feat: 스크린샷 리포트 반영 + 영구 저장 인프라 강화
- transformReport: channel_data.screenshots → report.screenshots 자동 매핑 - transformReport: youtubeAudit/instagramAudit/facebookAudit diagnosis에 evidenceIds 자동 연결 (채널별 스크린샷 → 진단 항목 연결) - collect-channel-data: 스크린샷 아카이브를 병렬→순차로 변경 (rate-limit 방지), 실패 시 상세 로그 - scripts/archive-screenshots.py: 기존 GCS 임시 URL → Supabase Storage 일괄 재아카이브 스크립트 추가 - TypeScript 기존 에러 3개 수정 (SectionErrorBoundary, ClinicSnapshot, reviewCount 유니언 타입) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>claude/bold-hawking
parent
2d1937944a
commit
9c4d10609f
|
|
@ -0,0 +1,184 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
archive-screenshots.py
|
||||
GCS 임시 URL(7일 만료)로 저장된 스크린샷을 Supabase Storage에 영구 아카이브.
|
||||
|
||||
실행: python3 scripts/archive-screenshots.py
|
||||
환경: SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY (.env 또는 환경변수)
|
||||
"""
|
||||
|
||||
import os, json, re, urllib.request, urllib.parse
|
||||
|
||||
# ── 환경변수 로드 ──────────────────────────────────────────────────────────────
|
||||
def load_env():
|
||||
env = {}
|
||||
env_path = os.path.join(os.path.dirname(__file__), '..', '.env')
|
||||
try:
|
||||
with open(env_path) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#') and '=' in line:
|
||||
k, v = line.split('=', 1)
|
||||
env[k.strip()] = v.strip().strip('"').strip("'")
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
# 환경변수 우선
|
||||
for k in ('SUPABASE_URL', 'SUPABASE_SERVICE_ROLE_KEY', 'VITE_SUPABASE_URL'):
|
||||
if os.environ.get(k):
|
||||
env[k] = os.environ[k]
|
||||
return env
|
||||
|
||||
env = load_env()
|
||||
|
||||
SUPABASE_URL = env.get('SUPABASE_URL') or env.get('VITE_SUPABASE_URL', '').replace('VITE_', '')
|
||||
SERVICE_KEY = env.get('SUPABASE_SERVICE_ROLE_KEY', '')
|
||||
BUCKET = 'screenshots'
|
||||
DB_DIR = os.path.join(os.path.dirname(__file__), '..', 'src', 'data', 'db')
|
||||
|
||||
if not SUPABASE_URL or not SERVICE_KEY:
|
||||
print('❌ SUPABASE_URL 또는 SUPABASE_SERVICE_ROLE_KEY 환경변수가 없습니다.')
|
||||
print(' .env 파일에 추가하거나 환경변수로 설정해주세요.')
|
||||
raise SystemExit(1)
|
||||
|
||||
# ── Supabase Storage 업로드 ────────────────────────────────────────────────────
|
||||
def upload_to_storage(storage_path: str, image_bytes: bytes) -> str:
|
||||
"""Supabase Storage에 업로드 후 public URL 반환."""
|
||||
encoded_path = urllib.parse.quote(storage_path, safe='/')
|
||||
url = f'{SUPABASE_URL}/storage/v1/object/{BUCKET}/{encoded_path}'
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=image_bytes,
|
||||
method='POST',
|
||||
headers={
|
||||
'Authorization': f'Bearer {SERVICE_KEY}',
|
||||
'Content-Type': 'image/png',
|
||||
'x-upsert': 'true', # 중복이면 덮어쓰기
|
||||
}
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as r:
|
||||
r.read()
|
||||
except urllib.error.HTTPError as e:
|
||||
body = e.read().decode()
|
||||
raise RuntimeError(f'Upload failed {e.code}: {body}')
|
||||
|
||||
public_url = f'{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/{encoded_path}'
|
||||
return public_url
|
||||
|
||||
# ── GCS URL에서 이미지 다운로드 ────────────────────────────────────────────────
|
||||
def fetch_image(gcs_url: str) -> bytes:
|
||||
req = urllib.request.Request(gcs_url, headers={'User-Agent': 'Mozilla/5.0'})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
return r.read()
|
||||
|
||||
# ── Supabase DB에서 모든 리포트 조회 + URL 업데이트 ───────────────────────────
|
||||
def fetch_reports():
|
||||
url = f'{SUPABASE_URL}/rest/v1/marketing_reports?select=id,clinic_name,url,channel_data,report'
|
||||
req = urllib.request.Request(url, headers={
|
||||
'Authorization': f'Bearer {SERVICE_KEY}',
|
||||
'apikey': SERVICE_KEY,
|
||||
'Accept': 'application/json',
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=30) as r:
|
||||
return json.loads(r.read())
|
||||
|
||||
def update_report_screenshots(report_id: str, channel_data: dict, report: dict):
|
||||
"""Supabase DB에 업데이트된 channel_data, report JSONB 저장."""
|
||||
payload = json.dumps({'channel_data': channel_data, 'report': report}).encode()
|
||||
url = f'{SUPABASE_URL}/rest/v1/marketing_reports?id=eq.{report_id}'
|
||||
req = urllib.request.Request(url, data=payload, method='PATCH', headers={
|
||||
'Authorization': f'Bearer {SERVICE_KEY}',
|
||||
'apikey': SERVICE_KEY,
|
||||
'Content-Type': 'application/json',
|
||||
'Prefer': 'return=minimal',
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=30) as r:
|
||||
r.read()
|
||||
|
||||
# ── 메인 ───────────────────────────────────────────────────────────────────────
|
||||
def get_domain(site_url: str) -> str:
|
||||
try:
|
||||
return urllib.parse.urlparse(site_url).hostname.replace('www.', '') or 'unknown'
|
||||
except Exception:
|
||||
return 'unknown'
|
||||
|
||||
def archive_screenshots_for_report(row: dict) -> int:
|
||||
report_id = row['id']
|
||||
clinic_name = row.get('clinic_name', '?')
|
||||
domain = get_domain(row.get('url', ''))
|
||||
channel_data = row.get('channel_data') or {}
|
||||
report = row.get('report') or {}
|
||||
|
||||
ss_list = channel_data.get('screenshots', [])
|
||||
rss_list = report.get('screenshots', [])
|
||||
all_ss = {s['id']: s for s in ss_list + rss_list if s.get('url')}
|
||||
|
||||
archived = 0
|
||||
for ss_id, ss in all_ss.items():
|
||||
url = ss.get('url', '')
|
||||
if 'googleapis.com' not in url and 'firecrawl' not in url:
|
||||
continue # 이미 Supabase URL이거나 다른 URL
|
||||
|
||||
storage_path = f'clinics/{domain}/{report_id}/screenshots/{ss_id}.png'
|
||||
|
||||
try:
|
||||
print(f' → 다운로드: {ss_id} ({url[:60]}...)')
|
||||
image_bytes = fetch_image(url)
|
||||
public_url = upload_to_storage(storage_path, image_bytes)
|
||||
|
||||
# in-place URL 교체
|
||||
ss['url'] = public_url
|
||||
if ss_id in {s.get('id') for s in ss_list}:
|
||||
for s in ss_list:
|
||||
if s.get('id') == ss_id:
|
||||
s['url'] = public_url
|
||||
if ss_id in {s.get('id') for s in rss_list}:
|
||||
for s in rss_list:
|
||||
if s.get('id') == ss_id:
|
||||
s['url'] = public_url
|
||||
|
||||
print(f' ✅ 아카이브 완료 → {public_url[:70]}...')
|
||||
archived += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f' ❌ 실패: {e}')
|
||||
|
||||
if archived > 0:
|
||||
# DB 업데이트
|
||||
update_report_screenshots(report_id, channel_data, report)
|
||||
print(f' 💾 DB 업데이트 완료 ({clinic_name})')
|
||||
|
||||
return archived
|
||||
|
||||
def main():
|
||||
print('=== Supabase Screenshot 영구 아카이브 ===')
|
||||
print(f'대상: {SUPABASE_URL}')
|
||||
print()
|
||||
|
||||
print('DB에서 리포트 목록 조회 중...')
|
||||
reports = fetch_reports()
|
||||
print(f'총 {len(reports)}개 리포트')
|
||||
print()
|
||||
|
||||
total_archived = 0
|
||||
for row in reports:
|
||||
name = row.get('clinic_name', '?')
|
||||
ss_count = len((row.get('channel_data') or {}).get('screenshots', []))
|
||||
rss_count = len((row.get('report') or {}).get('screenshots', []))
|
||||
has_gcs = any(
|
||||
'googleapis.com' in (s.get('url', ''))
|
||||
for s in (row.get('channel_data') or {}).get('screenshots', [])
|
||||
+ (row.get('report') or {}).get('screenshots', [])
|
||||
)
|
||||
if not has_gcs:
|
||||
continue
|
||||
|
||||
print(f'[{name}] channel_data={ss_count}개 / report={rss_count}개 스크린샷')
|
||||
n = archive_screenshots_for_report(row)
|
||||
total_archived += n
|
||||
print()
|
||||
|
||||
print(f'=== 완료: 총 {total_archived}개 스크린샷 영구 저장 ===')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -28,7 +28,7 @@ const infoFields = (data: ClinicSnapshotType): InfoField[] => [
|
|||
data.phone ? { label: '전화', value: data.phone, icon: Phone, href: `tel:${data.phone.replace(/[^+0-9]/g, '')}` } : null,
|
||||
data.domain ? { label: '도메인', value: data.domain, icon: Globe, href: `https://${data.domain.replace(/^https?:\/\//, '')}` } : null,
|
||||
data.registryData?.websiteEn ? { label: '영문 사이트', value: data.registryData.websiteEn, icon: Globe, href: data.registryData.websiteEn } : null,
|
||||
].filter((f): f is NonNullable<InfoField> => f !== null);
|
||||
].filter(Boolean) as InfoField[];
|
||||
|
||||
export default function ClinicSnapshot({ data }: ClinicSnapshotProps) {
|
||||
const fields = infoFields(data);
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ interface State {
|
|||
}
|
||||
|
||||
export class SectionErrorBoundary extends Component<Props, State> {
|
||||
declare props: Props;
|
||||
state: State = { hasError: false };
|
||||
|
||||
static getDerivedStateFromError() {
|
||||
|
|
|
|||
|
|
@ -550,7 +550,7 @@ export function transformApiReport(
|
|||
name: doctor?.name || '',
|
||||
credentials: doctor?.specialty || '',
|
||||
rating: doctor?.rating ?? 0,
|
||||
reviewCount: doctor?.reviewCount ?? doctor?.reviews ?? 0,
|
||||
reviewCount: (doctor as { reviewCount?: number })?.reviewCount ?? (doctor as { reviews?: number })?.reviews ?? 0,
|
||||
},
|
||||
// 강남언니 is 10-point scale. AI sometimes gives 5-point — auto-correct.
|
||||
overallRating: (() => {
|
||||
|
|
@ -746,6 +746,15 @@ export interface EnrichmentData {
|
|||
badges?: string[];
|
||||
sourceUrl?: string;
|
||||
};
|
||||
// 스크래핑 시 캡처된 스크린샷 목록 (channel_data.screenshots)
|
||||
screenshots?: {
|
||||
id: string;
|
||||
url: string;
|
||||
channel: string;
|
||||
caption: string;
|
||||
capturedAt?: string;
|
||||
sourceUrl?: string;
|
||||
}[];
|
||||
naverBlog?: {
|
||||
totalResults?: number;
|
||||
searchQuery?: string;
|
||||
|
|
@ -1119,7 +1128,7 @@ export function mergeEnrichment(
|
|||
linkedDomain: fb.website || '',
|
||||
reviews: (() => {
|
||||
// Facebook rating 문자열 파싱: "Not yet rated (3 Reviews)" or "4.8 (120 Reviews)"
|
||||
const m = (fb.rating || '').match(/\((\d+)\s+Reviews?\)/i);
|
||||
const m = String(fb.rating || '').match(/\((\d+)\s+Reviews?\)/i);
|
||||
return m ? parseInt(m[1], 10) : 0;
|
||||
})(),
|
||||
recentPostAge: '',
|
||||
|
|
@ -1178,5 +1187,74 @@ export function mergeEnrichment(
|
|||
merged.problemDiagnosis = [...merged.problemDiagnosis, ...enrichDiagnosis];
|
||||
}
|
||||
|
||||
// ── 스크린샷 영구 반영 ──────────────────────────────────────────────────────
|
||||
// channel_data.screenshots → report.screenshots 로 옮기고,
|
||||
// 채널별로 diagnosis evidenceIds 자동 연결
|
||||
if (enrichment.screenshots?.length) {
|
||||
const ss = enrichment.screenshots;
|
||||
|
||||
// 1) report.screenshots 세팅 (ScreenshotEvidence 형식으로 변환)
|
||||
merged.screenshots = ss.map(s => ({
|
||||
id: s.id,
|
||||
url: s.url,
|
||||
channel: s.channel,
|
||||
caption: s.caption,
|
||||
capturedAt: s.capturedAt ?? new Date().toISOString(),
|
||||
sourceUrl: s.sourceUrl,
|
||||
}));
|
||||
|
||||
// 2) 채널명 → screenshot IDs 매핑 테이블 생성
|
||||
// channel_data의 channel 필드: "YouTube", "웹사이트", "Instagram", "Facebook" 등
|
||||
const CHANNEL_ALIAS: Record<string, string[]> = {
|
||||
youtube: ['youtube', 'YouTube', 'yt'],
|
||||
instagram: ['instagram', 'Instagram', 'ig'],
|
||||
facebook: ['facebook', 'Facebook', 'fb'],
|
||||
website: ['웹사이트', 'website', 'Website'],
|
||||
gangnamUnni: ['강남언니', 'gangnamUnni'],
|
||||
naverPlace: ['네이버 플레이스', 'naverPlace'],
|
||||
naverBlog: ['네이버 블로그', 'naverBlog'],
|
||||
};
|
||||
|
||||
const channelToIds: Record<string, string[]> = {};
|
||||
for (const s of ss) {
|
||||
for (const [key, aliases] of Object.entries(CHANNEL_ALIAS)) {
|
||||
if (aliases.some(a => s.channel.toLowerCase().includes(a.toLowerCase()))) {
|
||||
channelToIds[key] = [...(channelToIds[key] ?? []), s.id];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3) 채널별 audit.diagnosis 배열에 evidenceIds 연결
|
||||
// YouTubeAudit / InstagramAudit / FacebookAudit 컴포넌트가 이 필드를 사용함
|
||||
const linkIds = (diagItems: import('../types/report').DiagnosisItem[], channelKey: string): import('../types/report').DiagnosisItem[] => {
|
||||
const ids = channelToIds[channelKey] ?? [];
|
||||
if (!ids.length) return diagItems;
|
||||
return diagItems.map(item => ({ ...item, evidenceIds: [...(item.evidenceIds ?? []), ...ids] }));
|
||||
};
|
||||
|
||||
if (merged.youtubeAudit?.diagnosis?.length) {
|
||||
merged.youtubeAudit = { ...merged.youtubeAudit, diagnosis: linkIds(merged.youtubeAudit.diagnosis, 'youtube') };
|
||||
}
|
||||
if (merged.instagramAudit?.diagnosis?.length) {
|
||||
merged.instagramAudit = { ...merged.instagramAudit, diagnosis: linkIds(merged.instagramAudit.diagnosis, 'instagram') };
|
||||
}
|
||||
if (merged.facebookAudit?.diagnosis?.length) {
|
||||
merged.facebookAudit = { ...merged.facebookAudit, diagnosis: linkIds(merged.facebookAudit.diagnosis, 'facebook') };
|
||||
}
|
||||
// websiteAudit / 기타 채널은 EvidenceGallery를 직접 받지 않으므로 problemDiagnosis에만 연결
|
||||
merged.problemDiagnosis = merged.problemDiagnosis.map(item => {
|
||||
const catLower = item.category.toLowerCase();
|
||||
let ids: string[] = [];
|
||||
for (const [key, ssIds] of Object.entries(channelToIds)) {
|
||||
if (catLower.includes(key) || key.includes(catLower)) {
|
||||
ids = [...ids, ...ssIds];
|
||||
}
|
||||
}
|
||||
return ids.length > 0 ? { ...item, evidenceIds: ids } : item;
|
||||
});
|
||||
}
|
||||
// ───────────────────────────────────────────────────────────────────────────
|
||||
|
||||
return merged;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -638,60 +638,52 @@ Deno.serve(async (req) => {
|
|||
throw new Error(`No screenshots captured: ${debugInfo}`);
|
||||
}
|
||||
|
||||
// ─── Step 2: Archive to Supabase Storage (replace 7-day GCS URLs) ───────
|
||||
// Firecrawl returns signed GCS URLs that expire after ~7 days.
|
||||
// We already have the image as base64 in memory — upload it permanently
|
||||
// to Supabase Storage and replace ss.url in-place before storing to DB.
|
||||
//
|
||||
// Upload happens in parallel; failures are non-fatal — the screenshot
|
||||
// keeps its GCS URL as a fallback so Vision analysis still proceeds.
|
||||
// clinics/{domain}/{reportId}/screenshots/{id}.png
|
||||
// ─── Step 2: Archive to Supabase Storage (GCS 7일 임시 URL → 영구 저장) ──
|
||||
// base64가 메모리에 있는 지금 즉시 업로드. 실패 시 GCS URL 유지(비치명적).
|
||||
// 경로: clinics/{domain}/{reportId}/screenshots/{id}.png
|
||||
const domain = (() => {
|
||||
try { return new URL(row.url || "").hostname.replace('www.', ''); } catch { return "unknown"; }
|
||||
try {
|
||||
const h = new URL(row.url || "").hostname.replace('www.', '');
|
||||
return h || "unknown";
|
||||
} catch { return "unknown"; }
|
||||
})();
|
||||
const SUPABASE_STORAGE_BUCKET = "screenshots";
|
||||
const archiveTasks = screenshots.map(async (ss) => {
|
||||
if (!ss.base64) return; // no image data — skip
|
||||
|
||||
// 순차 업로드 (병렬 시 Supabase rate-limit 위험 방지)
|
||||
for (const ss of screenshots) {
|
||||
if (!ss.base64) {
|
||||
console.warn(`[archive] ${ss.id}: base64 없음 — GCS URL 유지`);
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
// base64 → Uint8Array
|
||||
const binaryStr = atob(ss.base64);
|
||||
const bytes = new Uint8Array(binaryStr.length);
|
||||
for (let i = 0; i < binaryStr.length; i++) {
|
||||
bytes[i] = binaryStr.charCodeAt(i);
|
||||
}
|
||||
for (let i = 0; i < binaryStr.length; i++) bytes[i] = binaryStr.charCodeAt(i);
|
||||
|
||||
// Upload: clinics/{domain}/{reportId}/screenshots/{screenshotId}.png
|
||||
const storagePath = `clinics/${domain}/${reportId}/screenshots/${ss.id}.png`;
|
||||
const { error: uploadError } = await supabase.storage
|
||||
.from(SUPABASE_STORAGE_BUCKET)
|
||||
.upload(storagePath, bytes, {
|
||||
contentType: "image/png",
|
||||
upsert: true, // overwrite if re-running same analysis
|
||||
});
|
||||
.upload(storagePath, bytes, { contentType: "image/png", upsert: true });
|
||||
|
||||
if (uploadError) {
|
||||
// Non-fatal: log and keep GCS URL as fallback
|
||||
console.warn(`[archive] Storage upload failed for ${ss.id}: ${uploadError.message}`);
|
||||
return;
|
||||
console.error(`[archive] ❌ 업로드 실패 ${ss.id}: ${uploadError.message}`);
|
||||
continue; // GCS URL 유지
|
||||
}
|
||||
|
||||
// Replace GCS temp URL with permanent Supabase Storage public URL
|
||||
const { data: { publicUrl } } = supabase.storage
|
||||
.from(SUPABASE_STORAGE_BUCKET)
|
||||
.getPublicUrl(storagePath);
|
||||
|
||||
ss.url = publicUrl; // in-place replace — all downstream code uses permanent URL
|
||||
console.log(`[archive] ${ss.id} → clinics/${domain}/${reportId}/screenshots/`);
|
||||
} catch (archiveErr) {
|
||||
// Non-fatal: Vision analysis still proceeds with base64
|
||||
console.warn(`[archive] Exception for ${ss.id}:`, archiveErr instanceof Error ? archiveErr.message : archiveErr);
|
||||
ss.url = publicUrl;
|
||||
ss.archived = true;
|
||||
console.log(`[archive] ✅ ${ss.id} → Supabase Storage`);
|
||||
} catch (err) {
|
||||
console.error(`[archive] ❌ 예외 ${ss.id}:`, err instanceof Error ? err.message : err);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
await Promise.allSettled(archiveTasks);
|
||||
|
||||
const archivedCount = screenshots.filter(ss => ss.url.includes("supabase")).length;
|
||||
console.log(`[archive] ${archivedCount}/${screenshots.length} screenshots archived to Supabase Storage`);
|
||||
const archivedCount = screenshots.filter(ss => ss.archived).length;
|
||||
console.log(`[archive] ${archivedCount}/${screenshots.length}개 영구 저장 완료`);
|
||||
|
||||
// Step 3: Run Gemini Vision on captured screenshots (base64 still in memory)
|
||||
if (GEMINI_API_KEY && screenshots.length > 0) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue