feat: 스크린샷 리포트 반영 + 영구 저장 인프라 강화

- transformReport: channel_data.screenshots → report.screenshots 자동 매핑
- transformReport: youtubeAudit/instagramAudit/facebookAudit diagnosis에 evidenceIds 자동 연결 (채널별 스크린샷 → 진단 항목 연결)
- collect-channel-data: 스크린샷 아카이브를 병렬→순차로 변경 (rate-limit 방지), 실패 시 상세 로그
- scripts/archive-screenshots.py: 기존 GCS 임시 URL → Supabase Storage 일괄 재아카이브 스크립트 추가
- TypeScript 기존 에러 3개 수정 (SectionErrorBoundary, ClinicSnapshot, reviewCount 유니언 타입)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
claude/bold-hawking
Haewon Kam 2026-04-07 14:43:25 +09:00
parent 2d1937944a
commit 9c4d10609f
5 changed files with 292 additions and 37 deletions

View File

@ -0,0 +1,184 @@
#!/usr/bin/env python3
"""
archive-screenshots.py
GCS 임시 URL(7 만료) 저장된 스크린샷을 Supabase Storage에 영구 아카이브.
실행: python3 scripts/archive-screenshots.py
환경: SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY (.env 또는 환경변수)
"""
import os, json, re, urllib.request, urllib.parse
# ── 환경변수 로드 ──────────────────────────────────────────────────────────────
def load_env():
env = {}
env_path = os.path.join(os.path.dirname(__file__), '..', '.env')
try:
with open(env_path) as f:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
k, v = line.split('=', 1)
env[k.strip()] = v.strip().strip('"').strip("'")
except FileNotFoundError:
pass
# 환경변수 우선
for k in ('SUPABASE_URL', 'SUPABASE_SERVICE_ROLE_KEY', 'VITE_SUPABASE_URL'):
if os.environ.get(k):
env[k] = os.environ[k]
return env
env = load_env()
SUPABASE_URL = env.get('SUPABASE_URL') or env.get('VITE_SUPABASE_URL', '').replace('VITE_', '')
SERVICE_KEY = env.get('SUPABASE_SERVICE_ROLE_KEY', '')
BUCKET = 'screenshots'
DB_DIR = os.path.join(os.path.dirname(__file__), '..', 'src', 'data', 'db')
if not SUPABASE_URL or not SERVICE_KEY:
print('❌ SUPABASE_URL 또는 SUPABASE_SERVICE_ROLE_KEY 환경변수가 없습니다.')
print(' .env 파일에 추가하거나 환경변수로 설정해주세요.')
raise SystemExit(1)
# ── Supabase Storage 업로드 ────────────────────────────────────────────────────
def upload_to_storage(storage_path: str, image_bytes: bytes) -> str:
"""Supabase Storage에 업로드 후 public URL 반환."""
encoded_path = urllib.parse.quote(storage_path, safe='/')
url = f'{SUPABASE_URL}/storage/v1/object/{BUCKET}/{encoded_path}'
req = urllib.request.Request(
url,
data=image_bytes,
method='POST',
headers={
'Authorization': f'Bearer {SERVICE_KEY}',
'Content-Type': 'image/png',
'x-upsert': 'true', # 중복이면 덮어쓰기
}
)
try:
with urllib.request.urlopen(req, timeout=30) as r:
r.read()
except urllib.error.HTTPError as e:
body = e.read().decode()
raise RuntimeError(f'Upload failed {e.code}: {body}')
public_url = f'{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/{encoded_path}'
return public_url
# ── GCS URL에서 이미지 다운로드 ────────────────────────────────────────────────
def fetch_image(gcs_url: str) -> bytes:
req = urllib.request.Request(gcs_url, headers={'User-Agent': 'Mozilla/5.0'})
with urllib.request.urlopen(req, timeout=20) as r:
return r.read()
# ── Supabase DB에서 모든 리포트 조회 + URL 업데이트 ───────────────────────────
def fetch_reports():
url = f'{SUPABASE_URL}/rest/v1/marketing_reports?select=id,clinic_name,url,channel_data,report'
req = urllib.request.Request(url, headers={
'Authorization': f'Bearer {SERVICE_KEY}',
'apikey': SERVICE_KEY,
'Accept': 'application/json',
})
with urllib.request.urlopen(req, timeout=30) as r:
return json.loads(r.read())
def update_report_screenshots(report_id: str, channel_data: dict, report: dict):
"""Supabase DB에 업데이트된 channel_data, report JSONB 저장."""
payload = json.dumps({'channel_data': channel_data, 'report': report}).encode()
url = f'{SUPABASE_URL}/rest/v1/marketing_reports?id=eq.{report_id}'
req = urllib.request.Request(url, data=payload, method='PATCH', headers={
'Authorization': f'Bearer {SERVICE_KEY}',
'apikey': SERVICE_KEY,
'Content-Type': 'application/json',
'Prefer': 'return=minimal',
})
with urllib.request.urlopen(req, timeout=30) as r:
r.read()
# ── 메인 ───────────────────────────────────────────────────────────────────────
def get_domain(site_url: str) -> str:
try:
return urllib.parse.urlparse(site_url).hostname.replace('www.', '') or 'unknown'
except Exception:
return 'unknown'
def archive_screenshots_for_report(row: dict) -> int:
report_id = row['id']
clinic_name = row.get('clinic_name', '?')
domain = get_domain(row.get('url', ''))
channel_data = row.get('channel_data') or {}
report = row.get('report') or {}
ss_list = channel_data.get('screenshots', [])
rss_list = report.get('screenshots', [])
all_ss = {s['id']: s for s in ss_list + rss_list if s.get('url')}
archived = 0
for ss_id, ss in all_ss.items():
url = ss.get('url', '')
if 'googleapis.com' not in url and 'firecrawl' not in url:
continue # 이미 Supabase URL이거나 다른 URL
storage_path = f'clinics/{domain}/{report_id}/screenshots/{ss_id}.png'
try:
print(f' → 다운로드: {ss_id} ({url[:60]}...)')
image_bytes = fetch_image(url)
public_url = upload_to_storage(storage_path, image_bytes)
# in-place URL 교체
ss['url'] = public_url
if ss_id in {s.get('id') for s in ss_list}:
for s in ss_list:
if s.get('id') == ss_id:
s['url'] = public_url
if ss_id in {s.get('id') for s in rss_list}:
for s in rss_list:
if s.get('id') == ss_id:
s['url'] = public_url
print(f' ✅ 아카이브 완료 → {public_url[:70]}...')
archived += 1
except Exception as e:
print(f' ❌ 실패: {e}')
if archived > 0:
# DB 업데이트
update_report_screenshots(report_id, channel_data, report)
print(f' 💾 DB 업데이트 완료 ({clinic_name})')
return archived
def main():
print('=== Supabase Screenshot 영구 아카이브 ===')
print(f'대상: {SUPABASE_URL}')
print()
print('DB에서 리포트 목록 조회 중...')
reports = fetch_reports()
print(f'{len(reports)}개 리포트')
print()
total_archived = 0
for row in reports:
name = row.get('clinic_name', '?')
ss_count = len((row.get('channel_data') or {}).get('screenshots', []))
rss_count = len((row.get('report') or {}).get('screenshots', []))
has_gcs = any(
'googleapis.com' in (s.get('url', ''))
for s in (row.get('channel_data') or {}).get('screenshots', [])
+ (row.get('report') or {}).get('screenshots', [])
)
if not has_gcs:
continue
print(f'[{name}] channel_data={ss_count}개 / report={rss_count}개 스크린샷')
n = archive_screenshots_for_report(row)
total_archived += n
print()
print(f'=== 완료: 총 {total_archived}개 스크린샷 영구 저장 ===')
if __name__ == '__main__':
main()

View File

@ -28,7 +28,7 @@ const infoFields = (data: ClinicSnapshotType): InfoField[] => [
data.phone ? { label: '전화', value: data.phone, icon: Phone, href: `tel:${data.phone.replace(/[^+0-9]/g, '')}` } : null,
data.domain ? { label: '도메인', value: data.domain, icon: Globe, href: `https://${data.domain.replace(/^https?:\/\//, '')}` } : null,
data.registryData?.websiteEn ? { label: '영문 사이트', value: data.registryData.websiteEn, icon: Globe, href: data.registryData.websiteEn } : null,
].filter((f): f is NonNullable<InfoField> => f !== null);
].filter(Boolean) as InfoField[];
export default function ClinicSnapshot({ data }: ClinicSnapshotProps) {
const fields = infoFields(data);

View File

@ -10,6 +10,7 @@ interface State {
}
export class SectionErrorBoundary extends Component<Props, State> {
declare props: Props;
state: State = { hasError: false };
static getDerivedStateFromError() {

View File

@ -550,7 +550,7 @@ export function transformApiReport(
name: doctor?.name || '',
credentials: doctor?.specialty || '',
rating: doctor?.rating ?? 0,
reviewCount: doctor?.reviewCount ?? doctor?.reviews ?? 0,
reviewCount: (doctor as { reviewCount?: number })?.reviewCount ?? (doctor as { reviews?: number })?.reviews ?? 0,
},
// 강남언니 is 10-point scale. AI sometimes gives 5-point — auto-correct.
overallRating: (() => {
@ -746,6 +746,15 @@ export interface EnrichmentData {
badges?: string[];
sourceUrl?: string;
};
// 스크래핑 시 캡처된 스크린샷 목록 (channel_data.screenshots)
screenshots?: {
id: string;
url: string;
channel: string;
caption: string;
capturedAt?: string;
sourceUrl?: string;
}[];
naverBlog?: {
totalResults?: number;
searchQuery?: string;
@ -1119,7 +1128,7 @@ export function mergeEnrichment(
linkedDomain: fb.website || '',
reviews: (() => {
// Facebook rating 문자열 파싱: "Not yet rated (3 Reviews)" or "4.8 (120 Reviews)"
const m = (fb.rating || '').match(/\((\d+)\s+Reviews?\)/i);
const m = String(fb.rating || '').match(/\((\d+)\s+Reviews?\)/i);
return m ? parseInt(m[1], 10) : 0;
})(),
recentPostAge: '',
@ -1178,5 +1187,74 @@ export function mergeEnrichment(
merged.problemDiagnosis = [...merged.problemDiagnosis, ...enrichDiagnosis];
}
// ── 스크린샷 영구 반영 ──────────────────────────────────────────────────────
// channel_data.screenshots → report.screenshots 로 옮기고,
// 채널별로 diagnosis evidenceIds 자동 연결
if (enrichment.screenshots?.length) {
const ss = enrichment.screenshots;
// 1) report.screenshots 세팅 (ScreenshotEvidence 형식으로 변환)
merged.screenshots = ss.map(s => ({
id: s.id,
url: s.url,
channel: s.channel,
caption: s.caption,
capturedAt: s.capturedAt ?? new Date().toISOString(),
sourceUrl: s.sourceUrl,
}));
// 2) 채널명 → screenshot IDs 매핑 테이블 생성
// channel_data의 channel 필드: "YouTube", "웹사이트", "Instagram", "Facebook" 등
const CHANNEL_ALIAS: Record<string, string[]> = {
youtube: ['youtube', 'YouTube', 'yt'],
instagram: ['instagram', 'Instagram', 'ig'],
facebook: ['facebook', 'Facebook', 'fb'],
website: ['웹사이트', 'website', 'Website'],
gangnamUnni: ['강남언니', 'gangnamUnni'],
naverPlace: ['네이버 플레이스', 'naverPlace'],
naverBlog: ['네이버 블로그', 'naverBlog'],
};
const channelToIds: Record<string, string[]> = {};
for (const s of ss) {
for (const [key, aliases] of Object.entries(CHANNEL_ALIAS)) {
if (aliases.some(a => s.channel.toLowerCase().includes(a.toLowerCase()))) {
channelToIds[key] = [...(channelToIds[key] ?? []), s.id];
break;
}
}
}
// 3) 채널별 audit.diagnosis 배열에 evidenceIds 연결
// YouTubeAudit / InstagramAudit / FacebookAudit 컴포넌트가 이 필드를 사용함
const linkIds = (diagItems: import('../types/report').DiagnosisItem[], channelKey: string): import('../types/report').DiagnosisItem[] => {
const ids = channelToIds[channelKey] ?? [];
if (!ids.length) return diagItems;
return diagItems.map(item => ({ ...item, evidenceIds: [...(item.evidenceIds ?? []), ...ids] }));
};
if (merged.youtubeAudit?.diagnosis?.length) {
merged.youtubeAudit = { ...merged.youtubeAudit, diagnosis: linkIds(merged.youtubeAudit.diagnosis, 'youtube') };
}
if (merged.instagramAudit?.diagnosis?.length) {
merged.instagramAudit = { ...merged.instagramAudit, diagnosis: linkIds(merged.instagramAudit.diagnosis, 'instagram') };
}
if (merged.facebookAudit?.diagnosis?.length) {
merged.facebookAudit = { ...merged.facebookAudit, diagnosis: linkIds(merged.facebookAudit.diagnosis, 'facebook') };
}
// websiteAudit / 기타 채널은 EvidenceGallery를 직접 받지 않으므로 problemDiagnosis에만 연결
merged.problemDiagnosis = merged.problemDiagnosis.map(item => {
const catLower = item.category.toLowerCase();
let ids: string[] = [];
for (const [key, ssIds] of Object.entries(channelToIds)) {
if (catLower.includes(key) || key.includes(catLower)) {
ids = [...ids, ...ssIds];
}
}
return ids.length > 0 ? { ...item, evidenceIds: ids } : item;
});
}
// ───────────────────────────────────────────────────────────────────────────
return merged;
}

View File

@ -638,60 +638,52 @@ Deno.serve(async (req) => {
throw new Error(`No screenshots captured: ${debugInfo}`);
}
// ─── Step 2: Archive to Supabase Storage (replace 7-day GCS URLs) ───────
// Firecrawl returns signed GCS URLs that expire after ~7 days.
// We already have the image as base64 in memory — upload it permanently
// to Supabase Storage and replace ss.url in-place before storing to DB.
//
// Upload happens in parallel; failures are non-fatal — the screenshot
// keeps its GCS URL as a fallback so Vision analysis still proceeds.
// clinics/{domain}/{reportId}/screenshots/{id}.png
// ─── Step 2: Archive to Supabase Storage (GCS 7일 임시 URL → 영구 저장) ──
// base64가 메모리에 있는 지금 즉시 업로드. 실패 시 GCS URL 유지(비치명적).
// 경로: clinics/{domain}/{reportId}/screenshots/{id}.png
const domain = (() => {
try { return new URL(row.url || "").hostname.replace('www.', ''); } catch { return "unknown"; }
try {
const h = new URL(row.url || "").hostname.replace('www.', '');
return h || "unknown";
} catch { return "unknown"; }
})();
const SUPABASE_STORAGE_BUCKET = "screenshots";
const archiveTasks = screenshots.map(async (ss) => {
if (!ss.base64) return; // no image data — skip
// 순차 업로드 (병렬 시 Supabase rate-limit 위험 방지)
for (const ss of screenshots) {
if (!ss.base64) {
console.warn(`[archive] ${ss.id}: base64 없음 — GCS URL 유지`);
continue;
}
try {
// base64 → Uint8Array
const binaryStr = atob(ss.base64);
const bytes = new Uint8Array(binaryStr.length);
for (let i = 0; i < binaryStr.length; i++) {
bytes[i] = binaryStr.charCodeAt(i);
}
for (let i = 0; i < binaryStr.length; i++) bytes[i] = binaryStr.charCodeAt(i);
// Upload: clinics/{domain}/{reportId}/screenshots/{screenshotId}.png
const storagePath = `clinics/${domain}/${reportId}/screenshots/${ss.id}.png`;
const { error: uploadError } = await supabase.storage
.from(SUPABASE_STORAGE_BUCKET)
.upload(storagePath, bytes, {
contentType: "image/png",
upsert: true, // overwrite if re-running same analysis
});
.upload(storagePath, bytes, { contentType: "image/png", upsert: true });
if (uploadError) {
// Non-fatal: log and keep GCS URL as fallback
console.warn(`[archive] Storage upload failed for ${ss.id}: ${uploadError.message}`);
return;
console.error(`[archive] ❌ 업로드 실패 ${ss.id}: ${uploadError.message}`);
continue; // GCS URL 유지
}
// Replace GCS temp URL with permanent Supabase Storage public URL
const { data: { publicUrl } } = supabase.storage
.from(SUPABASE_STORAGE_BUCKET)
.getPublicUrl(storagePath);
ss.url = publicUrl; // in-place replace — all downstream code uses permanent URL
console.log(`[archive] ${ss.id} → clinics/${domain}/${reportId}/screenshots/`);
} catch (archiveErr) {
// Non-fatal: Vision analysis still proceeds with base64
console.warn(`[archive] Exception for ${ss.id}:`, archiveErr instanceof Error ? archiveErr.message : archiveErr);
ss.url = publicUrl;
ss.archived = true;
console.log(`[archive] ✅ ${ss.id} → Supabase Storage`);
} catch (err) {
console.error(`[archive] ❌ 예외 ${ss.id}:`, err instanceof Error ? err.message : err);
}
}
});
await Promise.allSettled(archiveTasks);
const archivedCount = screenshots.filter(ss => ss.url.includes("supabase")).length;
console.log(`[archive] ${archivedCount}/${screenshots.length} screenshots archived to Supabase Storage`);
const archivedCount = screenshots.filter(ss => ss.archived).length;
console.log(`[archive] ${archivedCount}/${screenshots.length}개 영구 저장 완료`);
// Step 3: Run Gemini Vision on captured screenshots (base64 still in memory)
if (GEMINI_API_KEY && screenshots.length > 0) {