From 36d2f1cf498cf2dce6879b4b884c550ae4e48885 Mon Sep 17 00:00:00 2001 From: Haewon Kam Date: Tue, 7 Apr 2026 09:51:31 +0900 Subject: [PATCH] feat: archive Firecrawl screenshots to Supabase Storage (permanent URLs) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## 문제 Firecrawl이 반환하는 스크린샷 URL은 GCS Signed URL로 7일 후 만료. 리포트에 저장된 이미지 URL이 일주일 후 전부 깨짐 (403 Access Denied). ## 해결 collect-channel-data의 Vision 단계에 아카이빙 스텝 추가. 캡처 직후 base64(이미 메모리에 있음)를 Supabase Storage에 영구 업로드. ### 처리 흐름 (변경 후) 1. captureAllScreenshots() → GCS URL + base64 반환 (기존) 2. [신규] archiveTasks: base64 → Supabase Storage 업로드 (병렬) - 경로: screenshots/{reportId}/{screenshotId}.png - 성공 시 ss.url을 영구 Supabase URL로 in-place 교체 - 실패 시 non-fatal — GCS URL fallback으로 Vision 분석 계속 진행 3. runVisionAnalysis() — base64 여전히 메모리에 있어 정상 실행 (기존) 4. channelData.screenshots 저장 시 영구 URL 사용 (자동) - archived: true/false 플래그 추가 (모니터링용) ### 비용/성능 - 추가 API 호출 없음 (base64 이미 캡처 시 다운로드됨) - 업로드: ~1-3초/장 (병렬), 5MB limit, PNG/JPEG/WebP 허용 - 버킷: public (URL만 있으면 열람) + 서비스 역할만 업로드 가능 ## 마이그레이션 supabase/migrations/20260407_screenshots_storage.sql - screenshots 버킷 생성 (public, 5MB limit) - RLS: public read / service_role write - delete_old_screenshots() 함수: 90일 이상 된 파일 정리 (pg_cron 연동 가능) ## 타입 ScreenshotResult.archived?: boolean 필드 추가 (영구 vs GCS fallback 구분) Co-Authored-By: Claude Sonnet 4.6 --- supabase/functions/_shared/visionAnalysis.ts | 8 +- .../functions/collect-channel-data/index.ts | 88 +++++++++++++++---- .../20260407_screenshots_storage.sql | 49 +++++++++++ 3 files changed, 125 insertions(+), 20 deletions(-) create mode 100644 supabase/migrations/20260407_screenshots_storage.sql diff --git a/supabase/functions/_shared/visionAnalysis.ts b/supabase/functions/_shared/visionAnalysis.ts index 1e7220a..5e9dc3e 100644 --- a/supabase/functions/_shared/visionAnalysis.ts +++ b/supabase/functions/_shared/visionAnalysis.ts @@ -19,12 +19,16 @@ const FIRECRAWL_BASE = "https://api.firecrawl.dev/v1"; export interface ScreenshotResult { id: string; - url: string; // Supabase Storage signed URL or data URI + /** Image URL. Initially a GCS signed URL from Firecrawl (~7-day expiry). + * collect-channel-data archives it to Supabase Storage and replaces this + * with a permanent public URL. Check `archived` flag to know which. */ + url: string; channel: string; // 'website', 'youtube', 'instagram', 'gangnamUnni', etc. capturedAt: string; caption: string; sourceUrl: string; // Original page URL - base64?: string; // Raw base64 (for Vision analysis, not stored in report) + base64?: string; // Raw base64 (for Vision analysis — NOT stored in report or DB) + archived?: boolean; // true = permanent Supabase Storage URL; false = GCS fallback } export interface VisionAnalysisResult { diff --git a/supabase/functions/collect-channel-data/index.ts b/supabase/functions/collect-channel-data/index.ts index 9682926..d873dae 100644 --- a/supabase/functions/collect-channel-data/index.ts +++ b/supabase/functions/collect-channel-data/index.ts @@ -597,32 +597,84 @@ Deno.serve(async (req) => { const siteMap: string[] = row.scrape_data?.siteMap || []; channelTasks.push(wrapChannelTask("vision", async () => { - // Capture screenshots of relevant pages + social channel landings + // Step 1: Capture screenshots of relevant pages + social channel landings screenshots = await captureAllScreenshots(mainUrl, siteMap, verified, FIRECRAWL_API_KEY); - // Run Gemini Vision on captured screenshots - if (GEMINI_API_KEY && screenshots.length > 0) { - const vision = await runVisionAnalysis(screenshots, GEMINI_API_KEY); - channelData.visionAnalysis = vision.merged; - channelData.visionPerPage = vision.perPage; - } - - // Store screenshots metadata (NOT base64 — use the GCS URL from Firecrawl) - channelData.screenshots = screenshots.map(ss => ({ - id: ss.id, - url: ss.url, // GCS signed URL (valid ~7 days) - channel: ss.channel, - capturedAt: ss.capturedAt, - caption: ss.caption, - sourceUrl: ss.sourceUrl, - })); - if (screenshots.length === 0) { const debugInfo = screenshotErrors.length > 0 ? screenshotErrors.join(" | ") : "No errors recorded — check FIRECRAWL_API_KEY"; throw new Error(`No screenshots captured: ${debugInfo}`); } + + // ─── Step 2: Archive to Supabase Storage (replace 7-day GCS URLs) ─────── + // Firecrawl returns signed GCS URLs that expire after ~7 days. + // We already have the image as base64 in memory — upload it permanently + // to Supabase Storage and replace ss.url in-place before storing to DB. + // + // Upload happens in parallel; failures are non-fatal — the screenshot + // keeps its GCS URL as a fallback so Vision analysis still proceeds. + const SUPABASE_STORAGE_BUCKET = "screenshots"; + const archiveTasks = screenshots.map(async (ss) => { + if (!ss.base64) return; // no image data — skip + try { + // base64 → Uint8Array + const binaryStr = atob(ss.base64); + const bytes = new Uint8Array(binaryStr.length); + for (let i = 0; i < binaryStr.length; i++) { + bytes[i] = binaryStr.charCodeAt(i); + } + + // Upload: screenshots/{reportId}/{screenshotId}.png + const storagePath = `${reportId}/${ss.id}.png`; + const { error: uploadError } = await supabase.storage + .from(SUPABASE_STORAGE_BUCKET) + .upload(storagePath, bytes, { + contentType: "image/png", + upsert: true, // overwrite if re-running same analysis + }); + + if (uploadError) { + // Non-fatal: log and keep GCS URL as fallback + console.warn(`[archive] Storage upload failed for ${ss.id}: ${uploadError.message}`); + return; + } + + // Replace GCS temp URL with permanent Supabase Storage public URL + const { data: { publicUrl } } = supabase.storage + .from(SUPABASE_STORAGE_BUCKET) + .getPublicUrl(storagePath); + + ss.url = publicUrl; // in-place replace — all downstream code uses permanent URL + console.log(`[archive] ${ss.id} → ${publicUrl.slice(-60)}`); + } catch (archiveErr) { + // Non-fatal: Vision analysis still proceeds with base64 + console.warn(`[archive] Exception for ${ss.id}:`, archiveErr instanceof Error ? archiveErr.message : archiveErr); + } + }); + + await Promise.allSettled(archiveTasks); + + const archivedCount = screenshots.filter(ss => ss.url.includes("supabase")).length; + console.log(`[archive] ${archivedCount}/${screenshots.length} screenshots archived to Supabase Storage`); + + // Step 3: Run Gemini Vision on captured screenshots (base64 still in memory) + if (GEMINI_API_KEY && screenshots.length > 0) { + const vision = await runVisionAnalysis(screenshots, GEMINI_API_KEY); + channelData.visionAnalysis = vision.merged; + channelData.visionPerPage = vision.perPage; + } + + // Step 4: Store screenshots metadata — ss.url is now the permanent URL (or GCS fallback) + channelData.screenshots = screenshots.map(ss => ({ + id: ss.id, + url: ss.url, // permanent Supabase Storage URL (or GCS fallback if archive failed) + channel: ss.channel, + capturedAt: ss.capturedAt, + caption: ss.caption, + sourceUrl: ss.sourceUrl, + archived: ss.url.includes("supabase"), // flag: true = permanent, false = GCS fallback + })); })); } diff --git a/supabase/migrations/20260407_screenshots_storage.sql b/supabase/migrations/20260407_screenshots_storage.sql new file mode 100644 index 0000000..155480c --- /dev/null +++ b/supabase/migrations/20260407_screenshots_storage.sql @@ -0,0 +1,49 @@ +-- ═══════════════════════════════════════════════════════════════ +-- Screenshots: Supabase Storage bucket + RLS +-- ═══════════════════════════════════════════════════════════════ +-- Firecrawl이 반환하는 GCS URL은 7일 후 만료됨. +-- collect-channel-data가 스크린샷 캡처 시 base64를 이 버킷에 영구 저장. +-- 경로 규칙: screenshots/{reportId}/{screenshotId}.png + +-- Storage 버킷 생성 (이미 있으면 무시) +INSERT INTO storage.buckets (id, name, public, file_size_limit, allowed_mime_types) +VALUES ( + 'screenshots', + 'screenshots', + true, -- public bucket: 인증 없이 URL만으로 열람 가능 + 5242880, -- 5MB per file limit + ARRAY['image/png', 'image/jpeg', 'image/webp'] +) +ON CONFLICT (id) DO NOTHING; + +-- ─── RLS Policies ─────────────────────────────────────────────── + +-- 1. 누구나 읽기 가능 (리포트 공유 URL에서 이미지 표시) +CREATE POLICY "public_read_screenshots" + ON storage.objects FOR SELECT + USING (bucket_id = 'screenshots'); + +-- 2. 서비스 역할만 업로드/삭제 (Edge Function이 service_role_key 사용) +CREATE POLICY "service_upload_screenshots" + ON storage.objects FOR INSERT + WITH CHECK (bucket_id = 'screenshots' AND auth.role() = 'service_role'); + +CREATE POLICY "service_delete_screenshots" + ON storage.objects FOR DELETE + USING (bucket_id = 'screenshots' AND auth.role() = 'service_role'); + +-- ─── 정리 함수 ───────────────────────────────────────────────── +-- 90일 이상 된 스크린샷 자동 삭제 (선택적 — pg_cron 스케줄로 실행) +-- 병원 리포트는 보통 3개월 후 재분석하므로 이전 스크린샷은 불필요. +CREATE OR REPLACE FUNCTION delete_old_screenshots() +RETURNS void +LANGUAGE sql +AS $$ + DELETE FROM storage.objects + WHERE bucket_id = 'screenshots' + AND created_at < now() - interval '90 days'; +$$; + +-- 코멘트 +COMMENT ON FUNCTION delete_old_screenshots() IS + 'Deletes screenshots older than 90 days. Schedule with pg_cron: SELECT cron.schedule(''weekly-screenshot-cleanup'', ''0 3 * * 0'', ''SELECT delete_old_screenshots()'');';