feat: archive Firecrawl screenshots to Supabase Storage (permanent URLs)

## 문제
Firecrawl이 반환하는 스크린샷 URL은 GCS Signed URL로 7일 후 만료.
리포트에 저장된 이미지 URL이 일주일 후 전부 깨짐 (403 Access Denied).

## 해결
collect-channel-data의 Vision 단계에 아카이빙 스텝 추가.
캡처 직후 base64(이미 메모리에 있음)를 Supabase Storage에 영구 업로드.

### 처리 흐름 (변경 후)
1. captureAllScreenshots() → GCS URL + base64 반환 (기존)
2. [신규] archiveTasks: base64 → Supabase Storage 업로드 (병렬)
   - 경로: screenshots/{reportId}/{screenshotId}.png
   - 성공 시 ss.url을 영구 Supabase URL로 in-place 교체
   - 실패 시 non-fatal — GCS URL fallback으로 Vision 분석 계속 진행
3. runVisionAnalysis() — base64 여전히 메모리에 있어 정상 실행 (기존)
4. channelData.screenshots 저장 시 영구 URL 사용 (자동)
   - archived: true/false 플래그 추가 (모니터링용)

### 비용/성능
- 추가 API 호출 없음 (base64 이미 캡처 시 다운로드됨)
- 업로드: ~1-3초/장 (병렬), 5MB limit, PNG/JPEG/WebP 허용
- 버킷: public (URL만 있으면 열람) + 서비스 역할만 업로드 가능

## 마이그레이션
supabase/migrations/20260407_screenshots_storage.sql
- screenshots 버킷 생성 (public, 5MB limit)
- RLS: public read / service_role write
- delete_old_screenshots() 함수: 90일 이상 된 파일 정리 (pg_cron 연동 가능)

## 타입
ScreenshotResult.archived?: boolean 필드 추가 (영구 vs GCS fallback 구분)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
claude/bold-hawking
Haewon Kam 2026-04-07 09:51:31 +09:00
parent bcc0b6ea5e
commit 36d2f1cf49
3 changed files with 125 additions and 20 deletions

View File

@ -19,12 +19,16 @@ const FIRECRAWL_BASE = "https://api.firecrawl.dev/v1";
export interface ScreenshotResult { export interface ScreenshotResult {
id: string; id: string;
url: string; // Supabase Storage signed URL or data URI /** Image URL. Initially a GCS signed URL from Firecrawl (~7-day expiry).
* collect-channel-data archives it to Supabase Storage and replaces this
* with a permanent public URL. Check `archived` flag to know which. */
url: string;
channel: string; // 'website', 'youtube', 'instagram', 'gangnamUnni', etc. channel: string; // 'website', 'youtube', 'instagram', 'gangnamUnni', etc.
capturedAt: string; capturedAt: string;
caption: string; caption: string;
sourceUrl: string; // Original page URL sourceUrl: string; // Original page URL
base64?: string; // Raw base64 (for Vision analysis, not stored in report) base64?: string; // Raw base64 (for Vision analysis — NOT stored in report or DB)
archived?: boolean; // true = permanent Supabase Storage URL; false = GCS fallback
} }
export interface VisionAnalysisResult { export interface VisionAnalysisResult {

View File

@ -597,32 +597,84 @@ Deno.serve(async (req) => {
const siteMap: string[] = row.scrape_data?.siteMap || []; const siteMap: string[] = row.scrape_data?.siteMap || [];
channelTasks.push(wrapChannelTask("vision", async () => { channelTasks.push(wrapChannelTask("vision", async () => {
// Capture screenshots of relevant pages + social channel landings // Step 1: Capture screenshots of relevant pages + social channel landings
screenshots = await captureAllScreenshots(mainUrl, siteMap, verified, FIRECRAWL_API_KEY); screenshots = await captureAllScreenshots(mainUrl, siteMap, verified, FIRECRAWL_API_KEY);
// Run Gemini Vision on captured screenshots
if (GEMINI_API_KEY && screenshots.length > 0) {
const vision = await runVisionAnalysis(screenshots, GEMINI_API_KEY);
channelData.visionAnalysis = vision.merged;
channelData.visionPerPage = vision.perPage;
}
// Store screenshots metadata (NOT base64 — use the GCS URL from Firecrawl)
channelData.screenshots = screenshots.map(ss => ({
id: ss.id,
url: ss.url, // GCS signed URL (valid ~7 days)
channel: ss.channel,
capturedAt: ss.capturedAt,
caption: ss.caption,
sourceUrl: ss.sourceUrl,
}));
if (screenshots.length === 0) { if (screenshots.length === 0) {
const debugInfo = screenshotErrors.length > 0 const debugInfo = screenshotErrors.length > 0
? screenshotErrors.join(" | ") ? screenshotErrors.join(" | ")
: "No errors recorded — check FIRECRAWL_API_KEY"; : "No errors recorded — check FIRECRAWL_API_KEY";
throw new Error(`No screenshots captured: ${debugInfo}`); throw new Error(`No screenshots captured: ${debugInfo}`);
} }
// ─── Step 2: Archive to Supabase Storage (replace 7-day GCS URLs) ───────
// Firecrawl returns signed GCS URLs that expire after ~7 days.
// We already have the image as base64 in memory — upload it permanently
// to Supabase Storage and replace ss.url in-place before storing to DB.
//
// Upload happens in parallel; failures are non-fatal — the screenshot
// keeps its GCS URL as a fallback so Vision analysis still proceeds.
const SUPABASE_STORAGE_BUCKET = "screenshots";
const archiveTasks = screenshots.map(async (ss) => {
if (!ss.base64) return; // no image data — skip
try {
// base64 → Uint8Array
const binaryStr = atob(ss.base64);
const bytes = new Uint8Array(binaryStr.length);
for (let i = 0; i < binaryStr.length; i++) {
bytes[i] = binaryStr.charCodeAt(i);
}
// Upload: screenshots/{reportId}/{screenshotId}.png
const storagePath = `${reportId}/${ss.id}.png`;
const { error: uploadError } = await supabase.storage
.from(SUPABASE_STORAGE_BUCKET)
.upload(storagePath, bytes, {
contentType: "image/png",
upsert: true, // overwrite if re-running same analysis
});
if (uploadError) {
// Non-fatal: log and keep GCS URL as fallback
console.warn(`[archive] Storage upload failed for ${ss.id}: ${uploadError.message}`);
return;
}
// Replace GCS temp URL with permanent Supabase Storage public URL
const { data: { publicUrl } } = supabase.storage
.from(SUPABASE_STORAGE_BUCKET)
.getPublicUrl(storagePath);
ss.url = publicUrl; // in-place replace — all downstream code uses permanent URL
console.log(`[archive] ${ss.id}${publicUrl.slice(-60)}`);
} catch (archiveErr) {
// Non-fatal: Vision analysis still proceeds with base64
console.warn(`[archive] Exception for ${ss.id}:`, archiveErr instanceof Error ? archiveErr.message : archiveErr);
}
});
await Promise.allSettled(archiveTasks);
const archivedCount = screenshots.filter(ss => ss.url.includes("supabase")).length;
console.log(`[archive] ${archivedCount}/${screenshots.length} screenshots archived to Supabase Storage`);
// Step 3: Run Gemini Vision on captured screenshots (base64 still in memory)
if (GEMINI_API_KEY && screenshots.length > 0) {
const vision = await runVisionAnalysis(screenshots, GEMINI_API_KEY);
channelData.visionAnalysis = vision.merged;
channelData.visionPerPage = vision.perPage;
}
// Step 4: Store screenshots metadata — ss.url is now the permanent URL (or GCS fallback)
channelData.screenshots = screenshots.map(ss => ({
id: ss.id,
url: ss.url, // permanent Supabase Storage URL (or GCS fallback if archive failed)
channel: ss.channel,
capturedAt: ss.capturedAt,
caption: ss.caption,
sourceUrl: ss.sourceUrl,
archived: ss.url.includes("supabase"), // flag: true = permanent, false = GCS fallback
}));
})); }));
} }

View File

@ -0,0 +1,49 @@
-- ═══════════════════════════════════════════════════════════════
-- Screenshots: Supabase Storage bucket + RLS
-- ═══════════════════════════════════════════════════════════════
-- Firecrawl이 반환하는 GCS URL은 7일 후 만료됨.
-- collect-channel-data가 스크린샷 캡처 시 base64를 이 버킷에 영구 저장.
-- 경로 규칙: screenshots/{reportId}/{screenshotId}.png
-- Storage 버킷 생성 (이미 있으면 무시)
INSERT INTO storage.buckets (id, name, public, file_size_limit, allowed_mime_types)
VALUES (
'screenshots',
'screenshots',
true, -- public bucket: 인증 없이 URL만으로 열람 가능
5242880, -- 5MB per file limit
ARRAY['image/png', 'image/jpeg', 'image/webp']
)
ON CONFLICT (id) DO NOTHING;
-- ─── RLS Policies ───────────────────────────────────────────────
-- 1. 누구나 읽기 가능 (리포트 공유 URL에서 이미지 표시)
CREATE POLICY "public_read_screenshots"
ON storage.objects FOR SELECT
USING (bucket_id = 'screenshots');
-- 2. 서비스 역할만 업로드/삭제 (Edge Function이 service_role_key 사용)
CREATE POLICY "service_upload_screenshots"
ON storage.objects FOR INSERT
WITH CHECK (bucket_id = 'screenshots' AND auth.role() = 'service_role');
CREATE POLICY "service_delete_screenshots"
ON storage.objects FOR DELETE
USING (bucket_id = 'screenshots' AND auth.role() = 'service_role');
-- ─── 정리 함수 ─────────────────────────────────────────────────
-- 90일 이상 된 스크린샷 자동 삭제 (선택적 — pg_cron 스케줄로 실행)
-- 병원 리포트는 보통 3개월 후 재분석하므로 이전 스크린샷은 불필요.
CREATE OR REPLACE FUNCTION delete_old_screenshots()
RETURNS void
LANGUAGE sql
AS $$
DELETE FROM storage.objects
WHERE bucket_id = 'screenshots'
AND created_at < now() - interval '90 days';
$$;
-- 코멘트
COMMENT ON FUNCTION delete_old_screenshots() IS
'Deletes screenshots older than 90 days. Schedule with pg_cron: SELECT cron.schedule(''weekly-screenshot-cleanup'', ''0 3 * * 0'', ''SELECT delete_old_screenshots()'');';