feat: archive Firecrawl screenshots to Supabase Storage (permanent URLs)
## 문제
Firecrawl이 반환하는 스크린샷 URL은 GCS Signed URL로 7일 후 만료.
리포트에 저장된 이미지 URL이 일주일 후 전부 깨짐 (403 Access Denied).
## 해결
collect-channel-data의 Vision 단계에 아카이빙 스텝 추가.
캡처 직후 base64(이미 메모리에 있음)를 Supabase Storage에 영구 업로드.
### 처리 흐름 (변경 후)
1. captureAllScreenshots() → GCS URL + base64 반환 (기존)
2. [신규] archiveTasks: base64 → Supabase Storage 업로드 (병렬)
- 경로: screenshots/{reportId}/{screenshotId}.png
- 성공 시 ss.url을 영구 Supabase URL로 in-place 교체
- 실패 시 non-fatal — GCS URL fallback으로 Vision 분석 계속 진행
3. runVisionAnalysis() — base64 여전히 메모리에 있어 정상 실행 (기존)
4. channelData.screenshots 저장 시 영구 URL 사용 (자동)
- archived: true/false 플래그 추가 (모니터링용)
### 비용/성능
- 추가 API 호출 없음 (base64 이미 캡처 시 다운로드됨)
- 업로드: ~1-3초/장 (병렬), 5MB limit, PNG/JPEG/WebP 허용
- 버킷: public (URL만 있으면 열람) + 서비스 역할만 업로드 가능
## 마이그레이션
supabase/migrations/20260407_screenshots_storage.sql
- screenshots 버킷 생성 (public, 5MB limit)
- RLS: public read / service_role write
- delete_old_screenshots() 함수: 90일 이상 된 파일 정리 (pg_cron 연동 가능)
## 타입
ScreenshotResult.archived?: boolean 필드 추가 (영구 vs GCS fallback 구분)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
claude/bold-hawking
parent
bcc0b6ea5e
commit
36d2f1cf49
|
|
@ -19,12 +19,16 @@ const FIRECRAWL_BASE = "https://api.firecrawl.dev/v1";
|
|||
|
||||
export interface ScreenshotResult {
|
||||
id: string;
|
||||
url: string; // Supabase Storage signed URL or data URI
|
||||
/** Image URL. Initially a GCS signed URL from Firecrawl (~7-day expiry).
|
||||
* collect-channel-data archives it to Supabase Storage and replaces this
|
||||
* with a permanent public URL. Check `archived` flag to know which. */
|
||||
url: string;
|
||||
channel: string; // 'website', 'youtube', 'instagram', 'gangnamUnni', etc.
|
||||
capturedAt: string;
|
||||
caption: string;
|
||||
sourceUrl: string; // Original page URL
|
||||
base64?: string; // Raw base64 (for Vision analysis, not stored in report)
|
||||
base64?: string; // Raw base64 (for Vision analysis — NOT stored in report or DB)
|
||||
archived?: boolean; // true = permanent Supabase Storage URL; false = GCS fallback
|
||||
}
|
||||
|
||||
export interface VisionAnalysisResult {
|
||||
|
|
|
|||
|
|
@ -597,32 +597,84 @@ Deno.serve(async (req) => {
|
|||
const siteMap: string[] = row.scrape_data?.siteMap || [];
|
||||
|
||||
channelTasks.push(wrapChannelTask("vision", async () => {
|
||||
// Capture screenshots of relevant pages + social channel landings
|
||||
// Step 1: Capture screenshots of relevant pages + social channel landings
|
||||
screenshots = await captureAllScreenshots(mainUrl, siteMap, verified, FIRECRAWL_API_KEY);
|
||||
|
||||
// Run Gemini Vision on captured screenshots
|
||||
if (GEMINI_API_KEY && screenshots.length > 0) {
|
||||
const vision = await runVisionAnalysis(screenshots, GEMINI_API_KEY);
|
||||
channelData.visionAnalysis = vision.merged;
|
||||
channelData.visionPerPage = vision.perPage;
|
||||
}
|
||||
|
||||
// Store screenshots metadata (NOT base64 — use the GCS URL from Firecrawl)
|
||||
channelData.screenshots = screenshots.map(ss => ({
|
||||
id: ss.id,
|
||||
url: ss.url, // GCS signed URL (valid ~7 days)
|
||||
channel: ss.channel,
|
||||
capturedAt: ss.capturedAt,
|
||||
caption: ss.caption,
|
||||
sourceUrl: ss.sourceUrl,
|
||||
}));
|
||||
|
||||
if (screenshots.length === 0) {
|
||||
const debugInfo = screenshotErrors.length > 0
|
||||
? screenshotErrors.join(" | ")
|
||||
: "No errors recorded — check FIRECRAWL_API_KEY";
|
||||
throw new Error(`No screenshots captured: ${debugInfo}`);
|
||||
}
|
||||
|
||||
// ─── Step 2: Archive to Supabase Storage (replace 7-day GCS URLs) ───────
|
||||
// Firecrawl returns signed GCS URLs that expire after ~7 days.
|
||||
// We already have the image as base64 in memory — upload it permanently
|
||||
// to Supabase Storage and replace ss.url in-place before storing to DB.
|
||||
//
|
||||
// Upload happens in parallel; failures are non-fatal — the screenshot
|
||||
// keeps its GCS URL as a fallback so Vision analysis still proceeds.
|
||||
const SUPABASE_STORAGE_BUCKET = "screenshots";
|
||||
const archiveTasks = screenshots.map(async (ss) => {
|
||||
if (!ss.base64) return; // no image data — skip
|
||||
try {
|
||||
// base64 → Uint8Array
|
||||
const binaryStr = atob(ss.base64);
|
||||
const bytes = new Uint8Array(binaryStr.length);
|
||||
for (let i = 0; i < binaryStr.length; i++) {
|
||||
bytes[i] = binaryStr.charCodeAt(i);
|
||||
}
|
||||
|
||||
// Upload: screenshots/{reportId}/{screenshotId}.png
|
||||
const storagePath = `${reportId}/${ss.id}.png`;
|
||||
const { error: uploadError } = await supabase.storage
|
||||
.from(SUPABASE_STORAGE_BUCKET)
|
||||
.upload(storagePath, bytes, {
|
||||
contentType: "image/png",
|
||||
upsert: true, // overwrite if re-running same analysis
|
||||
});
|
||||
|
||||
if (uploadError) {
|
||||
// Non-fatal: log and keep GCS URL as fallback
|
||||
console.warn(`[archive] Storage upload failed for ${ss.id}: ${uploadError.message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Replace GCS temp URL with permanent Supabase Storage public URL
|
||||
const { data: { publicUrl } } = supabase.storage
|
||||
.from(SUPABASE_STORAGE_BUCKET)
|
||||
.getPublicUrl(storagePath);
|
||||
|
||||
ss.url = publicUrl; // in-place replace — all downstream code uses permanent URL
|
||||
console.log(`[archive] ${ss.id} → ${publicUrl.slice(-60)}`);
|
||||
} catch (archiveErr) {
|
||||
// Non-fatal: Vision analysis still proceeds with base64
|
||||
console.warn(`[archive] Exception for ${ss.id}:`, archiveErr instanceof Error ? archiveErr.message : archiveErr);
|
||||
}
|
||||
});
|
||||
|
||||
await Promise.allSettled(archiveTasks);
|
||||
|
||||
const archivedCount = screenshots.filter(ss => ss.url.includes("supabase")).length;
|
||||
console.log(`[archive] ${archivedCount}/${screenshots.length} screenshots archived to Supabase Storage`);
|
||||
|
||||
// Step 3: Run Gemini Vision on captured screenshots (base64 still in memory)
|
||||
if (GEMINI_API_KEY && screenshots.length > 0) {
|
||||
const vision = await runVisionAnalysis(screenshots, GEMINI_API_KEY);
|
||||
channelData.visionAnalysis = vision.merged;
|
||||
channelData.visionPerPage = vision.perPage;
|
||||
}
|
||||
|
||||
// Step 4: Store screenshots metadata — ss.url is now the permanent URL (or GCS fallback)
|
||||
channelData.screenshots = screenshots.map(ss => ({
|
||||
id: ss.id,
|
||||
url: ss.url, // permanent Supabase Storage URL (or GCS fallback if archive failed)
|
||||
channel: ss.channel,
|
||||
capturedAt: ss.capturedAt,
|
||||
caption: ss.caption,
|
||||
sourceUrl: ss.sourceUrl,
|
||||
archived: ss.url.includes("supabase"), // flag: true = permanent, false = GCS fallback
|
||||
}));
|
||||
}));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,49 @@
|
|||
-- ═══════════════════════════════════════════════════════════════
|
||||
-- Screenshots: Supabase Storage bucket + RLS
|
||||
-- ═══════════════════════════════════════════════════════════════
|
||||
-- Firecrawl이 반환하는 GCS URL은 7일 후 만료됨.
|
||||
-- collect-channel-data가 스크린샷 캡처 시 base64를 이 버킷에 영구 저장.
|
||||
-- 경로 규칙: screenshots/{reportId}/{screenshotId}.png
|
||||
|
||||
-- Storage 버킷 생성 (이미 있으면 무시)
|
||||
INSERT INTO storage.buckets (id, name, public, file_size_limit, allowed_mime_types)
|
||||
VALUES (
|
||||
'screenshots',
|
||||
'screenshots',
|
||||
true, -- public bucket: 인증 없이 URL만으로 열람 가능
|
||||
5242880, -- 5MB per file limit
|
||||
ARRAY['image/png', 'image/jpeg', 'image/webp']
|
||||
)
|
||||
ON CONFLICT (id) DO NOTHING;
|
||||
|
||||
-- ─── RLS Policies ───────────────────────────────────────────────
|
||||
|
||||
-- 1. 누구나 읽기 가능 (리포트 공유 URL에서 이미지 표시)
|
||||
CREATE POLICY "public_read_screenshots"
|
||||
ON storage.objects FOR SELECT
|
||||
USING (bucket_id = 'screenshots');
|
||||
|
||||
-- 2. 서비스 역할만 업로드/삭제 (Edge Function이 service_role_key 사용)
|
||||
CREATE POLICY "service_upload_screenshots"
|
||||
ON storage.objects FOR INSERT
|
||||
WITH CHECK (bucket_id = 'screenshots' AND auth.role() = 'service_role');
|
||||
|
||||
CREATE POLICY "service_delete_screenshots"
|
||||
ON storage.objects FOR DELETE
|
||||
USING (bucket_id = 'screenshots' AND auth.role() = 'service_role');
|
||||
|
||||
-- ─── 정리 함수 ─────────────────────────────────────────────────
|
||||
-- 90일 이상 된 스크린샷 자동 삭제 (선택적 — pg_cron 스케줄로 실행)
|
||||
-- 병원 리포트는 보통 3개월 후 재분석하므로 이전 스크린샷은 불필요.
|
||||
CREATE OR REPLACE FUNCTION delete_old_screenshots()
|
||||
RETURNS void
|
||||
LANGUAGE sql
|
||||
AS $$
|
||||
DELETE FROM storage.objects
|
||||
WHERE bucket_id = 'screenshots'
|
||||
AND created_at < now() - interval '90 days';
|
||||
$$;
|
||||
|
||||
-- 코멘트
|
||||
COMMENT ON FUNCTION delete_old_screenshots() IS
|
||||
'Deletes screenshots older than 90 days. Schedule with pg_cron: SELECT cron.schedule(''weekly-screenshot-cleanup'', ''0 3 * * 0'', ''SELECT delete_old_screenshots()'');';
|
||||
Loading…
Reference in New Issue