feat: archive Firecrawl screenshots to Supabase Storage (permanent URLs)
## 문제
Firecrawl이 반환하는 스크린샷 URL은 GCS Signed URL로 7일 후 만료.
리포트에 저장된 이미지 URL이 일주일 후 전부 깨짐 (403 Access Denied).
## 해결
collect-channel-data의 Vision 단계에 아카이빙 스텝 추가.
캡처 직후 base64(이미 메모리에 있음)를 Supabase Storage에 영구 업로드.
### 처리 흐름 (변경 후)
1. captureAllScreenshots() → GCS URL + base64 반환 (기존)
2. [신규] archiveTasks: base64 → Supabase Storage 업로드 (병렬)
- 경로: screenshots/{reportId}/{screenshotId}.png
- 성공 시 ss.url을 영구 Supabase URL로 in-place 교체
- 실패 시 non-fatal — GCS URL fallback으로 Vision 분석 계속 진행
3. runVisionAnalysis() — base64 여전히 메모리에 있어 정상 실행 (기존)
4. channelData.screenshots 저장 시 영구 URL 사용 (자동)
- archived: true/false 플래그 추가 (모니터링용)
### 비용/성능
- 추가 API 호출 없음 (base64 이미 캡처 시 다운로드됨)
- 업로드: ~1-3초/장 (병렬), 5MB limit, PNG/JPEG/WebP 허용
- 버킷: public (URL만 있으면 열람) + 서비스 역할만 업로드 가능
## 마이그레이션
supabase/migrations/20260407_screenshots_storage.sql
- screenshots 버킷 생성 (public, 5MB limit)
- RLS: public read / service_role write
- delete_old_screenshots() 함수: 90일 이상 된 파일 정리 (pg_cron 연동 가능)
## 타입
ScreenshotResult.archived?: boolean 필드 추가 (영구 vs GCS fallback 구분)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
claude/bold-hawking
parent
bcc0b6ea5e
commit
36d2f1cf49
|
|
@ -19,12 +19,16 @@ const FIRECRAWL_BASE = "https://api.firecrawl.dev/v1";
|
||||||
|
|
||||||
export interface ScreenshotResult {
|
export interface ScreenshotResult {
|
||||||
id: string;
|
id: string;
|
||||||
url: string; // Supabase Storage signed URL or data URI
|
/** Image URL. Initially a GCS signed URL from Firecrawl (~7-day expiry).
|
||||||
|
* collect-channel-data archives it to Supabase Storage and replaces this
|
||||||
|
* with a permanent public URL. Check `archived` flag to know which. */
|
||||||
|
url: string;
|
||||||
channel: string; // 'website', 'youtube', 'instagram', 'gangnamUnni', etc.
|
channel: string; // 'website', 'youtube', 'instagram', 'gangnamUnni', etc.
|
||||||
capturedAt: string;
|
capturedAt: string;
|
||||||
caption: string;
|
caption: string;
|
||||||
sourceUrl: string; // Original page URL
|
sourceUrl: string; // Original page URL
|
||||||
base64?: string; // Raw base64 (for Vision analysis, not stored in report)
|
base64?: string; // Raw base64 (for Vision analysis — NOT stored in report or DB)
|
||||||
|
archived?: boolean; // true = permanent Supabase Storage URL; false = GCS fallback
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface VisionAnalysisResult {
|
export interface VisionAnalysisResult {
|
||||||
|
|
|
||||||
|
|
@ -597,32 +597,84 @@ Deno.serve(async (req) => {
|
||||||
const siteMap: string[] = row.scrape_data?.siteMap || [];
|
const siteMap: string[] = row.scrape_data?.siteMap || [];
|
||||||
|
|
||||||
channelTasks.push(wrapChannelTask("vision", async () => {
|
channelTasks.push(wrapChannelTask("vision", async () => {
|
||||||
// Capture screenshots of relevant pages + social channel landings
|
// Step 1: Capture screenshots of relevant pages + social channel landings
|
||||||
screenshots = await captureAllScreenshots(mainUrl, siteMap, verified, FIRECRAWL_API_KEY);
|
screenshots = await captureAllScreenshots(mainUrl, siteMap, verified, FIRECRAWL_API_KEY);
|
||||||
|
|
||||||
// Run Gemini Vision on captured screenshots
|
|
||||||
if (GEMINI_API_KEY && screenshots.length > 0) {
|
|
||||||
const vision = await runVisionAnalysis(screenshots, GEMINI_API_KEY);
|
|
||||||
channelData.visionAnalysis = vision.merged;
|
|
||||||
channelData.visionPerPage = vision.perPage;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Store screenshots metadata (NOT base64 — use the GCS URL from Firecrawl)
|
|
||||||
channelData.screenshots = screenshots.map(ss => ({
|
|
||||||
id: ss.id,
|
|
||||||
url: ss.url, // GCS signed URL (valid ~7 days)
|
|
||||||
channel: ss.channel,
|
|
||||||
capturedAt: ss.capturedAt,
|
|
||||||
caption: ss.caption,
|
|
||||||
sourceUrl: ss.sourceUrl,
|
|
||||||
}));
|
|
||||||
|
|
||||||
if (screenshots.length === 0) {
|
if (screenshots.length === 0) {
|
||||||
const debugInfo = screenshotErrors.length > 0
|
const debugInfo = screenshotErrors.length > 0
|
||||||
? screenshotErrors.join(" | ")
|
? screenshotErrors.join(" | ")
|
||||||
: "No errors recorded — check FIRECRAWL_API_KEY";
|
: "No errors recorded — check FIRECRAWL_API_KEY";
|
||||||
throw new Error(`No screenshots captured: ${debugInfo}`);
|
throw new Error(`No screenshots captured: ${debugInfo}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ─── Step 2: Archive to Supabase Storage (replace 7-day GCS URLs) ───────
|
||||||
|
// Firecrawl returns signed GCS URLs that expire after ~7 days.
|
||||||
|
// We already have the image as base64 in memory — upload it permanently
|
||||||
|
// to Supabase Storage and replace ss.url in-place before storing to DB.
|
||||||
|
//
|
||||||
|
// Upload happens in parallel; failures are non-fatal — the screenshot
|
||||||
|
// keeps its GCS URL as a fallback so Vision analysis still proceeds.
|
||||||
|
const SUPABASE_STORAGE_BUCKET = "screenshots";
|
||||||
|
const archiveTasks = screenshots.map(async (ss) => {
|
||||||
|
if (!ss.base64) return; // no image data — skip
|
||||||
|
try {
|
||||||
|
// base64 → Uint8Array
|
||||||
|
const binaryStr = atob(ss.base64);
|
||||||
|
const bytes = new Uint8Array(binaryStr.length);
|
||||||
|
for (let i = 0; i < binaryStr.length; i++) {
|
||||||
|
bytes[i] = binaryStr.charCodeAt(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Upload: screenshots/{reportId}/{screenshotId}.png
|
||||||
|
const storagePath = `${reportId}/${ss.id}.png`;
|
||||||
|
const { error: uploadError } = await supabase.storage
|
||||||
|
.from(SUPABASE_STORAGE_BUCKET)
|
||||||
|
.upload(storagePath, bytes, {
|
||||||
|
contentType: "image/png",
|
||||||
|
upsert: true, // overwrite if re-running same analysis
|
||||||
|
});
|
||||||
|
|
||||||
|
if (uploadError) {
|
||||||
|
// Non-fatal: log and keep GCS URL as fallback
|
||||||
|
console.warn(`[archive] Storage upload failed for ${ss.id}: ${uploadError.message}`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace GCS temp URL with permanent Supabase Storage public URL
|
||||||
|
const { data: { publicUrl } } = supabase.storage
|
||||||
|
.from(SUPABASE_STORAGE_BUCKET)
|
||||||
|
.getPublicUrl(storagePath);
|
||||||
|
|
||||||
|
ss.url = publicUrl; // in-place replace — all downstream code uses permanent URL
|
||||||
|
console.log(`[archive] ${ss.id} → ${publicUrl.slice(-60)}`);
|
||||||
|
} catch (archiveErr) {
|
||||||
|
// Non-fatal: Vision analysis still proceeds with base64
|
||||||
|
console.warn(`[archive] Exception for ${ss.id}:`, archiveErr instanceof Error ? archiveErr.message : archiveErr);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
await Promise.allSettled(archiveTasks);
|
||||||
|
|
||||||
|
const archivedCount = screenshots.filter(ss => ss.url.includes("supabase")).length;
|
||||||
|
console.log(`[archive] ${archivedCount}/${screenshots.length} screenshots archived to Supabase Storage`);
|
||||||
|
|
||||||
|
// Step 3: Run Gemini Vision on captured screenshots (base64 still in memory)
|
||||||
|
if (GEMINI_API_KEY && screenshots.length > 0) {
|
||||||
|
const vision = await runVisionAnalysis(screenshots, GEMINI_API_KEY);
|
||||||
|
channelData.visionAnalysis = vision.merged;
|
||||||
|
channelData.visionPerPage = vision.perPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 4: Store screenshots metadata — ss.url is now the permanent URL (or GCS fallback)
|
||||||
|
channelData.screenshots = screenshots.map(ss => ({
|
||||||
|
id: ss.id,
|
||||||
|
url: ss.url, // permanent Supabase Storage URL (or GCS fallback if archive failed)
|
||||||
|
channel: ss.channel,
|
||||||
|
capturedAt: ss.capturedAt,
|
||||||
|
caption: ss.caption,
|
||||||
|
sourceUrl: ss.sourceUrl,
|
||||||
|
archived: ss.url.includes("supabase"), // flag: true = permanent, false = GCS fallback
|
||||||
|
}));
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,49 @@
|
||||||
|
-- ═══════════════════════════════════════════════════════════════
|
||||||
|
-- Screenshots: Supabase Storage bucket + RLS
|
||||||
|
-- ═══════════════════════════════════════════════════════════════
|
||||||
|
-- Firecrawl이 반환하는 GCS URL은 7일 후 만료됨.
|
||||||
|
-- collect-channel-data가 스크린샷 캡처 시 base64를 이 버킷에 영구 저장.
|
||||||
|
-- 경로 규칙: screenshots/{reportId}/{screenshotId}.png
|
||||||
|
|
||||||
|
-- Storage 버킷 생성 (이미 있으면 무시)
|
||||||
|
INSERT INTO storage.buckets (id, name, public, file_size_limit, allowed_mime_types)
|
||||||
|
VALUES (
|
||||||
|
'screenshots',
|
||||||
|
'screenshots',
|
||||||
|
true, -- public bucket: 인증 없이 URL만으로 열람 가능
|
||||||
|
5242880, -- 5MB per file limit
|
||||||
|
ARRAY['image/png', 'image/jpeg', 'image/webp']
|
||||||
|
)
|
||||||
|
ON CONFLICT (id) DO NOTHING;
|
||||||
|
|
||||||
|
-- ─── RLS Policies ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
-- 1. 누구나 읽기 가능 (리포트 공유 URL에서 이미지 표시)
|
||||||
|
CREATE POLICY "public_read_screenshots"
|
||||||
|
ON storage.objects FOR SELECT
|
||||||
|
USING (bucket_id = 'screenshots');
|
||||||
|
|
||||||
|
-- 2. 서비스 역할만 업로드/삭제 (Edge Function이 service_role_key 사용)
|
||||||
|
CREATE POLICY "service_upload_screenshots"
|
||||||
|
ON storage.objects FOR INSERT
|
||||||
|
WITH CHECK (bucket_id = 'screenshots' AND auth.role() = 'service_role');
|
||||||
|
|
||||||
|
CREATE POLICY "service_delete_screenshots"
|
||||||
|
ON storage.objects FOR DELETE
|
||||||
|
USING (bucket_id = 'screenshots' AND auth.role() = 'service_role');
|
||||||
|
|
||||||
|
-- ─── 정리 함수 ─────────────────────────────────────────────────
|
||||||
|
-- 90일 이상 된 스크린샷 자동 삭제 (선택적 — pg_cron 스케줄로 실행)
|
||||||
|
-- 병원 리포트는 보통 3개월 후 재분석하므로 이전 스크린샷은 불필요.
|
||||||
|
CREATE OR REPLACE FUNCTION delete_old_screenshots()
|
||||||
|
RETURNS void
|
||||||
|
LANGUAGE sql
|
||||||
|
AS $$
|
||||||
|
DELETE FROM storage.objects
|
||||||
|
WHERE bucket_id = 'screenshots'
|
||||||
|
AND created_at < now() - interval '90 days';
|
||||||
|
$$;
|
||||||
|
|
||||||
|
-- 코멘트
|
||||||
|
COMMENT ON FUNCTION delete_old_screenshots() IS
|
||||||
|
'Deletes screenshots older than 90 days. Schedule with pg_cron: SELECT cron.schedule(''weekly-screenshot-cleanup'', ''0 3 * * 0'', ''SELECT delete_old_screenshots()'');';
|
||||||
Loading…
Reference in New Issue