o2o-infinith-demo/scripts/archive-screenshots.py

#!/usr/bin/env python3
"""
archive-screenshots.py
GCS 임시 URL(7일 만료)로 저장된 스크린샷을 Supabase Storage에 영구 아카이브.

실행: python3 scripts/archive-screenshots.py
환경: SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY (.env 또는 환경변수)
"""

import os, json, urllib.request, urllib.parse
from supabase import create_client, Client

# ── 환경변수 로드 ──────────────────────────────────────────────────────────────
def load_env():
    env = {}
    env_path = os.path.join(os.path.dirname(__file__), '..', '.env')
    try:
        with open(env_path) as f:
            for line in f:
                line = line.strip()
                if line and not line.startswith('#') and '=' in line:
                    k, v = line.split('=', 1)
                    env[k.strip()] = v.strip().strip('"').strip("'")
    except FileNotFoundError:
        pass
    for k in ('SUPABASE_URL', 'SUPABASE_SERVICE_ROLE_KEY', 'VITE_SUPABASE_URL'):
        if os.environ.get(k):
            env[k] = os.environ[k]
    return env

env = load_env()

SUPABASE_URL = (env.get('SUPABASE_URL') or env.get('VITE_SUPABASE_URL', '')).rstrip('/')
SERVICE_KEY  = env.get('SUPABASE_SERVICE_ROLE_KEY', '')
BUCKET       = 'screenshots'

if not SUPABASE_URL or not SERVICE_KEY:
    print('❌ SUPABASE_URL 또는 SUPABASE_SERVICE_ROLE_KEY 환경변수가 없습니다.')
    raise SystemExit(1)

# supabase-py 클라이언트 (sb_secret_* 키 포함 모든 형식 지원)
sb: Client = create_client(SUPABASE_URL, SERVICE_KEY)

# ── Storage 버킷 초기화 ────────────────────────────────────────────────────────
def ensure_bucket():
    """screenshots 버킷이 없으면 public으로 생성."""
    try:
        buckets = sb.storage.list_buckets()
        existing = [b.name for b in buckets]
        if BUCKET not in existing:
            sb.storage.create_bucket(BUCKET, options={"public": True})
            print(f'✅ Storage 버킷 "{BUCKET}" 생성 완료')
        else:
            print(f'✅ Storage 버킷 "{BUCKET}" 확인됨')
    except Exception as e:
        print(f'⚠️  버킷 확인/생성 실패: {e}')

# ── GCS URL에서 이미지 다운로드 ────────────────────────────────────────────────
def fetch_image(gcs_url: str) -> bytes:
    req = urllib.request.Request(gcs_url, headers={'User-Agent': 'Mozilla/5.0'})
    with urllib.request.urlopen(req, timeout=20) as r:
        return r.read()

# ── Supabase Storage 업로드 ────────────────────────────────────────────────────
def upload_to_storage(storage_path: str, image_bytes: bytes) -> str:
    """supabase-py Storage 클라이언트로 업로드 → public URL 반환."""
    sb.storage.from_(BUCKET).upload(
        path=storage_path,
        file=image_bytes,
        file_options={"content-type": "image/png", "upsert": "true"},
    )
    public_url = sb.storage.from_(BUCKET).get_public_url(storage_path)
    return public_url

# ── DB 조회 / 업데이트 ─────────────────────────────────────────────────────────
def fetch_reports():
    res = sb.table('marketing_reports').select('id,clinic_name,url,channel_data,report').execute()
    return res.data

def update_report_screenshots(report_id: str, channel_data: dict, report: dict):
    sb.table('marketing_reports').update({
        'channel_data': channel_data,
        'report': report,
    }).eq('id', report_id).execute()

# ── 헬퍼 ───────────────────────────────────────────────────────────────────────
def get_domain(site_url: str) -> str:
    try:
        return urllib.parse.urlparse(site_url).hostname.replace('www.', '') or 'unknown'
    except Exception:
        return 'unknown'

def is_gcs(url: str) -> bool:
    return 'googleapis.com' in url or 'firecrawl' in url

# ── 리포트별 아카이브 ───────────────────────────────────────────────────────────
def archive_screenshots_for_report(row: dict) -> int:
    report_id    = row['id']
    clinic_name  = row.get('clinic_name', '?')
    domain       = get_domain(row.get('url', ''))
    channel_data = row.get('channel_data') or {}
    report       = row.get('report') or {}

    ss_list  = channel_data.get('screenshots', [])
    rss_list = report.get('screenshots', [])

    # id → screenshot 오브젝트 인덱스 (중복 제거)
    all_ss = {s['id']: s for s in ss_list + rss_list if s.get('url')}

    archived = 0
    for ss_id, ss in all_ss.items():
        url = ss.get('url', '')
        if not is_gcs(url):
            continue  # 이미 Supabase URL이거나 영구 URL

        storage_path = f'clinics/{domain}/{report_id}/screenshots/{ss_id}.png'

        try:
            print(f'  → 다운로드: {ss_id}')
            print(f'     URL: {url[:80]}...')
            image_bytes = fetch_image(url)
            public_url  = upload_to_storage(storage_path, image_bytes)

            # channel_data, report 양쪽에서 URL 교체
            for lst in (ss_list, rss_list):
                for s in lst:
                    if s.get('id') == ss_id:
                        s['url'] = public_url

            print(f'     ✅ 완료 → {public_url[:80]}...')
            archived += 1

        except Exception as e:
            print(f'     ❌ 실패: {e}')

    if archived > 0:
        update_report_screenshots(report_id, channel_data, report)
        print(f'  💾 DB 업데이트 완료 ({clinic_name})')

    return archived

# ── 메인 ───────────────────────────────────────────────────────────────────────
def main():
    print('=== Supabase Screenshot 영구 아카이브 ===')
    print(f'대상: {SUPABASE_URL}')
    print()

    ensure_bucket()
    print()
    print('DB에서 리포트 목록 조회 중...')
    reports = fetch_reports()
    print(f'총 {len(reports)}개 리포트')
    print()

    total_archived = 0
    for row in reports:
        name      = row.get('clinic_name', '?')
        ss_list   = (row.get('channel_data') or {}).get('screenshots', [])
        rss_list  = (row.get('report') or {}).get('screenshots', [])
        has_gcs   = any(is_gcs(s.get('url', '')) for s in ss_list + rss_list)
        if not has_gcs:
            continue

        print(f'[{name}] channel_data={len(ss_list)}개 / report={len(rss_list)}개 스크린샷')
        n = archive_screenshots_for_report(row)
        total_archived += n
        print()

    print(f'=== 완료: 총 {total_archived}개 스크린샷 영구 저장 ===')

if __name__ == '__main__':
    main()