import "@supabase/functions-js/edge-runtime.d.ts"; import { createClient } from "https://esm.sh/@supabase/supabase-js@2"; import { extractSocialLinks, mergeSocialLinks } from "../_shared/extractSocialLinks.ts"; import { PERPLEXITY_MODEL } from "../_shared/config.ts"; import { verifyAllHandles, type VerifiedChannels } from "../_shared/verifyHandles.ts"; import { RESEARCH_SYSTEM_PROMPT, buildResearchUserPrompt } from "../_shared/researchPrompt.ts"; const corsHeaders = { "Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "authorization, x-client-info, apikey, content-type", }; const APIFY_BASE = "https://api.apify.com/v2"; // ─── Registry Helper: Convert registry row → VerifiedChannels ─── function extractHandleFromUrl(url: string, platform: string): string | null { if (!url) return null; try { if (platform === 'instagram') { const m = url.match(/instagram\.com\/([a-zA-Z0-9._]+)/); return m ? m[1] : null; } if (platform === 'youtube') { const m = url.match(/youtube\.com\/(?:@([a-zA-Z0-9._-]+)|channel\/(UC[a-zA-Z0-9_-]+)|c\/([a-zA-Z0-9._-]+)|user\/([a-zA-Z0-9._-]+))/); if (m) return m[1] ? `@${m[1]}` : m[2] || m[3] || m[4] || null; return null; } if (platform === 'facebook') { const m = url.match(/facebook\.com\/([a-zA-Z0-9._-]+)/); return m ? m[1] : null; } if (platform === 'naverBlog') { const m = url.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/); return m ? m[1] : null; } if (platform === 'tiktok') { const m = url.match(/tiktok\.com\/@([a-zA-Z0-9._-]+)/); return m ? m[1] : null; } } catch { /* ignore */ } return null; } interface RegistryRow { name: string; domain: string; website_url: string; brand_group?: string; district?: string; branches?: string; website_en?: string; youtube_url?: string; instagram_url?: string; instagram_en_url?: string; facebook_url?: string; tiktok_url?: string; naver_blog_url?: string; naver_place_url?: string; gangnam_unni_url?: string; google_maps_url?: string; founded_year?: number; } function registryToVerifiedChannels(reg: RegistryRow): import("../_shared/verifyHandles.ts").VerifiedChannels { const igHandles: import("../_shared/verifyHandles.ts").VerifiedChannel[] = []; const igHandle = extractHandleFromUrl(reg.instagram_url || '', 'instagram'); if (igHandle) igHandles.push({ handle: igHandle, verified: true, url: reg.instagram_url! }); const igEnHandle = extractHandleFromUrl(reg.instagram_en_url || '', 'instagram'); if (igEnHandle) igHandles.push({ handle: igEnHandle, verified: true, url: reg.instagram_en_url! }); const ytHandle = extractHandleFromUrl(reg.youtube_url || '', 'youtube'); const fbHandle = extractHandleFromUrl(reg.facebook_url || '', 'facebook'); const blogHandle = extractHandleFromUrl(reg.naver_blog_url || '', 'naverBlog'); const ttHandle = extractHandleFromUrl(reg.tiktok_url || '', 'tiktok'); return { instagram: igHandles, youtube: ytHandle ? { handle: ytHandle, verified: true, url: reg.youtube_url! } : null, facebook: fbHandle ? { handle: fbHandle, verified: true, url: reg.facebook_url! } : null, naverBlog: blogHandle ? { handle: blogHandle, verified: true, url: reg.naver_blog_url! } : null, gangnamUnni: reg.gangnam_unni_url ? { handle: reg.gangnam_unni_url, verified: true, url: reg.gangnam_unni_url } : null, tiktok: ttHandle ? { handle: ttHandle, verified: true, url: reg.tiktok_url! } : null, }; } interface DiscoverRequest { url: string; clinicName?: string; } function extractHandle(raw: string, platform: string): string | null { if (!raw || raw.length < 2) return null; let h = raw.trim(); if (platform === 'instagram') { const m = h.match(/instagram\.com\/([a-zA-Z0-9._]+)/); if (m) return m[1]; h = h.replace(/^@/, '').replace(/\/$/, ''); if (/^[a-zA-Z0-9._]+$/.test(h) && h.length >= 2) return h; return null; } if (platform === 'youtube') { const m = h.match(/youtube\.com\/(?:@([a-zA-Z0-9._-]+)|channel\/(UC[a-zA-Z0-9_-]+)|c\/([a-zA-Z0-9._-]+))/); if (m) return m[1] ? `@${m[1]}` : m[2] || m[3] || null; h = h.replace(/^@/, ''); if (h.includes('http') || h.includes('/') || h.includes('.com')) return null; if (/^UC[a-zA-Z0-9_-]{22}$/.test(h)) return h; // YouTube channel IDs are exactly 24 chars (UC + 22) if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return `@${h}`; return null; } if (platform === 'facebook') { const m = h.match(/facebook\.com\/([a-zA-Z0-9._-]+)/); if (m) return m[1]; h = h.replace(/^@/, '').replace(/\/$/, ''); if (h.includes('http') || h.includes('/')) return null; if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return h; return null; } if (platform === 'naverBlog') { const m = h.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/); if (m) return m[1]; if (h.includes('http') || h.includes('/')) return null; if (/^[a-zA-Z0-9_-]+$/.test(h) && h.length >= 2) return h; return null; } if (platform === 'tiktok') { const m = h.match(/tiktok\.com\/@([a-zA-Z0-9._-]+)/); if (m) return m[1]; h = h.replace(/^@/, ''); if (h.includes('http') || h.includes('/')) return null; if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return h; return null; } return h; } /** * Phase 1: Discover & Verify Channels * * API-first, Perplexity-supplement approach: * Stage A: Firecrawl scrape + map → clinicName + social links from HTML * Stage B: Direct API searches (YouTube, Naver, Firecrawl) + Perplexity * Stage C: Merge all sources + Verify handles */ Deno.serve(async (req) => { if (req.method === "OPTIONS") { return new Response("ok", { headers: corsHeaders }); } try { const { url, clinicName: inputClinicName } = (await req.json()) as DiscoverRequest; if (!url) { return new Response( JSON.stringify({ error: "URL is required" }), { status: 400, headers: { ...corsHeaders, "Content-Type": "application/json" } }, ); } // ═══════════════════════════════════════════ // REGISTRY CHECK: Pre-verified clinic DB lookup // If domain is registered, skip all API discovery // ═══════════════════════════════════════════ const supabaseUrl = Deno.env.get("SUPABASE_URL")!; const supabaseKey = Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")!; const supabase = createClient(supabaseUrl, supabaseKey); let registryDomain: string; try { registryDomain = new URL(url).hostname.replace(/^www\./, ''); } catch { registryDomain = ''; } if (registryDomain) { const { data: registered } = await supabase .from("clinic_registry") .select("*") .eq("domain", registryDomain) .eq("is_active", true) .maybeSingle(); if (registered) { console.log(`[registry] Hit: ${registered.name} (${registryDomain})`); const verified = registryToVerifiedChannels(registered as RegistryRow); const scrapeDataFromRegistry = { clinic: { clinicName: registered.name }, branding: {}, siteLinks: [], siteMap: [], sourceUrl: url, scrapedAt: new Date().toISOString(), source: "registry", registryData: { district: registered.district, branches: registered.branches, brandGroup: registered.brand_group, foundedYear: registered.founded_year, websiteEn: registered.website_en, naverPlaceUrl: registered.naver_place_url, googleMapsUrl: registered.google_maps_url, }, }; // Legacy: marketing_reports const { data: saved, error: saveError } = await supabase .from("marketing_reports") .insert({ url, clinic_name: registered.name, status: "discovered", verified_channels: verified, scrape_data: scrapeDataFromRegistry, report: {}, pipeline_started_at: new Date().toISOString(), }) .select("id") .single(); if (saveError) throw new Error(`DB save failed: ${saveError.message}`); // V3: clinics + analysis_runs let clinicId: string | null = null; let runId: string | null = null; try { const { data: clinicRow } = await supabase .from("clinics") .upsert({ url, name: registered.name, name_en: null, domain: registryDomain, address: null, phone: null, services: [], branding: {}, social_handles: { instagram: verified.instagram?.map((v: Record) => v.handle) || [], youtube: (verified.youtube as Record)?.handle || null, facebook: (verified.facebook as Record)?.handle || null, naverBlog: (verified.naverBlog as Record)?.handle || null, }, verified_channels: verified, last_analyzed_at: new Date().toISOString(), updated_at: new Date().toISOString(), }, { onConflict: 'url' }) .select("id") .single(); clinicId = clinicRow?.id || null; if (clinicId) { const { data: runRow } = await supabase .from("analysis_runs") .insert({ clinic_id: clinicId, status: "discovering", scrape_data: scrapeDataFromRegistry, discovered_channels: verified, trigger: "manual", pipeline_started_at: new Date().toISOString(), }) .select("id") .single(); runId = runRow?.id || null; } } catch (e) { console.error("V3 dual-write error (registry):", e); } return new Response( JSON.stringify({ success: true, reportId: saved.id, clinicId, runId, clinicName: registered.name, verifiedChannels: verified, address: "", services: [], scrapeData: scrapeDataFromRegistry, source: "registry", }), { headers: { ...corsHeaders, "Content-Type": "application/json" } }, ); } } // ═══════════════════════════════════════════ // NOT REGISTERED: Return error for unregistered domains // (Registry-only mode — no API fallback) // ═══════════════════════════════════════════ console.log(`[registry] Miss: ${registryDomain} — returning CLINIC_NOT_REGISTERED`); return new Response( JSON.stringify({ success: false, error: "CLINIC_NOT_REGISTERED", message: "현재 지원하지 않는 병원입니다. 등록된 병원만 분석 가능합니다.", domain: registryDomain, }), { status: 404, headers: { ...corsHeaders, "Content-Type": "application/json" } }, ); // ═══════════════════════════════════════════ // LEGACY FALLBACK: Full API discovery (disabled — registry-only mode) // Kept for reference; unreachable in production // ═══════════════════════════════════════════ const FIRECRAWL_API_KEY = Deno.env.get("FIRECRAWL_API_KEY") || ""; const PERPLEXITY_API_KEY = Deno.env.get("PERPLEXITY_API_KEY") || ""; const YOUTUBE_API_KEY = Deno.env.get("YOUTUBE_API_KEY") || ""; const NAVER_CLIENT_ID = Deno.env.get("NAVER_CLIENT_ID") || ""; const NAVER_CLIENT_SECRET = Deno.env.get("NAVER_CLIENT_SECRET") || ""; if (!FIRECRAWL_API_KEY) throw new Error("FIRECRAWL_API_KEY not configured"); // ═══════════════════════════════════════════ // STAGE A: Firecrawl scrape + map (parallel) // ═══════════════════════════════════════════ const [scrapeResult, mapResult, brandResult, socialButtonResult] = await Promise.allSettled([ // A1. Main scrape — clinic info + links fetch("https://api.firecrawl.dev/v1/scrape", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, body: JSON.stringify({ url, formats: ["json", "links"], jsonOptions: { prompt: "Extract: clinic name (Korean), clinic name (English), address, phone, services offered, doctors with specialties, ALL social media links (instagram handles/URLs, youtube channel URL/handle, naver blog URL, facebook page URL, tiktok, kakao channel), business hours, slogan", schema: { type: "object", properties: { clinicName: { type: "string" }, clinicNameEn: { type: "string" }, address: { type: "string" }, phone: { type: "string" }, businessHours: { type: "string" }, slogan: { type: "string" }, services: { type: "array", items: { type: "string" } }, doctors: { type: "array", items: { type: "object", properties: { name: { type: "string" }, title: { type: "string" }, specialty: { type: "string" } } } }, socialMedia: { type: "object", properties: { instagram: { type: "string" }, youtube: { type: "string" }, blog: { type: "string" }, facebook: { type: "string" }, tiktok: { type: "string" }, kakao: { type: "string" } } }, }, }, }, waitFor: 5000, }), }).then(r => r.json()), // A2. Map site fetch("https://api.firecrawl.dev/v1/map", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, body: JSON.stringify({ url, limit: 50 }), }).then(r => r.json()), // A3. Branding extraction fetch("https://api.firecrawl.dev/v1/scrape", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, body: JSON.stringify({ url, formats: ["json"], jsonOptions: { prompt: "Extract brand identity: primary/accent/background/text colors (hex), heading/body fonts, logo URL, favicon URL, tagline", schema: { type: "object", properties: { primaryColor: { type: "string" }, accentColor: { type: "string" }, backgroundColor: { type: "string" }, textColor: { type: "string" }, headingFont: { type: "string" }, bodyFont: { type: "string" }, logoUrl: { type: "string" }, faviconUrl: { type: "string" }, tagline: { type: "string" } } }, }, waitFor: 3000, }), }).then(r => r.json()).catch(() => ({ data: { json: {} } })), // A4. Social button links — execute JS to extract all pointing to social platforms fetch("https://api.firecrawl.dev/v1/scrape", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, body: JSON.stringify({ url, formats: ["json"], jsonOptions: { prompt: "Find ALL social media link URLs on this page. Look in the header, footer, sidebar, and floating buttons. Extract the actual href URLs (not just text) for: Instagram, YouTube, Facebook, TikTok, Naver Blog (blog.naver.com), KakaoTalk (pf.kakao.com), Twitter/X. Include ALL variants found.", schema: { type: "object", properties: { socialLinks: { type: "array", items: { type: "object", properties: { platform: { type: "string" }, url: { type: "string" }, }, }, }, }, }, }, waitFor: 5000, actions: [ { type: "wait", milliseconds: 3000 }, { type: "scrape" }, ], }), }).then(r => r.json()).catch(() => ({ data: { json: {} } })), ]); const scrapeData = scrapeResult.status === "fulfilled" ? scrapeResult.value : { data: {} }; const mapData = mapResult.status === "fulfilled" ? mapResult.value : {}; const brandData = brandResult.status === "fulfilled" ? brandResult.value : { data: { json: {} } }; // A4 result: social buttons from JS-rendered page const socialButtonData = socialButtonResult.status === "fulfilled" ? socialButtonResult.value : { data: { json: {} } }; const socialButtons = (socialButtonData.data?.json?.socialLinks || []) as { platform?: string; url?: string }[]; const clinic = scrapeData.data?.json || {}; let resolvedName = inputClinicName || clinic.clinicName || clinic.clinicNameEn || ""; // Fallback: ask Perplexity to identify clinic name from URL if (!resolvedName && PERPLEXITY_API_KEY) { try { const nameRes = await fetch("https://api.perplexity.ai/chat/completions", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, body: JSON.stringify({ model: PERPLEXITY_MODEL, messages: [ { role: "system", content: "Respond with ONLY the clinic name in Korean, nothing else." }, { role: "user", content: `${url} 이 URL의 병원/클리닉 한국어 이름이 뭐야?` }, ], temperature: 0.1, }), }); const nameData = await nameRes.json(); const aiName = (nameData.choices?.[0]?.message?.content || "").trim().replace(/["""]/g, '').split('\n')[0].trim(); if (aiName && aiName.length >= 2 && aiName.length <= 30) resolvedName = aiName; } catch { /* fallback to domain */ } } if (!resolvedName) resolvedName = new URL(url).hostname.replace('www.', '').split('.')[0]; // Source 1: Parse social links from HTML const siteLinks: string[] = scrapeData.data?.links || []; const siteMap: string[] = mapData.links || []; const linkHandles = extractSocialLinks([...siteLinks, ...siteMap]); // Source 2: Firecrawl JSON extraction const scrapeSocial = clinic.socialMedia || {}; const firecrawlHandles = { instagram: scrapeSocial.instagram ? [scrapeSocial.instagram] : [], youtube: scrapeSocial.youtube ? [scrapeSocial.youtube] : [], facebook: scrapeSocial.facebook ? [scrapeSocial.facebook] : [], naverBlog: scrapeSocial.blog ? [scrapeSocial.blog] : [], tiktok: scrapeSocial.tiktok ? [scrapeSocial.tiktok] : [], kakao: scrapeSocial.kakao ? [scrapeSocial.kakao] : [], }; // Source 3: Social button links from JS-rendered page (most reliable!) const socialButtonUrls = socialButtons .map(b => typeof b.url === 'string' ? b.url : '') .filter(u => u.length > 5); const buttonHandles = extractSocialLinks(socialButtonUrls); // ═══════════════════════════════════════════ // STAGE B: Direct API searches + Perplexity (ALL PARALLEL) // Each API directly searches for the clinic's presence // ═══════════════════════════════════════════ const apiHandles: Partial = { instagram: [], youtube: [], facebook: [], naverBlog: [], tiktok: [], kakao: [], }; let gangnamUnniHintUrl: string | undefined; const stageBTasks: Promise[] = []; // ─── B1. YouTube Data API: Search for channel by name ─── if (YOUTUBE_API_KEY) { stageBTasks.push((async () => { try { const q = encodeURIComponent(resolvedName); const res = await fetch( `https://www.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=${q}&maxResults=3&key=${YOUTUBE_API_KEY}` ); const data = await res.json(); // Add ALL search results — let verifyAllHandles pick the best match by name for (const item of (data.items || [])) { const channelId = item.snippet?.channelId || item.id?.channelId; if (channelId) { apiHandles.youtube!.push(channelId); } } } catch { /* skip */ } })()); } // ─── B2. Naver Search API: Find blog + social URLs ─── if (NAVER_CLIENT_ID && NAVER_CLIENT_SECRET) { const naverHeaders = { "X-Naver-Client-Id": NAVER_CLIENT_ID, "X-Naver-Client-Secret": NAVER_CLIENT_SECRET, }; // B2a. Blog search → find official Naver blog stageBTasks.push((async () => { try { const q = encodeURIComponent(`${resolvedName} 공식 블로그`); const res = await fetch( `https://openapi.naver.com/v1/search/blog.json?query=${q}&display=5&sort=sim`, { headers: naverHeaders } ); const data = await res.json(); for (const item of (data.items || [])) { const link = item.link || ""; if (link.includes("blog.naver.com/")) { const m = link.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/); if (m) apiHandles.naverBlog!.push(m[1]); } } } catch { /* skip */ } })()); // B2b. Web search → find Instagram/YouTube/Facebook URLs stageBTasks.push((async () => { try { const q = encodeURIComponent(`${resolvedName} 인스타그램 유튜브 공식`); const res = await fetch( `https://openapi.naver.com/v1/search/webkr.json?query=${q}&display=10`, { headers: naverHeaders } ); const data = await res.json(); const urls: string[] = (data.items || []).map((item: Record) => item.link).filter(Boolean); // Extract social handles from search result URLs const found = extractSocialLinks(urls); if (found.instagram.length) apiHandles.instagram!.push(...found.instagram); if (found.youtube.length) apiHandles.youtube!.push(...found.youtube); if (found.facebook.length) apiHandles.facebook!.push(...found.facebook); if (found.tiktok.length) apiHandles.tiktok!.push(...found.tiktok); } catch { /* skip */ } })()); } // ─── B3. Firecrawl Search: Find social URLs via web search ─── stageBTasks.push((async () => { try { const res = await fetch("https://api.firecrawl.dev/v1/search", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, body: JSON.stringify({ query: `${resolvedName} 성형외과 instagram youtube 공식`, limit: 10, }), }); const data = await res.json(); const urls: string[] = (data.data || []).map((r: Record) => r.url).filter(Boolean); const found = extractSocialLinks(urls); if (found.instagram.length) apiHandles.instagram!.push(...found.instagram); if (found.youtube.length) apiHandles.youtube!.push(...found.youtube); if (found.facebook.length) apiHandles.facebook!.push(...found.facebook); if (found.tiktok.length) apiHandles.tiktok!.push(...found.tiktok); } catch { /* skip */ } })()); // ─── B4. Perplexity: Single comprehensive query (proven pattern) ─── let perplexityResearch: Record | null = null; if (PERPLEXITY_API_KEY) { // Build clinic name with English variant for better search const clinicNameEn = clinic.clinicNameEn || ''; const searchName = clinicNameEn ? `${resolvedName} (${clinicNameEn})` : resolvedName; stageBTasks.push((async () => { try { const res = await fetch("https://api.perplexity.ai/chat/completions", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, body: JSON.stringify({ model: PERPLEXITY_MODEL, messages: [ { role: "system", content: "You are a social media researcher. Search the web and find social media accounts. Respond ONLY with valid JSON." }, { role: "user", content: `${searchName} 병원의 인스타그램, 유튜브, 페이스북, 틱톡, 네이버블로그 계정을 검색해서 찾아줘. 검색 결과에서 발견된 계정을 모두 알려줘. 인스타그램은 여러 계정이 있을 수 있어.\n\n{"instagram": ["handle1", "handle2"], "youtube": "channel URL or handle", "facebook": "page name or URL", "tiktok": "handle", "naverBlog": "blog ID"}` }, ], temperature: 0.1, }), }); const data = await res.json(); let text = data.choices?.[0]?.message?.content || ""; const m = text.match(/\{[\s\S]*\}/); if (m) { const parsed = JSON.parse(m[0]); perplexityResearch = parsed; const ig = Array.isArray(parsed.instagram) ? parsed.instagram : parsed.instagram ? [parsed.instagram] : []; const yt = Array.isArray(parsed.youtube) ? parsed.youtube : parsed.youtube ? [parsed.youtube] : []; ig.forEach((h: unknown) => { if (h && typeof h === 'string') apiHandles.instagram!.push(h); }); yt.forEach((h: unknown) => { if (h && typeof h === 'string') apiHandles.youtube!.push(h); }); if (parsed.facebook && typeof parsed.facebook === 'string') apiHandles.facebook!.push(parsed.facebook); if (parsed.tiktok && typeof parsed.tiktok === 'string') apiHandles.tiktok!.push(parsed.tiktok); if (parsed.naverBlog && typeof parsed.naverBlog === 'string') apiHandles.naverBlog!.push(parsed.naverBlog); } } catch { /* skip */ } })()); // B4b. 강남언니 검색 (별도 — gangnamunni URL 힌트 필요) stageBTasks.push((async () => { try { const res = await fetch("https://api.perplexity.ai/chat/completions", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, body: JSON.stringify({ model: PERPLEXITY_MODEL, messages: [ { role: "system", content: "You search for clinic listings on medical platforms. Respond ONLY with valid JSON." }, { role: "user", content: `${resolvedName} 병원 강남언니 gangnamunni.com 페이지를 찾아줘.\n\n{"gangnamUnni": {"url": "https://gangnamunni.com/hospitals/...", "rating": 9.5, "reviews": 1000}}` }, ], temperature: 0.1, }), }); const data = await res.json(); let text = data.choices?.[0]?.message?.content || ""; const m = text.match(/\{[\s\S]*\}/); if (m) { const parsed = JSON.parse(m[0]); if (parsed.gangnamUnni?.url) gangnamUnniHintUrl = String(parsed.gangnamUnni.url); } } catch { /* skip */ } })()); } // ─── B5. Apify Instagram: Direct profile search by clinic name variants ─── const APIFY_TOKEN = Deno.env.get("APIFY_API_TOKEN") || ""; if (APIFY_TOKEN) { stageBTasks.push((async () => { try { // Generate handle candidates from clinic name const baseName = resolvedName.replace(/성형외과|병원|의원|클리닉|피부과/g, '').trim().toLowerCase(); const baseNameEn = (clinic.clinicNameEn || '').replace(/\s+/g, '').toLowerCase(); const candidates: string[] = []; if (baseNameEn && baseNameEn.length >= 3) { candidates.push(baseNameEn, `${baseNameEn}_official`, `${baseNameEn}_ps`, `${baseNameEn}_clinic`); } if (baseName && /^[a-zA-Z]/.test(baseName)) { candidates.push(baseName, `${baseName}_official`, `${baseName}_ps`); } // Also try domain-based const domainBase = new URL(url).hostname.replace('www.', '').split('.')[0].toLowerCase(); if (domainBase.length >= 3 && !candidates.includes(domainBase)) { candidates.push(domainBase, `${domainBase}_official`); } // Quick check each candidate with Apify for (const handle of candidates.slice(0, 6)) { try { const apifyRes = await fetch( `${APIFY_BASE}/acts/apify~instagram-profile-scraper/runs?token=${APIFY_TOKEN}&waitForFinish=45`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ usernames: [handle], resultsLimit: 1 }), } ); const run = await apifyRes.json(); const datasetId = run.data?.defaultDatasetId; if (!datasetId) continue; const itemsRes = await fetch(`${APIFY_BASE}/datasets/${datasetId}/items?token=${APIFY_TOKEN}&limit=1`); const items = await itemsRes.json(); const profile = (items as Record[])[0]; if (profile && !profile.error && (profile.followersCount as number) >= 50) { apiHandles.instagram!.push(profile.username as string || handle); break; // Found one valid — stop searching } } catch { /* try next */ } } } catch { /* skip */ } })()); } // Run all Stage B tasks in parallel await Promise.allSettled(stageBTasks); // ═══════════════════════════════════════════ // STAGE C: Merge ALL sources + Verify // ═══════════════════════════════════════════ const merged = mergeSocialLinks(linkHandles, firecrawlHandles, buttonHandles, apiHandles); const cleanHandles = { instagram: [...new Set(merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null))], youtube: [...new Set(merged.youtube.map(h => extractHandle(h, 'youtube')).filter((h): h is string => h !== null))], facebook: [...new Set(merged.facebook.map(h => extractHandle(h, 'facebook')).filter((h): h is string => h !== null))], naverBlog: [...new Set(merged.naverBlog.map(h => extractHandle(h, 'naverBlog')).filter((h): h is string => h !== null))], tiktok: [...new Set(merged.tiktok.map(h => extractHandle(h, 'tiktok')).filter((h): h is string => h !== null))], }; // Fallback: try domain name as Facebook/Instagram handle if no candidates found try { const domain = new URL(url).hostname.replace('www.', '').split('.')[0]; // e.g. "idhospital" if (domain && domain.length >= 3) { if (cleanHandles.facebook.length === 0) cleanHandles.facebook.push(domain); if (cleanHandles.instagram.length === 0) cleanHandles.instagram.push(domain); } } catch { /* ignore */ } const verified: VerifiedChannels = await verifyAllHandles( cleanHandles, resolvedName, gangnamUnniHintUrl, ); // ═══════════════════════════════════════════ // Save to DB (supabase client reused from registry check above) // ═══════════════════════════════════════════ const scrapeDataFull = { clinic, branding: brandData.data?.json || {}, siteLinks, siteMap: mapData.links || [], sourceUrl: url, scrapedAt: new Date().toISOString(), // Perplexity research results — raw channel data with subscriber counts etc. onlinePresenceResearch: perplexityResearch, }; // ─── Legacy: marketing_reports (backward compat) ─── const { data: saved, error: saveError } = await supabase .from("marketing_reports") .insert({ url, clinic_name: resolvedName, status: "discovered", verified_channels: verified, scrape_data: scrapeDataFull, report: {}, pipeline_started_at: new Date().toISOString(), }) .select("id") .single(); if (saveError) throw new Error(`DB save failed: ${saveError.message}`); // ─── V3: clinics + analysis_runs (dual-write) ─── let clinicId: string | null = null; let runId: string | null = null; try { // UPSERT clinic (url 기준 — 같은 URL이면 기존 행 업데이트) const { data: clinicRow } = await supabase .from("clinics") .upsert({ url, name: resolvedName, name_en: clinic.clinicNameEn || null, domain: new URL(url).hostname.replace('www.', ''), address: clinic.address || null, phone: clinic.phone || null, services: clinic.services || [], branding: brandData.data?.json || {}, social_handles: { instagram: verified.instagram?.map((v: Record) => v.handle) || [], youtube: (verified.youtube as Record)?.handle || null, facebook: (verified.facebook as Record)?.handle || null, naverBlog: (verified.naverBlog as Record)?.handle || null, }, verified_channels: verified, last_analyzed_at: new Date().toISOString(), updated_at: new Date().toISOString(), }, { onConflict: 'url' }) .select("id") .single(); clinicId = clinicRow?.id || null; // INSERT analysis_run if (clinicId) { const { data: runRow } = await supabase .from("analysis_runs") .insert({ clinic_id: clinicId, status: "discovering", scrape_data: scrapeDataFull, discovered_channels: verified, trigger: "manual", pipeline_started_at: new Date().toISOString(), }) .select("id") .single(); runId = runRow?.id || null; } } catch (e) { // V3 write failure should not block the pipeline console.error("V3 dual-write error:", e); } return new Response( JSON.stringify({ success: true, reportId: saved.id, clinicId, runId, // V3 IDs for downstream phases clinicName: resolvedName, verifiedChannels: verified, address: clinic.address || "", services: clinic.services || [], scrapeData: scrapeDataFull, }), { headers: { ...corsHeaders, "Content-Type": "application/json" } }, ); } catch (error) { return new Response( JSON.stringify({ success: false, error: error.message }), { status: 500, headers: { ...corsHeaders, "Content-Type": "application/json" } }, ); } });