import "@supabase/functions-js/edge-runtime.d.ts"; import { createClient } from "https://esm.sh/@supabase/supabase-js@2"; import { extractSocialLinks, mergeSocialLinks } from "../_shared/extractSocialLinks.ts"; import { verifyAllHandles, type VerifiedChannels } from "../_shared/verifyHandles.ts"; const corsHeaders = { "Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "authorization, x-client-info, apikey, content-type", }; interface DiscoverRequest { url: string; clinicName?: string; } /** * Robust handle extraction — handles may be full URLs, @handles, or bare usernames. * Validates each handle actually belongs to its platform. */ function extractHandle(raw: string, platform: string): string | null { if (!raw || raw.length < 2) return null; let h = raw.trim(); if (platform === 'instagram') { const m = h.match(/instagram\.com\/([a-zA-Z0-9._]+)/); if (m) return m[1]; h = h.replace(/^@/, '').replace(/\/$/, ''); if (/^[a-zA-Z0-9._]+$/.test(h) && h.length >= 2) return h; return null; } if (platform === 'youtube') { const m = h.match(/youtube\.com\/(?:@([a-zA-Z0-9._-]+)|channel\/(UC[a-zA-Z0-9_-]+)|c\/([a-zA-Z0-9._-]+))/); if (m) return m[1] ? `@${m[1]}` : m[2] || m[3] || null; h = h.replace(/^@/, ''); if (h.includes('http') || h.includes('/') || h.includes('.com')) return null; if (/^UC[a-zA-Z0-9_-]{20,}$/.test(h)) return h; if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return `@${h}`; return null; } if (platform === 'facebook') { const m = h.match(/facebook\.com\/([a-zA-Z0-9._-]+)/); if (m) return m[1]; h = h.replace(/^@/, '').replace(/\/$/, ''); if (h.includes('http') || h.includes('/')) return null; if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return h; return null; } if (platform === 'naverBlog') { const m = h.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/); if (m) return m[1]; if (h.includes('http') || h.includes('/')) return null; if (/^[a-zA-Z0-9_-]+$/.test(h) && h.length >= 2) return h; return null; } if (platform === 'tiktok') { const m = h.match(/tiktok\.com\/@([a-zA-Z0-9._-]+)/); if (m) return m[1]; h = h.replace(/^@/, ''); if (h.includes('http') || h.includes('/')) return null; if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return h; return null; } return h; } /** * Phase 1: Discover & Verify Channels * * Two-stage discovery: * Stage A: Firecrawl scrape + map (parallel) → extract clinicName + social links * Stage B: Perplexity search using clinicName (parallel) → find more handles * Stage C: Merge + Verify all handles */ Deno.serve(async (req) => { if (req.method === "OPTIONS") { return new Response("ok", { headers: corsHeaders }); } try { const { url, clinicName: inputClinicName } = (await req.json()) as DiscoverRequest; if (!url) { return new Response( JSON.stringify({ error: "URL is required" }), { status: 400, headers: { ...corsHeaders, "Content-Type": "application/json" } }, ); } const FIRECRAWL_API_KEY = Deno.env.get("FIRECRAWL_API_KEY"); const PERPLEXITY_API_KEY = Deno.env.get("PERPLEXITY_API_KEY"); if (!FIRECRAWL_API_KEY) throw new Error("FIRECRAWL_API_KEY not configured"); // ═══════════════════════════════════════════ // STAGE A: Firecrawl scrape + map (parallel) // → Extract clinicName + social links from HTML // ═══════════════════════════════════════════ const [scrapeResult, mapResult, brandResult] = await Promise.allSettled([ fetch("https://api.firecrawl.dev/v1/scrape", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, body: JSON.stringify({ url, formats: ["json", "links"], jsonOptions: { prompt: "Extract: clinic name (Korean), clinic name (English), address, phone, services offered, doctors with specialties, ALL social media links (instagram handles/URLs, youtube channel URL/handle, naver blog URL, facebook page URL, tiktok, kakao channel), business hours, slogan", schema: { type: "object", properties: { clinicName: { type: "string" }, clinicNameEn: { type: "string" }, address: { type: "string" }, phone: { type: "string" }, businessHours: { type: "string" }, slogan: { type: "string" }, services: { type: "array", items: { type: "string" } }, doctors: { type: "array", items: { type: "object", properties: { name: { type: "string" }, title: { type: "string" }, specialty: { type: "string" } } } }, socialMedia: { type: "object", properties: { instagram: { type: "string" }, youtube: { type: "string" }, blog: { type: "string" }, facebook: { type: "string" }, tiktok: { type: "string" }, kakao: { type: "string" } } }, }, }, }, waitFor: 5000, }), }).then(r => r.json()), fetch("https://api.firecrawl.dev/v1/map", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, body: JSON.stringify({ url, limit: 50 }), }).then(r => r.json()), fetch("https://api.firecrawl.dev/v1/scrape", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, body: JSON.stringify({ url, formats: ["json"], jsonOptions: { prompt: "Extract brand identity: primary/accent/background/text colors (hex), heading/body fonts, logo URL, favicon URL, tagline", schema: { type: "object", properties: { primaryColor: { type: "string" }, accentColor: { type: "string" }, backgroundColor: { type: "string" }, textColor: { type: "string" }, headingFont: { type: "string" }, bodyFont: { type: "string" }, logoUrl: { type: "string" }, faviconUrl: { type: "string" }, tagline: { type: "string" } } }, }, waitFor: 3000, }), }).then(r => r.json()).catch(() => ({ data: { json: {} } })), ]); const scrapeData = scrapeResult.status === "fulfilled" ? scrapeResult.value : { data: {} }; const mapData = mapResult.status === "fulfilled" ? mapResult.value : {}; const brandData = brandResult.status === "fulfilled" ? brandResult.value : { data: { json: {} } }; const clinic = scrapeData.data?.json || {}; let resolvedName = inputClinicName || clinic.clinicName || ""; // If Firecrawl didn't extract a Korean name, try English name or domain if (!resolvedName) { resolvedName = clinic.clinicNameEn || ""; } // Last resort: extract something readable from the domain if (!resolvedName) { const domain = new URL(url).hostname.replace('www.', '').split('.')[0]; // If Perplexity is available, ask it to identify the clinic name from the URL if (PERPLEXITY_API_KEY) { try { const nameRes = await fetch("https://api.perplexity.ai/chat/completions", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, body: JSON.stringify({ model: "sonar", messages: [ { role: "system", content: "Respond with ONLY the clinic name in Korean, nothing else." }, { role: "user", content: `${url} 이 URL의 병원/클리닉 한국어 이름이 뭐야?` }, ], temperature: 0.1, }), }); const nameData = await nameRes.json(); const aiName = (nameData.choices?.[0]?.message?.content || "").trim().replace(/["""]/g, '').split('\n')[0].trim(); if (aiName && aiName.length >= 2 && aiName.length <= 30) { resolvedName = aiName; } } catch { /* fallback to domain */ } } if (!resolvedName) resolvedName = domain; } const siteLinks: string[] = scrapeData.data?.links || []; const siteMap: string[] = mapData.links || []; // Source 1: Parse links from HTML const linkHandles = extractSocialLinks([...siteLinks, ...siteMap]); // Source 2: Firecrawl JSON extraction socialMedia field const scrapeSocial = clinic.socialMedia || {}; const firecrawlHandles = { instagram: scrapeSocial.instagram ? [scrapeSocial.instagram] : [], youtube: scrapeSocial.youtube ? [scrapeSocial.youtube] : [], facebook: scrapeSocial.facebook ? [scrapeSocial.facebook] : [], naverBlog: scrapeSocial.blog ? [scrapeSocial.blog] : [], tiktok: scrapeSocial.tiktok ? [scrapeSocial.tiktok] : [], kakao: scrapeSocial.kakao ? [scrapeSocial.kakao] : [], }; // ═══════════════════════════════════════════ // STAGE B: Perplexity search using CLINIC NAME // → Find social handles that Firecrawl missed // ═══════════════════════════════════════════ let perplexityHandles: Partial = {}; let gangnamUnniHintUrl: string | undefined; if (PERPLEXITY_API_KEY && resolvedName) { const pResults = await Promise.allSettled([ // Query 1: Social media accounts — search-based, not verification fetch("https://api.perplexity.ai/chat/completions", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, body: JSON.stringify({ model: "sonar", messages: [ { role: "system", content: "You are a social media researcher. Search the web and find social media accounts. Respond ONLY with valid JSON, no explanation." }, { role: "user", content: `${resolvedName} 병원의 인스타그램, 유튜브, 페이스북, 틱톡, 네이버블로그, 카카오채널 계정을 검색해서 찾아줘. 검색 결과에서 발견된 계정을 모두 알려줘. 인스타그램은 여러 계정이 있을 수 있어.\n\n{"instagram": ["handle1", "handle2"], "youtube": "channel URL or @handle", "facebook": "page URL or name", "tiktok": "@handle", "naverBlog": "blog ID", "kakao": "channel ID"}` }, ], temperature: 0.1, }), }).then(r => r.json()), // Query 2: Platform presence — 강남언니, 네이버, 바비톡 fetch("https://api.perplexity.ai/chat/completions", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, body: JSON.stringify({ model: "sonar", messages: [ { role: "system", content: "You are a medical platform researcher. Search the web for clinic listings. Respond ONLY with valid JSON, no explanation." }, { role: "user", content: `${resolvedName} 병원이 강남언니(gangnamunni.com), 네이버 플레이스, 바비톡에 등록되어 있는지 검색해줘. URL도 찾아줘.\n\n{"gangnamUnni": {"registered": true, "url": "https://gangnamunni.com/hospitals/...", "rating": 9.5}, "naverPlace": {"registered": true}, "babitok": {"registered": false}}` }, ], temperature: 0.1, }), }).then(r => r.json()), ]); // Parse social handles if (pResults[0]?.status === "fulfilled") { try { let text = pResults[0].value?.choices?.[0]?.message?.content || ""; const jsonMatch = text.match(/```(?:json)?\n?([\s\S]*?)```/); if (jsonMatch) text = jsonMatch[1]; const parsed = JSON.parse(text); perplexityHandles = { instagram: Array.isArray(parsed.instagram) ? parsed.instagram : parsed.instagram ? [parsed.instagram] : [], youtube: parsed.youtube ? [parsed.youtube] : [], facebook: parsed.facebook ? [parsed.facebook] : [], naverBlog: parsed.naverBlog ? [parsed.naverBlog] : [], tiktok: parsed.tiktok ? [parsed.tiktok] : [], kakao: parsed.kakao ? [parsed.kakao] : [], }; } catch { /* JSON parse failed */ } } // Parse platform presence if (pResults[1]?.status === "fulfilled") { try { let text = pResults[1].value?.choices?.[0]?.message?.content || ""; const jsonMatch = text.match(/```(?:json)?\n?([\s\S]*?)```/); if (jsonMatch) text = jsonMatch[1]; const parsed = JSON.parse(text); if (parsed.gangnamUnni?.url) gangnamUnniHintUrl = parsed.gangnamUnni.url; } catch { /* JSON parse failed */ } } } // ═══════════════════════════════════════════ // STAGE C: Merge + Deduplicate + Verify // ═══════════════════════════════════════════ const merged = mergeSocialLinks(linkHandles, firecrawlHandles, perplexityHandles); const cleanHandles = { instagram: merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null), youtube: merged.youtube.map(h => extractHandle(h, 'youtube')).filter((h): h is string => h !== null), facebook: merged.facebook.map(h => extractHandle(h, 'facebook')).filter((h): h is string => h !== null), naverBlog: merged.naverBlog.map(h => extractHandle(h, 'naverBlog')).filter((h): h is string => h !== null), tiktok: merged.tiktok.map(h => extractHandle(h, 'tiktok')).filter((h): h is string => h !== null), }; const verified: VerifiedChannels = await verifyAllHandles( cleanHandles, resolvedName, gangnamUnniHintUrl, ); // ═══════════════════════════════════════════ // Save to DB // ═══════════════════════════════════════════ const supabaseUrl = Deno.env.get("SUPABASE_URL")!; const supabaseKey = Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")!; const supabase = createClient(supabaseUrl, supabaseKey); const scrapeDataFull = { clinic, branding: brandData.data?.json || {}, siteLinks, siteMap: mapData.links || [], sourceUrl: url, scrapedAt: new Date().toISOString(), }; const { data: saved, error: saveError } = await supabase .from("marketing_reports") .insert({ url, clinic_name: resolvedName, status: "discovered", verified_channels: verified, scrape_data: scrapeDataFull, report: {}, pipeline_started_at: new Date().toISOString(), }) .select("id") .single(); if (saveError) throw new Error(`DB save failed: ${saveError.message}`); return new Response( JSON.stringify({ success: true, reportId: saved.id, clinicName: resolvedName, verifiedChannels: verified, address: clinic.address || "", services: clinic.services || [], scrapeData: scrapeDataFull, }), { headers: { ...corsHeaders, "Content-Type": "application/json" } }, ); } catch (error) { return new Response( JSON.stringify({ success: false, error: error.message }), { status: 500, headers: { ...corsHeaders, "Content-Type": "application/json" } }, ); } });