diff --git a/supabase/functions/discover-channels/index.ts b/supabase/functions/discover-channels/index.ts index 5f5acaf..2cb83cd 100644 --- a/supabase/functions/discover-channels/index.ts +++ b/supabase/functions/discover-channels/index.ts @@ -13,13 +13,63 @@ interface DiscoverRequest { clinicName?: string; } +/** + * Robust handle extraction — handles may be full URLs, @handles, or bare usernames. + * Validates each handle actually belongs to its platform. + */ +function extractHandle(raw: string, platform: string): string | null { + if (!raw || raw.length < 2) return null; + let h = raw.trim(); + + if (platform === 'instagram') { + const m = h.match(/instagram\.com\/([a-zA-Z0-9._]+)/); + if (m) return m[1]; + h = h.replace(/^@/, '').replace(/\/$/, ''); + if (/^[a-zA-Z0-9._]+$/.test(h) && h.length >= 2) return h; + return null; + } + if (platform === 'youtube') { + const m = h.match(/youtube\.com\/(?:@([a-zA-Z0-9._-]+)|channel\/(UC[a-zA-Z0-9_-]+)|c\/([a-zA-Z0-9._-]+))/); + if (m) return m[1] ? `@${m[1]}` : m[2] || m[3] || null; + h = h.replace(/^@/, ''); + if (h.includes('http') || h.includes('/') || h.includes('.com')) return null; + if (/^UC[a-zA-Z0-9_-]{20,}$/.test(h)) return h; + if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return `@${h}`; + return null; + } + if (platform === 'facebook') { + const m = h.match(/facebook\.com\/([a-zA-Z0-9._-]+)/); + if (m) return m[1]; + h = h.replace(/^@/, '').replace(/\/$/, ''); + if (h.includes('http') || h.includes('/')) return null; + if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return h; + return null; + } + if (platform === 'naverBlog') { + const m = h.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/); + if (m) return m[1]; + if (h.includes('http') || h.includes('/')) return null; + if (/^[a-zA-Z0-9_-]+$/.test(h) && h.length >= 2) return h; + return null; + } + if (platform === 'tiktok') { + const m = h.match(/tiktok\.com\/@([a-zA-Z0-9._-]+)/); + if (m) return m[1]; + h = h.replace(/^@/, ''); + if (h.includes('http') || h.includes('/')) return null; + if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return h; + return null; + } + return h; +} + /** * Phase 1: Discover & Verify Channels * - * 3-source channel discovery: - * A. Firecrawl scrape + map → extract social links from HTML - * B. Perplexity search → find social handles via web search - * C. Merge + deduplicate → verify each handle exists + * Two-stage discovery: + * Stage A: Firecrawl scrape + map (parallel) → extract clinicName + social links + * Stage B: Perplexity search using clinicName (parallel) → find more handles + * Stage C: Merge + Verify all handles */ Deno.serve(async (req) => { if (req.method === "OPTIONS") { @@ -27,7 +77,7 @@ Deno.serve(async (req) => { } try { - const { url, clinicName } = (await req.json()) as DiscoverRequest; + const { url, clinicName: inputClinicName } = (await req.json()) as DiscoverRequest; if (!url) { return new Response( JSON.stringify({ error: "URL is required" }), @@ -39,10 +89,12 @@ Deno.serve(async (req) => { const PERPLEXITY_API_KEY = Deno.env.get("PERPLEXITY_API_KEY"); if (!FIRECRAWL_API_KEY) throw new Error("FIRECRAWL_API_KEY not configured"); - // ─── A. Parallel: Firecrawl scrape/map + Perplexity search ─── + // ═══════════════════════════════════════════ + // STAGE A: Firecrawl scrape + map (parallel) + // → Extract clinicName + social links from HTML + // ═══════════════════════════════════════════ - const [scrapeResult, mapResult, brandResult, perplexityResult] = await Promise.allSettled([ - // A1. Scrape website — structured JSON + links + const [scrapeResult, mapResult, brandResult] = await Promise.allSettled([ fetch("https://api.firecrawl.dev/v1/scrape", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, @@ -50,11 +102,12 @@ Deno.serve(async (req) => { url, formats: ["json", "links"], jsonOptions: { - prompt: "Extract: clinic name, address, phone, services offered, doctors with specialties, social media links (instagram, youtube, blog, facebook, tiktok, kakao), business hours, slogan", + prompt: "Extract: clinic name (Korean), clinic name (English), address, phone, services offered, doctors with specialties, ALL social media links (instagram handles/URLs, youtube channel URL/handle, naver blog URL, facebook page URL, tiktok, kakao channel), business hours, slogan", schema: { type: "object", properties: { clinicName: { type: "string" }, + clinicNameEn: { type: "string" }, address: { type: "string" }, phone: { type: "string" }, businessHours: { type: "string" }, @@ -69,88 +122,41 @@ Deno.serve(async (req) => { }), }).then(r => r.json()), - // A2. Map site — discover all linked pages fetch("https://api.firecrawl.dev/v1/map", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, body: JSON.stringify({ url, limit: 50 }), }).then(r => r.json()), - // A3. Branding extraction fetch("https://api.firecrawl.dev/v1/scrape", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, body: JSON.stringify({ - url, - formats: ["json"], + url, formats: ["json"], jsonOptions: { prompt: "Extract brand identity: primary/accent/background/text colors (hex), heading/body fonts, logo URL, favicon URL, tagline", - schema: { - type: "object", - properties: { - primaryColor: { type: "string" }, accentColor: { type: "string" }, - backgroundColor: { type: "string" }, textColor: { type: "string" }, - headingFont: { type: "string" }, bodyFont: { type: "string" }, - logoUrl: { type: "string" }, faviconUrl: { type: "string" }, tagline: { type: "string" }, - }, - }, + schema: { type: "object", properties: { primaryColor: { type: "string" }, accentColor: { type: "string" }, backgroundColor: { type: "string" }, textColor: { type: "string" }, headingFont: { type: "string" }, bodyFont: { type: "string" }, logoUrl: { type: "string" }, faviconUrl: { type: "string" }, tagline: { type: "string" } } }, }, waitFor: 3000, }), }).then(r => r.json()).catch(() => ({ data: { json: {} } })), - - // A4. Perplexity — find social handles via web search - PERPLEXITY_API_KEY - ? Promise.allSettled([ - // Query 1: Social media handles - fetch("https://api.perplexity.ai/chat/completions", { - method: "POST", - headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, - body: JSON.stringify({ - model: "sonar", - messages: [ - { role: "system", content: "You find official social media accounts for Korean medical clinics. Respond ONLY with valid JSON. If unsure, use null. Never guess." }, - { role: "user", content: `"${clinicName || url}" 성형외과의 공식 소셜 미디어 계정을 찾아줘. 반드시 확인된 계정만 포함.\n\n{"instagram": ["핸들1", "핸들2"], "youtube": "핸들 또는 URL", "facebook": "페이지명", "tiktok": "핸들", "naverBlog": "블로그ID", "kakao": "채널ID"}` }, - ], - temperature: 0.1, - }), - }).then(r => r.json()), - - // Query 2: Platform presence (강남언니, 네이버, 바비톡) - fetch("https://api.perplexity.ai/chat/completions", { - method: "POST", - headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, - body: JSON.stringify({ - model: "sonar", - messages: [ - { role: "system", content: "You research Korean medical clinic platform presence. Respond ONLY with valid JSON." }, - { role: "user", content: `"${clinicName || url}" 성형외과의 강남언니, 네이버 플레이스, 바비톡 등록 현황을 찾아줘.\n\n{"gangnamUnni": {"registered": true/false, "url": "URL 또는 null", "rating": 숫자 또는 null}, "naverPlace": {"registered": true/false, "rating": 숫자 또는 null}, "babitok": {"registered": true/false}}` }, - ], - temperature: 0.1, - }), - }).then(r => r.json()), - ]) - : Promise.resolve([]), ]); - // ─── B. Parse results ─── - const scrapeData = scrapeResult.status === "fulfilled" ? scrapeResult.value : { data: {} }; const mapData = mapResult.status === "fulfilled" ? mapResult.value : {}; const brandData = brandResult.status === "fulfilled" ? brandResult.value : { data: { json: {} } }; const clinic = scrapeData.data?.json || {}; - const resolvedName = clinicName || clinic.clinicName || url; + const resolvedName = inputClinicName || clinic.clinicName || clinic.clinicNameEn || new URL(url).hostname.replace('www.', '').split('.')[0]; const siteLinks: string[] = scrapeData.data?.links || []; const siteMap: string[] = mapData.links || []; - const allUrls = [...siteLinks, ...siteMap]; // Source 1: Parse links from HTML - const linkHandles = extractSocialLinks(allUrls); + const linkHandles = extractSocialLinks([...siteLinks, ...siteMap]); - // Source 2: Parse Firecrawl JSON extraction socialMedia field + // Source 2: Firecrawl JSON extraction socialMedia field const scrapeSocial = clinic.socialMedia || {}; - const firecrawlHandles: Partial = { + const firecrawlHandles = { instagram: scrapeSocial.instagram ? [scrapeSocial.instagram] : [], youtube: scrapeSocial.youtube ? [scrapeSocial.youtube] : [], facebook: scrapeSocial.facebook ? [scrapeSocial.facebook] : [], @@ -159,14 +165,46 @@ Deno.serve(async (req) => { kakao: scrapeSocial.kakao ? [scrapeSocial.kakao] : [], }; - // Source 3: Parse Perplexity results + // ═══════════════════════════════════════════ + // STAGE B: Perplexity search using CLINIC NAME + // → Find social handles that Firecrawl missed + // ═══════════════════════════════════════════ + let perplexityHandles: Partial = {}; let gangnamUnniHintUrl: string | undefined; - if (perplexityResult.status === "fulfilled" && Array.isArray(perplexityResult.value)) { - const pResults = perplexityResult.value; + if (PERPLEXITY_API_KEY && resolvedName) { + const pResults = await Promise.allSettled([ + // Query 1: Social media accounts — using clinic name, not URL + fetch("https://api.perplexity.ai/chat/completions", { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, + body: JSON.stringify({ + model: "sonar", + messages: [ + { role: "system", content: "You find official social media accounts for Korean medical clinics. Respond ONLY with valid JSON. If unsure, use null. Never guess or make up handles." }, + { role: "user", content: `"${resolvedName}" 성형외과/병원의 공식 소셜 미디어 계정을 찾아줘. 인스타그램 계정이 여러개일 수 있어 (국문용, 영문용 등). 반드시 확인된 계정만 포함.\n\n{"instagram": ["핸들1", "핸들2"], "youtube": "채널 핸들 또는 URL (@ 포함)", "facebook": "페이지명 또는 URL", "tiktok": "핸들", "naverBlog": "블로그ID", "kakao": "채널ID"}` }, + ], + temperature: 0.1, + }), + }).then(r => r.json()), - // Social handles query + // Query 2: Platform presence — 강남언니, 네이버, 바비톡 + fetch("https://api.perplexity.ai/chat/completions", { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, + body: JSON.stringify({ + model: "sonar", + messages: [ + { role: "system", content: "You research Korean medical clinic platform presence. Respond ONLY with valid JSON." }, + { role: "user", content: `"${resolvedName}" 성형외과/병원의 강남언니, 네이버 플레이스, 바비톡 등록 현황을 찾아줘.\n\n{"gangnamUnni": {"registered": true/false, "url": "gangnamunni.com URL 또는 null", "rating": 숫자/10 또는 null}, "naverPlace": {"registered": true/false}, "babitok": {"registered": true/false}}` }, + ], + temperature: 0.1, + }), + }).then(r => r.json()), + ]); + + // Parse social handles if (pResults[0]?.status === "fulfilled") { try { let text = pResults[0].value?.choices?.[0]?.message?.content || ""; @@ -181,77 +219,27 @@ Deno.serve(async (req) => { tiktok: parsed.tiktok ? [parsed.tiktok] : [], kakao: parsed.kakao ? [parsed.kakao] : [], }; - } catch { /* JSON parse failed — skip */ } + } catch { /* JSON parse failed */ } } - // Platform presence query + // Parse platform presence if (pResults[1]?.status === "fulfilled") { try { let text = pResults[1].value?.choices?.[0]?.message?.content || ""; const jsonMatch = text.match(/```(?:json)?\n?([\s\S]*?)```/); if (jsonMatch) text = jsonMatch[1]; const parsed = JSON.parse(text); - if (parsed.gangnamUnni?.url) { - gangnamUnniHintUrl = parsed.gangnamUnni.url; - } - } catch { /* JSON parse failed — skip */ } + if (parsed.gangnamUnni?.url) gangnamUnniHintUrl = parsed.gangnamUnni.url; + } catch { /* JSON parse failed */ } } } - // ─── C. Merge + Deduplicate + Verify ─── + // ═══════════════════════════════════════════ + // STAGE C: Merge + Deduplicate + Verify + // ═══════════════════════════════════════════ const merged = mergeSocialLinks(linkHandles, firecrawlHandles, perplexityHandles); - // Robust handle extraction — handles may be full URLs, @handles, or bare usernames - function extractHandle(raw: string, platform: string): string | null { - if (!raw || raw.length < 2) return null; - let h = raw.trim(); - - // Platform-specific URL extraction - if (platform === 'instagram') { - const m = h.match(/instagram\.com\/([a-zA-Z0-9._]+)/); - if (m) return m[1]; - h = h.replace(/^@/, '').replace(/\/$/, ''); - if (/^[a-zA-Z0-9._]+$/.test(h) && h.length >= 2) return h; - return null; - } - if (platform === 'youtube') { - const m = h.match(/youtube\.com\/(?:@([a-zA-Z0-9._-]+)|channel\/(UC[a-zA-Z0-9_-]+)|c\/([a-zA-Z0-9._-]+))/); - if (m) return m[1] ? `@${m[1]}` : m[2] || m[3] || null; - h = h.replace(/^@/, ''); - // Reject if it looks like a non-YouTube URL - if (h.includes('http') || h.includes('/') || h.includes('.com')) return null; - // Channel IDs start with UC — don't add @ prefix - if (/^UC[a-zA-Z0-9_-]{20,}$/.test(h)) return h; - if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return `@${h}`; - return null; - } - if (platform === 'facebook') { - const m = h.match(/facebook\.com\/([a-zA-Z0-9._-]+)/); - if (m) return m[1]; - h = h.replace(/^@/, '').replace(/\/$/, ''); - if (h.includes('http') || h.includes('/')) return null; - if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return h; - return null; - } - if (platform === 'naverBlog') { - const m = h.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/); - if (m) return m[1]; - if (h.includes('http') || h.includes('/')) return null; - if (/^[a-zA-Z0-9_-]+$/.test(h) && h.length >= 2) return h; - return null; - } - if (platform === 'tiktok') { - const m = h.match(/tiktok\.com\/@([a-zA-Z0-9._-]+)/); - if (m) return m[1]; - h = h.replace(/^@/, ''); - if (h.includes('http') || h.includes('/')) return null; - if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return h; - return null; - } - return h; - } - const cleanHandles = { instagram: merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null), youtube: merged.youtube.map(h => extractHandle(h, 'youtube')).filter((h): h is string => h !== null), @@ -261,31 +249,27 @@ Deno.serve(async (req) => { }; const verified: VerifiedChannels = await verifyAllHandles( - cleanHandles, - resolvedName, - gangnamUnniHintUrl, + cleanHandles, resolvedName, gangnamUnniHintUrl, ); - // ─── D. Save to DB ─── + // ═══════════════════════════════════════════ + // Save to DB + // ═══════════════════════════════════════════ const supabaseUrl = Deno.env.get("SUPABASE_URL")!; const supabaseKey = Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")!; const supabase = createClient(supabaseUrl, supabaseKey); const scrapeDataFull = { - clinic, - branding: brandData.data?.json || {}, - siteLinks, - siteMap: mapData.links || [], - sourceUrl: url, - scrapedAt: new Date().toISOString(), + clinic, branding: brandData.data?.json || {}, + siteLinks, siteMap: mapData.links || [], + sourceUrl: url, scrapedAt: new Date().toISOString(), }; const { data: saved, error: saveError } = await supabase .from("marketing_reports") .insert({ - url, - clinic_name: resolvedName, + url, clinic_name: resolvedName, status: "discovered", verified_channels: verified, scrape_data: scrapeDataFull, @@ -299,8 +283,7 @@ Deno.serve(async (req) => { return new Response( JSON.stringify({ - success: true, - reportId: saved.id, + success: true, reportId: saved.id, clinicName: resolvedName, verifiedChannels: verified, address: clinic.address || "",