From f224d1788c9878d3f378fb86673d8ec2add19d65 Mon Sep 17 00:00:00 2001 From: Haewon Kam Date: Sat, 4 Apr 2026 01:15:49 +0900 Subject: [PATCH] =?UTF-8?q?feat:=20API-first=20channel=20discovery=20?= =?UTF-8?q?=E2=80=94=20YouTube=20API=20+=20Naver=20API=20+=20Firecrawl=20S?= =?UTF-8?q?earch=20+=20Perplexity?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaced Perplexity-only approach with 5 parallel direct API searches: B1. YouTube Data API: search?type=channel&q={clinicName} → find channel B2a. Naver Blog API: search blog.json → find official Naver blog B2b. Naver Web API: search webkr.json → find Instagram/YouTube/Facebook URLs B3. Firecrawl Search: web search → extract social URLs from results B4. Perplexity: supplement — catch what direct APIs missed All 5 sources run in parallel after Stage A (Firecrawl scrape for clinicName). Results merged + deduplicated + verified. Perplexity is now a fallback, not the primary source. Co-Authored-By: Claude Opus 4.6 (1M context) --- supabase/functions/discover-channels/index.ts | 296 +++++++++++------- 1 file changed, 189 insertions(+), 107 deletions(-) diff --git a/supabase/functions/discover-channels/index.ts b/supabase/functions/discover-channels/index.ts index 01e0336..0189f5c 100644 --- a/supabase/functions/discover-channels/index.ts +++ b/supabase/functions/discover-channels/index.ts @@ -8,19 +8,16 @@ const corsHeaders = { "Access-Control-Allow-Headers": "authorization, x-client-info, apikey, content-type", }; +const APIFY_BASE = "https://api.apify.com/v2"; + interface DiscoverRequest { url: string; clinicName?: string; } -/** - * Robust handle extraction — handles may be full URLs, @handles, or bare usernames. - * Validates each handle actually belongs to its platform. - */ function extractHandle(raw: string, platform: string): string | null { if (!raw || raw.length < 2) return null; let h = raw.trim(); - if (platform === 'instagram') { const m = h.match(/instagram\.com\/([a-zA-Z0-9._]+)/); if (m) return m[1]; @@ -66,10 +63,10 @@ function extractHandle(raw: string, platform: string): string | null { /** * Phase 1: Discover & Verify Channels * - * Two-stage discovery: - * Stage A: Firecrawl scrape + map (parallel) → extract clinicName + social links - * Stage B: Perplexity search using clinicName (parallel) → find more handles - * Stage C: Merge + Verify all handles + * API-first, Perplexity-supplement approach: + * Stage A: Firecrawl scrape + map → clinicName + social links from HTML + * Stage B: Direct API searches (YouTube, Naver, Firecrawl) + Perplexity + * Stage C: Merge all sources + Verify handles */ Deno.serve(async (req) => { if (req.method === "OPTIONS") { @@ -85,13 +82,15 @@ Deno.serve(async (req) => { ); } - const FIRECRAWL_API_KEY = Deno.env.get("FIRECRAWL_API_KEY"); - const PERPLEXITY_API_KEY = Deno.env.get("PERPLEXITY_API_KEY"); + const FIRECRAWL_API_KEY = Deno.env.get("FIRECRAWL_API_KEY") || ""; + const PERPLEXITY_API_KEY = Deno.env.get("PERPLEXITY_API_KEY") || ""; + const YOUTUBE_API_KEY = Deno.env.get("YOUTUBE_API_KEY") || ""; + const NAVER_CLIENT_ID = Deno.env.get("NAVER_CLIENT_ID") || ""; + const NAVER_CLIENT_SECRET = Deno.env.get("NAVER_CLIENT_SECRET") || ""; if (!FIRECRAWL_API_KEY) throw new Error("FIRECRAWL_API_KEY not configured"); // ═══════════════════════════════════════════ // STAGE A: Firecrawl scrape + map (parallel) - // → Extract clinicName + social links from HTML // ═══════════════════════════════════════════ const [scrapeResult, mapResult, brandResult] = await Promise.allSettled([ @@ -99,19 +98,15 @@ Deno.serve(async (req) => { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, body: JSON.stringify({ - url, - formats: ["json", "links"], + url, formats: ["json", "links"], jsonOptions: { prompt: "Extract: clinic name (Korean), clinic name (English), address, phone, services offered, doctors with specialties, ALL social media links (instagram handles/URLs, youtube channel URL/handle, naver blog URL, facebook page URL, tiktok, kakao channel), business hours, slogan", schema: { type: "object", properties: { - clinicName: { type: "string" }, - clinicNameEn: { type: "string" }, - address: { type: "string" }, - phone: { type: "string" }, - businessHours: { type: "string" }, - slogan: { type: "string" }, + clinicName: { type: "string" }, clinicNameEn: { type: "string" }, + address: { type: "string" }, phone: { type: "string" }, + businessHours: { type: "string" }, slogan: { type: "string" }, services: { type: "array", items: { type: "string" } }, doctors: { type: "array", items: { type: "object", properties: { name: { type: "string" }, title: { type: "string" }, specialty: { type: "string" } } } }, socialMedia: { type: "object", properties: { instagram: { type: "string" }, youtube: { type: "string" }, blog: { type: "string" }, facebook: { type: "string" }, tiktok: { type: "string" }, kakao: { type: "string" } } }, @@ -147,48 +142,36 @@ Deno.serve(async (req) => { const brandData = brandResult.status === "fulfilled" ? brandResult.value : { data: { json: {} } }; const clinic = scrapeData.data?.json || {}; - let resolvedName = inputClinicName || clinic.clinicName || ""; + let resolvedName = inputClinicName || clinic.clinicName || clinic.clinicNameEn || ""; - // If Firecrawl didn't extract a Korean name, try English name or domain - if (!resolvedName) { - resolvedName = clinic.clinicNameEn || ""; - } - - // Last resort: extract something readable from the domain - if (!resolvedName) { - const domain = new URL(url).hostname.replace('www.', '').split('.')[0]; - // If Perplexity is available, ask it to identify the clinic name from the URL - if (PERPLEXITY_API_KEY) { - try { - const nameRes = await fetch("https://api.perplexity.ai/chat/completions", { - method: "POST", - headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, - body: JSON.stringify({ - model: "sonar", - messages: [ - { role: "system", content: "Respond with ONLY the clinic name in Korean, nothing else." }, - { role: "user", content: `${url} 이 URL의 병원/클리닉 한국어 이름이 뭐야?` }, - ], - temperature: 0.1, - }), - }); - const nameData = await nameRes.json(); - const aiName = (nameData.choices?.[0]?.message?.content || "").trim().replace(/["""]/g, '').split('\n')[0].trim(); - if (aiName && aiName.length >= 2 && aiName.length <= 30) { - resolvedName = aiName; - } - } catch { /* fallback to domain */ } - } - if (!resolvedName) resolvedName = domain; + // Fallback: ask Perplexity to identify clinic name from URL + if (!resolvedName && PERPLEXITY_API_KEY) { + try { + const nameRes = await fetch("https://api.perplexity.ai/chat/completions", { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, + body: JSON.stringify({ + model: "sonar", + messages: [ + { role: "system", content: "Respond with ONLY the clinic name in Korean, nothing else." }, + { role: "user", content: `${url} 이 URL의 병원/클리닉 한국어 이름이 뭐야?` }, + ], + temperature: 0.1, + }), + }); + const nameData = await nameRes.json(); + const aiName = (nameData.choices?.[0]?.message?.content || "").trim().replace(/["""]/g, '').split('\n')[0].trim(); + if (aiName && aiName.length >= 2 && aiName.length <= 30) resolvedName = aiName; + } catch { /* fallback to domain */ } } + if (!resolvedName) resolvedName = new URL(url).hostname.replace('www.', '').split('.')[0]; + // Source 1: Parse social links from HTML const siteLinks: string[] = scrapeData.data?.links || []; const siteMap: string[] = mapData.links || []; - - // Source 1: Parse links from HTML const linkHandles = extractSocialLinks([...siteLinks, ...siteMap]); - // Source 2: Firecrawl JSON extraction socialMedia field + // Source 2: Firecrawl JSON extraction const scrapeSocial = clinic.socialMedia || {}; const firecrawlHandles = { instagram: scrapeSocial.instagram ? [scrapeSocial.instagram] : [], @@ -200,52 +183,129 @@ Deno.serve(async (req) => { }; // ═══════════════════════════════════════════ - // STAGE B: Perplexity search using CLINIC NAME - // → Find social handles that Firecrawl missed + // STAGE B: Direct API searches + Perplexity (ALL PARALLEL) + // Each API directly searches for the clinic's presence // ═══════════════════════════════════════════ - let perplexityHandles: Partial = {}; + const apiHandles: Partial = { + instagram: [], youtube: [], facebook: [], + naverBlog: [], tiktok: [], kakao: [], + }; let gangnamUnniHintUrl: string | undefined; + const stageBTasks: Promise[] = []; - if (PERPLEXITY_API_KEY && resolvedName) { - const pResults = await Promise.allSettled([ - // Query 1: Social media accounts — search-based, not verification - fetch("https://api.perplexity.ai/chat/completions", { - method: "POST", - headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, - body: JSON.stringify({ - model: "sonar", - messages: [ - { role: "system", content: "You are a social media researcher. Search the web and find social media accounts. Respond ONLY with valid JSON, no explanation." }, - { role: "user", content: `${resolvedName} 병원의 인스타그램, 유튜브, 페이스북, 틱톡, 네이버블로그, 카카오채널 계정을 검색해서 찾아줘. 검색 결과에서 발견된 계정을 모두 알려줘. 인스타그램은 여러 계정이 있을 수 있어.\n\n{"instagram": ["handle1", "handle2"], "youtube": "channel URL or @handle", "facebook": "page URL or name", "tiktok": "@handle", "naverBlog": "blog ID", "kakao": "channel ID"}` }, - ], - temperature: 0.1, - }), - }).then(r => r.json()), - - // Query 2: Platform presence — 강남언니, 네이버, 바비톡 - fetch("https://api.perplexity.ai/chat/completions", { - method: "POST", - headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, - body: JSON.stringify({ - model: "sonar", - messages: [ - { role: "system", content: "You are a medical platform researcher. Search the web for clinic listings. Respond ONLY with valid JSON, no explanation." }, - { role: "user", content: `${resolvedName} 병원이 강남언니(gangnamunni.com), 네이버 플레이스, 바비톡에 등록되어 있는지 검색해줘. URL도 찾아줘.\n\n{"gangnamUnni": {"registered": true, "url": "https://gangnamunni.com/hospitals/...", "rating": 9.5}, "naverPlace": {"registered": true}, "babitok": {"registered": false}}` }, - ], - temperature: 0.1, - }), - }).then(r => r.json()), - ]); - - // Parse social handles - if (pResults[0]?.status === "fulfilled") { + // ─── B1. YouTube Data API: Search for channel by name ─── + if (YOUTUBE_API_KEY) { + stageBTasks.push((async () => { try { - let text = pResults[0].value?.choices?.[0]?.message?.content || ""; + const q = encodeURIComponent(resolvedName); + const res = await fetch( + `https://www.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=${q}&maxResults=3&key=${YOUTUBE_API_KEY}` + ); + const data = await res.json(); + for (const item of (data.items || [])) { + const channelId = item.snippet?.channelId || item.id?.channelId; + const title = (item.snippet?.title || "").toLowerCase(); + const nameL = resolvedName.toLowerCase(); + // Match if title contains clinic name or vice versa + if (channelId && (title.includes(nameL) || nameL.includes(title) || title.includes(nameL.replace(/성형외과|병원|의원|클리닉/g, '').trim()))) { + apiHandles.youtube!.push(channelId); + } + } + } catch { /* skip */ } + })()); + } + + // ─── B2. Naver Search API: Find blog + social URLs ─── + if (NAVER_CLIENT_ID && NAVER_CLIENT_SECRET) { + const naverHeaders = { + "X-Naver-Client-Id": NAVER_CLIENT_ID, + "X-Naver-Client-Secret": NAVER_CLIENT_SECRET, + }; + + // B2a. Blog search → find official Naver blog + stageBTasks.push((async () => { + try { + const q = encodeURIComponent(`${resolvedName} 공식 블로그`); + const res = await fetch( + `https://openapi.naver.com/v1/search/blog.json?query=${q}&display=5&sort=sim`, + { headers: naverHeaders } + ); + const data = await res.json(); + for (const item of (data.items || [])) { + const link = item.link || ""; + if (link.includes("blog.naver.com/")) { + const m = link.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/); + if (m) apiHandles.naverBlog!.push(m[1]); + } + } + } catch { /* skip */ } + })()); + + // B2b. Web search → find Instagram/YouTube/Facebook URLs + stageBTasks.push((async () => { + try { + const q = encodeURIComponent(`${resolvedName} 인스타그램 유튜브 공식`); + const res = await fetch( + `https://openapi.naver.com/v1/search/webkr.json?query=${q}&display=10`, + { headers: naverHeaders } + ); + const data = await res.json(); + const urls: string[] = (data.items || []).map((item: Record) => item.link).filter(Boolean); + // Extract social handles from search result URLs + const found = extractSocialLinks(urls); + if (found.instagram.length) apiHandles.instagram!.push(...found.instagram); + if (found.youtube.length) apiHandles.youtube!.push(...found.youtube); + if (found.facebook.length) apiHandles.facebook!.push(...found.facebook); + if (found.tiktok.length) apiHandles.tiktok!.push(...found.tiktok); + } catch { /* skip */ } + })()); + } + + // ─── B3. Firecrawl Search: Find social URLs via web search ─── + stageBTasks.push((async () => { + try { + const res = await fetch("https://api.firecrawl.dev/v1/search", { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, + body: JSON.stringify({ + query: `${resolvedName} 성형외과 instagram youtube 공식`, + limit: 10, + }), + }); + const data = await res.json(); + const urls: string[] = (data.data || []).map((r: Record) => r.url).filter(Boolean); + const found = extractSocialLinks(urls); + if (found.instagram.length) apiHandles.instagram!.push(...found.instagram); + if (found.youtube.length) apiHandles.youtube!.push(...found.youtube); + if (found.facebook.length) apiHandles.facebook!.push(...found.facebook); + if (found.tiktok.length) apiHandles.tiktok!.push(...found.tiktok); + } catch { /* skip */ } + })()); + + // ─── B4. Perplexity: Supplement — catch what APIs missed ─── + if (PERPLEXITY_API_KEY) { + stageBTasks.push((async () => { + try { + const res = await fetch("https://api.perplexity.ai/chat/completions", { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, + body: JSON.stringify({ + model: "sonar", + messages: [ + { role: "system", content: "You are a social media researcher. Search the web and find social media accounts. Respond ONLY with valid JSON, no explanation." }, + { role: "user", content: `${resolvedName} 병원의 인스타그램, 유튜브, 페이스북, 틱톡, 네이버블로그, 카카오채널 계정을 검색해서 찾아줘. 검색 결과에서 발견된 계정을 모두 알려줘. 인스타그램은 여러 계정이 있을 수 있어.\n\n{"instagram": ["handle1", "handle2"], "youtube": "channel URL or @handle", "facebook": "page URL or name", "tiktok": "@handle", "naverBlog": "blog ID", "kakao": "channel ID"}` }, + ], + temperature: 0.1, + }), + }); + const data = await res.json(); + let text = data.choices?.[0]?.message?.content || ""; const jsonMatch = text.match(/```(?:json)?\n?([\s\S]*?)```/); if (jsonMatch) text = jsonMatch[1]; const parsed = JSON.parse(text); - perplexityHandles = { + + const ph = { instagram: Array.isArray(parsed.instagram) ? parsed.instagram : parsed.instagram ? [parsed.instagram] : [], youtube: parsed.youtube ? [parsed.youtube] : [], facebook: parsed.facebook ? [parsed.facebook] : [], @@ -253,33 +313,55 @@ Deno.serve(async (req) => { tiktok: parsed.tiktok ? [parsed.tiktok] : [], kakao: parsed.kakao ? [parsed.kakao] : [], }; - } catch { /* JSON parse failed */ } - } + if (ph.instagram.length) apiHandles.instagram!.push(...ph.instagram); + if (ph.youtube.length) apiHandles.youtube!.push(...ph.youtube); + if (ph.facebook.length) apiHandles.facebook!.push(...ph.facebook); + if (ph.naverBlog.length) apiHandles.naverBlog!.push(...ph.naverBlog); + if (ph.tiktok.length) apiHandles.tiktok!.push(...ph.tiktok); + if (ph.kakao.length) apiHandles.kakao!.push(...ph.kakao); + } catch { /* skip */ } + })()); - // Parse platform presence - if (pResults[1]?.status === "fulfilled") { + // B4b. Platform presence (강남언니, 바비톡) + stageBTasks.push((async () => { try { - let text = pResults[1].value?.choices?.[0]?.message?.content || ""; + const res = await fetch("https://api.perplexity.ai/chat/completions", { + method: "POST", + headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, + body: JSON.stringify({ + model: "sonar", + messages: [ + { role: "system", content: "You are a medical platform researcher. Search the web for clinic listings. Respond ONLY with valid JSON, no explanation." }, + { role: "user", content: `${resolvedName} 병원이 강남언니(gangnamunni.com), 네이버 플레이스, 바비톡에 등록되어 있는지 검색해줘. URL도 찾아줘.\n\n{"gangnamUnni": {"registered": true, "url": "https://gangnamunni.com/hospitals/...", "rating": 9.5}, "naverPlace": {"registered": true}, "babitok": {"registered": false}}` }, + ], + temperature: 0.1, + }), + }); + const data = await res.json(); + let text = data.choices?.[0]?.message?.content || ""; const jsonMatch = text.match(/```(?:json)?\n?([\s\S]*?)```/); if (jsonMatch) text = jsonMatch[1]; const parsed = JSON.parse(text); if (parsed.gangnamUnni?.url) gangnamUnniHintUrl = parsed.gangnamUnni.url; - } catch { /* JSON parse failed */ } - } + } catch { /* skip */ } + })()); } + // Run all Stage B tasks in parallel + await Promise.allSettled(stageBTasks); + // ═══════════════════════════════════════════ - // STAGE C: Merge + Deduplicate + Verify + // STAGE C: Merge ALL sources + Verify // ═══════════════════════════════════════════ - const merged = mergeSocialLinks(linkHandles, firecrawlHandles, perplexityHandles); + const merged = mergeSocialLinks(linkHandles, firecrawlHandles, apiHandles); const cleanHandles = { - instagram: merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null), - youtube: merged.youtube.map(h => extractHandle(h, 'youtube')).filter((h): h is string => h !== null), - facebook: merged.facebook.map(h => extractHandle(h, 'facebook')).filter((h): h is string => h !== null), - naverBlog: merged.naverBlog.map(h => extractHandle(h, 'naverBlog')).filter((h): h is string => h !== null), - tiktok: merged.tiktok.map(h => extractHandle(h, 'tiktok')).filter((h): h is string => h !== null), + instagram: [...new Set(merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null))], + youtube: [...new Set(merged.youtube.map(h => extractHandle(h, 'youtube')).filter((h): h is string => h !== null))], + facebook: [...new Set(merged.facebook.map(h => extractHandle(h, 'facebook')).filter((h): h is string => h !== null))], + naverBlog: [...new Set(merged.naverBlog.map(h => extractHandle(h, 'naverBlog')).filter((h): h is string => h !== null))], + tiktok: [...new Set(merged.tiktok.map(h => extractHandle(h, 'tiktok')).filter((h): h is string => h !== null))], }; const verified: VerifiedChannels = await verifyAllHandles(