feat: API-first channel discovery — YouTube API + Naver API + Firecrawl Search + Perplexity

Replaced Perplexity-only approach with 5 parallel direct API searches:

B1. YouTube Data API: search?type=channel&q={clinicName} → find channel
B2a. Naver Blog API: search blog.json → find official Naver blog
B2b. Naver Web API: search webkr.json → find Instagram/YouTube/Facebook URLs
B3. Firecrawl Search: web search → extract social URLs from results
B4. Perplexity: supplement — catch what direct APIs missed

All 5 sources run in parallel after Stage A (Firecrawl scrape for clinicName).
Results merged + deduplicated + verified. Perplexity is now a fallback,
not the primary source.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
claude/bold-hawking
Haewon Kam 2026-04-04 01:15:49 +09:00
parent 159de36e38
commit f224d1788c
1 changed files with 189 additions and 107 deletions

View File

@ -8,19 +8,16 @@ const corsHeaders = {
"Access-Control-Allow-Headers": "authorization, x-client-info, apikey, content-type", "Access-Control-Allow-Headers": "authorization, x-client-info, apikey, content-type",
}; };
const APIFY_BASE = "https://api.apify.com/v2";
interface DiscoverRequest { interface DiscoverRequest {
url: string; url: string;
clinicName?: string; clinicName?: string;
} }
/**
* Robust handle extraction handles may be full URLs, @handles, or bare usernames.
* Validates each handle actually belongs to its platform.
*/
function extractHandle(raw: string, platform: string): string | null { function extractHandle(raw: string, platform: string): string | null {
if (!raw || raw.length < 2) return null; if (!raw || raw.length < 2) return null;
let h = raw.trim(); let h = raw.trim();
if (platform === 'instagram') { if (platform === 'instagram') {
const m = h.match(/instagram\.com\/([a-zA-Z0-9._]+)/); const m = h.match(/instagram\.com\/([a-zA-Z0-9._]+)/);
if (m) return m[1]; if (m) return m[1];
@ -66,10 +63,10 @@ function extractHandle(raw: string, platform: string): string | null {
/** /**
* Phase 1: Discover & Verify Channels * Phase 1: Discover & Verify Channels
* *
* Two-stage discovery: * API-first, Perplexity-supplement approach:
* Stage A: Firecrawl scrape + map (parallel) extract clinicName + social links * Stage A: Firecrawl scrape + map clinicName + social links from HTML
* Stage B: Perplexity search using clinicName (parallel) find more handles * Stage B: Direct API searches (YouTube, Naver, Firecrawl) + Perplexity
* Stage C: Merge + Verify all handles * Stage C: Merge all sources + Verify handles
*/ */
Deno.serve(async (req) => { Deno.serve(async (req) => {
if (req.method === "OPTIONS") { if (req.method === "OPTIONS") {
@ -85,13 +82,15 @@ Deno.serve(async (req) => {
); );
} }
const FIRECRAWL_API_KEY = Deno.env.get("FIRECRAWL_API_KEY"); const FIRECRAWL_API_KEY = Deno.env.get("FIRECRAWL_API_KEY") || "";
const PERPLEXITY_API_KEY = Deno.env.get("PERPLEXITY_API_KEY"); const PERPLEXITY_API_KEY = Deno.env.get("PERPLEXITY_API_KEY") || "";
const YOUTUBE_API_KEY = Deno.env.get("YOUTUBE_API_KEY") || "";
const NAVER_CLIENT_ID = Deno.env.get("NAVER_CLIENT_ID") || "";
const NAVER_CLIENT_SECRET = Deno.env.get("NAVER_CLIENT_SECRET") || "";
if (!FIRECRAWL_API_KEY) throw new Error("FIRECRAWL_API_KEY not configured"); if (!FIRECRAWL_API_KEY) throw new Error("FIRECRAWL_API_KEY not configured");
// ═══════════════════════════════════════════ // ═══════════════════════════════════════════
// STAGE A: Firecrawl scrape + map (parallel) // STAGE A: Firecrawl scrape + map (parallel)
// → Extract clinicName + social links from HTML
// ═══════════════════════════════════════════ // ═══════════════════════════════════════════
const [scrapeResult, mapResult, brandResult] = await Promise.allSettled([ const [scrapeResult, mapResult, brandResult] = await Promise.allSettled([
@ -99,19 +98,15 @@ Deno.serve(async (req) => {
method: "POST", method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
body: JSON.stringify({ body: JSON.stringify({
url, url, formats: ["json", "links"],
formats: ["json", "links"],
jsonOptions: { jsonOptions: {
prompt: "Extract: clinic name (Korean), clinic name (English), address, phone, services offered, doctors with specialties, ALL social media links (instagram handles/URLs, youtube channel URL/handle, naver blog URL, facebook page URL, tiktok, kakao channel), business hours, slogan", prompt: "Extract: clinic name (Korean), clinic name (English), address, phone, services offered, doctors with specialties, ALL social media links (instagram handles/URLs, youtube channel URL/handle, naver blog URL, facebook page URL, tiktok, kakao channel), business hours, slogan",
schema: { schema: {
type: "object", type: "object",
properties: { properties: {
clinicName: { type: "string" }, clinicName: { type: "string" }, clinicNameEn: { type: "string" },
clinicNameEn: { type: "string" }, address: { type: "string" }, phone: { type: "string" },
address: { type: "string" }, businessHours: { type: "string" }, slogan: { type: "string" },
phone: { type: "string" },
businessHours: { type: "string" },
slogan: { type: "string" },
services: { type: "array", items: { type: "string" } }, services: { type: "array", items: { type: "string" } },
doctors: { type: "array", items: { type: "object", properties: { name: { type: "string" }, title: { type: "string" }, specialty: { type: "string" } } } }, doctors: { type: "array", items: { type: "object", properties: { name: { type: "string" }, title: { type: "string" }, specialty: { type: "string" } } } },
socialMedia: { type: "object", properties: { instagram: { type: "string" }, youtube: { type: "string" }, blog: { type: "string" }, facebook: { type: "string" }, tiktok: { type: "string" }, kakao: { type: "string" } } }, socialMedia: { type: "object", properties: { instagram: { type: "string" }, youtube: { type: "string" }, blog: { type: "string" }, facebook: { type: "string" }, tiktok: { type: "string" }, kakao: { type: "string" } } },
@ -147,48 +142,36 @@ Deno.serve(async (req) => {
const brandData = brandResult.status === "fulfilled" ? brandResult.value : { data: { json: {} } }; const brandData = brandResult.status === "fulfilled" ? brandResult.value : { data: { json: {} } };
const clinic = scrapeData.data?.json || {}; const clinic = scrapeData.data?.json || {};
let resolvedName = inputClinicName || clinic.clinicName || ""; let resolvedName = inputClinicName || clinic.clinicName || clinic.clinicNameEn || "";
// If Firecrawl didn't extract a Korean name, try English name or domain // Fallback: ask Perplexity to identify clinic name from URL
if (!resolvedName) { if (!resolvedName && PERPLEXITY_API_KEY) {
resolvedName = clinic.clinicNameEn || ""; try {
} const nameRes = await fetch("https://api.perplexity.ai/chat/completions", {
method: "POST",
// Last resort: extract something readable from the domain headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` },
if (!resolvedName) { body: JSON.stringify({
const domain = new URL(url).hostname.replace('www.', '').split('.')[0]; model: "sonar",
// If Perplexity is available, ask it to identify the clinic name from the URL messages: [
if (PERPLEXITY_API_KEY) { { role: "system", content: "Respond with ONLY the clinic name in Korean, nothing else." },
try { { role: "user", content: `${url} 이 URL의 병원/클리닉 한국어 이름이 뭐야?` },
const nameRes = await fetch("https://api.perplexity.ai/chat/completions", { ],
method: "POST", temperature: 0.1,
headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` }, }),
body: JSON.stringify({ });
model: "sonar", const nameData = await nameRes.json();
messages: [ const aiName = (nameData.choices?.[0]?.message?.content || "").trim().replace(/["""]/g, '').split('\n')[0].trim();
{ role: "system", content: "Respond with ONLY the clinic name in Korean, nothing else." }, if (aiName && aiName.length >= 2 && aiName.length <= 30) resolvedName = aiName;
{ role: "user", content: `${url} 이 URL의 병원/클리닉 한국어 이름이 뭐야?` }, } catch { /* fallback to domain */ }
],
temperature: 0.1,
}),
});
const nameData = await nameRes.json();
const aiName = (nameData.choices?.[0]?.message?.content || "").trim().replace(/["""]/g, '').split('\n')[0].trim();
if (aiName && aiName.length >= 2 && aiName.length <= 30) {
resolvedName = aiName;
}
} catch { /* fallback to domain */ }
}
if (!resolvedName) resolvedName = domain;
} }
if (!resolvedName) resolvedName = new URL(url).hostname.replace('www.', '').split('.')[0];
// Source 1: Parse social links from HTML
const siteLinks: string[] = scrapeData.data?.links || []; const siteLinks: string[] = scrapeData.data?.links || [];
const siteMap: string[] = mapData.links || []; const siteMap: string[] = mapData.links || [];
// Source 1: Parse links from HTML
const linkHandles = extractSocialLinks([...siteLinks, ...siteMap]); const linkHandles = extractSocialLinks([...siteLinks, ...siteMap]);
// Source 2: Firecrawl JSON extraction socialMedia field // Source 2: Firecrawl JSON extraction
const scrapeSocial = clinic.socialMedia || {}; const scrapeSocial = clinic.socialMedia || {};
const firecrawlHandles = { const firecrawlHandles = {
instagram: scrapeSocial.instagram ? [scrapeSocial.instagram] : [], instagram: scrapeSocial.instagram ? [scrapeSocial.instagram] : [],
@ -200,52 +183,129 @@ Deno.serve(async (req) => {
}; };
// ═══════════════════════════════════════════ // ═══════════════════════════════════════════
// STAGE B: Perplexity search using CLINIC NAME // STAGE B: Direct API searches + Perplexity (ALL PARALLEL)
// → Find social handles that Firecrawl missed // Each API directly searches for the clinic's presence
// ═══════════════════════════════════════════ // ═══════════════════════════════════════════
let perplexityHandles: Partial<typeof linkHandles> = {}; const apiHandles: Partial<typeof linkHandles> = {
instagram: [], youtube: [], facebook: [],
naverBlog: [], tiktok: [], kakao: [],
};
let gangnamUnniHintUrl: string | undefined; let gangnamUnniHintUrl: string | undefined;
const stageBTasks: Promise<void>[] = [];
if (PERPLEXITY_API_KEY && resolvedName) { // ─── B1. YouTube Data API: Search for channel by name ───
const pResults = await Promise.allSettled([ if (YOUTUBE_API_KEY) {
// Query 1: Social media accounts — search-based, not verification stageBTasks.push((async () => {
fetch("https://api.perplexity.ai/chat/completions", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` },
body: JSON.stringify({
model: "sonar",
messages: [
{ role: "system", content: "You are a social media researcher. Search the web and find social media accounts. Respond ONLY with valid JSON, no explanation." },
{ role: "user", content: `${resolvedName} 병원의 인스타그램, 유튜브, 페이스북, 틱톡, 네이버블로그, 카카오채널 계정을 검색해서 찾아줘. 검색 결과에서 발견된 계정을 모두 알려줘. 인스타그램은 여러 계정이 있을 수 있어.\n\n{"instagram": ["handle1", "handle2"], "youtube": "channel URL or @handle", "facebook": "page URL or name", "tiktok": "@handle", "naverBlog": "blog ID", "kakao": "channel ID"}` },
],
temperature: 0.1,
}),
}).then(r => r.json()),
// Query 2: Platform presence — 강남언니, 네이버, 바비톡
fetch("https://api.perplexity.ai/chat/completions", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` },
body: JSON.stringify({
model: "sonar",
messages: [
{ role: "system", content: "You are a medical platform researcher. Search the web for clinic listings. Respond ONLY with valid JSON, no explanation." },
{ role: "user", content: `${resolvedName} 병원이 강남언니(gangnamunni.com), 네이버 플레이스, 바비톡에 등록되어 있는지 검색해줘. URL도 찾아줘.\n\n{"gangnamUnni": {"registered": true, "url": "https://gangnamunni.com/hospitals/...", "rating": 9.5}, "naverPlace": {"registered": true}, "babitok": {"registered": false}}` },
],
temperature: 0.1,
}),
}).then(r => r.json()),
]);
// Parse social handles
if (pResults[0]?.status === "fulfilled") {
try { try {
let text = pResults[0].value?.choices?.[0]?.message?.content || ""; const q = encodeURIComponent(resolvedName);
const res = await fetch(
`https://www.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=${q}&maxResults=3&key=${YOUTUBE_API_KEY}`
);
const data = await res.json();
for (const item of (data.items || [])) {
const channelId = item.snippet?.channelId || item.id?.channelId;
const title = (item.snippet?.title || "").toLowerCase();
const nameL = resolvedName.toLowerCase();
// Match if title contains clinic name or vice versa
if (channelId && (title.includes(nameL) || nameL.includes(title) || title.includes(nameL.replace(/성형외과|병원|의원|클리닉/g, '').trim()))) {
apiHandles.youtube!.push(channelId);
}
}
} catch { /* skip */ }
})());
}
// ─── B2. Naver Search API: Find blog + social URLs ───
if (NAVER_CLIENT_ID && NAVER_CLIENT_SECRET) {
const naverHeaders = {
"X-Naver-Client-Id": NAVER_CLIENT_ID,
"X-Naver-Client-Secret": NAVER_CLIENT_SECRET,
};
// B2a. Blog search → find official Naver blog
stageBTasks.push((async () => {
try {
const q = encodeURIComponent(`${resolvedName} 공식 블로그`);
const res = await fetch(
`https://openapi.naver.com/v1/search/blog.json?query=${q}&display=5&sort=sim`,
{ headers: naverHeaders }
);
const data = await res.json();
for (const item of (data.items || [])) {
const link = item.link || "";
if (link.includes("blog.naver.com/")) {
const m = link.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/);
if (m) apiHandles.naverBlog!.push(m[1]);
}
}
} catch { /* skip */ }
})());
// B2b. Web search → find Instagram/YouTube/Facebook URLs
stageBTasks.push((async () => {
try {
const q = encodeURIComponent(`${resolvedName} 인스타그램 유튜브 공식`);
const res = await fetch(
`https://openapi.naver.com/v1/search/webkr.json?query=${q}&display=10`,
{ headers: naverHeaders }
);
const data = await res.json();
const urls: string[] = (data.items || []).map((item: Record<string, string>) => item.link).filter(Boolean);
// Extract social handles from search result URLs
const found = extractSocialLinks(urls);
if (found.instagram.length) apiHandles.instagram!.push(...found.instagram);
if (found.youtube.length) apiHandles.youtube!.push(...found.youtube);
if (found.facebook.length) apiHandles.facebook!.push(...found.facebook);
if (found.tiktok.length) apiHandles.tiktok!.push(...found.tiktok);
} catch { /* skip */ }
})());
}
// ─── B3. Firecrawl Search: Find social URLs via web search ───
stageBTasks.push((async () => {
try {
const res = await fetch("https://api.firecrawl.dev/v1/search", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
body: JSON.stringify({
query: `${resolvedName} 성형외과 instagram youtube 공식`,
limit: 10,
}),
});
const data = await res.json();
const urls: string[] = (data.data || []).map((r: Record<string, string>) => r.url).filter(Boolean);
const found = extractSocialLinks(urls);
if (found.instagram.length) apiHandles.instagram!.push(...found.instagram);
if (found.youtube.length) apiHandles.youtube!.push(...found.youtube);
if (found.facebook.length) apiHandles.facebook!.push(...found.facebook);
if (found.tiktok.length) apiHandles.tiktok!.push(...found.tiktok);
} catch { /* skip */ }
})());
// ─── B4. Perplexity: Supplement — catch what APIs missed ───
if (PERPLEXITY_API_KEY) {
stageBTasks.push((async () => {
try {
const res = await fetch("https://api.perplexity.ai/chat/completions", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` },
body: JSON.stringify({
model: "sonar",
messages: [
{ role: "system", content: "You are a social media researcher. Search the web and find social media accounts. Respond ONLY with valid JSON, no explanation." },
{ role: "user", content: `${resolvedName} 병원의 인스타그램, 유튜브, 페이스북, 틱톡, 네이버블로그, 카카오채널 계정을 검색해서 찾아줘. 검색 결과에서 발견된 계정을 모두 알려줘. 인스타그램은 여러 계정이 있을 수 있어.\n\n{"instagram": ["handle1", "handle2"], "youtube": "channel URL or @handle", "facebook": "page URL or name", "tiktok": "@handle", "naverBlog": "blog ID", "kakao": "channel ID"}` },
],
temperature: 0.1,
}),
});
const data = await res.json();
let text = data.choices?.[0]?.message?.content || "";
const jsonMatch = text.match(/```(?:json)?\n?([\s\S]*?)```/); const jsonMatch = text.match(/```(?:json)?\n?([\s\S]*?)```/);
if (jsonMatch) text = jsonMatch[1]; if (jsonMatch) text = jsonMatch[1];
const parsed = JSON.parse(text); const parsed = JSON.parse(text);
perplexityHandles = {
const ph = {
instagram: Array.isArray(parsed.instagram) ? parsed.instagram : parsed.instagram ? [parsed.instagram] : [], instagram: Array.isArray(parsed.instagram) ? parsed.instagram : parsed.instagram ? [parsed.instagram] : [],
youtube: parsed.youtube ? [parsed.youtube] : [], youtube: parsed.youtube ? [parsed.youtube] : [],
facebook: parsed.facebook ? [parsed.facebook] : [], facebook: parsed.facebook ? [parsed.facebook] : [],
@ -253,33 +313,55 @@ Deno.serve(async (req) => {
tiktok: parsed.tiktok ? [parsed.tiktok] : [], tiktok: parsed.tiktok ? [parsed.tiktok] : [],
kakao: parsed.kakao ? [parsed.kakao] : [], kakao: parsed.kakao ? [parsed.kakao] : [],
}; };
} catch { /* JSON parse failed */ } if (ph.instagram.length) apiHandles.instagram!.push(...ph.instagram);
} if (ph.youtube.length) apiHandles.youtube!.push(...ph.youtube);
if (ph.facebook.length) apiHandles.facebook!.push(...ph.facebook);
if (ph.naverBlog.length) apiHandles.naverBlog!.push(...ph.naverBlog);
if (ph.tiktok.length) apiHandles.tiktok!.push(...ph.tiktok);
if (ph.kakao.length) apiHandles.kakao!.push(...ph.kakao);
} catch { /* skip */ }
})());
// Parse platform presence // B4b. Platform presence (강남언니, 바비톡)
if (pResults[1]?.status === "fulfilled") { stageBTasks.push((async () => {
try { try {
let text = pResults[1].value?.choices?.[0]?.message?.content || ""; const res = await fetch("https://api.perplexity.ai/chat/completions", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` },
body: JSON.stringify({
model: "sonar",
messages: [
{ role: "system", content: "You are a medical platform researcher. Search the web for clinic listings. Respond ONLY with valid JSON, no explanation." },
{ role: "user", content: `${resolvedName} 병원이 강남언니(gangnamunni.com), 네이버 플레이스, 바비톡에 등록되어 있는지 검색해줘. URL도 찾아줘.\n\n{"gangnamUnni": {"registered": true, "url": "https://gangnamunni.com/hospitals/...", "rating": 9.5}, "naverPlace": {"registered": true}, "babitok": {"registered": false}}` },
],
temperature: 0.1,
}),
});
const data = await res.json();
let text = data.choices?.[0]?.message?.content || "";
const jsonMatch = text.match(/```(?:json)?\n?([\s\S]*?)```/); const jsonMatch = text.match(/```(?:json)?\n?([\s\S]*?)```/);
if (jsonMatch) text = jsonMatch[1]; if (jsonMatch) text = jsonMatch[1];
const parsed = JSON.parse(text); const parsed = JSON.parse(text);
if (parsed.gangnamUnni?.url) gangnamUnniHintUrl = parsed.gangnamUnni.url; if (parsed.gangnamUnni?.url) gangnamUnniHintUrl = parsed.gangnamUnni.url;
} catch { /* JSON parse failed */ } } catch { /* skip */ }
} })());
} }
// Run all Stage B tasks in parallel
await Promise.allSettled(stageBTasks);
// ═══════════════════════════════════════════ // ═══════════════════════════════════════════
// STAGE C: Merge + Deduplicate + Verify // STAGE C: Merge ALL sources + Verify
// ═══════════════════════════════════════════ // ═══════════════════════════════════════════
const merged = mergeSocialLinks(linkHandles, firecrawlHandles, perplexityHandles); const merged = mergeSocialLinks(linkHandles, firecrawlHandles, apiHandles);
const cleanHandles = { const cleanHandles = {
instagram: merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null), instagram: [...new Set(merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null))],
youtube: merged.youtube.map(h => extractHandle(h, 'youtube')).filter((h): h is string => h !== null), youtube: [...new Set(merged.youtube.map(h => extractHandle(h, 'youtube')).filter((h): h is string => h !== null))],
facebook: merged.facebook.map(h => extractHandle(h, 'facebook')).filter((h): h is string => h !== null), facebook: [...new Set(merged.facebook.map(h => extractHandle(h, 'facebook')).filter((h): h is string => h !== null))],
naverBlog: merged.naverBlog.map(h => extractHandle(h, 'naverBlog')).filter((h): h is string => h !== null), naverBlog: [...new Set(merged.naverBlog.map(h => extractHandle(h, 'naverBlog')).filter((h): h is string => h !== null))],
tiktok: merged.tiktok.map(h => extractHandle(h, 'tiktok')).filter((h): h is string => h !== null), tiktok: [...new Set(merged.tiktok.map(h => extractHandle(h, 'tiktok')).filter((h): h is string => h !== null))],
}; };
const verified: VerifiedChannels = await verifyAllHandles( const verified: VerifiedChannels = await verifyAllHandles(