feat: API-first channel discovery — YouTube API + Naver API + Firecrawl Search + Perplexity

Replaced Perplexity-only approach with 5 parallel direct API searches:

B1. YouTube Data API: search?type=channel&q={clinicName} → find channel
B2a. Naver Blog API: search blog.json → find official Naver blog
B2b. Naver Web API: search webkr.json → find Instagram/YouTube/Facebook URLs
B3. Firecrawl Search: web search → extract social URLs from results
B4. Perplexity: supplement — catch what direct APIs missed

All 5 sources run in parallel after Stage A (Firecrawl scrape for clinicName).
Results merged + deduplicated + verified. Perplexity is now a fallback,
not the primary source.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
claude/bold-hawking
Haewon Kam 2026-04-04 01:15:49 +09:00
parent 159de36e38
commit f224d1788c
1 changed files with 189 additions and 107 deletions

View File

@ -8,19 +8,16 @@ const corsHeaders = {
"Access-Control-Allow-Headers": "authorization, x-client-info, apikey, content-type",
};
const APIFY_BASE = "https://api.apify.com/v2";
interface DiscoverRequest {
url: string;
clinicName?: string;
}
/**
* Robust handle extraction handles may be full URLs, @handles, or bare usernames.
* Validates each handle actually belongs to its platform.
*/
function extractHandle(raw: string, platform: string): string | null {
if (!raw || raw.length < 2) return null;
let h = raw.trim();
if (platform === 'instagram') {
const m = h.match(/instagram\.com\/([a-zA-Z0-9._]+)/);
if (m) return m[1];
@ -66,10 +63,10 @@ function extractHandle(raw: string, platform: string): string | null {
/**
* Phase 1: Discover & Verify Channels
*
* Two-stage discovery:
* Stage A: Firecrawl scrape + map (parallel) extract clinicName + social links
* Stage B: Perplexity search using clinicName (parallel) find more handles
* Stage C: Merge + Verify all handles
* API-first, Perplexity-supplement approach:
* Stage A: Firecrawl scrape + map clinicName + social links from HTML
* Stage B: Direct API searches (YouTube, Naver, Firecrawl) + Perplexity
* Stage C: Merge all sources + Verify handles
*/
Deno.serve(async (req) => {
if (req.method === "OPTIONS") {
@ -85,13 +82,15 @@ Deno.serve(async (req) => {
);
}
const FIRECRAWL_API_KEY = Deno.env.get("FIRECRAWL_API_KEY");
const PERPLEXITY_API_KEY = Deno.env.get("PERPLEXITY_API_KEY");
const FIRECRAWL_API_KEY = Deno.env.get("FIRECRAWL_API_KEY") || "";
const PERPLEXITY_API_KEY = Deno.env.get("PERPLEXITY_API_KEY") || "";
const YOUTUBE_API_KEY = Deno.env.get("YOUTUBE_API_KEY") || "";
const NAVER_CLIENT_ID = Deno.env.get("NAVER_CLIENT_ID") || "";
const NAVER_CLIENT_SECRET = Deno.env.get("NAVER_CLIENT_SECRET") || "";
if (!FIRECRAWL_API_KEY) throw new Error("FIRECRAWL_API_KEY not configured");
// ═══════════════════════════════════════════
// STAGE A: Firecrawl scrape + map (parallel)
// → Extract clinicName + social links from HTML
// ═══════════════════════════════════════════
const [scrapeResult, mapResult, brandResult] = await Promise.allSettled([
@ -99,19 +98,15 @@ Deno.serve(async (req) => {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
body: JSON.stringify({
url,
formats: ["json", "links"],
url, formats: ["json", "links"],
jsonOptions: {
prompt: "Extract: clinic name (Korean), clinic name (English), address, phone, services offered, doctors with specialties, ALL social media links (instagram handles/URLs, youtube channel URL/handle, naver blog URL, facebook page URL, tiktok, kakao channel), business hours, slogan",
schema: {
type: "object",
properties: {
clinicName: { type: "string" },
clinicNameEn: { type: "string" },
address: { type: "string" },
phone: { type: "string" },
businessHours: { type: "string" },
slogan: { type: "string" },
clinicName: { type: "string" }, clinicNameEn: { type: "string" },
address: { type: "string" }, phone: { type: "string" },
businessHours: { type: "string" }, slogan: { type: "string" },
services: { type: "array", items: { type: "string" } },
doctors: { type: "array", items: { type: "object", properties: { name: { type: "string" }, title: { type: "string" }, specialty: { type: "string" } } } },
socialMedia: { type: "object", properties: { instagram: { type: "string" }, youtube: { type: "string" }, blog: { type: "string" }, facebook: { type: "string" }, tiktok: { type: "string" }, kakao: { type: "string" } } },
@ -147,18 +142,10 @@ Deno.serve(async (req) => {
const brandData = brandResult.status === "fulfilled" ? brandResult.value : { data: { json: {} } };
const clinic = scrapeData.data?.json || {};
let resolvedName = inputClinicName || clinic.clinicName || "";
let resolvedName = inputClinicName || clinic.clinicName || clinic.clinicNameEn || "";
// If Firecrawl didn't extract a Korean name, try English name or domain
if (!resolvedName) {
resolvedName = clinic.clinicNameEn || "";
}
// Last resort: extract something readable from the domain
if (!resolvedName) {
const domain = new URL(url).hostname.replace('www.', '').split('.')[0];
// If Perplexity is available, ask it to identify the clinic name from the URL
if (PERPLEXITY_API_KEY) {
// Fallback: ask Perplexity to identify clinic name from URL
if (!resolvedName && PERPLEXITY_API_KEY) {
try {
const nameRes = await fetch("https://api.perplexity.ai/chat/completions", {
method: "POST",
@ -174,21 +161,17 @@ Deno.serve(async (req) => {
});
const nameData = await nameRes.json();
const aiName = (nameData.choices?.[0]?.message?.content || "").trim().replace(/["""]/g, '').split('\n')[0].trim();
if (aiName && aiName.length >= 2 && aiName.length <= 30) {
resolvedName = aiName;
}
if (aiName && aiName.length >= 2 && aiName.length <= 30) resolvedName = aiName;
} catch { /* fallback to domain */ }
}
if (!resolvedName) resolvedName = domain;
}
if (!resolvedName) resolvedName = new URL(url).hostname.replace('www.', '').split('.')[0];
// Source 1: Parse social links from HTML
const siteLinks: string[] = scrapeData.data?.links || [];
const siteMap: string[] = mapData.links || [];
// Source 1: Parse links from HTML
const linkHandles = extractSocialLinks([...siteLinks, ...siteMap]);
// Source 2: Firecrawl JSON extraction socialMedia field
// Source 2: Firecrawl JSON extraction
const scrapeSocial = clinic.socialMedia || {};
const firecrawlHandles = {
instagram: scrapeSocial.instagram ? [scrapeSocial.instagram] : [],
@ -200,17 +183,111 @@ Deno.serve(async (req) => {
};
// ═══════════════════════════════════════════
// STAGE B: Perplexity search using CLINIC NAME
// → Find social handles that Firecrawl missed
// STAGE B: Direct API searches + Perplexity (ALL PARALLEL)
// Each API directly searches for the clinic's presence
// ═══════════════════════════════════════════
let perplexityHandles: Partial<typeof linkHandles> = {};
const apiHandles: Partial<typeof linkHandles> = {
instagram: [], youtube: [], facebook: [],
naverBlog: [], tiktok: [], kakao: [],
};
let gangnamUnniHintUrl: string | undefined;
const stageBTasks: Promise<void>[] = [];
if (PERPLEXITY_API_KEY && resolvedName) {
const pResults = await Promise.allSettled([
// Query 1: Social media accounts — search-based, not verification
fetch("https://api.perplexity.ai/chat/completions", {
// ─── B1. YouTube Data API: Search for channel by name ───
if (YOUTUBE_API_KEY) {
stageBTasks.push((async () => {
try {
const q = encodeURIComponent(resolvedName);
const res = await fetch(
`https://www.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=${q}&maxResults=3&key=${YOUTUBE_API_KEY}`
);
const data = await res.json();
for (const item of (data.items || [])) {
const channelId = item.snippet?.channelId || item.id?.channelId;
const title = (item.snippet?.title || "").toLowerCase();
const nameL = resolvedName.toLowerCase();
// Match if title contains clinic name or vice versa
if (channelId && (title.includes(nameL) || nameL.includes(title) || title.includes(nameL.replace(/성형외과|병원|의원|클리닉/g, '').trim()))) {
apiHandles.youtube!.push(channelId);
}
}
} catch { /* skip */ }
})());
}
// ─── B2. Naver Search API: Find blog + social URLs ───
if (NAVER_CLIENT_ID && NAVER_CLIENT_SECRET) {
const naverHeaders = {
"X-Naver-Client-Id": NAVER_CLIENT_ID,
"X-Naver-Client-Secret": NAVER_CLIENT_SECRET,
};
// B2a. Blog search → find official Naver blog
stageBTasks.push((async () => {
try {
const q = encodeURIComponent(`${resolvedName} 공식 블로그`);
const res = await fetch(
`https://openapi.naver.com/v1/search/blog.json?query=${q}&display=5&sort=sim`,
{ headers: naverHeaders }
);
const data = await res.json();
for (const item of (data.items || [])) {
const link = item.link || "";
if (link.includes("blog.naver.com/")) {
const m = link.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/);
if (m) apiHandles.naverBlog!.push(m[1]);
}
}
} catch { /* skip */ }
})());
// B2b. Web search → find Instagram/YouTube/Facebook URLs
stageBTasks.push((async () => {
try {
const q = encodeURIComponent(`${resolvedName} 인스타그램 유튜브 공식`);
const res = await fetch(
`https://openapi.naver.com/v1/search/webkr.json?query=${q}&display=10`,
{ headers: naverHeaders }
);
const data = await res.json();
const urls: string[] = (data.items || []).map((item: Record<string, string>) => item.link).filter(Boolean);
// Extract social handles from search result URLs
const found = extractSocialLinks(urls);
if (found.instagram.length) apiHandles.instagram!.push(...found.instagram);
if (found.youtube.length) apiHandles.youtube!.push(...found.youtube);
if (found.facebook.length) apiHandles.facebook!.push(...found.facebook);
if (found.tiktok.length) apiHandles.tiktok!.push(...found.tiktok);
} catch { /* skip */ }
})());
}
// ─── B3. Firecrawl Search: Find social URLs via web search ───
stageBTasks.push((async () => {
try {
const res = await fetch("https://api.firecrawl.dev/v1/search", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
body: JSON.stringify({
query: `${resolvedName} 성형외과 instagram youtube 공식`,
limit: 10,
}),
});
const data = await res.json();
const urls: string[] = (data.data || []).map((r: Record<string, string>) => r.url).filter(Boolean);
const found = extractSocialLinks(urls);
if (found.instagram.length) apiHandles.instagram!.push(...found.instagram);
if (found.youtube.length) apiHandles.youtube!.push(...found.youtube);
if (found.facebook.length) apiHandles.facebook!.push(...found.facebook);
if (found.tiktok.length) apiHandles.tiktok!.push(...found.tiktok);
} catch { /* skip */ }
})());
// ─── B4. Perplexity: Supplement — catch what APIs missed ───
if (PERPLEXITY_API_KEY) {
stageBTasks.push((async () => {
try {
const res = await fetch("https://api.perplexity.ai/chat/completions", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` },
body: JSON.stringify({
@ -221,10 +298,34 @@ Deno.serve(async (req) => {
],
temperature: 0.1,
}),
}).then(r => r.json()),
});
const data = await res.json();
let text = data.choices?.[0]?.message?.content || "";
const jsonMatch = text.match(/```(?:json)?\n?([\s\S]*?)```/);
if (jsonMatch) text = jsonMatch[1];
const parsed = JSON.parse(text);
// Query 2: Platform presence — 강남언니, 네이버, 바비톡
fetch("https://api.perplexity.ai/chat/completions", {
const ph = {
instagram: Array.isArray(parsed.instagram) ? parsed.instagram : parsed.instagram ? [parsed.instagram] : [],
youtube: parsed.youtube ? [parsed.youtube] : [],
facebook: parsed.facebook ? [parsed.facebook] : [],
naverBlog: parsed.naverBlog ? [parsed.naverBlog] : [],
tiktok: parsed.tiktok ? [parsed.tiktok] : [],
kakao: parsed.kakao ? [parsed.kakao] : [],
};
if (ph.instagram.length) apiHandles.instagram!.push(...ph.instagram);
if (ph.youtube.length) apiHandles.youtube!.push(...ph.youtube);
if (ph.facebook.length) apiHandles.facebook!.push(...ph.facebook);
if (ph.naverBlog.length) apiHandles.naverBlog!.push(...ph.naverBlog);
if (ph.tiktok.length) apiHandles.tiktok!.push(...ph.tiktok);
if (ph.kakao.length) apiHandles.kakao!.push(...ph.kakao);
} catch { /* skip */ }
})());
// B4b. Platform presence (강남언니, 바비톡)
stageBTasks.push((async () => {
try {
const res = await fetch("https://api.perplexity.ai/chat/completions", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` },
body: JSON.stringify({
@ -235,51 +336,32 @@ Deno.serve(async (req) => {
],
temperature: 0.1,
}),
}).then(r => r.json()),
]);
// Parse social handles
if (pResults[0]?.status === "fulfilled") {
try {
let text = pResults[0].value?.choices?.[0]?.message?.content || "";
const jsonMatch = text.match(/```(?:json)?\n?([\s\S]*?)```/);
if (jsonMatch) text = jsonMatch[1];
const parsed = JSON.parse(text);
perplexityHandles = {
instagram: Array.isArray(parsed.instagram) ? parsed.instagram : parsed.instagram ? [parsed.instagram] : [],
youtube: parsed.youtube ? [parsed.youtube] : [],
facebook: parsed.facebook ? [parsed.facebook] : [],
naverBlog: parsed.naverBlog ? [parsed.naverBlog] : [],
tiktok: parsed.tiktok ? [parsed.tiktok] : [],
kakao: parsed.kakao ? [parsed.kakao] : [],
};
} catch { /* JSON parse failed */ }
}
// Parse platform presence
if (pResults[1]?.status === "fulfilled") {
try {
let text = pResults[1].value?.choices?.[0]?.message?.content || "";
});
const data = await res.json();
let text = data.choices?.[0]?.message?.content || "";
const jsonMatch = text.match(/```(?:json)?\n?([\s\S]*?)```/);
if (jsonMatch) text = jsonMatch[1];
const parsed = JSON.parse(text);
if (parsed.gangnamUnni?.url) gangnamUnniHintUrl = parsed.gangnamUnni.url;
} catch { /* JSON parse failed */ }
}
} catch { /* skip */ }
})());
}
// Run all Stage B tasks in parallel
await Promise.allSettled(stageBTasks);
// ═══════════════════════════════════════════
// STAGE C: Merge + Deduplicate + Verify
// STAGE C: Merge ALL sources + Verify
// ═══════════════════════════════════════════
const merged = mergeSocialLinks(linkHandles, firecrawlHandles, perplexityHandles);
const merged = mergeSocialLinks(linkHandles, firecrawlHandles, apiHandles);
const cleanHandles = {
instagram: merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null),
youtube: merged.youtube.map(h => extractHandle(h, 'youtube')).filter((h): h is string => h !== null),
facebook: merged.facebook.map(h => extractHandle(h, 'facebook')).filter((h): h is string => h !== null),
naverBlog: merged.naverBlog.map(h => extractHandle(h, 'naverBlog')).filter((h): h is string => h !== null),
tiktok: merged.tiktok.map(h => extractHandle(h, 'tiktok')).filter((h): h is string => h !== null),
instagram: [...new Set(merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null))],
youtube: [...new Set(merged.youtube.map(h => extractHandle(h, 'youtube')).filter((h): h is string => h !== null))],
facebook: [...new Set(merged.facebook.map(h => extractHandle(h, 'facebook')).filter((h): h is string => h !== null))],
naverBlog: [...new Set(merged.naverBlog.map(h => extractHandle(h, 'naverBlog')).filter((h): h is string => h !== null))],
tiktok: [...new Set(merged.tiktok.map(h => extractHandle(h, 'tiktok')).filter((h): h is string => h !== null))],
};
const verified: VerifiedChannels = await verifyAllHandles(