feat: extract social links from JS-rendered buttons on clinic website

Added A4 parallel Firecrawl call with actions: [wait 3s, scrape]
to execute JavaScript and extract social button href URLs from
header/footer. This is the most reliable source — most Korean
clinics have Facebook/Instagram/YouTube/Blog icons in their nav.

Results merged as Source 3 (buttonHandles) alongside HTML links,
JSON extraction, and API searches.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
claude/bold-hawking
Haewon Kam 2026-04-04 23:41:27 +09:00
parent 80c57147e7
commit ed37f23f78
1 changed files with 48 additions and 2 deletions

View File

@ -95,7 +95,8 @@ Deno.serve(async (req) => {
// STAGE A: Firecrawl scrape + map (parallel) // STAGE A: Firecrawl scrape + map (parallel)
// ═══════════════════════════════════════════ // ═══════════════════════════════════════════
const [scrapeResult, mapResult, brandResult] = await Promise.allSettled([ const [scrapeResult, mapResult, brandResult, socialButtonResult] = await Promise.allSettled([
// A1. Main scrape — clinic info + links
fetch("https://api.firecrawl.dev/v1/scrape", { fetch("https://api.firecrawl.dev/v1/scrape", {
method: "POST", method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
@ -119,12 +120,14 @@ Deno.serve(async (req) => {
}), }),
}).then(r => r.json()), }).then(r => r.json()),
// A2. Map site
fetch("https://api.firecrawl.dev/v1/map", { fetch("https://api.firecrawl.dev/v1/map", {
method: "POST", method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
body: JSON.stringify({ url, limit: 50 }), body: JSON.stringify({ url, limit: 50 }),
}).then(r => r.json()), }).then(r => r.json()),
// A3. Branding extraction
fetch("https://api.firecrawl.dev/v1/scrape", { fetch("https://api.firecrawl.dev/v1/scrape", {
method: "POST", method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` }, headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
@ -137,12 +140,49 @@ Deno.serve(async (req) => {
waitFor: 3000, waitFor: 3000,
}), }),
}).then(r => r.json()).catch(() => ({ data: { json: {} } })), }).then(r => r.json()).catch(() => ({ data: { json: {} } })),
// A4. Social button links — execute JS to extract all <a href> pointing to social platforms
fetch("https://api.firecrawl.dev/v1/scrape", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
body: JSON.stringify({
url,
formats: ["json"],
jsonOptions: {
prompt: "Find ALL social media link URLs on this page. Look in the header, footer, sidebar, and floating buttons. Extract the actual href URLs (not just text) for: Instagram, YouTube, Facebook, TikTok, Naver Blog (blog.naver.com), KakaoTalk (pf.kakao.com), Twitter/X. Include ALL variants found.",
schema: {
type: "object",
properties: {
socialLinks: {
type: "array",
items: {
type: "object",
properties: {
platform: { type: "string" },
url: { type: "string" },
},
},
},
},
},
},
waitFor: 5000,
actions: [
{ type: "wait", milliseconds: 3000 },
{ type: "scrape" },
],
}),
}).then(r => r.json()).catch(() => ({ data: { json: {} } })),
]); ]);
const scrapeData = scrapeResult.status === "fulfilled" ? scrapeResult.value : { data: {} }; const scrapeData = scrapeResult.status === "fulfilled" ? scrapeResult.value : { data: {} };
const mapData = mapResult.status === "fulfilled" ? mapResult.value : {}; const mapData = mapResult.status === "fulfilled" ? mapResult.value : {};
const brandData = brandResult.status === "fulfilled" ? brandResult.value : { data: { json: {} } }; const brandData = brandResult.status === "fulfilled" ? brandResult.value : { data: { json: {} } };
// A4 result: social buttons from JS-rendered page
const socialButtonData = socialButtonResult.status === "fulfilled" ? socialButtonResult.value : { data: { json: {} } };
const socialButtons = (socialButtonData.data?.json?.socialLinks || []) as { platform?: string; url?: string }[];
const clinic = scrapeData.data?.json || {}; const clinic = scrapeData.data?.json || {};
let resolvedName = inputClinicName || clinic.clinicName || clinic.clinicNameEn || ""; let resolvedName = inputClinicName || clinic.clinicName || clinic.clinicNameEn || "";
@ -184,6 +224,12 @@ Deno.serve(async (req) => {
kakao: scrapeSocial.kakao ? [scrapeSocial.kakao] : [], kakao: scrapeSocial.kakao ? [scrapeSocial.kakao] : [],
}; };
// Source 3: Social button links from JS-rendered page (most reliable!)
const socialButtonUrls = socialButtons
.map(b => typeof b.url === 'string' ? b.url : '')
.filter(u => u.length > 5);
const buttonHandles = extractSocialLinks(socialButtonUrls);
// ═══════════════════════════════════════════ // ═══════════════════════════════════════════
// STAGE B: Direct API searches + Perplexity (ALL PARALLEL) // STAGE B: Direct API searches + Perplexity (ALL PARALLEL)
// Each API directly searches for the clinic's presence // Each API directly searches for the clinic's presence
@ -410,7 +456,7 @@ Deno.serve(async (req) => {
// STAGE C: Merge ALL sources + Verify // STAGE C: Merge ALL sources + Verify
// ═══════════════════════════════════════════ // ═══════════════════════════════════════════
const merged = mergeSocialLinks(linkHandles, firecrawlHandles, apiHandles); const merged = mergeSocialLinks(linkHandles, firecrawlHandles, buttonHandles, apiHandles);
const cleanHandles = { const cleanHandles = {
instagram: [...new Set(merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null))], instagram: [...new Set(merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null))],