feat: extract social links from JS-rendered buttons on clinic website
Added A4 parallel Firecrawl call with actions: [wait 3s, scrape] to execute JavaScript and extract social button href URLs from header/footer. This is the most reliable source — most Korean clinics have Facebook/Instagram/YouTube/Blog icons in their nav. Results merged as Source 3 (buttonHandles) alongside HTML links, JSON extraction, and API searches. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>claude/bold-hawking
parent
80c57147e7
commit
ed37f23f78
|
|
@ -95,7 +95,8 @@ Deno.serve(async (req) => {
|
||||||
// STAGE A: Firecrawl scrape + map (parallel)
|
// STAGE A: Firecrawl scrape + map (parallel)
|
||||||
// ═══════════════════════════════════════════
|
// ═══════════════════════════════════════════
|
||||||
|
|
||||||
const [scrapeResult, mapResult, brandResult] = await Promise.allSettled([
|
const [scrapeResult, mapResult, brandResult, socialButtonResult] = await Promise.allSettled([
|
||||||
|
// A1. Main scrape — clinic info + links
|
||||||
fetch("https://api.firecrawl.dev/v1/scrape", {
|
fetch("https://api.firecrawl.dev/v1/scrape", {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
|
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
|
||||||
|
|
@ -119,12 +120,14 @@ Deno.serve(async (req) => {
|
||||||
}),
|
}),
|
||||||
}).then(r => r.json()),
|
}).then(r => r.json()),
|
||||||
|
|
||||||
|
// A2. Map site
|
||||||
fetch("https://api.firecrawl.dev/v1/map", {
|
fetch("https://api.firecrawl.dev/v1/map", {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
|
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
|
||||||
body: JSON.stringify({ url, limit: 50 }),
|
body: JSON.stringify({ url, limit: 50 }),
|
||||||
}).then(r => r.json()),
|
}).then(r => r.json()),
|
||||||
|
|
||||||
|
// A3. Branding extraction
|
||||||
fetch("https://api.firecrawl.dev/v1/scrape", {
|
fetch("https://api.firecrawl.dev/v1/scrape", {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
|
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
|
||||||
|
|
@ -137,12 +140,49 @@ Deno.serve(async (req) => {
|
||||||
waitFor: 3000,
|
waitFor: 3000,
|
||||||
}),
|
}),
|
||||||
}).then(r => r.json()).catch(() => ({ data: { json: {} } })),
|
}).then(r => r.json()).catch(() => ({ data: { json: {} } })),
|
||||||
|
|
||||||
|
// A4. Social button links — execute JS to extract all <a href> pointing to social platforms
|
||||||
|
fetch("https://api.firecrawl.dev/v1/scrape", {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
|
||||||
|
body: JSON.stringify({
|
||||||
|
url,
|
||||||
|
formats: ["json"],
|
||||||
|
jsonOptions: {
|
||||||
|
prompt: "Find ALL social media link URLs on this page. Look in the header, footer, sidebar, and floating buttons. Extract the actual href URLs (not just text) for: Instagram, YouTube, Facebook, TikTok, Naver Blog (blog.naver.com), KakaoTalk (pf.kakao.com), Twitter/X. Include ALL variants found.",
|
||||||
|
schema: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
socialLinks: {
|
||||||
|
type: "array",
|
||||||
|
items: {
|
||||||
|
type: "object",
|
||||||
|
properties: {
|
||||||
|
platform: { type: "string" },
|
||||||
|
url: { type: "string" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
waitFor: 5000,
|
||||||
|
actions: [
|
||||||
|
{ type: "wait", milliseconds: 3000 },
|
||||||
|
{ type: "scrape" },
|
||||||
|
],
|
||||||
|
}),
|
||||||
|
}).then(r => r.json()).catch(() => ({ data: { json: {} } })),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const scrapeData = scrapeResult.status === "fulfilled" ? scrapeResult.value : { data: {} };
|
const scrapeData = scrapeResult.status === "fulfilled" ? scrapeResult.value : { data: {} };
|
||||||
const mapData = mapResult.status === "fulfilled" ? mapResult.value : {};
|
const mapData = mapResult.status === "fulfilled" ? mapResult.value : {};
|
||||||
const brandData = brandResult.status === "fulfilled" ? brandResult.value : { data: { json: {} } };
|
const brandData = brandResult.status === "fulfilled" ? brandResult.value : { data: { json: {} } };
|
||||||
|
|
||||||
|
// A4 result: social buttons from JS-rendered page
|
||||||
|
const socialButtonData = socialButtonResult.status === "fulfilled" ? socialButtonResult.value : { data: { json: {} } };
|
||||||
|
const socialButtons = (socialButtonData.data?.json?.socialLinks || []) as { platform?: string; url?: string }[];
|
||||||
|
|
||||||
const clinic = scrapeData.data?.json || {};
|
const clinic = scrapeData.data?.json || {};
|
||||||
let resolvedName = inputClinicName || clinic.clinicName || clinic.clinicNameEn || "";
|
let resolvedName = inputClinicName || clinic.clinicName || clinic.clinicNameEn || "";
|
||||||
|
|
||||||
|
|
@ -184,6 +224,12 @@ Deno.serve(async (req) => {
|
||||||
kakao: scrapeSocial.kakao ? [scrapeSocial.kakao] : [],
|
kakao: scrapeSocial.kakao ? [scrapeSocial.kakao] : [],
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Source 3: Social button links from JS-rendered page (most reliable!)
|
||||||
|
const socialButtonUrls = socialButtons
|
||||||
|
.map(b => typeof b.url === 'string' ? b.url : '')
|
||||||
|
.filter(u => u.length > 5);
|
||||||
|
const buttonHandles = extractSocialLinks(socialButtonUrls);
|
||||||
|
|
||||||
// ═══════════════════════════════════════════
|
// ═══════════════════════════════════════════
|
||||||
// STAGE B: Direct API searches + Perplexity (ALL PARALLEL)
|
// STAGE B: Direct API searches + Perplexity (ALL PARALLEL)
|
||||||
// Each API directly searches for the clinic's presence
|
// Each API directly searches for the clinic's presence
|
||||||
|
|
@ -410,7 +456,7 @@ Deno.serve(async (req) => {
|
||||||
// STAGE C: Merge ALL sources + Verify
|
// STAGE C: Merge ALL sources + Verify
|
||||||
// ═══════════════════════════════════════════
|
// ═══════════════════════════════════════════
|
||||||
|
|
||||||
const merged = mergeSocialLinks(linkHandles, firecrawlHandles, apiHandles);
|
const merged = mergeSocialLinks(linkHandles, firecrawlHandles, buttonHandles, apiHandles);
|
||||||
|
|
||||||
const cleanHandles = {
|
const cleanHandles = {
|
||||||
instagram: [...new Set(merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null))],
|
instagram: [...new Set(merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null))],
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue