o2o-infinith-demo/supabase/functions/discover-channels/index.ts

302 lines
14 KiB
TypeScript

import "@supabase/functions-js/edge-runtime.d.ts";
import { createClient } from "https://esm.sh/@supabase/supabase-js@2";
import { extractSocialLinks, mergeSocialLinks } from "../_shared/extractSocialLinks.ts";
import { verifyAllHandles, type VerifiedChannels } from "../_shared/verifyHandles.ts";
const corsHeaders = {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Headers": "authorization, x-client-info, apikey, content-type",
};
interface DiscoverRequest {
url: string;
clinicName?: string;
}
/**
* Robust handle extraction — handles may be full URLs, @handles, or bare usernames.
* Validates each handle actually belongs to its platform.
*/
function extractHandle(raw: string, platform: string): string | null {
if (!raw || raw.length < 2) return null;
let h = raw.trim();
if (platform === 'instagram') {
const m = h.match(/instagram\.com\/([a-zA-Z0-9._]+)/);
if (m) return m[1];
h = h.replace(/^@/, '').replace(/\/$/, '');
if (/^[a-zA-Z0-9._]+$/.test(h) && h.length >= 2) return h;
return null;
}
if (platform === 'youtube') {
const m = h.match(/youtube\.com\/(?:@([a-zA-Z0-9._-]+)|channel\/(UC[a-zA-Z0-9_-]+)|c\/([a-zA-Z0-9._-]+))/);
if (m) return m[1] ? `@${m[1]}` : m[2] || m[3] || null;
h = h.replace(/^@/, '');
if (h.includes('http') || h.includes('/') || h.includes('.com')) return null;
if (/^UC[a-zA-Z0-9_-]{20,}$/.test(h)) return h;
if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return `@${h}`;
return null;
}
if (platform === 'facebook') {
const m = h.match(/facebook\.com\/([a-zA-Z0-9._-]+)/);
if (m) return m[1];
h = h.replace(/^@/, '').replace(/\/$/, '');
if (h.includes('http') || h.includes('/')) return null;
if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return h;
return null;
}
if (platform === 'naverBlog') {
const m = h.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/);
if (m) return m[1];
if (h.includes('http') || h.includes('/')) return null;
if (/^[a-zA-Z0-9_-]+$/.test(h) && h.length >= 2) return h;
return null;
}
if (platform === 'tiktok') {
const m = h.match(/tiktok\.com\/@([a-zA-Z0-9._-]+)/);
if (m) return m[1];
h = h.replace(/^@/, '');
if (h.includes('http') || h.includes('/')) return null;
if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return h;
return null;
}
return h;
}
/**
* Phase 1: Discover & Verify Channels
*
* Two-stage discovery:
* Stage A: Firecrawl scrape + map (parallel) → extract clinicName + social links
* Stage B: Perplexity search using clinicName (parallel) → find more handles
* Stage C: Merge + Verify all handles
*/
Deno.serve(async (req) => {
if (req.method === "OPTIONS") {
return new Response("ok", { headers: corsHeaders });
}
try {
const { url, clinicName: inputClinicName } = (await req.json()) as DiscoverRequest;
if (!url) {
return new Response(
JSON.stringify({ error: "URL is required" }),
{ status: 400, headers: { ...corsHeaders, "Content-Type": "application/json" } },
);
}
const FIRECRAWL_API_KEY = Deno.env.get("FIRECRAWL_API_KEY");
const PERPLEXITY_API_KEY = Deno.env.get("PERPLEXITY_API_KEY");
if (!FIRECRAWL_API_KEY) throw new Error("FIRECRAWL_API_KEY not configured");
// ═══════════════════════════════════════════
// STAGE A: Firecrawl scrape + map (parallel)
// → Extract clinicName + social links from HTML
// ═══════════════════════════════════════════
const [scrapeResult, mapResult, brandResult] = await Promise.allSettled([
fetch("https://api.firecrawl.dev/v1/scrape", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
body: JSON.stringify({
url,
formats: ["json", "links"],
jsonOptions: {
prompt: "Extract: clinic name (Korean), clinic name (English), address, phone, services offered, doctors with specialties, ALL social media links (instagram handles/URLs, youtube channel URL/handle, naver blog URL, facebook page URL, tiktok, kakao channel), business hours, slogan",
schema: {
type: "object",
properties: {
clinicName: { type: "string" },
clinicNameEn: { type: "string" },
address: { type: "string" },
phone: { type: "string" },
businessHours: { type: "string" },
slogan: { type: "string" },
services: { type: "array", items: { type: "string" } },
doctors: { type: "array", items: { type: "object", properties: { name: { type: "string" }, title: { type: "string" }, specialty: { type: "string" } } } },
socialMedia: { type: "object", properties: { instagram: { type: "string" }, youtube: { type: "string" }, blog: { type: "string" }, facebook: { type: "string" }, tiktok: { type: "string" }, kakao: { type: "string" } } },
},
},
},
waitFor: 5000,
}),
}).then(r => r.json()),
fetch("https://api.firecrawl.dev/v1/map", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
body: JSON.stringify({ url, limit: 50 }),
}).then(r => r.json()),
fetch("https://api.firecrawl.dev/v1/scrape", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
body: JSON.stringify({
url, formats: ["json"],
jsonOptions: {
prompt: "Extract brand identity: primary/accent/background/text colors (hex), heading/body fonts, logo URL, favicon URL, tagline",
schema: { type: "object", properties: { primaryColor: { type: "string" }, accentColor: { type: "string" }, backgroundColor: { type: "string" }, textColor: { type: "string" }, headingFont: { type: "string" }, bodyFont: { type: "string" }, logoUrl: { type: "string" }, faviconUrl: { type: "string" }, tagline: { type: "string" } } },
},
waitFor: 3000,
}),
}).then(r => r.json()).catch(() => ({ data: { json: {} } })),
]);
const scrapeData = scrapeResult.status === "fulfilled" ? scrapeResult.value : { data: {} };
const mapData = mapResult.status === "fulfilled" ? mapResult.value : {};
const brandData = brandResult.status === "fulfilled" ? brandResult.value : { data: { json: {} } };
const clinic = scrapeData.data?.json || {};
const resolvedName = inputClinicName || clinic.clinicName || clinic.clinicNameEn || new URL(url).hostname.replace('www.', '').split('.')[0];
const siteLinks: string[] = scrapeData.data?.links || [];
const siteMap: string[] = mapData.links || [];
// Source 1: Parse links from HTML
const linkHandles = extractSocialLinks([...siteLinks, ...siteMap]);
// Source 2: Firecrawl JSON extraction socialMedia field
const scrapeSocial = clinic.socialMedia || {};
const firecrawlHandles = {
instagram: scrapeSocial.instagram ? [scrapeSocial.instagram] : [],
youtube: scrapeSocial.youtube ? [scrapeSocial.youtube] : [],
facebook: scrapeSocial.facebook ? [scrapeSocial.facebook] : [],
naverBlog: scrapeSocial.blog ? [scrapeSocial.blog] : [],
tiktok: scrapeSocial.tiktok ? [scrapeSocial.tiktok] : [],
kakao: scrapeSocial.kakao ? [scrapeSocial.kakao] : [],
};
// ═══════════════════════════════════════════
// STAGE B: Perplexity search using CLINIC NAME
// → Find social handles that Firecrawl missed
// ═══════════════════════════════════════════
let perplexityHandles: Partial<typeof linkHandles> = {};
let gangnamUnniHintUrl: string | undefined;
if (PERPLEXITY_API_KEY && resolvedName) {
const pResults = await Promise.allSettled([
// Query 1: Social media accounts — using clinic name, not URL
fetch("https://api.perplexity.ai/chat/completions", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` },
body: JSON.stringify({
model: "sonar",
messages: [
{ role: "system", content: "You find official social media accounts for Korean medical clinics. Respond ONLY with valid JSON. If unsure, use null. Never guess or make up handles." },
{ role: "user", content: `"${resolvedName}" 성형외과/병원의 공식 소셜 미디어 계정을 찾아줘. 인스타그램 계정이 여러개일 수 있어 (국문용, 영문용 등). 반드시 확인된 계정만 포함.\n\n{"instagram": ["핸들1", "핸들2"], "youtube": "채널 핸들 또는 URL (@ 포함)", "facebook": "페이지명 또는 URL", "tiktok": "핸들", "naverBlog": "블로그ID", "kakao": "채널ID"}` },
],
temperature: 0.1,
}),
}).then(r => r.json()),
// Query 2: Platform presence — 강남언니, 네이버, 바비톡
fetch("https://api.perplexity.ai/chat/completions", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` },
body: JSON.stringify({
model: "sonar",
messages: [
{ role: "system", content: "You research Korean medical clinic platform presence. Respond ONLY with valid JSON." },
{ role: "user", content: `"${resolvedName}" 성형외과/병원의 강남언니, 네이버 플레이스, 바비톡 등록 현황을 찾아줘.\n\n{"gangnamUnni": {"registered": true/false, "url": "gangnamunni.com URL 또는 null", "rating": 숫자/10 또는 null}, "naverPlace": {"registered": true/false}, "babitok": {"registered": true/false}}` },
],
temperature: 0.1,
}),
}).then(r => r.json()),
]);
// Parse social handles
if (pResults[0]?.status === "fulfilled") {
try {
let text = pResults[0].value?.choices?.[0]?.message?.content || "";
const jsonMatch = text.match(/```(?:json)?\n?([\s\S]*?)```/);
if (jsonMatch) text = jsonMatch[1];
const parsed = JSON.parse(text);
perplexityHandles = {
instagram: Array.isArray(parsed.instagram) ? parsed.instagram : parsed.instagram ? [parsed.instagram] : [],
youtube: parsed.youtube ? [parsed.youtube] : [],
facebook: parsed.facebook ? [parsed.facebook] : [],
naverBlog: parsed.naverBlog ? [parsed.naverBlog] : [],
tiktok: parsed.tiktok ? [parsed.tiktok] : [],
kakao: parsed.kakao ? [parsed.kakao] : [],
};
} catch { /* JSON parse failed */ }
}
// Parse platform presence
if (pResults[1]?.status === "fulfilled") {
try {
let text = pResults[1].value?.choices?.[0]?.message?.content || "";
const jsonMatch = text.match(/```(?:json)?\n?([\s\S]*?)```/);
if (jsonMatch) text = jsonMatch[1];
const parsed = JSON.parse(text);
if (parsed.gangnamUnni?.url) gangnamUnniHintUrl = parsed.gangnamUnni.url;
} catch { /* JSON parse failed */ }
}
}
// ═══════════════════════════════════════════
// STAGE C: Merge + Deduplicate + Verify
// ═══════════════════════════════════════════
const merged = mergeSocialLinks(linkHandles, firecrawlHandles, perplexityHandles);
const cleanHandles = {
instagram: merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null),
youtube: merged.youtube.map(h => extractHandle(h, 'youtube')).filter((h): h is string => h !== null),
facebook: merged.facebook.map(h => extractHandle(h, 'facebook')).filter((h): h is string => h !== null),
naverBlog: merged.naverBlog.map(h => extractHandle(h, 'naverBlog')).filter((h): h is string => h !== null),
tiktok: merged.tiktok.map(h => extractHandle(h, 'tiktok')).filter((h): h is string => h !== null),
};
const verified: VerifiedChannels = await verifyAllHandles(
cleanHandles, resolvedName, gangnamUnniHintUrl,
);
// ═══════════════════════════════════════════
// Save to DB
// ═══════════════════════════════════════════
const supabaseUrl = Deno.env.get("SUPABASE_URL")!;
const supabaseKey = Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")!;
const supabase = createClient(supabaseUrl, supabaseKey);
const scrapeDataFull = {
clinic, branding: brandData.data?.json || {},
siteLinks, siteMap: mapData.links || [],
sourceUrl: url, scrapedAt: new Date().toISOString(),
};
const { data: saved, error: saveError } = await supabase
.from("marketing_reports")
.insert({
url, clinic_name: resolvedName,
status: "discovered",
verified_channels: verified,
scrape_data: scrapeDataFull,
report: {},
pipeline_started_at: new Date().toISOString(),
})
.select("id")
.single();
if (saveError) throw new Error(`DB save failed: ${saveError.message}`);
return new Response(
JSON.stringify({
success: true, reportId: saved.id,
clinicName: resolvedName,
verifiedChannels: verified,
address: clinic.address || "",
services: clinic.services || [],
scrapeData: scrapeDataFull,
}),
{ headers: { ...corsHeaders, "Content-Type": "application/json" } },
);
} catch (error) {
return new Response(
JSON.stringify({ success: false, error: error.message }),
{ status: 500, headers: { ...corsHeaders, "Content-Type": "application/json" } },
);
}
});