828 lines
36 KiB
TypeScript
828 lines
36 KiB
TypeScript
import "@supabase/functions-js/edge-runtime.d.ts";
|
|
import { createClient } from "https://esm.sh/@supabase/supabase-js@2";
|
|
import { extractSocialLinks, mergeSocialLinks } from "../_shared/extractSocialLinks.ts";
|
|
import { PERPLEXITY_MODEL } from "../_shared/config.ts";
|
|
import { verifyAllHandles, type VerifiedChannels } from "../_shared/verifyHandles.ts";
|
|
import { RESEARCH_SYSTEM_PROMPT, buildResearchUserPrompt } from "../_shared/researchPrompt.ts";
|
|
|
|
const corsHeaders = {
|
|
"Access-Control-Allow-Origin": "*",
|
|
"Access-Control-Allow-Headers": "authorization, x-client-info, apikey, content-type",
|
|
};
|
|
|
|
const APIFY_BASE = "https://api.apify.com/v2";
|
|
|
|
// ─── Registry Helper: Convert registry row → VerifiedChannels ───
|
|
|
|
function extractHandleFromUrl(url: string, platform: string): string | null {
|
|
if (!url) return null;
|
|
try {
|
|
if (platform === 'instagram') {
|
|
const m = url.match(/instagram\.com\/([a-zA-Z0-9._]+)/);
|
|
return m ? m[1] : null;
|
|
}
|
|
if (platform === 'youtube') {
|
|
const m = url.match(/youtube\.com\/(?:@([a-zA-Z0-9._-]+)|channel\/(UC[a-zA-Z0-9_-]+)|c\/([a-zA-Z0-9._-]+)|user\/([a-zA-Z0-9._-]+))/);
|
|
if (m) return m[1] ? `@${m[1]}` : m[2] || m[3] || m[4] || null;
|
|
return null;
|
|
}
|
|
if (platform === 'facebook') {
|
|
const m = url.match(/facebook\.com\/([a-zA-Z0-9._-]+)/);
|
|
return m ? m[1] : null;
|
|
}
|
|
if (platform === 'naverBlog') {
|
|
const m = url.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/);
|
|
return m ? m[1] : null;
|
|
}
|
|
if (platform === 'tiktok') {
|
|
const m = url.match(/tiktok\.com\/@([a-zA-Z0-9._-]+)/);
|
|
return m ? m[1] : null;
|
|
}
|
|
} catch { /* ignore */ }
|
|
return null;
|
|
}
|
|
|
|
interface RegistryRow {
|
|
name: string;
|
|
domain: string;
|
|
website_url: string;
|
|
brand_group?: string;
|
|
district?: string;
|
|
branches?: string;
|
|
website_en?: string;
|
|
youtube_url?: string;
|
|
instagram_url?: string;
|
|
instagram_en_url?: string;
|
|
facebook_url?: string;
|
|
tiktok_url?: string;
|
|
naver_blog_url?: string;
|
|
naver_place_url?: string;
|
|
gangnam_unni_url?: string;
|
|
google_maps_url?: string;
|
|
founded_year?: number;
|
|
}
|
|
|
|
function registryToVerifiedChannels(reg: RegistryRow): import("../_shared/verifyHandles.ts").VerifiedChannels {
|
|
const igHandles: import("../_shared/verifyHandles.ts").VerifiedChannel[] = [];
|
|
const igHandle = extractHandleFromUrl(reg.instagram_url || '', 'instagram');
|
|
if (igHandle) igHandles.push({ handle: igHandle, verified: true, url: reg.instagram_url! });
|
|
const igEnHandle = extractHandleFromUrl(reg.instagram_en_url || '', 'instagram');
|
|
if (igEnHandle) igHandles.push({ handle: igEnHandle, verified: true, url: reg.instagram_en_url! });
|
|
|
|
const ytHandle = extractHandleFromUrl(reg.youtube_url || '', 'youtube');
|
|
const fbHandle = extractHandleFromUrl(reg.facebook_url || '', 'facebook');
|
|
const blogHandle = extractHandleFromUrl(reg.naver_blog_url || '', 'naverBlog');
|
|
const ttHandle = extractHandleFromUrl(reg.tiktok_url || '', 'tiktok');
|
|
|
|
return {
|
|
instagram: igHandles,
|
|
youtube: ytHandle ? { handle: ytHandle, verified: true, url: reg.youtube_url! } : null,
|
|
facebook: fbHandle ? { handle: fbHandle, verified: true, url: reg.facebook_url! } : null,
|
|
naverBlog: blogHandle ? { handle: blogHandle, verified: true, url: reg.naver_blog_url! } : null,
|
|
gangnamUnni: reg.gangnam_unni_url ? { handle: reg.gangnam_unni_url, verified: true, url: reg.gangnam_unni_url } : null,
|
|
tiktok: ttHandle ? { handle: ttHandle, verified: true, url: reg.tiktok_url! } : null,
|
|
};
|
|
}
|
|
|
|
interface DiscoverRequest {
|
|
url: string;
|
|
clinicName?: string;
|
|
}
|
|
|
|
function extractHandle(raw: string, platform: string): string | null {
|
|
if (!raw || raw.length < 2) return null;
|
|
let h = raw.trim();
|
|
if (platform === 'instagram') {
|
|
const m = h.match(/instagram\.com\/([a-zA-Z0-9._]+)/);
|
|
if (m) return m[1];
|
|
h = h.replace(/^@/, '').replace(/\/$/, '');
|
|
if (/^[a-zA-Z0-9._]+$/.test(h) && h.length >= 2) return h;
|
|
return null;
|
|
}
|
|
if (platform === 'youtube') {
|
|
const m = h.match(/youtube\.com\/(?:@([a-zA-Z0-9._-]+)|channel\/(UC[a-zA-Z0-9_-]+)|c\/([a-zA-Z0-9._-]+))/);
|
|
if (m) return m[1] ? `@${m[1]}` : m[2] || m[3] || null;
|
|
h = h.replace(/^@/, '');
|
|
if (h.includes('http') || h.includes('/') || h.includes('.com')) return null;
|
|
if (/^UC[a-zA-Z0-9_-]{22}$/.test(h)) return h; // YouTube channel IDs are exactly 24 chars (UC + 22)
|
|
if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return `@${h}`;
|
|
return null;
|
|
}
|
|
if (platform === 'facebook') {
|
|
const m = h.match(/facebook\.com\/([a-zA-Z0-9._-]+)/);
|
|
if (m) return m[1];
|
|
h = h.replace(/^@/, '').replace(/\/$/, '');
|
|
if (h.includes('http') || h.includes('/')) return null;
|
|
if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return h;
|
|
return null;
|
|
}
|
|
if (platform === 'naverBlog') {
|
|
const m = h.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/);
|
|
if (m) return m[1];
|
|
if (h.includes('http') || h.includes('/')) return null;
|
|
if (/^[a-zA-Z0-9_-]+$/.test(h) && h.length >= 2) return h;
|
|
return null;
|
|
}
|
|
if (platform === 'tiktok') {
|
|
const m = h.match(/tiktok\.com\/@([a-zA-Z0-9._-]+)/);
|
|
if (m) return m[1];
|
|
h = h.replace(/^@/, '');
|
|
if (h.includes('http') || h.includes('/')) return null;
|
|
if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return h;
|
|
return null;
|
|
}
|
|
return h;
|
|
}
|
|
|
|
/**
|
|
* Phase 1: Discover & Verify Channels
|
|
*
|
|
* API-first, Perplexity-supplement approach:
|
|
* Stage A: Firecrawl scrape + map → clinicName + social links from HTML
|
|
* Stage B: Direct API searches (YouTube, Naver, Firecrawl) + Perplexity
|
|
* Stage C: Merge all sources + Verify handles
|
|
*/
|
|
Deno.serve(async (req) => {
|
|
if (req.method === "OPTIONS") {
|
|
return new Response("ok", { headers: corsHeaders });
|
|
}
|
|
|
|
try {
|
|
const { url, clinicName: inputClinicName } = (await req.json()) as DiscoverRequest;
|
|
if (!url) {
|
|
return new Response(
|
|
JSON.stringify({ error: "URL is required" }),
|
|
{ status: 400, headers: { ...corsHeaders, "Content-Type": "application/json" } },
|
|
);
|
|
}
|
|
|
|
// ═══════════════════════════════════════════
|
|
// REGISTRY CHECK: Pre-verified clinic DB lookup
|
|
// If domain is registered, skip all API discovery
|
|
// ═══════════════════════════════════════════
|
|
const supabaseUrl = Deno.env.get("SUPABASE_URL")!;
|
|
const supabaseKey = Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")!;
|
|
const supabase = createClient(supabaseUrl, supabaseKey);
|
|
|
|
let registryDomain: string;
|
|
try {
|
|
registryDomain = new URL(url).hostname.replace(/^www\./, '');
|
|
} catch {
|
|
registryDomain = '';
|
|
}
|
|
|
|
if (registryDomain) {
|
|
const { data: registered } = await supabase
|
|
.from("clinic_registry")
|
|
.select("*")
|
|
.eq("domain", registryDomain)
|
|
.eq("is_active", true)
|
|
.maybeSingle();
|
|
|
|
if (registered) {
|
|
console.log(`[registry] Hit: ${registered.name} (${registryDomain})`);
|
|
const verified = registryToVerifiedChannels(registered as RegistryRow);
|
|
|
|
const scrapeDataFromRegistry = {
|
|
clinic: { clinicName: registered.name },
|
|
branding: {},
|
|
siteLinks: [],
|
|
siteMap: [],
|
|
sourceUrl: url,
|
|
scrapedAt: new Date().toISOString(),
|
|
source: "registry",
|
|
registryData: {
|
|
district: registered.district,
|
|
branches: registered.branches,
|
|
brandGroup: registered.brand_group,
|
|
foundedYear: registered.founded_year,
|
|
websiteEn: registered.website_en,
|
|
naverPlaceUrl: registered.naver_place_url,
|
|
googleMapsUrl: registered.google_maps_url,
|
|
},
|
|
};
|
|
|
|
// Legacy: marketing_reports
|
|
const { data: saved, error: saveError } = await supabase
|
|
.from("marketing_reports")
|
|
.insert({
|
|
url, clinic_name: registered.name,
|
|
status: "discovered",
|
|
verified_channels: verified,
|
|
scrape_data: scrapeDataFromRegistry,
|
|
report: {},
|
|
pipeline_started_at: new Date().toISOString(),
|
|
})
|
|
.select("id")
|
|
.single();
|
|
|
|
if (saveError) throw new Error(`DB save failed: ${saveError.message}`);
|
|
|
|
// V3: clinics + analysis_runs
|
|
let clinicId: string | null = null;
|
|
let runId: string | null = null;
|
|
try {
|
|
const { data: clinicRow } = await supabase
|
|
.from("clinics")
|
|
.upsert({
|
|
url,
|
|
name: registered.name,
|
|
name_en: null,
|
|
domain: registryDomain,
|
|
address: null,
|
|
phone: null,
|
|
services: [],
|
|
branding: {},
|
|
social_handles: {
|
|
instagram: verified.instagram?.map((v: Record<string, unknown>) => v.handle) || [],
|
|
youtube: (verified.youtube as Record<string, unknown>)?.handle || null,
|
|
facebook: (verified.facebook as Record<string, unknown>)?.handle || null,
|
|
naverBlog: (verified.naverBlog as Record<string, unknown>)?.handle || null,
|
|
},
|
|
verified_channels: verified,
|
|
last_analyzed_at: new Date().toISOString(),
|
|
updated_at: new Date().toISOString(),
|
|
}, { onConflict: 'url' })
|
|
.select("id")
|
|
.single();
|
|
|
|
clinicId = clinicRow?.id || null;
|
|
|
|
if (clinicId) {
|
|
const { data: runRow } = await supabase
|
|
.from("analysis_runs")
|
|
.insert({
|
|
clinic_id: clinicId,
|
|
status: "discovering",
|
|
scrape_data: scrapeDataFromRegistry,
|
|
discovered_channels: verified,
|
|
trigger: "manual",
|
|
pipeline_started_at: new Date().toISOString(),
|
|
})
|
|
.select("id")
|
|
.single();
|
|
runId = runRow?.id || null;
|
|
}
|
|
} catch (e) {
|
|
console.error("V3 dual-write error (registry):", e);
|
|
}
|
|
|
|
return new Response(
|
|
JSON.stringify({
|
|
success: true, reportId: saved.id,
|
|
clinicId, runId,
|
|
clinicName: registered.name,
|
|
verifiedChannels: verified,
|
|
address: "",
|
|
services: [],
|
|
scrapeData: scrapeDataFromRegistry,
|
|
source: "registry",
|
|
}),
|
|
{ headers: { ...corsHeaders, "Content-Type": "application/json" } },
|
|
);
|
|
}
|
|
}
|
|
|
|
// ═══════════════════════════════════════════
|
|
// NOT REGISTERED: Return error for unregistered domains
|
|
// (Registry-only mode — no API fallback)
|
|
// ═══════════════════════════════════════════
|
|
console.log(`[registry] Miss: ${registryDomain} — returning CLINIC_NOT_REGISTERED`);
|
|
return new Response(
|
|
JSON.stringify({
|
|
success: false,
|
|
error: "CLINIC_NOT_REGISTERED",
|
|
message: "현재 지원하지 않는 병원입니다. 등록된 병원만 분석 가능합니다.",
|
|
domain: registryDomain,
|
|
}),
|
|
{ status: 404, headers: { ...corsHeaders, "Content-Type": "application/json" } },
|
|
);
|
|
|
|
// ═══════════════════════════════════════════
|
|
// LEGACY FALLBACK: Full API discovery (disabled — registry-only mode)
|
|
// Kept for reference; unreachable in production
|
|
// ═══════════════════════════════════════════
|
|
|
|
const FIRECRAWL_API_KEY = Deno.env.get("FIRECRAWL_API_KEY") || "";
|
|
const PERPLEXITY_API_KEY = Deno.env.get("PERPLEXITY_API_KEY") || "";
|
|
const YOUTUBE_API_KEY = Deno.env.get("YOUTUBE_API_KEY") || "";
|
|
const NAVER_CLIENT_ID = Deno.env.get("NAVER_CLIENT_ID") || "";
|
|
const NAVER_CLIENT_SECRET = Deno.env.get("NAVER_CLIENT_SECRET") || "";
|
|
if (!FIRECRAWL_API_KEY) throw new Error("FIRECRAWL_API_KEY not configured");
|
|
|
|
// ═══════════════════════════════════════════
|
|
// STAGE A: Firecrawl scrape + map (parallel)
|
|
// ═══════════════════════════════════════════
|
|
|
|
const [scrapeResult, mapResult, brandResult, socialButtonResult] = await Promise.allSettled([
|
|
// A1. Main scrape — clinic info + links
|
|
fetch("https://api.firecrawl.dev/v1/scrape", {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
|
|
body: JSON.stringify({
|
|
url, formats: ["json", "links"],
|
|
jsonOptions: {
|
|
prompt: "Extract: clinic name (Korean), clinic name (English), address, phone, services offered, doctors with specialties, ALL social media links (instagram handles/URLs, youtube channel URL/handle, naver blog URL, facebook page URL, tiktok, kakao channel), business hours, slogan",
|
|
schema: {
|
|
type: "object",
|
|
properties: {
|
|
clinicName: { type: "string" }, clinicNameEn: { type: "string" },
|
|
address: { type: "string" }, phone: { type: "string" },
|
|
businessHours: { type: "string" }, slogan: { type: "string" },
|
|
services: { type: "array", items: { type: "string" } },
|
|
doctors: { type: "array", items: { type: "object", properties: { name: { type: "string" }, title: { type: "string" }, specialty: { type: "string" } } } },
|
|
socialMedia: { type: "object", properties: { instagram: { type: "string" }, youtube: { type: "string" }, blog: { type: "string" }, facebook: { type: "string" }, tiktok: { type: "string" }, kakao: { type: "string" } } },
|
|
},
|
|
},
|
|
},
|
|
waitFor: 5000,
|
|
}),
|
|
}).then(r => r.json()),
|
|
|
|
// A2. Map site
|
|
fetch("https://api.firecrawl.dev/v1/map", {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
|
|
body: JSON.stringify({ url, limit: 50 }),
|
|
}).then(r => r.json()),
|
|
|
|
// A3. Branding extraction
|
|
fetch("https://api.firecrawl.dev/v1/scrape", {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
|
|
body: JSON.stringify({
|
|
url, formats: ["json"],
|
|
jsonOptions: {
|
|
prompt: "Extract brand identity: primary/accent/background/text colors (hex), heading/body fonts, logo URL, favicon URL, tagline",
|
|
schema: { type: "object", properties: { primaryColor: { type: "string" }, accentColor: { type: "string" }, backgroundColor: { type: "string" }, textColor: { type: "string" }, headingFont: { type: "string" }, bodyFont: { type: "string" }, logoUrl: { type: "string" }, faviconUrl: { type: "string" }, tagline: { type: "string" } } },
|
|
},
|
|
waitFor: 3000,
|
|
}),
|
|
}).then(r => r.json()).catch(() => ({ data: { json: {} } })),
|
|
|
|
// A4. Social button links — execute JS to extract all <a href> pointing to social platforms
|
|
fetch("https://api.firecrawl.dev/v1/scrape", {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
|
|
body: JSON.stringify({
|
|
url,
|
|
formats: ["json"],
|
|
jsonOptions: {
|
|
prompt: "Find ALL social media link URLs on this page. Look in the header, footer, sidebar, and floating buttons. Extract the actual href URLs (not just text) for: Instagram, YouTube, Facebook, TikTok, Naver Blog (blog.naver.com), KakaoTalk (pf.kakao.com), Twitter/X. Include ALL variants found.",
|
|
schema: {
|
|
type: "object",
|
|
properties: {
|
|
socialLinks: {
|
|
type: "array",
|
|
items: {
|
|
type: "object",
|
|
properties: {
|
|
platform: { type: "string" },
|
|
url: { type: "string" },
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
waitFor: 5000,
|
|
actions: [
|
|
{ type: "wait", milliseconds: 3000 },
|
|
{ type: "scrape" },
|
|
],
|
|
}),
|
|
}).then(r => r.json()).catch(() => ({ data: { json: {} } })),
|
|
]);
|
|
|
|
const scrapeData = scrapeResult.status === "fulfilled" ? scrapeResult.value : { data: {} };
|
|
const mapData = mapResult.status === "fulfilled" ? mapResult.value : {};
|
|
const brandData = brandResult.status === "fulfilled" ? brandResult.value : { data: { json: {} } };
|
|
|
|
// A4 result: social buttons from JS-rendered page
|
|
const socialButtonData = socialButtonResult.status === "fulfilled" ? socialButtonResult.value : { data: { json: {} } };
|
|
const socialButtons = (socialButtonData.data?.json?.socialLinks || []) as { platform?: string; url?: string }[];
|
|
|
|
const clinic = scrapeData.data?.json || {};
|
|
let resolvedName = inputClinicName || clinic.clinicName || clinic.clinicNameEn || "";
|
|
|
|
// Fallback: ask Perplexity to identify clinic name from URL
|
|
if (!resolvedName && PERPLEXITY_API_KEY) {
|
|
try {
|
|
const nameRes = await fetch("https://api.perplexity.ai/chat/completions", {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` },
|
|
body: JSON.stringify({
|
|
model: PERPLEXITY_MODEL,
|
|
messages: [
|
|
{ role: "system", content: "Respond with ONLY the clinic name in Korean, nothing else." },
|
|
{ role: "user", content: `${url} 이 URL의 병원/클리닉 한국어 이름이 뭐야?` },
|
|
],
|
|
temperature: 0.1,
|
|
}),
|
|
});
|
|
const nameData = await nameRes.json();
|
|
const aiName = (nameData.choices?.[0]?.message?.content || "").trim().replace(/["""]/g, '').split('\n')[0].trim();
|
|
if (aiName && aiName.length >= 2 && aiName.length <= 30) resolvedName = aiName;
|
|
} catch { /* fallback to domain */ }
|
|
}
|
|
if (!resolvedName) resolvedName = new URL(url).hostname.replace('www.', '').split('.')[0];
|
|
|
|
// Source 1: Parse social links from HTML
|
|
const siteLinks: string[] = scrapeData.data?.links || [];
|
|
const siteMap: string[] = mapData.links || [];
|
|
const linkHandles = extractSocialLinks([...siteLinks, ...siteMap]);
|
|
|
|
// Source 2: Firecrawl JSON extraction
|
|
const scrapeSocial = clinic.socialMedia || {};
|
|
const firecrawlHandles = {
|
|
instagram: scrapeSocial.instagram ? [scrapeSocial.instagram] : [],
|
|
youtube: scrapeSocial.youtube ? [scrapeSocial.youtube] : [],
|
|
facebook: scrapeSocial.facebook ? [scrapeSocial.facebook] : [],
|
|
naverBlog: scrapeSocial.blog ? [scrapeSocial.blog] : [],
|
|
tiktok: scrapeSocial.tiktok ? [scrapeSocial.tiktok] : [],
|
|
kakao: scrapeSocial.kakao ? [scrapeSocial.kakao] : [],
|
|
};
|
|
|
|
// Source 3: Social button links from JS-rendered page (most reliable!)
|
|
const socialButtonUrls = socialButtons
|
|
.map(b => typeof b.url === 'string' ? b.url : '')
|
|
.filter(u => u.length > 5);
|
|
const buttonHandles = extractSocialLinks(socialButtonUrls);
|
|
|
|
// ═══════════════════════════════════════════
|
|
// STAGE B: Direct API searches + Perplexity (ALL PARALLEL)
|
|
// Each API directly searches for the clinic's presence
|
|
// ═══════════════════════════════════════════
|
|
|
|
const apiHandles: Partial<typeof linkHandles> = {
|
|
instagram: [], youtube: [], facebook: [],
|
|
naverBlog: [], tiktok: [], kakao: [],
|
|
};
|
|
let gangnamUnniHintUrl: string | undefined;
|
|
const stageBTasks: Promise<void>[] = [];
|
|
|
|
// ─── B1. YouTube Data API: Search for channel by name ───
|
|
if (YOUTUBE_API_KEY) {
|
|
stageBTasks.push((async () => {
|
|
try {
|
|
const q = encodeURIComponent(resolvedName);
|
|
const res = await fetch(
|
|
`https://www.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=${q}&maxResults=3&key=${YOUTUBE_API_KEY}`
|
|
);
|
|
const data = await res.json();
|
|
// Add ALL search results — let verifyAllHandles pick the best match by name
|
|
for (const item of (data.items || [])) {
|
|
const channelId = item.snippet?.channelId || item.id?.channelId;
|
|
if (channelId) {
|
|
apiHandles.youtube!.push(channelId);
|
|
}
|
|
}
|
|
} catch { /* skip */ }
|
|
})());
|
|
}
|
|
|
|
// ─── B2. Naver Search API: Find blog + social URLs ───
|
|
if (NAVER_CLIENT_ID && NAVER_CLIENT_SECRET) {
|
|
const naverHeaders = {
|
|
"X-Naver-Client-Id": NAVER_CLIENT_ID,
|
|
"X-Naver-Client-Secret": NAVER_CLIENT_SECRET,
|
|
};
|
|
|
|
// B2a. Blog search → find official Naver blog
|
|
stageBTasks.push((async () => {
|
|
try {
|
|
const q = encodeURIComponent(`${resolvedName} 공식 블로그`);
|
|
const res = await fetch(
|
|
`https://openapi.naver.com/v1/search/blog.json?query=${q}&display=5&sort=sim`,
|
|
{ headers: naverHeaders }
|
|
);
|
|
const data = await res.json();
|
|
for (const item of (data.items || [])) {
|
|
const link = item.link || "";
|
|
if (link.includes("blog.naver.com/")) {
|
|
const m = link.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/);
|
|
if (m) apiHandles.naverBlog!.push(m[1]);
|
|
}
|
|
}
|
|
} catch { /* skip */ }
|
|
})());
|
|
|
|
// B2b. Web search → find Instagram/YouTube/Facebook URLs
|
|
stageBTasks.push((async () => {
|
|
try {
|
|
const q = encodeURIComponent(`${resolvedName} 인스타그램 유튜브 공식`);
|
|
const res = await fetch(
|
|
`https://openapi.naver.com/v1/search/webkr.json?query=${q}&display=10`,
|
|
{ headers: naverHeaders }
|
|
);
|
|
const data = await res.json();
|
|
const urls: string[] = (data.items || []).map((item: Record<string, string>) => item.link).filter(Boolean);
|
|
// Extract social handles from search result URLs
|
|
const found = extractSocialLinks(urls);
|
|
if (found.instagram.length) apiHandles.instagram!.push(...found.instagram);
|
|
if (found.youtube.length) apiHandles.youtube!.push(...found.youtube);
|
|
if (found.facebook.length) apiHandles.facebook!.push(...found.facebook);
|
|
if (found.tiktok.length) apiHandles.tiktok!.push(...found.tiktok);
|
|
} catch { /* skip */ }
|
|
})());
|
|
}
|
|
|
|
// ─── B3. Firecrawl Search: Find social URLs via web search ───
|
|
stageBTasks.push((async () => {
|
|
try {
|
|
const res = await fetch("https://api.firecrawl.dev/v1/search", {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
|
|
body: JSON.stringify({
|
|
query: `${resolvedName} 성형외과 instagram youtube 공식`,
|
|
limit: 10,
|
|
}),
|
|
});
|
|
const data = await res.json();
|
|
const urls: string[] = (data.data || []).map((r: Record<string, string>) => r.url).filter(Boolean);
|
|
const found = extractSocialLinks(urls);
|
|
if (found.instagram.length) apiHandles.instagram!.push(...found.instagram);
|
|
if (found.youtube.length) apiHandles.youtube!.push(...found.youtube);
|
|
if (found.facebook.length) apiHandles.facebook!.push(...found.facebook);
|
|
if (found.tiktok.length) apiHandles.tiktok!.push(...found.tiktok);
|
|
} catch { /* skip */ }
|
|
})());
|
|
|
|
// ─── B4. Perplexity: Single comprehensive query (proven pattern) ───
|
|
let perplexityResearch: Record<string, unknown> | null = null;
|
|
|
|
if (PERPLEXITY_API_KEY) {
|
|
// Build clinic name with English variant for better search
|
|
const clinicNameEn = clinic.clinicNameEn || '';
|
|
const searchName = clinicNameEn
|
|
? `${resolvedName} (${clinicNameEn})`
|
|
: resolvedName;
|
|
|
|
stageBTasks.push((async () => {
|
|
try {
|
|
const res = await fetch("https://api.perplexity.ai/chat/completions", {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` },
|
|
body: JSON.stringify({
|
|
model: PERPLEXITY_MODEL,
|
|
messages: [
|
|
{ role: "system", content: "You are a social media researcher. Search the web and find social media accounts. Respond ONLY with valid JSON." },
|
|
{ role: "user", content: `${searchName} 병원의 인스타그램, 유튜브, 페이스북, 틱톡, 네이버블로그 계정을 검색해서 찾아줘. 검색 결과에서 발견된 계정을 모두 알려줘. 인스타그램은 여러 계정이 있을 수 있어.\n\n{"instagram": ["handle1", "handle2"], "youtube": "channel URL or handle", "facebook": "page name or URL", "tiktok": "handle", "naverBlog": "blog ID"}` },
|
|
],
|
|
temperature: 0.1,
|
|
}),
|
|
});
|
|
const data = await res.json();
|
|
let text = data.choices?.[0]?.message?.content || "";
|
|
const m = text.match(/\{[\s\S]*\}/);
|
|
if (m) {
|
|
const parsed = JSON.parse(m[0]);
|
|
perplexityResearch = parsed;
|
|
|
|
const ig = Array.isArray(parsed.instagram) ? parsed.instagram : parsed.instagram ? [parsed.instagram] : [];
|
|
const yt = Array.isArray(parsed.youtube) ? parsed.youtube : parsed.youtube ? [parsed.youtube] : [];
|
|
ig.forEach((h: unknown) => { if (h && typeof h === 'string') apiHandles.instagram!.push(h); });
|
|
yt.forEach((h: unknown) => { if (h && typeof h === 'string') apiHandles.youtube!.push(h); });
|
|
if (parsed.facebook && typeof parsed.facebook === 'string') apiHandles.facebook!.push(parsed.facebook);
|
|
if (parsed.tiktok && typeof parsed.tiktok === 'string') apiHandles.tiktok!.push(parsed.tiktok);
|
|
if (parsed.naverBlog && typeof parsed.naverBlog === 'string') apiHandles.naverBlog!.push(parsed.naverBlog);
|
|
}
|
|
} catch { /* skip */ }
|
|
})());
|
|
|
|
// B4b. 강남언니 검색 (별도 — gangnamunni URL 힌트 필요)
|
|
stageBTasks.push((async () => {
|
|
try {
|
|
const res = await fetch("https://api.perplexity.ai/chat/completions", {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` },
|
|
body: JSON.stringify({
|
|
model: PERPLEXITY_MODEL,
|
|
messages: [
|
|
{ role: "system", content: "You search for clinic listings on medical platforms. Respond ONLY with valid JSON." },
|
|
{ role: "user", content: `${resolvedName} 병원 강남언니 gangnamunni.com 페이지를 찾아줘.\n\n{"gangnamUnni": {"url": "https://gangnamunni.com/hospitals/...", "rating": 9.5, "reviews": 1000}}` },
|
|
],
|
|
temperature: 0.1,
|
|
}),
|
|
});
|
|
const data = await res.json();
|
|
let text = data.choices?.[0]?.message?.content || "";
|
|
const m = text.match(/\{[\s\S]*\}/);
|
|
if (m) {
|
|
const parsed = JSON.parse(m[0]);
|
|
if (parsed.gangnamUnni?.url) gangnamUnniHintUrl = String(parsed.gangnamUnni.url);
|
|
}
|
|
} catch { /* skip */ }
|
|
})());
|
|
}
|
|
|
|
// ─── B5. Apify Instagram: Direct profile search by clinic name variants ───
|
|
const APIFY_TOKEN = Deno.env.get("APIFY_API_TOKEN") || "";
|
|
if (APIFY_TOKEN) {
|
|
stageBTasks.push((async () => {
|
|
try {
|
|
// Generate handle candidates from clinic name
|
|
const baseName = resolvedName.replace(/성형외과|병원|의원|클리닉|피부과/g, '').trim().toLowerCase();
|
|
const baseNameEn = (clinic.clinicNameEn || '').replace(/\s+/g, '').toLowerCase();
|
|
const candidates: string[] = [];
|
|
if (baseNameEn && baseNameEn.length >= 3) {
|
|
candidates.push(baseNameEn, `${baseNameEn}_official`, `${baseNameEn}_ps`, `${baseNameEn}_clinic`);
|
|
}
|
|
if (baseName && /^[a-zA-Z]/.test(baseName)) {
|
|
candidates.push(baseName, `${baseName}_official`, `${baseName}_ps`);
|
|
}
|
|
// Also try domain-based
|
|
const domainBase = new URL(url).hostname.replace('www.', '').split('.')[0].toLowerCase();
|
|
if (domainBase.length >= 3 && !candidates.includes(domainBase)) {
|
|
candidates.push(domainBase, `${domainBase}_official`);
|
|
}
|
|
|
|
// Quick check each candidate with Apify
|
|
for (const handle of candidates.slice(0, 6)) {
|
|
try {
|
|
const apifyRes = await fetch(
|
|
`${APIFY_BASE}/acts/apify~instagram-profile-scraper/runs?token=${APIFY_TOKEN}&waitForFinish=45`,
|
|
{
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify({ usernames: [handle], resultsLimit: 1 }),
|
|
}
|
|
);
|
|
const run = await apifyRes.json();
|
|
const datasetId = run.data?.defaultDatasetId;
|
|
if (!datasetId) continue;
|
|
|
|
const itemsRes = await fetch(`${APIFY_BASE}/datasets/${datasetId}/items?token=${APIFY_TOKEN}&limit=1`);
|
|
const items = await itemsRes.json();
|
|
const profile = (items as Record<string, unknown>[])[0];
|
|
|
|
if (profile && !profile.error && (profile.followersCount as number) >= 50) {
|
|
apiHandles.instagram!.push(profile.username as string || handle);
|
|
break; // Found one valid — stop searching
|
|
}
|
|
} catch { /* try next */ }
|
|
}
|
|
} catch { /* skip */ }
|
|
})());
|
|
}
|
|
|
|
// Run all Stage B tasks in parallel
|
|
await Promise.allSettled(stageBTasks);
|
|
|
|
// ═══════════════════════════════════════════
|
|
// STAGE C: Merge ALL sources + Verify
|
|
// ═══════════════════════════════════════════
|
|
|
|
const merged = mergeSocialLinks(linkHandles, firecrawlHandles, buttonHandles, apiHandles);
|
|
|
|
const cleanHandles = {
|
|
instagram: [...new Set(merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null))],
|
|
youtube: [...new Set(merged.youtube.map(h => extractHandle(h, 'youtube')).filter((h): h is string => h !== null))],
|
|
facebook: [...new Set(merged.facebook.map(h => extractHandle(h, 'facebook')).filter((h): h is string => h !== null))],
|
|
naverBlog: [...new Set(merged.naverBlog.map(h => extractHandle(h, 'naverBlog')).filter((h): h is string => h !== null))],
|
|
tiktok: [...new Set(merged.tiktok.map(h => extractHandle(h, 'tiktok')).filter((h): h is string => h !== null))],
|
|
};
|
|
|
|
// Fallback: try domain name as Facebook/Instagram handle if no candidates found
|
|
try {
|
|
const domain = new URL(url).hostname.replace('www.', '').split('.')[0]; // e.g. "idhospital"
|
|
if (domain && domain.length >= 3) {
|
|
if (cleanHandles.facebook.length === 0) cleanHandles.facebook.push(domain);
|
|
if (cleanHandles.instagram.length === 0) cleanHandles.instagram.push(domain);
|
|
}
|
|
} catch { /* ignore */ }
|
|
|
|
const verified: VerifiedChannels = await verifyAllHandles(
|
|
cleanHandles, resolvedName, gangnamUnniHintUrl,
|
|
);
|
|
|
|
// ═══════════════════════════════════════════
|
|
// Save to DB (supabase client reused from registry check above)
|
|
// ═══════════════════════════════════════════
|
|
|
|
const scrapeDataFull = {
|
|
clinic, branding: brandData.data?.json || {},
|
|
siteLinks, siteMap: mapData.links || [],
|
|
sourceUrl: url, scrapedAt: new Date().toISOString(),
|
|
// Perplexity research results — raw channel data with subscriber counts etc.
|
|
onlinePresenceResearch: perplexityResearch,
|
|
};
|
|
|
|
// ─── Legacy: marketing_reports (backward compat) ───
|
|
const { data: saved, error: saveError } = await supabase
|
|
.from("marketing_reports")
|
|
.insert({
|
|
url, clinic_name: resolvedName,
|
|
status: "discovered",
|
|
verified_channels: verified,
|
|
scrape_data: scrapeDataFull,
|
|
report: {},
|
|
pipeline_started_at: new Date().toISOString(),
|
|
})
|
|
.select("id")
|
|
.single();
|
|
|
|
if (saveError) throw new Error(`DB save failed: ${saveError.message}`);
|
|
|
|
// ─── Storage: save scrape_data.json to clinics/{domain}/{reportId}/ ───
|
|
try {
|
|
const domain = new URL(url).hostname.replace('www.', '');
|
|
const jsonBytes = new TextEncoder().encode(JSON.stringify(scrapeDataFull, null, 2));
|
|
await supabase.storage
|
|
.from('clinic-data')
|
|
.upload(`${domain}/${saved.id}/scrape_data.json`, jsonBytes, {
|
|
contentType: 'application/json',
|
|
upsert: true,
|
|
});
|
|
} catch (e) {
|
|
console.warn('[storage] scrape_data.json upload failed:', e instanceof Error ? e.message : e);
|
|
}
|
|
|
|
// ─── V3: clinics + analysis_runs (dual-write) ───
|
|
let clinicId: string | null = null;
|
|
let runId: string | null = null;
|
|
try {
|
|
// UPSERT clinic (url 기준 — 같은 URL이면 기존 행 업데이트)
|
|
const { data: clinicRow } = await supabase
|
|
.from("clinics")
|
|
.upsert({
|
|
url,
|
|
name: resolvedName,
|
|
name_en: clinic.clinicNameEn || null,
|
|
domain: new URL(url).hostname.replace('www.', ''),
|
|
address: clinic.address || null,
|
|
phone: clinic.phone || null,
|
|
services: clinic.services || [],
|
|
branding: brandData.data?.json || {},
|
|
social_handles: {
|
|
instagram: verified.instagram?.map((v: Record<string, unknown>) => v.handle) || [],
|
|
youtube: (verified.youtube as Record<string, unknown>)?.handle || null,
|
|
facebook: (verified.facebook as Record<string, unknown>)?.handle || null,
|
|
naverBlog: (verified.naverBlog as Record<string, unknown>)?.handle || null,
|
|
},
|
|
verified_channels: verified,
|
|
last_analyzed_at: new Date().toISOString(),
|
|
updated_at: new Date().toISOString(),
|
|
}, { onConflict: 'url' })
|
|
.select("id")
|
|
.single();
|
|
|
|
clinicId = clinicRow?.id || null;
|
|
|
|
// INSERT analysis_run
|
|
if (clinicId) {
|
|
const { data: runRow } = await supabase
|
|
.from("analysis_runs")
|
|
.insert({
|
|
clinic_id: clinicId,
|
|
status: "discovering",
|
|
scrape_data: scrapeDataFull,
|
|
discovered_channels: verified,
|
|
trigger: "manual",
|
|
pipeline_started_at: new Date().toISOString(),
|
|
})
|
|
.select("id")
|
|
.single();
|
|
runId = runRow?.id || null;
|
|
}
|
|
} catch (e) {
|
|
// V3 write failure should not block the pipeline
|
|
const errMsg = e instanceof Error ? e.message : String(e);
|
|
console.error("V3 dual-write error:", errMsg);
|
|
// Best-effort: record error so it's visible in DB
|
|
try {
|
|
if (runId) {
|
|
await supabase.from("analysis_runs").update({
|
|
error_message: `V3 dual-write (discover) failed: ${errMsg}`,
|
|
status: "error",
|
|
}).eq("id", runId);
|
|
} else {
|
|
// runId not yet created — fall back to marketing_reports
|
|
await supabase.from("marketing_reports").update({
|
|
status: "v3_write_error",
|
|
updated_at: new Date().toISOString(),
|
|
}).eq("id", saved.id);
|
|
}
|
|
} catch { /* ignore secondary failure */ }
|
|
}
|
|
|
|
return new Response(
|
|
JSON.stringify({
|
|
success: true, reportId: saved.id,
|
|
clinicId, runId, // V3 IDs for downstream phases
|
|
clinicName: resolvedName,
|
|
verifiedChannels: verified,
|
|
address: clinic.address || "",
|
|
services: clinic.services || [],
|
|
scrapeData: scrapeDataFull,
|
|
}),
|
|
{ headers: { ...corsHeaders, "Content-Type": "application/json" } },
|
|
);
|
|
} catch (error) {
|
|
return new Response(
|
|
JSON.stringify({ success: false, error: error.message }),
|
|
{ status: 500, headers: { ...corsHeaders, "Content-Type": "application/json" } },
|
|
);
|
|
}
|
|
});
|