o2o-infinith-demo/supabase/functions/discover-channels/index.ts

494 lines
23 KiB
TypeScript

import "@supabase/functions-js/edge-runtime.d.ts";
import { createClient } from "https://esm.sh/@supabase/supabase-js@2";
import { extractSocialLinks, mergeSocialLinks } from "../_shared/extractSocialLinks.ts";
import { verifyAllHandles, type VerifiedChannels } from "../_shared/verifyHandles.ts";
import { RESEARCH_SYSTEM_PROMPT, buildResearchUserPrompt } from "../_shared/researchPrompt.ts";
const corsHeaders = {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Headers": "authorization, x-client-info, apikey, content-type",
};
const APIFY_BASE = "https://api.apify.com/v2";
interface DiscoverRequest {
url: string;
clinicName?: string;
}
function extractHandle(raw: string, platform: string): string | null {
if (!raw || raw.length < 2) return null;
let h = raw.trim();
if (platform === 'instagram') {
const m = h.match(/instagram\.com\/([a-zA-Z0-9._]+)/);
if (m) return m[1];
h = h.replace(/^@/, '').replace(/\/$/, '');
if (/^[a-zA-Z0-9._]+$/.test(h) && h.length >= 2) return h;
return null;
}
if (platform === 'youtube') {
const m = h.match(/youtube\.com\/(?:@([a-zA-Z0-9._-]+)|channel\/(UC[a-zA-Z0-9_-]+)|c\/([a-zA-Z0-9._-]+))/);
if (m) return m[1] ? `@${m[1]}` : m[2] || m[3] || null;
h = h.replace(/^@/, '');
if (h.includes('http') || h.includes('/') || h.includes('.com')) return null;
if (/^UC[a-zA-Z0-9_-]{20,}$/.test(h)) return h;
if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return `@${h}`;
return null;
}
if (platform === 'facebook') {
const m = h.match(/facebook\.com\/([a-zA-Z0-9._-]+)/);
if (m) return m[1];
h = h.replace(/^@/, '').replace(/\/$/, '');
if (h.includes('http') || h.includes('/')) return null;
if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return h;
return null;
}
if (platform === 'naverBlog') {
const m = h.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/);
if (m) return m[1];
if (h.includes('http') || h.includes('/')) return null;
if (/^[a-zA-Z0-9_-]+$/.test(h) && h.length >= 2) return h;
return null;
}
if (platform === 'tiktok') {
const m = h.match(/tiktok\.com\/@([a-zA-Z0-9._-]+)/);
if (m) return m[1];
h = h.replace(/^@/, '');
if (h.includes('http') || h.includes('/')) return null;
if (/^[a-zA-Z0-9._-]+$/.test(h) && h.length >= 2) return h;
return null;
}
return h;
}
/**
* Phase 1: Discover & Verify Channels
*
* API-first, Perplexity-supplement approach:
* Stage A: Firecrawl scrape + map → clinicName + social links from HTML
* Stage B: Direct API searches (YouTube, Naver, Firecrawl) + Perplexity
* Stage C: Merge all sources + Verify handles
*/
Deno.serve(async (req) => {
if (req.method === "OPTIONS") {
return new Response("ok", { headers: corsHeaders });
}
try {
const { url, clinicName: inputClinicName } = (await req.json()) as DiscoverRequest;
if (!url) {
return new Response(
JSON.stringify({ error: "URL is required" }),
{ status: 400, headers: { ...corsHeaders, "Content-Type": "application/json" } },
);
}
const FIRECRAWL_API_KEY = Deno.env.get("FIRECRAWL_API_KEY") || "";
const PERPLEXITY_API_KEY = Deno.env.get("PERPLEXITY_API_KEY") || "";
const YOUTUBE_API_KEY = Deno.env.get("YOUTUBE_API_KEY") || "";
const NAVER_CLIENT_ID = Deno.env.get("NAVER_CLIENT_ID") || "";
const NAVER_CLIENT_SECRET = Deno.env.get("NAVER_CLIENT_SECRET") || "";
if (!FIRECRAWL_API_KEY) throw new Error("FIRECRAWL_API_KEY not configured");
// ═══════════════════════════════════════════
// STAGE A: Firecrawl scrape + map (parallel)
// ═══════════════════════════════════════════
const [scrapeResult, mapResult, brandResult] = await Promise.allSettled([
fetch("https://api.firecrawl.dev/v1/scrape", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
body: JSON.stringify({
url, formats: ["json", "links"],
jsonOptions: {
prompt: "Extract: clinic name (Korean), clinic name (English), address, phone, services offered, doctors with specialties, ALL social media links (instagram handles/URLs, youtube channel URL/handle, naver blog URL, facebook page URL, tiktok, kakao channel), business hours, slogan",
schema: {
type: "object",
properties: {
clinicName: { type: "string" }, clinicNameEn: { type: "string" },
address: { type: "string" }, phone: { type: "string" },
businessHours: { type: "string" }, slogan: { type: "string" },
services: { type: "array", items: { type: "string" } },
doctors: { type: "array", items: { type: "object", properties: { name: { type: "string" }, title: { type: "string" }, specialty: { type: "string" } } } },
socialMedia: { type: "object", properties: { instagram: { type: "string" }, youtube: { type: "string" }, blog: { type: "string" }, facebook: { type: "string" }, tiktok: { type: "string" }, kakao: { type: "string" } } },
},
},
},
waitFor: 5000,
}),
}).then(r => r.json()),
fetch("https://api.firecrawl.dev/v1/map", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
body: JSON.stringify({ url, limit: 50 }),
}).then(r => r.json()),
fetch("https://api.firecrawl.dev/v1/scrape", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
body: JSON.stringify({
url, formats: ["json"],
jsonOptions: {
prompt: "Extract brand identity: primary/accent/background/text colors (hex), heading/body fonts, logo URL, favicon URL, tagline",
schema: { type: "object", properties: { primaryColor: { type: "string" }, accentColor: { type: "string" }, backgroundColor: { type: "string" }, textColor: { type: "string" }, headingFont: { type: "string" }, bodyFont: { type: "string" }, logoUrl: { type: "string" }, faviconUrl: { type: "string" }, tagline: { type: "string" } } },
},
waitFor: 3000,
}),
}).then(r => r.json()).catch(() => ({ data: { json: {} } })),
]);
const scrapeData = scrapeResult.status === "fulfilled" ? scrapeResult.value : { data: {} };
const mapData = mapResult.status === "fulfilled" ? mapResult.value : {};
const brandData = brandResult.status === "fulfilled" ? brandResult.value : { data: { json: {} } };
const clinic = scrapeData.data?.json || {};
let resolvedName = inputClinicName || clinic.clinicName || clinic.clinicNameEn || "";
// Fallback: ask Perplexity to identify clinic name from URL
if (!resolvedName && PERPLEXITY_API_KEY) {
try {
const nameRes = await fetch("https://api.perplexity.ai/chat/completions", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` },
body: JSON.stringify({
model: "sonar",
messages: [
{ role: "system", content: "Respond with ONLY the clinic name in Korean, nothing else." },
{ role: "user", content: `${url} 이 URL의 병원/클리닉 한국어 이름이 뭐야?` },
],
temperature: 0.1,
}),
});
const nameData = await nameRes.json();
const aiName = (nameData.choices?.[0]?.message?.content || "").trim().replace(/["""]/g, '').split('\n')[0].trim();
if (aiName && aiName.length >= 2 && aiName.length <= 30) resolvedName = aiName;
} catch { /* fallback to domain */ }
}
if (!resolvedName) resolvedName = new URL(url).hostname.replace('www.', '').split('.')[0];
// Source 1: Parse social links from HTML
const siteLinks: string[] = scrapeData.data?.links || [];
const siteMap: string[] = mapData.links || [];
const linkHandles = extractSocialLinks([...siteLinks, ...siteMap]);
// Source 2: Firecrawl JSON extraction
const scrapeSocial = clinic.socialMedia || {};
const firecrawlHandles = {
instagram: scrapeSocial.instagram ? [scrapeSocial.instagram] : [],
youtube: scrapeSocial.youtube ? [scrapeSocial.youtube] : [],
facebook: scrapeSocial.facebook ? [scrapeSocial.facebook] : [],
naverBlog: scrapeSocial.blog ? [scrapeSocial.blog] : [],
tiktok: scrapeSocial.tiktok ? [scrapeSocial.tiktok] : [],
kakao: scrapeSocial.kakao ? [scrapeSocial.kakao] : [],
};
// ═══════════════════════════════════════════
// STAGE B: Direct API searches + Perplexity (ALL PARALLEL)
// Each API directly searches for the clinic's presence
// ═══════════════════════════════════════════
const apiHandles: Partial<typeof linkHandles> = {
instagram: [], youtube: [], facebook: [],
naverBlog: [], tiktok: [], kakao: [],
};
let gangnamUnniHintUrl: string | undefined;
const stageBTasks: Promise<void>[] = [];
// ─── B1. YouTube Data API: Search for channel by name ───
if (YOUTUBE_API_KEY) {
stageBTasks.push((async () => {
try {
const q = encodeURIComponent(resolvedName);
const res = await fetch(
`https://www.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=${q}&maxResults=3&key=${YOUTUBE_API_KEY}`
);
const data = await res.json();
for (const item of (data.items || [])) {
const channelId = item.snippet?.channelId || item.id?.channelId;
const title = (item.snippet?.title || "").toLowerCase();
const nameL = resolvedName.toLowerCase();
// Match if title contains clinic name or vice versa
if (channelId && (title.includes(nameL) || nameL.includes(title) || title.includes(nameL.replace(/성형외과|병원|의원|클리닉/g, '').trim()))) {
apiHandles.youtube!.push(channelId);
}
}
} catch { /* skip */ }
})());
}
// ─── B2. Naver Search API: Find blog + social URLs ───
if (NAVER_CLIENT_ID && NAVER_CLIENT_SECRET) {
const naverHeaders = {
"X-Naver-Client-Id": NAVER_CLIENT_ID,
"X-Naver-Client-Secret": NAVER_CLIENT_SECRET,
};
// B2a. Blog search → find official Naver blog
stageBTasks.push((async () => {
try {
const q = encodeURIComponent(`${resolvedName} 공식 블로그`);
const res = await fetch(
`https://openapi.naver.com/v1/search/blog.json?query=${q}&display=5&sort=sim`,
{ headers: naverHeaders }
);
const data = await res.json();
for (const item of (data.items || [])) {
const link = item.link || "";
if (link.includes("blog.naver.com/")) {
const m = link.match(/blog\.naver\.com\/([a-zA-Z0-9_-]+)/);
if (m) apiHandles.naverBlog!.push(m[1]);
}
}
} catch { /* skip */ }
})());
// B2b. Web search → find Instagram/YouTube/Facebook URLs
stageBTasks.push((async () => {
try {
const q = encodeURIComponent(`${resolvedName} 인스타그램 유튜브 공식`);
const res = await fetch(
`https://openapi.naver.com/v1/search/webkr.json?query=${q}&display=10`,
{ headers: naverHeaders }
);
const data = await res.json();
const urls: string[] = (data.items || []).map((item: Record<string, string>) => item.link).filter(Boolean);
// Extract social handles from search result URLs
const found = extractSocialLinks(urls);
if (found.instagram.length) apiHandles.instagram!.push(...found.instagram);
if (found.youtube.length) apiHandles.youtube!.push(...found.youtube);
if (found.facebook.length) apiHandles.facebook!.push(...found.facebook);
if (found.tiktok.length) apiHandles.tiktok!.push(...found.tiktok);
} catch { /* skip */ }
})());
}
// ─── B3. Firecrawl Search: Find social URLs via web search ───
stageBTasks.push((async () => {
try {
const res = await fetch("https://api.firecrawl.dev/v1/search", {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_API_KEY}` },
body: JSON.stringify({
query: `${resolvedName} 성형외과 instagram youtube 공식`,
limit: 10,
}),
});
const data = await res.json();
const urls: string[] = (data.data || []).map((r: Record<string, string>) => r.url).filter(Boolean);
const found = extractSocialLinks(urls);
if (found.instagram.length) apiHandles.instagram!.push(...found.instagram);
if (found.youtube.length) apiHandles.youtube!.push(...found.youtube);
if (found.facebook.length) apiHandles.facebook!.push(...found.facebook);
if (found.tiktok.length) apiHandles.tiktok!.push(...found.tiktok);
} catch { /* skip */ }
})());
// ─── B4. Perplexity: 3 separate focused queries (like the research methodology) ───
let perplexityResearch: Record<string, unknown> | null = null;
if (PERPLEXITY_API_KEY) {
const ppxHeaders = { "Content-Type": "application/json", Authorization: `Bearer ${PERPLEXITY_API_KEY}` };
const ppxSystem = "You are a social media researcher. Search the web and find accounts. Respond ONLY with valid JSON, no explanation.";
// B4a. Instagram + YouTube 검색 (가장 중요)
stageBTasks.push((async () => {
try {
const res = await fetch("https://api.perplexity.ai/chat/completions", {
method: "POST", headers: ppxHeaders,
body: JSON.stringify({
model: "sonar",
messages: [
{ role: "system", content: ppxSystem },
{ role: "user", content: `${resolvedName} 성형외과 공식 인스타그램 계정과 유튜브 채널을 찾아줘. 인스타는 여러 계정(국문, 영문, 원장 등)이 있을 수 있어.\n\n{"instagram": ["@handle1", "@handle2"], "youtube": ["@handle_or_url"]}` },
],
temperature: 0.1,
}),
});
const data = await res.json();
let text = data.choices?.[0]?.message?.content || "";
const m = text.match(/\{[\s\S]*\}/);
if (m) {
const parsed = JSON.parse(m[0]);
const ig = Array.isArray(parsed.instagram) ? parsed.instagram : parsed.instagram ? [parsed.instagram] : [];
const yt = Array.isArray(parsed.youtube) ? parsed.youtube : parsed.youtube ? [parsed.youtube] : [];
ig.forEach((h: unknown) => { if (h && typeof h === 'string') apiHandles.instagram!.push(h); });
yt.forEach((h: unknown) => { if (h && typeof h === 'string') apiHandles.youtube!.push(h); });
}
} catch { /* skip */ }
})());
// B4b. Facebook + TikTok + 네이버 블로그 + 카카오
stageBTasks.push((async () => {
try {
const res = await fetch("https://api.perplexity.ai/chat/completions", {
method: "POST", headers: ppxHeaders,
body: JSON.stringify({
model: "sonar",
messages: [
{ role: "system", content: ppxSystem },
{ role: "user", content: `${resolvedName} 병원의 페이스북, 틱톡, 네이버블로그, 카카오채널 계정을 검색해줘.\n\n{"facebook": "page_or_url", "tiktok": "@handle", "naverBlog": "blogId", "kakao": "channelId"}` },
],
temperature: 0.1,
}),
});
const data = await res.json();
let text = data.choices?.[0]?.message?.content || "";
const m = text.match(/\{[\s\S]*\}/);
if (m) {
const parsed = JSON.parse(m[0]);
if (parsed.facebook && typeof parsed.facebook === 'string') apiHandles.facebook!.push(parsed.facebook);
if (parsed.tiktok && typeof parsed.tiktok === 'string') apiHandles.tiktok!.push(parsed.tiktok);
if (parsed.naverBlog && typeof parsed.naverBlog === 'string') apiHandles.naverBlog!.push(parsed.naverBlog);
if (parsed.kakao && typeof parsed.kakao === 'string') apiHandles.kakao!.push(parsed.kakao);
}
} catch { /* skip */ }
})());
// B4c. 강남언니 + 리뷰 플랫폼
stageBTasks.push((async () => {
try {
const res = await fetch("https://api.perplexity.ai/chat/completions", {
method: "POST", headers: ppxHeaders,
body: JSON.stringify({
model: "sonar",
messages: [
{ role: "system", content: ppxSystem },
{ role: "user", content: `${resolvedName} 병원이 강남언니(gangnamunni.com)에 등록되어있는지 검색해줘. URL도 찾아줘.\n\n{"gangnamUnni": {"registered": true, "url": "https://gangnamunni.com/hospitals/...", "rating": 9.5, "reviews": 1000}}` },
],
temperature: 0.1,
}),
});
const data = await res.json();
let text = data.choices?.[0]?.message?.content || "";
const m = text.match(/\{[\s\S]*\}/);
if (m) {
const parsed = JSON.parse(m[0]);
perplexityResearch = parsed;
if (parsed.gangnamUnni?.url) gangnamUnniHintUrl = String(parsed.gangnamUnni.url);
}
} catch { /* skip */ }
})());
}
// ─── B5. Apify Instagram: Direct profile search by clinic name variants ───
const APIFY_TOKEN = Deno.env.get("APIFY_API_TOKEN") || "";
if (APIFY_TOKEN) {
stageBTasks.push((async () => {
try {
// Generate handle candidates from clinic name
const baseName = resolvedName.replace(/성형외과|병원|의원|클리닉|피부과/g, '').trim().toLowerCase();
const baseNameEn = (clinic.clinicNameEn || '').replace(/\s+/g, '').toLowerCase();
const candidates: string[] = [];
if (baseNameEn && baseNameEn.length >= 3) {
candidates.push(baseNameEn, `${baseNameEn}_official`, `${baseNameEn}_ps`, `${baseNameEn}_clinic`);
}
if (baseName && /^[a-zA-Z]/.test(baseName)) {
candidates.push(baseName, `${baseName}_official`, `${baseName}_ps`);
}
// Also try domain-based
const domainBase = new URL(url).hostname.replace('www.', '').split('.')[0].toLowerCase();
if (domainBase.length >= 3 && !candidates.includes(domainBase)) {
candidates.push(domainBase, `${domainBase}_official`);
}
// Quick check each candidate with Apify
for (const handle of candidates.slice(0, 6)) {
try {
const apifyRes = await fetch(
`${APIFY_BASE}/acts/apify~instagram-profile-scraper/runs?token=${APIFY_TOKEN}&waitForFinish=30`,
{
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ usernames: [handle], resultsLimit: 1 }),
}
);
const run = await apifyRes.json();
const datasetId = run.data?.defaultDatasetId;
if (!datasetId) continue;
const itemsRes = await fetch(`${APIFY_BASE}/datasets/${datasetId}/items?token=${APIFY_TOKEN}&limit=1`);
const items = await itemsRes.json();
const profile = (items as Record<string, unknown>[])[0];
if (profile && !profile.error && (profile.followersCount as number) >= 50) {
apiHandles.instagram!.push(profile.username as string || handle);
break; // Found one valid — stop searching
}
} catch { /* try next */ }
}
} catch { /* skip */ }
})());
}
// Run all Stage B tasks in parallel
await Promise.allSettled(stageBTasks);
// ═══════════════════════════════════════════
// STAGE C: Merge ALL sources + Verify
// ═══════════════════════════════════════════
const merged = mergeSocialLinks(linkHandles, firecrawlHandles, apiHandles);
const cleanHandles = {
instagram: [...new Set(merged.instagram.map(h => extractHandle(h, 'instagram')).filter((h): h is string => h !== null))],
youtube: [...new Set(merged.youtube.map(h => extractHandle(h, 'youtube')).filter((h): h is string => h !== null))],
facebook: [...new Set(merged.facebook.map(h => extractHandle(h, 'facebook')).filter((h): h is string => h !== null))],
naverBlog: [...new Set(merged.naverBlog.map(h => extractHandle(h, 'naverBlog')).filter((h): h is string => h !== null))],
tiktok: [...new Set(merged.tiktok.map(h => extractHandle(h, 'tiktok')).filter((h): h is string => h !== null))],
};
const verified: VerifiedChannels = await verifyAllHandles(
cleanHandles, resolvedName, gangnamUnniHintUrl,
);
// ═══════════════════════════════════════════
// Save to DB
// ═══════════════════════════════════════════
const supabaseUrl = Deno.env.get("SUPABASE_URL")!;
const supabaseKey = Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")!;
const supabase = createClient(supabaseUrl, supabaseKey);
const scrapeDataFull = {
clinic, branding: brandData.data?.json || {},
siteLinks, siteMap: mapData.links || [],
sourceUrl: url, scrapedAt: new Date().toISOString(),
// Perplexity research results — raw channel data with subscriber counts etc.
onlinePresenceResearch: perplexityResearch,
};
const { data: saved, error: saveError } = await supabase
.from("marketing_reports")
.insert({
url, clinic_name: resolvedName,
status: "discovered",
verified_channels: verified,
scrape_data: scrapeDataFull,
report: {},
pipeline_started_at: new Date().toISOString(),
})
.select("id")
.single();
if (saveError) throw new Error(`DB save failed: ${saveError.message}`);
return new Response(
JSON.stringify({
success: true, reportId: saved.id,
clinicName: resolvedName,
verifiedChannels: verified,
address: clinic.address || "",
services: clinic.services || [],
scrapeData: scrapeDataFull,
}),
{ headers: { ...corsHeaders, "Content-Type": "application/json" } },
);
} catch (error) {
return new Response(
JSON.stringify({ success: false, error: error.message }),
{ status: 500, headers: { ...corsHeaders, "Content-Type": "application/json" } },
);
}
});