fix: Instagram data collection pipeline — handle normalization + DB persistence

- enrich-channels: Instagram fallback — auto-try _ps, .ps, _clinic suffixes when <100 followers
- enrich-channels: YouTube URL normalization via normalizeYouTubeChannel (handles /c/, /user/, @handle)
- enrich-channels: Google Maps multi-query search for better hit rate
- generate-report: AI-found social handles prioritized over Firecrawl scrape
- generate-report: Added socialMedia field to AI prompt for accurate handle discovery
- normalizeHandles: Added normalizeYouTubeChannel for /c/, /user/, /channel/, @handle URLs

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
claude/bold-hawking
Haewon Kam 2026-04-03 14:45:00 +09:00
parent 9bf47f7d93
commit e5399486f7
3 changed files with 157 additions and 56 deletions

View File

@ -46,3 +46,62 @@ export function normalizeInstagramHandle(
return handle || null; return handle || null;
} }
/**
* Normalize a YouTube channel identifier from various URL formats.
*
* Returns an object with the best identifier type for API lookup:
* - "https://www.youtube.com/@banobagips" { type: 'handle', value: 'banobagips' }
* - "https://youtube.com/c/banobagips" { type: 'username', value: 'banobagips' }
* - "https://youtube.com/user/banobagi" { type: 'username', value: 'banobagi' }
* - "https://youtube.com/channel/UCxxxx" { type: 'channelId', value: 'UCxxxx' }
* - "@banobagips" { type: 'handle', value: 'banobagips' }
* - "UCxxxx" { type: 'channelId', value: 'UCxxxx' }
* - "banobagips" { type: 'username', value: 'banobagips' }
*/
export function normalizeYouTubeChannel(
raw: string | null | undefined,
): { type: 'handle' | 'username' | 'channelId'; value: string } | null {
if (!raw || typeof raw !== "string") return null;
let input = raw.trim();
if (!input) return null;
// Parse YouTube URLs
if (input.includes("youtube.com") || input.includes("youtu.be")) {
try {
const urlStr = input.startsWith("http") ? input : `https://${input}`;
const url = new URL(urlStr);
const segments = url.pathname.split("/").filter(Boolean);
if (segments[0] === "channel" && segments[1]?.startsWith("UC")) {
return { type: "channelId", value: segments[1] };
}
if (segments[0] === "c" && segments[1]) {
return { type: "username", value: segments[1] };
}
if (segments[0] === "user" && segments[1]) {
return { type: "username", value: segments[1] };
}
if (segments[0]?.startsWith("@")) {
return { type: "handle", value: segments[0].slice(1) };
}
// Fallback: first path segment
if (segments[0]) {
return { type: "username", value: segments[0] };
}
} catch {
// URL parsing failed
}
}
// Non-URL formats
if (input.startsWith("@")) {
return { type: "handle", value: input.slice(1) };
}
if (input.startsWith("UC") && input.length > 20) {
return { type: "channelId", value: input };
}
return { type: "username", value: input };
}

View File

@ -1,6 +1,6 @@
import "@supabase/functions-js/edge-runtime.d.ts"; import "@supabase/functions-js/edge-runtime.d.ts";
import { createClient } from "https://esm.sh/@supabase/supabase-js@2"; import { createClient } from "https://esm.sh/@supabase/supabase-js@2";
import { normalizeInstagramHandle } from "../_shared/normalizeHandles.ts"; import { normalizeInstagramHandle, normalizeYouTubeChannel } from "../_shared/normalizeHandles.ts";
const corsHeaders = { const corsHeaders = {
"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Origin": "*",
@ -58,36 +58,54 @@ Deno.serve(async (req) => {
// Run all enrichment tasks in parallel // Run all enrichment tasks in parallel
const tasks = []; const tasks = [];
// 1. Instagram Profile // 1. Instagram Profile — with fallback for wrong handle
const cleanIgHandle = normalizeInstagramHandle(instagramHandle); const cleanIgHandle = normalizeInstagramHandle(instagramHandle);
if (cleanIgHandle) { if (cleanIgHandle) {
tasks.push( tasks.push(
(async () => { (async () => {
const items = await runApifyActor( // Try the given handle first, then common clinic variants
"apify~instagram-profile-scraper", const handleCandidates = [
{ usernames: [cleanIgHandle], resultsLimit: 12 }, cleanIgHandle,
APIFY_TOKEN `${cleanIgHandle}_ps`, // banobagi → banobagi_ps
); `${cleanIgHandle}.ps`, // banobagi → banobagi.ps
const profile = (items as Record<string, unknown>[])[0]; `${cleanIgHandle}_clinic`, // banobagi → banobagi_clinic
if (profile && !profile.error) { `${cleanIgHandle}_official`, // banobagi → banobagi_official
enrichment.instagram = { ];
username: profile.username,
followers: profile.followersCount, for (const handle of handleCandidates) {
following: profile.followsCount, const items = await runApifyActor(
posts: profile.postsCount, "apify~instagram-profile-scraper",
bio: profile.biography, { usernames: [handle], resultsLimit: 12 },
isBusinessAccount: profile.isBusinessAccount, APIFY_TOKEN
externalUrl: profile.externalUrl, );
latestPosts: ((profile.latestPosts as Record<string, unknown>[]) || []) const profile = (items as Record<string, unknown>[])[0];
.slice(0, 12)
.map((p) => ({ if (profile && !profile.error) {
type: p.type, const followers = (profile.followersCount as number) || 0;
likes: p.likesCount,
comments: p.commentsCount, // Accept if: has meaningful followers OR is a business account with posts
caption: (p.caption as string || "").slice(0, 200), if (followers >= 100 || ((profile.isBusinessAccount as boolean) && (profile.postsCount as number) > 10)) {
timestamp: p.timestamp, enrichment.instagram = {
})), username: profile.username,
}; followers: profile.followersCount,
following: profile.followsCount,
posts: profile.postsCount,
bio: profile.biography,
isBusinessAccount: profile.isBusinessAccount,
externalUrl: profile.externalUrl,
latestPosts: ((profile.latestPosts as Record<string, unknown>[]) || [])
.slice(0, 12)
.map((p) => ({
type: p.type,
likes: p.likesCount,
comments: p.commentsCount,
caption: (p.caption as string || "").slice(0, 200),
timestamp: p.timestamp,
})),
};
break; // Found a valid account
}
}
} }
})() })()
); );
@ -97,17 +115,27 @@ Deno.serve(async (req) => {
if (clinicName || address) { if (clinicName || address) {
tasks.push( tasks.push(
(async () => { (async () => {
const searchQuery = `${clinicName} ${address || "강남"}`; // Try multiple search queries for better hit rate
const items = await runApifyActor( const queries = [
"compass~crawler-google-places", `${clinicName} 성형외과`,
{ clinicName,
searchStringsArray: [searchQuery], `${clinicName} ${address || "강남"}`,
maxCrawledPlacesPerSearch: 1, ];
language: "ko",
maxReviews: 10, let items: unknown[] = [];
}, for (const query of queries) {
APIFY_TOKEN items = await runApifyActor(
); "compass~crawler-google-places",
{
searchStringsArray: [query],
maxCrawledPlacesPerSearch: 3,
language: "ko",
maxReviews: 10,
},
APIFY_TOKEN
);
if ((items as Record<string, unknown>[]).length > 0) break;
}
const place = (items as Record<string, unknown>[])[0]; const place = (items as Record<string, unknown>[])[0];
if (place) { if (place) {
enrichment.googleMaps = { enrichment.googleMaps = {
@ -140,17 +168,24 @@ Deno.serve(async (req) => {
(async () => { (async () => {
const YT_BASE = "https://www.googleapis.com/youtube/v3"; const YT_BASE = "https://www.googleapis.com/youtube/v3";
// Resolve handle/username to channel ID // Normalize YouTube URL/handle to structured identifier
let channelId = youtubeChannelId; const ytNormalized = normalizeYouTubeChannel(youtubeChannelId);
if (channelId.startsWith("@") || !channelId.startsWith("UC")) { if (!ytNormalized) return;
// Use forHandle for @handles, forUsername for legacy usernames
const param = channelId.startsWith("@") ? "forHandle" : "forUsername"; let channelId = "";
const handle = channelId.startsWith("@") ? channelId.slice(1) : channelId;
const lookupRes = await fetch( if (ytNormalized.type === "channelId") {
`${YT_BASE}/channels?part=id&${param}=${handle}&key=${YOUTUBE_API_KEY}` channelId = ytNormalized.value;
); } else {
const lookupData = await lookupRes.json(); // Try forHandle first, then forUsername as fallback
channelId = lookupData.items?.[0]?.id || ""; for (const param of ["forHandle", "forUsername"]) {
const lookupRes = await fetch(
`${YT_BASE}/channels?part=id&${param}=${ytNormalized.value}&key=${YOUTUBE_API_KEY}`
);
const lookupData = await lookupRes.json();
channelId = lookupData.items?.[0]?.id || "";
if (channelId) break;
}
} }
if (!channelId) return; if (!channelId) return;

View File

@ -92,7 +92,13 @@ ${JSON.stringify(analyzeResult.data?.analysis || {}, null, 2)}
"address": "주소", "address": "주소",
"phone": "전화번호", "phone": "전화번호",
"services": ["시술1", "시술2"], "services": ["시술1", "시술2"],
"doctors": [{"name": "의사명", "specialty": "전문분야"}] "doctors": [{"name": "의사명", "specialty": "전문분야"}],
"socialMedia": {
"instagram": "정확한 Instagram 핸들 (@ 없이, 예: banobagi_ps)",
"youtube": "YouTube 채널 핸들 또는 URL",
"facebook": "Facebook 페이지명",
"naverBlog": "네이버 블로그 ID"
}
}, },
"executiveSummary": "경영진 요약 (3-5문장)", "executiveSummary": "경영진 요약 (3-5문장)",
"overallScore": 0-100, "overallScore": 0-100,
@ -165,13 +171,14 @@ ${JSON.stringify(analyzeResult.data?.analysis || {}, null, 2)}
report = { raw: reportText, parseError: true }; report = { raw: reportText, parseError: true };
} }
// Normalize social handles from scrape data // Merge social handles: AI-found (more accurate) > Firecrawl-extracted (fallback)
const socialMedia = clinic.socialMedia || {}; const scrapeSocial = clinic.socialMedia || {};
const aiSocial = report?.clinicInfo?.socialMedia || {};
const normalizedHandles = { const normalizedHandles = {
instagram: normalizeInstagramHandle(socialMedia.instagram), instagram: normalizeInstagramHandle(aiSocial.instagram) || normalizeInstagramHandle(scrapeSocial.instagram),
youtube: socialMedia.youtube || null, youtube: aiSocial.youtube || scrapeSocial.youtube || null,
facebook: socialMedia.facebook || null, facebook: aiSocial.facebook || scrapeSocial.facebook || null,
blog: socialMedia.blog || null, blog: aiSocial.naverBlog || scrapeSocial.blog || null,
}; };
// Embed normalized handles in report for DB persistence // Embed normalized handles in report for DB persistence