fix: Instagram data collection pipeline — handle normalization + DB persistence

- enrich-channels: Instagram fallback — auto-try _ps, .ps, _clinic suffixes when <100 followers
- enrich-channels: YouTube URL normalization via normalizeYouTubeChannel (handles /c/, /user/, @handle)
- enrich-channels: Google Maps multi-query search for better hit rate
- generate-report: AI-found social handles prioritized over Firecrawl scrape
- generate-report: Added socialMedia field to AI prompt for accurate handle discovery
- normalizeHandles: Added normalizeYouTubeChannel for /c/, /user/, /channel/, @handle URLs

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
claude/bold-hawking
Haewon Kam 2026-04-03 14:45:00 +09:00
parent 9bf47f7d93
commit e5399486f7
3 changed files with 157 additions and 56 deletions

View File

@ -46,3 +46,62 @@ export function normalizeInstagramHandle(
return handle || null;
}
/**
* Normalize a YouTube channel identifier from various URL formats.
*
* Returns an object with the best identifier type for API lookup:
* - "https://www.youtube.com/@banobagips" { type: 'handle', value: 'banobagips' }
* - "https://youtube.com/c/banobagips" { type: 'username', value: 'banobagips' }
* - "https://youtube.com/user/banobagi" { type: 'username', value: 'banobagi' }
* - "https://youtube.com/channel/UCxxxx" { type: 'channelId', value: 'UCxxxx' }
* - "@banobagips" { type: 'handle', value: 'banobagips' }
* - "UCxxxx" { type: 'channelId', value: 'UCxxxx' }
* - "banobagips" { type: 'username', value: 'banobagips' }
*/
export function normalizeYouTubeChannel(
raw: string | null | undefined,
): { type: 'handle' | 'username' | 'channelId'; value: string } | null {
if (!raw || typeof raw !== "string") return null;
let input = raw.trim();
if (!input) return null;
// Parse YouTube URLs
if (input.includes("youtube.com") || input.includes("youtu.be")) {
try {
const urlStr = input.startsWith("http") ? input : `https://${input}`;
const url = new URL(urlStr);
const segments = url.pathname.split("/").filter(Boolean);
if (segments[0] === "channel" && segments[1]?.startsWith("UC")) {
return { type: "channelId", value: segments[1] };
}
if (segments[0] === "c" && segments[1]) {
return { type: "username", value: segments[1] };
}
if (segments[0] === "user" && segments[1]) {
return { type: "username", value: segments[1] };
}
if (segments[0]?.startsWith("@")) {
return { type: "handle", value: segments[0].slice(1) };
}
// Fallback: first path segment
if (segments[0]) {
return { type: "username", value: segments[0] };
}
} catch {
// URL parsing failed
}
}
// Non-URL formats
if (input.startsWith("@")) {
return { type: "handle", value: input.slice(1) };
}
if (input.startsWith("UC") && input.length > 20) {
return { type: "channelId", value: input };
}
return { type: "username", value: input };
}

View File

@ -1,6 +1,6 @@
import "@supabase/functions-js/edge-runtime.d.ts";
import { createClient } from "https://esm.sh/@supabase/supabase-js@2";
import { normalizeInstagramHandle } from "../_shared/normalizeHandles.ts";
import { normalizeInstagramHandle, normalizeYouTubeChannel } from "../_shared/normalizeHandles.ts";
const corsHeaders = {
"Access-Control-Allow-Origin": "*",
@ -58,18 +58,33 @@ Deno.serve(async (req) => {
// Run all enrichment tasks in parallel
const tasks = [];
// 1. Instagram Profile
// 1. Instagram Profile — with fallback for wrong handle
const cleanIgHandle = normalizeInstagramHandle(instagramHandle);
if (cleanIgHandle) {
tasks.push(
(async () => {
// Try the given handle first, then common clinic variants
const handleCandidates = [
cleanIgHandle,
`${cleanIgHandle}_ps`, // banobagi → banobagi_ps
`${cleanIgHandle}.ps`, // banobagi → banobagi.ps
`${cleanIgHandle}_clinic`, // banobagi → banobagi_clinic
`${cleanIgHandle}_official`, // banobagi → banobagi_official
];
for (const handle of handleCandidates) {
const items = await runApifyActor(
"apify~instagram-profile-scraper",
{ usernames: [cleanIgHandle], resultsLimit: 12 },
{ usernames: [handle], resultsLimit: 12 },
APIFY_TOKEN
);
const profile = (items as Record<string, unknown>[])[0];
if (profile && !profile.error) {
const followers = (profile.followersCount as number) || 0;
// Accept if: has meaningful followers OR is a business account with posts
if (followers >= 100 || ((profile.isBusinessAccount as boolean) && (profile.postsCount as number) > 10)) {
enrichment.instagram = {
username: profile.username,
followers: profile.followersCount,
@ -88,6 +103,9 @@ Deno.serve(async (req) => {
timestamp: p.timestamp,
})),
};
break; // Found a valid account
}
}
}
})()
);
@ -97,17 +115,27 @@ Deno.serve(async (req) => {
if (clinicName || address) {
tasks.push(
(async () => {
const searchQuery = `${clinicName} ${address || "강남"}`;
const items = await runApifyActor(
// Try multiple search queries for better hit rate
const queries = [
`${clinicName} 성형외과`,
clinicName,
`${clinicName} ${address || "강남"}`,
];
let items: unknown[] = [];
for (const query of queries) {
items = await runApifyActor(
"compass~crawler-google-places",
{
searchStringsArray: [searchQuery],
maxCrawledPlacesPerSearch: 1,
searchStringsArray: [query],
maxCrawledPlacesPerSearch: 3,
language: "ko",
maxReviews: 10,
},
APIFY_TOKEN
);
if ((items as Record<string, unknown>[]).length > 0) break;
}
const place = (items as Record<string, unknown>[])[0];
if (place) {
enrichment.googleMaps = {
@ -140,17 +168,24 @@ Deno.serve(async (req) => {
(async () => {
const YT_BASE = "https://www.googleapis.com/youtube/v3";
// Resolve handle/username to channel ID
let channelId = youtubeChannelId;
if (channelId.startsWith("@") || !channelId.startsWith("UC")) {
// Use forHandle for @handles, forUsername for legacy usernames
const param = channelId.startsWith("@") ? "forHandle" : "forUsername";
const handle = channelId.startsWith("@") ? channelId.slice(1) : channelId;
// Normalize YouTube URL/handle to structured identifier
const ytNormalized = normalizeYouTubeChannel(youtubeChannelId);
if (!ytNormalized) return;
let channelId = "";
if (ytNormalized.type === "channelId") {
channelId = ytNormalized.value;
} else {
// Try forHandle first, then forUsername as fallback
for (const param of ["forHandle", "forUsername"]) {
const lookupRes = await fetch(
`${YT_BASE}/channels?part=id&${param}=${handle}&key=${YOUTUBE_API_KEY}`
`${YT_BASE}/channels?part=id&${param}=${ytNormalized.value}&key=${YOUTUBE_API_KEY}`
);
const lookupData = await lookupRes.json();
channelId = lookupData.items?.[0]?.id || "";
if (channelId) break;
}
}
if (!channelId) return;

View File

@ -92,7 +92,13 @@ ${JSON.stringify(analyzeResult.data?.analysis || {}, null, 2)}
"address": "주소",
"phone": "전화번호",
"services": ["시술1", "시술2"],
"doctors": [{"name": "의사명", "specialty": "전문분야"}]
"doctors": [{"name": "의사명", "specialty": "전문분야"}],
"socialMedia": {
"instagram": "정확한 Instagram 핸들 (@ 없이, 예: banobagi_ps)",
"youtube": "YouTube 채널 핸들 또는 URL",
"facebook": "Facebook 페이지명",
"naverBlog": "네이버 블로그 ID"
}
},
"executiveSummary": "경영진 요약 (3-5문장)",
"overallScore": 0-100,
@ -165,13 +171,14 @@ ${JSON.stringify(analyzeResult.data?.analysis || {}, null, 2)}
report = { raw: reportText, parseError: true };
}
// Normalize social handles from scrape data
const socialMedia = clinic.socialMedia || {};
// Merge social handles: AI-found (more accurate) > Firecrawl-extracted (fallback)
const scrapeSocial = clinic.socialMedia || {};
const aiSocial = report?.clinicInfo?.socialMedia || {};
const normalizedHandles = {
instagram: normalizeInstagramHandle(socialMedia.instagram),
youtube: socialMedia.youtube || null,
facebook: socialMedia.facebook || null,
blog: socialMedia.blog || null,
instagram: normalizeInstagramHandle(aiSocial.instagram) || normalizeInstagramHandle(scrapeSocial.instagram),
youtube: aiSocial.youtube || scrapeSocial.youtube || null,
facebook: aiSocial.facebook || scrapeSocial.facebook || null,
blog: aiSocial.naverBlog || scrapeSocial.blog || null,
};
// Embed normalized handles in report for DB persistence