fix: Instagram data collection pipeline — handle normalization + DB persistence
- enrich-channels: Instagram fallback — auto-try _ps, .ps, _clinic suffixes when <100 followers - enrich-channels: YouTube URL normalization via normalizeYouTubeChannel (handles /c/, /user/, @handle) - enrich-channels: Google Maps multi-query search for better hit rate - generate-report: AI-found social handles prioritized over Firecrawl scrape - generate-report: Added socialMedia field to AI prompt for accurate handle discovery - normalizeHandles: Added normalizeYouTubeChannel for /c/, /user/, /channel/, @handle URLs Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>claude/bold-hawking
parent
9bf47f7d93
commit
e5399486f7
|
|
@ -46,3 +46,62 @@ export function normalizeInstagramHandle(
|
||||||
|
|
||||||
return handle || null;
|
return handle || null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize a YouTube channel identifier from various URL formats.
|
||||||
|
*
|
||||||
|
* Returns an object with the best identifier type for API lookup:
|
||||||
|
* - "https://www.youtube.com/@banobagips" → { type: 'handle', value: 'banobagips' }
|
||||||
|
* - "https://youtube.com/c/banobagips" → { type: 'username', value: 'banobagips' }
|
||||||
|
* - "https://youtube.com/user/banobagi" → { type: 'username', value: 'banobagi' }
|
||||||
|
* - "https://youtube.com/channel/UCxxxx" → { type: 'channelId', value: 'UCxxxx' }
|
||||||
|
* - "@banobagips" → { type: 'handle', value: 'banobagips' }
|
||||||
|
* - "UCxxxx" → { type: 'channelId', value: 'UCxxxx' }
|
||||||
|
* - "banobagips" → { type: 'username', value: 'banobagips' }
|
||||||
|
*/
|
||||||
|
export function normalizeYouTubeChannel(
|
||||||
|
raw: string | null | undefined,
|
||||||
|
): { type: 'handle' | 'username' | 'channelId'; value: string } | null {
|
||||||
|
if (!raw || typeof raw !== "string") return null;
|
||||||
|
|
||||||
|
let input = raw.trim();
|
||||||
|
if (!input) return null;
|
||||||
|
|
||||||
|
// Parse YouTube URLs
|
||||||
|
if (input.includes("youtube.com") || input.includes("youtu.be")) {
|
||||||
|
try {
|
||||||
|
const urlStr = input.startsWith("http") ? input : `https://${input}`;
|
||||||
|
const url = new URL(urlStr);
|
||||||
|
const segments = url.pathname.split("/").filter(Boolean);
|
||||||
|
|
||||||
|
if (segments[0] === "channel" && segments[1]?.startsWith("UC")) {
|
||||||
|
return { type: "channelId", value: segments[1] };
|
||||||
|
}
|
||||||
|
if (segments[0] === "c" && segments[1]) {
|
||||||
|
return { type: "username", value: segments[1] };
|
||||||
|
}
|
||||||
|
if (segments[0] === "user" && segments[1]) {
|
||||||
|
return { type: "username", value: segments[1] };
|
||||||
|
}
|
||||||
|
if (segments[0]?.startsWith("@")) {
|
||||||
|
return { type: "handle", value: segments[0].slice(1) };
|
||||||
|
}
|
||||||
|
// Fallback: first path segment
|
||||||
|
if (segments[0]) {
|
||||||
|
return { type: "username", value: segments[0] };
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// URL parsing failed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Non-URL formats
|
||||||
|
if (input.startsWith("@")) {
|
||||||
|
return { type: "handle", value: input.slice(1) };
|
||||||
|
}
|
||||||
|
if (input.startsWith("UC") && input.length > 20) {
|
||||||
|
return { type: "channelId", value: input };
|
||||||
|
}
|
||||||
|
|
||||||
|
return { type: "username", value: input };
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import "@supabase/functions-js/edge-runtime.d.ts";
|
import "@supabase/functions-js/edge-runtime.d.ts";
|
||||||
import { createClient } from "https://esm.sh/@supabase/supabase-js@2";
|
import { createClient } from "https://esm.sh/@supabase/supabase-js@2";
|
||||||
import { normalizeInstagramHandle } from "../_shared/normalizeHandles.ts";
|
import { normalizeInstagramHandle, normalizeYouTubeChannel } from "../_shared/normalizeHandles.ts";
|
||||||
|
|
||||||
const corsHeaders = {
|
const corsHeaders = {
|
||||||
"Access-Control-Allow-Origin": "*",
|
"Access-Control-Allow-Origin": "*",
|
||||||
|
|
@ -58,18 +58,33 @@ Deno.serve(async (req) => {
|
||||||
// Run all enrichment tasks in parallel
|
// Run all enrichment tasks in parallel
|
||||||
const tasks = [];
|
const tasks = [];
|
||||||
|
|
||||||
// 1. Instagram Profile
|
// 1. Instagram Profile — with fallback for wrong handle
|
||||||
const cleanIgHandle = normalizeInstagramHandle(instagramHandle);
|
const cleanIgHandle = normalizeInstagramHandle(instagramHandle);
|
||||||
if (cleanIgHandle) {
|
if (cleanIgHandle) {
|
||||||
tasks.push(
|
tasks.push(
|
||||||
(async () => {
|
(async () => {
|
||||||
|
// Try the given handle first, then common clinic variants
|
||||||
|
const handleCandidates = [
|
||||||
|
cleanIgHandle,
|
||||||
|
`${cleanIgHandle}_ps`, // banobagi → banobagi_ps
|
||||||
|
`${cleanIgHandle}.ps`, // banobagi → banobagi.ps
|
||||||
|
`${cleanIgHandle}_clinic`, // banobagi → banobagi_clinic
|
||||||
|
`${cleanIgHandle}_official`, // banobagi → banobagi_official
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const handle of handleCandidates) {
|
||||||
const items = await runApifyActor(
|
const items = await runApifyActor(
|
||||||
"apify~instagram-profile-scraper",
|
"apify~instagram-profile-scraper",
|
||||||
{ usernames: [cleanIgHandle], resultsLimit: 12 },
|
{ usernames: [handle], resultsLimit: 12 },
|
||||||
APIFY_TOKEN
|
APIFY_TOKEN
|
||||||
);
|
);
|
||||||
const profile = (items as Record<string, unknown>[])[0];
|
const profile = (items as Record<string, unknown>[])[0];
|
||||||
|
|
||||||
if (profile && !profile.error) {
|
if (profile && !profile.error) {
|
||||||
|
const followers = (profile.followersCount as number) || 0;
|
||||||
|
|
||||||
|
// Accept if: has meaningful followers OR is a business account with posts
|
||||||
|
if (followers >= 100 || ((profile.isBusinessAccount as boolean) && (profile.postsCount as number) > 10)) {
|
||||||
enrichment.instagram = {
|
enrichment.instagram = {
|
||||||
username: profile.username,
|
username: profile.username,
|
||||||
followers: profile.followersCount,
|
followers: profile.followersCount,
|
||||||
|
|
@ -88,6 +103,9 @@ Deno.serve(async (req) => {
|
||||||
timestamp: p.timestamp,
|
timestamp: p.timestamp,
|
||||||
})),
|
})),
|
||||||
};
|
};
|
||||||
|
break; // Found a valid account
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
})()
|
})()
|
||||||
);
|
);
|
||||||
|
|
@ -97,17 +115,27 @@ Deno.serve(async (req) => {
|
||||||
if (clinicName || address) {
|
if (clinicName || address) {
|
||||||
tasks.push(
|
tasks.push(
|
||||||
(async () => {
|
(async () => {
|
||||||
const searchQuery = `${clinicName} ${address || "강남"}`;
|
// Try multiple search queries for better hit rate
|
||||||
const items = await runApifyActor(
|
const queries = [
|
||||||
|
`${clinicName} 성형외과`,
|
||||||
|
clinicName,
|
||||||
|
`${clinicName} ${address || "강남"}`,
|
||||||
|
];
|
||||||
|
|
||||||
|
let items: unknown[] = [];
|
||||||
|
for (const query of queries) {
|
||||||
|
items = await runApifyActor(
|
||||||
"compass~crawler-google-places",
|
"compass~crawler-google-places",
|
||||||
{
|
{
|
||||||
searchStringsArray: [searchQuery],
|
searchStringsArray: [query],
|
||||||
maxCrawledPlacesPerSearch: 1,
|
maxCrawledPlacesPerSearch: 3,
|
||||||
language: "ko",
|
language: "ko",
|
||||||
maxReviews: 10,
|
maxReviews: 10,
|
||||||
},
|
},
|
||||||
APIFY_TOKEN
|
APIFY_TOKEN
|
||||||
);
|
);
|
||||||
|
if ((items as Record<string, unknown>[]).length > 0) break;
|
||||||
|
}
|
||||||
const place = (items as Record<string, unknown>[])[0];
|
const place = (items as Record<string, unknown>[])[0];
|
||||||
if (place) {
|
if (place) {
|
||||||
enrichment.googleMaps = {
|
enrichment.googleMaps = {
|
||||||
|
|
@ -140,17 +168,24 @@ Deno.serve(async (req) => {
|
||||||
(async () => {
|
(async () => {
|
||||||
const YT_BASE = "https://www.googleapis.com/youtube/v3";
|
const YT_BASE = "https://www.googleapis.com/youtube/v3";
|
||||||
|
|
||||||
// Resolve handle/username to channel ID
|
// Normalize YouTube URL/handle to structured identifier
|
||||||
let channelId = youtubeChannelId;
|
const ytNormalized = normalizeYouTubeChannel(youtubeChannelId);
|
||||||
if (channelId.startsWith("@") || !channelId.startsWith("UC")) {
|
if (!ytNormalized) return;
|
||||||
// Use forHandle for @handles, forUsername for legacy usernames
|
|
||||||
const param = channelId.startsWith("@") ? "forHandle" : "forUsername";
|
let channelId = "";
|
||||||
const handle = channelId.startsWith("@") ? channelId.slice(1) : channelId;
|
|
||||||
|
if (ytNormalized.type === "channelId") {
|
||||||
|
channelId = ytNormalized.value;
|
||||||
|
} else {
|
||||||
|
// Try forHandle first, then forUsername as fallback
|
||||||
|
for (const param of ["forHandle", "forUsername"]) {
|
||||||
const lookupRes = await fetch(
|
const lookupRes = await fetch(
|
||||||
`${YT_BASE}/channels?part=id&${param}=${handle}&key=${YOUTUBE_API_KEY}`
|
`${YT_BASE}/channels?part=id&${param}=${ytNormalized.value}&key=${YOUTUBE_API_KEY}`
|
||||||
);
|
);
|
||||||
const lookupData = await lookupRes.json();
|
const lookupData = await lookupRes.json();
|
||||||
channelId = lookupData.items?.[0]?.id || "";
|
channelId = lookupData.items?.[0]?.id || "";
|
||||||
|
if (channelId) break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!channelId) return;
|
if (!channelId) return;
|
||||||
|
|
|
||||||
|
|
@ -92,7 +92,13 @@ ${JSON.stringify(analyzeResult.data?.analysis || {}, null, 2)}
|
||||||
"address": "주소",
|
"address": "주소",
|
||||||
"phone": "전화번호",
|
"phone": "전화번호",
|
||||||
"services": ["시술1", "시술2"],
|
"services": ["시술1", "시술2"],
|
||||||
"doctors": [{"name": "의사명", "specialty": "전문분야"}]
|
"doctors": [{"name": "의사명", "specialty": "전문분야"}],
|
||||||
|
"socialMedia": {
|
||||||
|
"instagram": "정확한 Instagram 핸들 (@ 없이, 예: banobagi_ps)",
|
||||||
|
"youtube": "YouTube 채널 핸들 또는 URL",
|
||||||
|
"facebook": "Facebook 페이지명",
|
||||||
|
"naverBlog": "네이버 블로그 ID"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"executiveSummary": "경영진 요약 (3-5문장)",
|
"executiveSummary": "경영진 요약 (3-5문장)",
|
||||||
"overallScore": 0-100,
|
"overallScore": 0-100,
|
||||||
|
|
@ -165,13 +171,14 @@ ${JSON.stringify(analyzeResult.data?.analysis || {}, null, 2)}
|
||||||
report = { raw: reportText, parseError: true };
|
report = { raw: reportText, parseError: true };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Normalize social handles from scrape data
|
// Merge social handles: AI-found (more accurate) > Firecrawl-extracted (fallback)
|
||||||
const socialMedia = clinic.socialMedia || {};
|
const scrapeSocial = clinic.socialMedia || {};
|
||||||
|
const aiSocial = report?.clinicInfo?.socialMedia || {};
|
||||||
const normalizedHandles = {
|
const normalizedHandles = {
|
||||||
instagram: normalizeInstagramHandle(socialMedia.instagram),
|
instagram: normalizeInstagramHandle(aiSocial.instagram) || normalizeInstagramHandle(scrapeSocial.instagram),
|
||||||
youtube: socialMedia.youtube || null,
|
youtube: aiSocial.youtube || scrapeSocial.youtube || null,
|
||||||
facebook: socialMedia.facebook || null,
|
facebook: aiSocial.facebook || scrapeSocial.facebook || null,
|
||||||
blog: socialMedia.blog || null,
|
blog: aiSocial.naverBlog || scrapeSocial.blog || null,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Embed normalized handles in report for DB persistence
|
// Embed normalized handles in report for DB persistence
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue