fix: Instagram data collection pipeline — handle normalization + DB persistence
- enrich-channels: Instagram fallback — auto-try _ps, .ps, _clinic suffixes when <100 followers - enrich-channels: YouTube URL normalization via normalizeYouTubeChannel (handles /c/, /user/, @handle) - enrich-channels: Google Maps multi-query search for better hit rate - generate-report: AI-found social handles prioritized over Firecrawl scrape - generate-report: Added socialMedia field to AI prompt for accurate handle discovery - normalizeHandles: Added normalizeYouTubeChannel for /c/, /user/, /channel/, @handle URLs Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>claude/bold-hawking
parent
9bf47f7d93
commit
e5399486f7
|
|
@ -46,3 +46,62 @@ export function normalizeInstagramHandle(
|
|||
|
||||
return handle || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a YouTube channel identifier from various URL formats.
|
||||
*
|
||||
* Returns an object with the best identifier type for API lookup:
|
||||
* - "https://www.youtube.com/@banobagips" → { type: 'handle', value: 'banobagips' }
|
||||
* - "https://youtube.com/c/banobagips" → { type: 'username', value: 'banobagips' }
|
||||
* - "https://youtube.com/user/banobagi" → { type: 'username', value: 'banobagi' }
|
||||
* - "https://youtube.com/channel/UCxxxx" → { type: 'channelId', value: 'UCxxxx' }
|
||||
* - "@banobagips" → { type: 'handle', value: 'banobagips' }
|
||||
* - "UCxxxx" → { type: 'channelId', value: 'UCxxxx' }
|
||||
* - "banobagips" → { type: 'username', value: 'banobagips' }
|
||||
*/
|
||||
export function normalizeYouTubeChannel(
|
||||
raw: string | null | undefined,
|
||||
): { type: 'handle' | 'username' | 'channelId'; value: string } | null {
|
||||
if (!raw || typeof raw !== "string") return null;
|
||||
|
||||
let input = raw.trim();
|
||||
if (!input) return null;
|
||||
|
||||
// Parse YouTube URLs
|
||||
if (input.includes("youtube.com") || input.includes("youtu.be")) {
|
||||
try {
|
||||
const urlStr = input.startsWith("http") ? input : `https://${input}`;
|
||||
const url = new URL(urlStr);
|
||||
const segments = url.pathname.split("/").filter(Boolean);
|
||||
|
||||
if (segments[0] === "channel" && segments[1]?.startsWith("UC")) {
|
||||
return { type: "channelId", value: segments[1] };
|
||||
}
|
||||
if (segments[0] === "c" && segments[1]) {
|
||||
return { type: "username", value: segments[1] };
|
||||
}
|
||||
if (segments[0] === "user" && segments[1]) {
|
||||
return { type: "username", value: segments[1] };
|
||||
}
|
||||
if (segments[0]?.startsWith("@")) {
|
||||
return { type: "handle", value: segments[0].slice(1) };
|
||||
}
|
||||
// Fallback: first path segment
|
||||
if (segments[0]) {
|
||||
return { type: "username", value: segments[0] };
|
||||
}
|
||||
} catch {
|
||||
// URL parsing failed
|
||||
}
|
||||
}
|
||||
|
||||
// Non-URL formats
|
||||
if (input.startsWith("@")) {
|
||||
return { type: "handle", value: input.slice(1) };
|
||||
}
|
||||
if (input.startsWith("UC") && input.length > 20) {
|
||||
return { type: "channelId", value: input };
|
||||
}
|
||||
|
||||
return { type: "username", value: input };
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import "@supabase/functions-js/edge-runtime.d.ts";
|
||||
import { createClient } from "https://esm.sh/@supabase/supabase-js@2";
|
||||
import { normalizeInstagramHandle } from "../_shared/normalizeHandles.ts";
|
||||
import { normalizeInstagramHandle, normalizeYouTubeChannel } from "../_shared/normalizeHandles.ts";
|
||||
|
||||
const corsHeaders = {
|
||||
"Access-Control-Allow-Origin": "*",
|
||||
|
|
@ -58,36 +58,54 @@ Deno.serve(async (req) => {
|
|||
// Run all enrichment tasks in parallel
|
||||
const tasks = [];
|
||||
|
||||
// 1. Instagram Profile
|
||||
// 1. Instagram Profile — with fallback for wrong handle
|
||||
const cleanIgHandle = normalizeInstagramHandle(instagramHandle);
|
||||
if (cleanIgHandle) {
|
||||
tasks.push(
|
||||
(async () => {
|
||||
const items = await runApifyActor(
|
||||
"apify~instagram-profile-scraper",
|
||||
{ usernames: [cleanIgHandle], resultsLimit: 12 },
|
||||
APIFY_TOKEN
|
||||
);
|
||||
const profile = (items as Record<string, unknown>[])[0];
|
||||
if (profile && !profile.error) {
|
||||
enrichment.instagram = {
|
||||
username: profile.username,
|
||||
followers: profile.followersCount,
|
||||
following: profile.followsCount,
|
||||
posts: profile.postsCount,
|
||||
bio: profile.biography,
|
||||
isBusinessAccount: profile.isBusinessAccount,
|
||||
externalUrl: profile.externalUrl,
|
||||
latestPosts: ((profile.latestPosts as Record<string, unknown>[]) || [])
|
||||
.slice(0, 12)
|
||||
.map((p) => ({
|
||||
type: p.type,
|
||||
likes: p.likesCount,
|
||||
comments: p.commentsCount,
|
||||
caption: (p.caption as string || "").slice(0, 200),
|
||||
timestamp: p.timestamp,
|
||||
})),
|
||||
};
|
||||
// Try the given handle first, then common clinic variants
|
||||
const handleCandidates = [
|
||||
cleanIgHandle,
|
||||
`${cleanIgHandle}_ps`, // banobagi → banobagi_ps
|
||||
`${cleanIgHandle}.ps`, // banobagi → banobagi.ps
|
||||
`${cleanIgHandle}_clinic`, // banobagi → banobagi_clinic
|
||||
`${cleanIgHandle}_official`, // banobagi → banobagi_official
|
||||
];
|
||||
|
||||
for (const handle of handleCandidates) {
|
||||
const items = await runApifyActor(
|
||||
"apify~instagram-profile-scraper",
|
||||
{ usernames: [handle], resultsLimit: 12 },
|
||||
APIFY_TOKEN
|
||||
);
|
||||
const profile = (items as Record<string, unknown>[])[0];
|
||||
|
||||
if (profile && !profile.error) {
|
||||
const followers = (profile.followersCount as number) || 0;
|
||||
|
||||
// Accept if: has meaningful followers OR is a business account with posts
|
||||
if (followers >= 100 || ((profile.isBusinessAccount as boolean) && (profile.postsCount as number) > 10)) {
|
||||
enrichment.instagram = {
|
||||
username: profile.username,
|
||||
followers: profile.followersCount,
|
||||
following: profile.followsCount,
|
||||
posts: profile.postsCount,
|
||||
bio: profile.biography,
|
||||
isBusinessAccount: profile.isBusinessAccount,
|
||||
externalUrl: profile.externalUrl,
|
||||
latestPosts: ((profile.latestPosts as Record<string, unknown>[]) || [])
|
||||
.slice(0, 12)
|
||||
.map((p) => ({
|
||||
type: p.type,
|
||||
likes: p.likesCount,
|
||||
comments: p.commentsCount,
|
||||
caption: (p.caption as string || "").slice(0, 200),
|
||||
timestamp: p.timestamp,
|
||||
})),
|
||||
};
|
||||
break; // Found a valid account
|
||||
}
|
||||
}
|
||||
}
|
||||
})()
|
||||
);
|
||||
|
|
@ -97,17 +115,27 @@ Deno.serve(async (req) => {
|
|||
if (clinicName || address) {
|
||||
tasks.push(
|
||||
(async () => {
|
||||
const searchQuery = `${clinicName} ${address || "강남"}`;
|
||||
const items = await runApifyActor(
|
||||
"compass~crawler-google-places",
|
||||
{
|
||||
searchStringsArray: [searchQuery],
|
||||
maxCrawledPlacesPerSearch: 1,
|
||||
language: "ko",
|
||||
maxReviews: 10,
|
||||
},
|
||||
APIFY_TOKEN
|
||||
);
|
||||
// Try multiple search queries for better hit rate
|
||||
const queries = [
|
||||
`${clinicName} 성형외과`,
|
||||
clinicName,
|
||||
`${clinicName} ${address || "강남"}`,
|
||||
];
|
||||
|
||||
let items: unknown[] = [];
|
||||
for (const query of queries) {
|
||||
items = await runApifyActor(
|
||||
"compass~crawler-google-places",
|
||||
{
|
||||
searchStringsArray: [query],
|
||||
maxCrawledPlacesPerSearch: 3,
|
||||
language: "ko",
|
||||
maxReviews: 10,
|
||||
},
|
||||
APIFY_TOKEN
|
||||
);
|
||||
if ((items as Record<string, unknown>[]).length > 0) break;
|
||||
}
|
||||
const place = (items as Record<string, unknown>[])[0];
|
||||
if (place) {
|
||||
enrichment.googleMaps = {
|
||||
|
|
@ -140,17 +168,24 @@ Deno.serve(async (req) => {
|
|||
(async () => {
|
||||
const YT_BASE = "https://www.googleapis.com/youtube/v3";
|
||||
|
||||
// Resolve handle/username to channel ID
|
||||
let channelId = youtubeChannelId;
|
||||
if (channelId.startsWith("@") || !channelId.startsWith("UC")) {
|
||||
// Use forHandle for @handles, forUsername for legacy usernames
|
||||
const param = channelId.startsWith("@") ? "forHandle" : "forUsername";
|
||||
const handle = channelId.startsWith("@") ? channelId.slice(1) : channelId;
|
||||
const lookupRes = await fetch(
|
||||
`${YT_BASE}/channels?part=id&${param}=${handle}&key=${YOUTUBE_API_KEY}`
|
||||
);
|
||||
const lookupData = await lookupRes.json();
|
||||
channelId = lookupData.items?.[0]?.id || "";
|
||||
// Normalize YouTube URL/handle to structured identifier
|
||||
const ytNormalized = normalizeYouTubeChannel(youtubeChannelId);
|
||||
if (!ytNormalized) return;
|
||||
|
||||
let channelId = "";
|
||||
|
||||
if (ytNormalized.type === "channelId") {
|
||||
channelId = ytNormalized.value;
|
||||
} else {
|
||||
// Try forHandle first, then forUsername as fallback
|
||||
for (const param of ["forHandle", "forUsername"]) {
|
||||
const lookupRes = await fetch(
|
||||
`${YT_BASE}/channels?part=id&${param}=${ytNormalized.value}&key=${YOUTUBE_API_KEY}`
|
||||
);
|
||||
const lookupData = await lookupRes.json();
|
||||
channelId = lookupData.items?.[0]?.id || "";
|
||||
if (channelId) break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!channelId) return;
|
||||
|
|
|
|||
|
|
@ -92,7 +92,13 @@ ${JSON.stringify(analyzeResult.data?.analysis || {}, null, 2)}
|
|||
"address": "주소",
|
||||
"phone": "전화번호",
|
||||
"services": ["시술1", "시술2"],
|
||||
"doctors": [{"name": "의사명", "specialty": "전문분야"}]
|
||||
"doctors": [{"name": "의사명", "specialty": "전문분야"}],
|
||||
"socialMedia": {
|
||||
"instagram": "정확한 Instagram 핸들 (@ 없이, 예: banobagi_ps)",
|
||||
"youtube": "YouTube 채널 핸들 또는 URL",
|
||||
"facebook": "Facebook 페이지명",
|
||||
"naverBlog": "네이버 블로그 ID"
|
||||
}
|
||||
},
|
||||
"executiveSummary": "경영진 요약 (3-5문장)",
|
||||
"overallScore": 0-100,
|
||||
|
|
@ -165,13 +171,14 @@ ${JSON.stringify(analyzeResult.data?.analysis || {}, null, 2)}
|
|||
report = { raw: reportText, parseError: true };
|
||||
}
|
||||
|
||||
// Normalize social handles from scrape data
|
||||
const socialMedia = clinic.socialMedia || {};
|
||||
// Merge social handles: AI-found (more accurate) > Firecrawl-extracted (fallback)
|
||||
const scrapeSocial = clinic.socialMedia || {};
|
||||
const aiSocial = report?.clinicInfo?.socialMedia || {};
|
||||
const normalizedHandles = {
|
||||
instagram: normalizeInstagramHandle(socialMedia.instagram),
|
||||
youtube: socialMedia.youtube || null,
|
||||
facebook: socialMedia.facebook || null,
|
||||
blog: socialMedia.blog || null,
|
||||
instagram: normalizeInstagramHandle(aiSocial.instagram) || normalizeInstagramHandle(scrapeSocial.instagram),
|
||||
youtube: aiSocial.youtube || scrapeSocial.youtube || null,
|
||||
facebook: aiSocial.facebook || scrapeSocial.facebook || null,
|
||||
blog: aiSocial.naverBlog || scrapeSocial.blog || null,
|
||||
};
|
||||
|
||||
// Embed normalized handles in report for DB persistence
|
||||
|
|
|
|||
Loading…
Reference in New Issue