o2o-infinith-demo/supabase/functions/_shared/extractSocialLinks.ts

138 lines
3.9 KiB
TypeScript

/**
* Extract social media handles from a list of URLs.
* Parses known platform patterns deterministically — no AI guessing.
*/
export interface ExtractedSocialLinks {
instagram: string[];
youtube: string[];
facebook: string[];
naverBlog: string[];
tiktok: string[];
kakao: string[];
}
const PATTERNS: { platform: keyof ExtractedSocialLinks; regex: RegExp; extract: (m: RegExpMatchArray) => string }[] = [
// Instagram: instagram.com/{handle} or instagram.com/p/{postId} (skip posts)
{
platform: 'instagram',
regex: /(?:www\.)?instagram\.com\/([a-zA-Z0-9._]+)\/?(?:\?|$)/,
extract: (m) => m[1],
},
// YouTube: youtube.com/@{handle} or youtube.com/channel/{id} or youtube.com/c/{custom}
{
platform: 'youtube',
regex: /(?:www\.)?youtube\.com\/(?:@([a-zA-Z0-9._-]+)|channel\/(UC[a-zA-Z0-9_-]+)|c\/([a-zA-Z0-9._-]+))/,
extract: (m) => m[1] ? `@${m[1]}` : m[2] || m[3] || '',
},
// Facebook: facebook.com/{page} (skip common paths)
{
platform: 'facebook',
regex: /(?:www\.)?facebook\.com\/([a-zA-Z0-9._-]+)\/?(?:\?|$)/,
extract: (m) => m[1],
},
// Naver Blog: blog.naver.com/{blogId}
{
platform: 'naverBlog',
regex: /blog\.naver\.com\/([a-zA-Z0-9_-]+)/,
extract: (m) => m[1],
},
// TikTok: tiktok.com/@{handle}
{
platform: 'tiktok',
regex: /(?:www\.)?tiktok\.com\/@([a-zA-Z0-9._-]+)/,
extract: (m) => m[1],
},
// KakaoTalk Channel: pf.kakao.com/{id}
{
platform: 'kakao',
regex: /pf\.kakao\.com\/([a-zA-Z0-9_-]+)/,
extract: (m) => m[1],
},
];
// Common Facebook paths that are NOT page names
const FB_SKIP = new Set([
'sharer', 'share', 'login', 'help', 'pages', 'events', 'groups',
'marketplace', 'watch', 'gaming', 'privacy', 'policies', 'tr',
'dialog', 'plugins', 'photo', 'video', 'reel',
]);
// Common Instagram paths that are NOT handles
const IG_SKIP = new Set([
'p', 'reel', 'reels', 'stories', 'explore', 'accounts', 'about',
'developer', 'legal', 'privacy', 'terms',
]);
export function extractSocialLinks(urls: string[]): ExtractedSocialLinks {
const result: ExtractedSocialLinks = {
instagram: [],
youtube: [],
facebook: [],
naverBlog: [],
tiktok: [],
kakao: [],
};
const seen: Record<string, Set<string>> = {};
for (const key of Object.keys(result)) {
seen[key] = new Set();
}
for (const rawUrl of urls) {
// Ensure we only process strings
const url = typeof rawUrl === 'string' ? rawUrl : String(rawUrl || '');
if (!url || url.length < 5) continue;
for (const { platform, regex, extract } of PATTERNS) {
const match = url.match(regex);
if (!match) continue;
const handle = extract(match);
if (!handle || handle.length < 2) continue;
// Skip known non-handle paths
if (platform === 'facebook' && FB_SKIP.has(handle.toLowerCase())) continue;
if (platform === 'instagram' && IG_SKIP.has(handle.toLowerCase())) continue;
const normalized = handle.toLowerCase();
if (!seen[platform].has(normalized)) {
seen[platform].add(normalized);
result[platform].push(handle);
}
}
}
return result;
}
/**
* Merge social links from multiple sources, deduplicating.
*/
export function mergeSocialLinks(...sources: Partial<ExtractedSocialLinks>[]): ExtractedSocialLinks {
const merged: ExtractedSocialLinks = {
instagram: [],
youtube: [],
facebook: [],
naverBlog: [],
tiktok: [],
kakao: [],
};
for (const source of sources) {
for (const key of Object.keys(merged) as (keyof ExtractedSocialLinks)[]) {
const vals = source[key];
if (Array.isArray(vals)) {
for (const rawV of vals) {
const v = typeof rawV === 'string' ? rawV.trim() : '';
if (v && v.length >= 2 && !merged[key].some(existing => existing.toLowerCase() === v.toLowerCase())) {
merged[key].push(v);
}
}
}
}
}
return merged;
}