fix: pipeline P0/P1 — rating bug, retry, health score, blog scrape
## P0 버그 수정 (즉시 영향) ### fix(collect-channel-data): 강남언니 rating 오변환 제거 - 기존: `rating ≤ 5 → ×2` 로직으로 4.8/10을 9.6/10으로 잘못 변환 - Firecrawl 프롬프트가 이미 0-10 반환 지시 → rawValue 직접 신뢰 ### fix(generate-report): Perplexity 단일 fetch → fetchWithRetry - maxRetries:2, backoffMs:[5000,15000], timeoutMs:90s 설정 - 기존: 일시적 429/타임아웃 시 리포트 생성 전체 실패 ## P1 기능 추가 (데이터 품질) ### feat(collect-channel-data): channel_snapshots health_score 계산 - `computeHealthScore(channel, data)` 함수 추가 (채널별 0-100 스코어) - Instagram: followers 기반 선형 보간 + posts bonus - YouTube: subscribers 기반 + video count bonus - 강남언니: rating×7 + reviews bonus (max 30pt) - Google Maps: rating×12 + reviews bonus (max 40pt) - Naver Blog: presence (50pt) + 언급 수 bonus (max 30pt) - 모든 channel_snapshots INSERT에 health_score 포함 ### feat(collect-channel-data): 네이버 블로그 공식 컨텐츠 스크랩 추가 - 기존: Naver Search API로 3rd-party 언급만 수집 - 추가: Registry에서 확인된 공식 블로그 URL을 Firecrawl로 직접 스크랩 - 총 게시글 수, 최근 게시물 (제목/날짜/요약), 카테고리 추출 - 실패 시 non-critical — 기존 Naver Search 결과는 항상 유지 ## docs: PIPELINE_IMPROVEMENT_PLAN 감사 결과 반영 - Sprint 0 (Vision), Sprint 1, Sprint 2 완료 표시 - WP-10, WP-11 완료 표시 - 2026-04-07 전수 감사 섹션 추가 (구현 완료/수정/남은 Gap 표) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>claude/bold-hawking
parent
d5f7f24e0a
commit
bcc0b6ea5e
|
|
@ -52,6 +52,98 @@ async function runApifyActor(actorId: string, input: Record<string, unknown>, to
|
|||
return itemsRes.json();
|
||||
}
|
||||
|
||||
// ─── Health Score Calculator ───────────────────────────────────────────────
|
||||
// Computes a 0-100 health score per channel based on follower/rating/review
|
||||
// benchmarks for Korean plastic surgery clinics. Used in channel_snapshots.
|
||||
//
|
||||
// Design: linear interpolation between (floor, floorScore) and (ceil, ceilScore).
|
||||
// E.g., Instagram at 5K followers → mid-range between 1K=40 and 10K=70.
|
||||
|
||||
function lerp(value: number, low: number, high: number, scoreLow: number, scoreHigh: number): number {
|
||||
if (value <= low) return scoreLow;
|
||||
if (value >= high) return scoreHigh;
|
||||
return Math.round(scoreLow + (value - low) / (high - low) * (scoreHigh - scoreLow));
|
||||
}
|
||||
|
||||
function computeHealthScore(channel: string, data: Record<string, unknown>): number {
|
||||
const n = (v: unknown): number => typeof v === 'number' ? v : (parseInt(String(v || 0)) || 0);
|
||||
|
||||
switch (channel) {
|
||||
case 'instagram': {
|
||||
// followers: 0→20, 1K→40, 10K→70, 50K→90, 100K+→100
|
||||
const followers = n(data.followers);
|
||||
const fScore = followers >= 100_000 ? 100
|
||||
: followers >= 50_000 ? lerp(followers, 50_000, 100_000, 90, 100)
|
||||
: followers >= 10_000 ? lerp(followers, 10_000, 50_000, 70, 90)
|
||||
: followers >= 1_000 ? lerp(followers, 1_000, 10_000, 40, 70)
|
||||
: lerp(followers, 0, 1_000, 20, 40);
|
||||
// posts bonus: +5 if active (≥ 50 posts)
|
||||
const posts = n(data.posts);
|
||||
return Math.min(fScore + (posts >= 50 ? 5 : 0), 100);
|
||||
}
|
||||
case 'youtube': {
|
||||
// subscribers: 0→20, 500→40, 5K→65, 50K→85, 200K+→100
|
||||
const subs = n(data.subscribers);
|
||||
const sScore = subs >= 200_000 ? 100
|
||||
: subs >= 50_000 ? lerp(subs, 50_000, 200_000, 85, 100)
|
||||
: subs >= 5_000 ? lerp(subs, 5_000, 50_000, 65, 85)
|
||||
: subs >= 500 ? lerp(subs, 500, 5_000, 40, 65)
|
||||
: lerp(subs, 0, 500, 20, 40);
|
||||
// video count bonus: +5 if ≥ 20 videos
|
||||
const videos = n(data.totalVideos);
|
||||
return Math.min(sScore + (videos >= 20 ? 5 : 0), 100);
|
||||
}
|
||||
case 'facebook': {
|
||||
// followers: 0→20, 500→35, 5K→60, 20K→80, 50K+→100
|
||||
const followers = n(data.followers);
|
||||
return followers >= 50_000 ? 100
|
||||
: followers >= 20_000 ? lerp(followers, 20_000, 50_000, 80, 100)
|
||||
: followers >= 5_000 ? lerp(followers, 5_000, 20_000, 60, 80)
|
||||
: followers >= 500 ? lerp(followers, 500, 5_000, 35, 60)
|
||||
: lerp(followers, 0, 500, 20, 35);
|
||||
}
|
||||
case 'gangnamUnni': {
|
||||
// rating /10: max 70pts. reviews: 0→0, 100→10, 1000→20, 10000→30
|
||||
const rating = n(data.rating);
|
||||
const reviews = n(data.totalReviews);
|
||||
const rScore = Math.round(Math.min(rating / 10, 1.0) * 70);
|
||||
const rvScore = reviews >= 10_000 ? 30 : reviews >= 1_000 ? lerp(reviews, 1_000, 10_000, 20, 30)
|
||||
: reviews >= 100 ? lerp(reviews, 100, 1_000, 10, 20)
|
||||
: lerp(reviews, 0, 100, 0, 10);
|
||||
return Math.min(rScore + rvScore, 100);
|
||||
}
|
||||
case 'googleMaps': {
|
||||
// rating /5: max 60pts. reviews: 0→0, 50→10, 500→25, 5000→40
|
||||
const rating = n(data.rating);
|
||||
const reviews = n(data.reviewCount);
|
||||
const rScore = Math.round(Math.min(rating / 5, 1.0) * 60);
|
||||
const rvScore = reviews >= 5_000 ? 40 : reviews >= 500 ? lerp(reviews, 500, 5_000, 25, 40)
|
||||
: reviews >= 50 ? lerp(reviews, 50, 500, 10, 25)
|
||||
: lerp(reviews, 0, 50, 0, 10);
|
||||
return Math.min(rScore + rvScore, 100);
|
||||
}
|
||||
case 'naverBlog': {
|
||||
// Presence-based: official handle = 50, mention count bonus up to +30, activity +20
|
||||
const hasHandle = Boolean(data.officialBlogHandle);
|
||||
const total = n(data.totalResults);
|
||||
const mentionScore = total >= 1000 ? 30 : total >= 100 ? lerp(total, 100, 1000, 15, 30) : lerp(total, 0, 100, 0, 15);
|
||||
return Math.min((hasHandle ? 50 : 20) + mentionScore, 100);
|
||||
}
|
||||
case 'naverPlace': {
|
||||
// rating /5: max 60pts. reviews: 0→0, 100→15, 1000→30, 10000→40
|
||||
const rating = n(data.rating);
|
||||
const reviews = n(data.reviewCount) || n(data.reviews);
|
||||
const rScore = Math.round(Math.min(rating / 5, 1.0) * 60);
|
||||
const rvScore = reviews >= 10_000 ? 40 : reviews >= 1_000 ? lerp(reviews, 1_000, 10_000, 30, 40)
|
||||
: reviews >= 100 ? lerp(reviews, 100, 1_000, 15, 30)
|
||||
: lerp(reviews, 0, 100, 0, 15);
|
||||
return Math.min(rScore + rvScore, 100);
|
||||
}
|
||||
default:
|
||||
return 50; // Unknown channel — neutral score
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Phase 2: Collect Channel Data
|
||||
*
|
||||
|
|
@ -342,17 +434,59 @@ Deno.serve(async (req) => {
|
|||
// Get verified Naver Blog handle from Phase 1 for official blog URL
|
||||
const nbVerified = verified.naverBlog as Record<string, unknown> | null;
|
||||
const officialBlogHandle = nbVerified?.handle ? String(nbVerified.handle) : null;
|
||||
const officialBlogUrl = officialBlogHandle ? `https://blog.naver.com/${officialBlogHandle}` : null;
|
||||
|
||||
// ─── 5a. Naver Search: 3rd-party blog mentions ───
|
||||
const query = encodeURIComponent(`${clinicName} 후기`);
|
||||
const res = await fetchWithRetry(`https://openapi.naver.com/v1/search/blog.json?query=${query}&display=10&sort=sim`, { headers: naverHeaders }, { label: "naver-blog" });
|
||||
if (!res.ok) throw new Error(`Naver Blog API returned ${res.status}`);
|
||||
const data = await res.json();
|
||||
|
||||
// ─── 5b. Firecrawl: Official blog recent posts ───
|
||||
// Registry always provides the official blog URL — scrape it for real content metrics.
|
||||
let officialBlogContent: Record<string, unknown> | null = null;
|
||||
if (officialBlogUrl) {
|
||||
const FIRECRAWL_KEY = Deno.env.get("FIRECRAWL_API_KEY");
|
||||
if (FIRECRAWL_KEY) {
|
||||
try {
|
||||
const blogScrape = await fetchWithRetry(`https://api.firecrawl.dev/v1/scrape`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_KEY}` },
|
||||
body: JSON.stringify({
|
||||
url: officialBlogUrl,
|
||||
formats: ["json"],
|
||||
jsonOptions: {
|
||||
prompt: "Extract the blog's recent posts: title, date, excerpt. Also total post count visible on the page, and the blog category/tag list.",
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
totalPosts: { type: "number" },
|
||||
recentPosts: { type: "array", items: { type: "object", properties: { title: { type: "string" }, date: { type: "string" }, excerpt: { type: "string" } } } },
|
||||
categories: { type: "array", items: { type: "string" } },
|
||||
},
|
||||
},
|
||||
},
|
||||
waitFor: 3000,
|
||||
}),
|
||||
}, { label: "firecrawl-naver-blog", timeoutMs: 45000 });
|
||||
if (blogScrape.ok) {
|
||||
const blogData = await blogScrape.json();
|
||||
officialBlogContent = blogData.data?.json || null;
|
||||
console.log(`[naverBlog] Official blog scraped: ${officialBlogContent?.totalPosts ?? 0} posts`);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`[naverBlog] Official blog Firecrawl failed (non-critical):`, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
channelData.naverBlog = {
|
||||
totalResults: data.total || 0, searchQuery: `${clinicName} 후기`,
|
||||
// Official blog URL from Phase 1 verified handle
|
||||
officialBlogUrl: officialBlogHandle ? `https://blog.naver.com/${officialBlogHandle}` : null,
|
||||
officialBlogHandle: officialBlogHandle,
|
||||
// Blog mentions (third-party posts, NOT the official blog)
|
||||
officialBlogUrl,
|
||||
officialBlogHandle,
|
||||
// Official blog content (from Firecrawl — actual blog data)
|
||||
officialContent: officialBlogContent,
|
||||
// Blog mentions (third-party posts via Naver Search)
|
||||
posts: (data.items || []).slice(0, 10).map((item: Record<string, string>) => ({
|
||||
title: (item.title || "").replace(/<[^>]*>/g, ""),
|
||||
description: (item.description || "").replace(/<[^>]*>/g, ""),
|
||||
|
|
@ -558,6 +692,7 @@ Deno.serve(async (req) => {
|
|||
snapshotInserts.push({
|
||||
clinic_id: clinicId, run_id: runId, channel: 'instagram',
|
||||
handle: igData.username, followers: igData.followers, posts: igData.posts,
|
||||
health_score: computeHealthScore('instagram', igData),
|
||||
details: igData,
|
||||
});
|
||||
}
|
||||
|
|
@ -568,6 +703,7 @@ Deno.serve(async (req) => {
|
|||
clinic_id: clinicId, run_id: runId, channel: 'youtube',
|
||||
handle: ytData.handle || ytData.channelName, followers: ytData.subscribers,
|
||||
posts: ytData.totalVideos, total_views: ytData.totalViews,
|
||||
health_score: computeHealthScore('youtube', ytData),
|
||||
details: ytData,
|
||||
});
|
||||
}
|
||||
|
|
@ -577,6 +713,7 @@ Deno.serve(async (req) => {
|
|||
snapshotInserts.push({
|
||||
clinic_id: clinicId, run_id: runId, channel: 'facebook',
|
||||
handle: fbData.pageName, followers: fbData.followers,
|
||||
health_score: computeHealthScore('facebook', fbData),
|
||||
details: fbData,
|
||||
});
|
||||
}
|
||||
|
|
@ -586,7 +723,9 @@ Deno.serve(async (req) => {
|
|||
snapshotInserts.push({
|
||||
clinic_id: clinicId, run_id: runId, channel: 'gangnamUnni',
|
||||
handle: guData.name, rating: guData.rating, rating_scale: 10,
|
||||
reviews: guData.totalReviews, details: guData,
|
||||
reviews: guData.totalReviews,
|
||||
health_score: computeHealthScore('gangnamUnni', guData),
|
||||
details: guData,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -595,7 +734,9 @@ Deno.serve(async (req) => {
|
|||
snapshotInserts.push({
|
||||
clinic_id: clinicId, run_id: runId, channel: 'googleMaps',
|
||||
handle: gmData.name, rating: gmData.rating, rating_scale: 5,
|
||||
reviews: gmData.reviewCount, details: gmData,
|
||||
reviews: gmData.reviewCount,
|
||||
health_score: computeHealthScore('googleMaps', gmData),
|
||||
details: gmData,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -603,7 +744,9 @@ Deno.serve(async (req) => {
|
|||
if (nbData) {
|
||||
snapshotInserts.push({
|
||||
clinic_id: clinicId, run_id: runId, channel: 'naverBlog',
|
||||
handle: nbData.officialBlogHandle, details: nbData,
|
||||
handle: nbData.officialBlogHandle,
|
||||
health_score: computeHealthScore('naverBlog', nbData),
|
||||
details: nbData,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue