fix: naverPlace 오매핑 수정 + naverBlog RSS 공식 블로그 스크래핑

- naverPlace: 매칭 우선순위 재정의 — 전체 클리닉명 정확 매칭 > 짧은명+성형 카테고리 > 성형 카테고리 순으로 변경 (기존 로직은 피부과까지 매칭되어 데이뷰의원 오매핑 발생)
- naverBlog: Firecrawl 공식 블로그 스크래핑 → Naver RSS 피드로 교체 (blog.naver.com은 Firecrawl 차단, RSS는 공개 엔드포인트)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
claude/bold-hawking
Haewon Kam 2026-04-07 17:03:36 +09:00
parent 2027ae9b64
commit 5ed35bc4cd
1 changed files with 51 additions and 40 deletions

View File

@ -473,41 +473,39 @@ Deno.serve(async (req) => {
if (!res.ok) throw new Error(`Naver Blog API returned ${res.status}`);
const data = await res.json();
// ─── 5b. Firecrawl: Official blog recent posts ───
// Registry always provides the official blog URL — scrape it for real content metrics.
// ─── 5b. Naver RSS: Official blog recent posts ───
// blog.naver.com is blocked by Firecrawl. Use the public RSS feed instead:
// https://rss.blog.naver.com/{blogId}.xml — no auth required.
let officialBlogContent: Record<string, unknown> | null = null;
if (officialBlogUrl) {
const FIRECRAWL_KEY = Deno.env.get("FIRECRAWL_API_KEY");
if (FIRECRAWL_KEY) {
try {
const blogScrape = await fetchWithRetry(`https://api.firecrawl.dev/v1/scrape`, {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_KEY}` },
body: JSON.stringify({
url: officialBlogUrl,
formats: ["json"],
jsonOptions: {
prompt: "Extract the blog's recent posts: title, date, excerpt. Also total post count visible on the page, and the blog category/tag list.",
schema: {
type: "object",
properties: {
totalPosts: { type: "number" },
recentPosts: { type: "array", items: { type: "object", properties: { title: { type: "string" }, date: { type: "string" }, excerpt: { type: "string" } } } },
categories: { type: "array", items: { type: "string" } },
},
},
},
waitFor: 3000,
}),
}, { label: "firecrawl-naver-blog", timeoutMs: 45000 });
if (blogScrape.ok) {
const blogData = await blogScrape.json();
officialBlogContent = blogData.data?.json || null;
console.log(`[naverBlog] Official blog scraped: ${officialBlogContent?.totalPosts ?? 0} posts`);
if (officialBlogHandle) {
try {
const rssRes = await fetchWithRetry(
`https://rss.blog.naver.com/${officialBlogHandle}.xml`,
{},
{ label: "naver-rss", timeoutMs: 15000 }
);
if (rssRes.ok) {
const xml = await rssRes.text();
// Parse RSS items: <item><title>...</title><link>...</link><pubDate>...</pubDate><description>...</description></item>
const items: Array<{ title: string; link: string; date: string; excerpt: string }> = [];
const itemMatches = xml.matchAll(/<item>([\s\S]*?)<\/item>/g);
for (const m of itemMatches) {
const block = m[1];
const title = (block.match(/<title><!\[CDATA\[(.*?)\]\]><\/title>/) || block.match(/<title>(.*?)<\/title>/))?.[1] || "";
const link = (block.match(/<link>(.*?)<\/link>/))?.[1] || "";
const date = (block.match(/<pubDate>(.*?)<\/pubDate>/))?.[1] || "";
const desc = (block.match(/<description><!\[CDATA\[(.*?)\]\]><\/description>/) || block.match(/<description>(.*?)<\/description>/))?.[1] || "";
items.push({ title, link, date, excerpt: desc.replace(/<[^>]*>/g, "").trim().slice(0, 150) });
}
} catch (e) {
console.warn(`[naverBlog] Official blog Firecrawl failed (non-critical):`, e);
const totalPosts = (xml.match(/<totalCount>(\d+)<\/totalCount>/) || xml.match(/<managedCount>(\d+)<\/managedCount>/))?.[1];
officialBlogContent = {
totalPosts: totalPosts ? Number(totalPosts) : items.length,
recentPosts: items.slice(0, 10),
};
console.log(`[naverBlog] RSS fetched: ${items.length} posts from ${officialBlogHandle}`);
}
} catch (e) {
console.warn(`[naverBlog] RSS fetch failed (non-critical):`, e);
}
}
@ -533,23 +531,36 @@ Deno.serve(async (req) => {
`${clinicName} 성형`,
clinicName,
];
// Core name without type suffixes (e.g. "뷰성형외과" → "뷰성형외과", trimmed for short names)
const cleanedName = clinicName.replace(/성형외과|병원|의원/g, '').trim().toLowerCase();
for (const q of queries) {
const query = encodeURIComponent(q);
const res = await fetchWithRetry(`https://openapi.naver.com/v1/search/local.json?query=${query}&display=5&sort=comment`, { headers: naverHeaders }, { label: "naver-place" });
if (!res.ok) continue;
const data = await res.json();
// Find the best match: prefer category containing 성형 or 피부
const items = (data.items || []) as Record<string, string>[];
const match = items.find(i =>
(i.category || '').includes('성형') || (i.category || '').includes('피부')
) || items.find(i => {
const name = (i.title || '').replace(/<[^>]*>/g, '').toLowerCase();
return name.includes(clinicName.replace(/성형외과|병원|의원/g, '').trim().toLowerCase());
}) || null;
const normalize = (s: string) => (s || '').replace(/<[^>]*>/g, '').toLowerCase();
// Priority 1: name contains full clinicName (exact match, ignoring HTML tags)
let match = items.find(i => normalize(i.title).includes(clinicName.toLowerCase())) || null;
// Priority 2: name contains cleaned short name AND category is 성형 (plastic surgery only)
if (!match && cleanedName.length >= 2) {
match = items.find(i =>
normalize(i.title).includes(cleanedName) && (i.category || '').includes('성형')
) || null;
}
// Priority 3: category is 성형 (plastic surgery) — not 피부 to avoid skin clinics
if (!match) {
match = items.find(i => (i.category || '').includes('성형')) || null;
}
if (match) {
channelData.naverPlace = {
name: (match.title || "").replace(/<[^>]*>/g, ""),
name: normalize(match.title),
category: match.category, address: match.roadAddress || match.address,
telephone: match.telephone, link: match.link, mapx: match.mapx, mapy: match.mapy,
};