diff --git a/supabase/functions/collect-channel-data/index.ts b/supabase/functions/collect-channel-data/index.ts index d99e66b..9bdea42 100644 --- a/supabase/functions/collect-channel-data/index.ts +++ b/supabase/functions/collect-channel-data/index.ts @@ -473,41 +473,39 @@ Deno.serve(async (req) => { if (!res.ok) throw new Error(`Naver Blog API returned ${res.status}`); const data = await res.json(); - // ─── 5b. Firecrawl: Official blog recent posts ─── - // Registry always provides the official blog URL — scrape it for real content metrics. + // ─── 5b. Naver RSS: Official blog recent posts ─── + // blog.naver.com is blocked by Firecrawl. Use the public RSS feed instead: + // https://rss.blog.naver.com/{blogId}.xml — no auth required. let officialBlogContent: Record | null = null; - if (officialBlogUrl) { - const FIRECRAWL_KEY = Deno.env.get("FIRECRAWL_API_KEY"); - if (FIRECRAWL_KEY) { - try { - const blogScrape = await fetchWithRetry(`https://api.firecrawl.dev/v1/scrape`, { - method: "POST", - headers: { "Content-Type": "application/json", Authorization: `Bearer ${FIRECRAWL_KEY}` }, - body: JSON.stringify({ - url: officialBlogUrl, - formats: ["json"], - jsonOptions: { - prompt: "Extract the blog's recent posts: title, date, excerpt. Also total post count visible on the page, and the blog category/tag list.", - schema: { - type: "object", - properties: { - totalPosts: { type: "number" }, - recentPosts: { type: "array", items: { type: "object", properties: { title: { type: "string" }, date: { type: "string" }, excerpt: { type: "string" } } } }, - categories: { type: "array", items: { type: "string" } }, - }, - }, - }, - waitFor: 3000, - }), - }, { label: "firecrawl-naver-blog", timeoutMs: 45000 }); - if (blogScrape.ok) { - const blogData = await blogScrape.json(); - officialBlogContent = blogData.data?.json || null; - console.log(`[naverBlog] Official blog scraped: ${officialBlogContent?.totalPosts ?? 0} posts`); + if (officialBlogHandle) { + try { + const rssRes = await fetchWithRetry( + `https://rss.blog.naver.com/${officialBlogHandle}.xml`, + {}, + { label: "naver-rss", timeoutMs: 15000 } + ); + if (rssRes.ok) { + const xml = await rssRes.text(); + // Parse RSS items: ............ + const items: Array<{ title: string; link: string; date: string; excerpt: string }> = []; + const itemMatches = xml.matchAll(/([\s\S]*?)<\/item>/g); + for (const m of itemMatches) { + const block = m[1]; + const title = (block.match(/<!\[CDATA\[(.*?)\]\]><\/title>/) || block.match(/<title>(.*?)<\/title>/))?.[1] || ""; + const link = (block.match(/<link>(.*?)<\/link>/))?.[1] || ""; + const date = (block.match(/<pubDate>(.*?)<\/pubDate>/))?.[1] || ""; + const desc = (block.match(/<description><!\[CDATA\[(.*?)\]\]><\/description>/) || block.match(/<description>(.*?)<\/description>/))?.[1] || ""; + items.push({ title, link, date, excerpt: desc.replace(/<[^>]*>/g, "").trim().slice(0, 150) }); } - } catch (e) { - console.warn(`[naverBlog] Official blog Firecrawl failed (non-critical):`, e); + const totalPosts = (xml.match(/<totalCount>(\d+)<\/totalCount>/) || xml.match(/<managedCount>(\d+)<\/managedCount>/))?.[1]; + officialBlogContent = { + totalPosts: totalPosts ? Number(totalPosts) : items.length, + recentPosts: items.slice(0, 10), + }; + console.log(`[naverBlog] RSS fetched: ${items.length} posts from ${officialBlogHandle}`); } + } catch (e) { + console.warn(`[naverBlog] RSS fetch failed (non-critical):`, e); } } @@ -533,23 +531,36 @@ Deno.serve(async (req) => { `${clinicName} 성형`, clinicName, ]; + // Core name without type suffixes (e.g. "뷰성형외과" → "뷰성형외과", trimmed for short names) + const cleanedName = clinicName.replace(/성형외과|병원|의원/g, '').trim().toLowerCase(); + for (const q of queries) { const query = encodeURIComponent(q); const res = await fetchWithRetry(`https://openapi.naver.com/v1/search/local.json?query=${query}&display=5&sort=comment`, { headers: naverHeaders }, { label: "naver-place" }); if (!res.ok) continue; const data = await res.json(); - // Find the best match: prefer category containing 성형 or 피부 const items = (data.items || []) as Record<string, string>[]; - const match = items.find(i => - (i.category || '').includes('성형') || (i.category || '').includes('피부') - ) || items.find(i => { - const name = (i.title || '').replace(/<[^>]*>/g, '').toLowerCase(); - return name.includes(clinicName.replace(/성형외과|병원|의원/g, '').trim().toLowerCase()); - }) || null; + + const normalize = (s: string) => (s || '').replace(/<[^>]*>/g, '').toLowerCase(); + + // Priority 1: name contains full clinicName (exact match, ignoring HTML tags) + let match = items.find(i => normalize(i.title).includes(clinicName.toLowerCase())) || null; + + // Priority 2: name contains cleaned short name AND category is 성형 (plastic surgery only) + if (!match && cleanedName.length >= 2) { + match = items.find(i => + normalize(i.title).includes(cleanedName) && (i.category || '').includes('성형') + ) || null; + } + + // Priority 3: category is 성형 (plastic surgery) — not 피부 to avoid skin clinics + if (!match) { + match = items.find(i => (i.category || '').includes('성형')) || null; + } if (match) { channelData.naverPlace = { - name: (match.title || "").replace(/<[^>]*>/g, ""), + name: normalize(match.title), category: match.category, address: match.roadAddress || match.address, telephone: match.telephone, link: match.link, mapx: match.mapx, mapy: match.mapy, };