/** * URL Classification Guard — Harness 1 * * Separates URL classification into a testable pure function with * an embedded test corpus. Prevents misclassification bugs like * `/about/meet-our-doctors.php` being tagged as an "about" page. * * Priority: doctor > surgery > about (specific → general) */ // ─── Pattern Definitions ─── const PAGE_PATTERNS = { doctor: { positive: [ "/doctor", "/doctors", "/team", "/staff", "/specialist", "/professor", "/의료진", "/원장", "meet-our-doctor", "/physicians", "/surgeon", "our-team", "our-doctors", ], }, surgery: { positive: [ "/surgery", "/service", "/procedure", "/treatment", "/시술", "/수술", "/procedures", "/treatments", "/services", "/진료", ], }, about: { positive: [ "/about", "/intro", "/소개", "/greeting", "/인사말", "/history", "/연혁", "/company", "/clinic-info", ], negative: [ "/doctor", "/doctors", "/procedure", "/surgery", "/service", "/treatment", "meet-our", "/team", "/staff", "/specialist", "/의료진", "/원장", "/시술", "/수술", "/진료", "our-team", "our-doctors", ], }, } as const; // ─── Classification Function ─── /** * Classify a URL into one of: 'doctor' | 'surgery' | 'about' | null. * Uses priority ordering (doctor > surgery > about) so that * compound URLs like `/about/meet-our-doctors.php` correctly resolve * to the more specific category. */ export function classifyPageUrl(url: string): "doctor" | "surgery" | "about" | null { const lower = url.toLowerCase(); // Priority 1: Doctor pages if (PAGE_PATTERNS.doctor.positive.some((p) => lower.includes(p))) { return "doctor"; } // Priority 2: Surgery/service pages if (PAGE_PATTERNS.surgery.positive.some((p) => lower.includes(p))) { return "surgery"; } // Priority 3: About pages (with negative exclusion) if (PAGE_PATTERNS.about.positive.some((p) => lower.includes(p))) { if (PAGE_PATTERNS.about.negative.some((p) => lower.includes(p))) { return null; // Contains about + doctor/surgery keyword → ambiguous, skip } return "about"; } return null; } // ─── Test Corpus ─── /** * Known failure cases and expected classifications. * Used by validateClassifier() for regression prevention. */ export const CLASSIFICATION_TEST_CORPUS: ReadonlyArray = [ // Bug cases (historically misclassified) ["/about/meet-our-doctors.php", "doctor"], // 그랜드성형외과 bug ["/about/procedures.php", "surgery"], // 그랜드성형외과 // Standard doctor pages ["/doctors", "doctor"], ["/의료진", "doctor"], ["/team/professor-kim", "doctor"], ["/about/our-team", "doctor"], // Standard surgery pages ["/surgery/rhinoplasty", "surgery"], ["/시술안내", "surgery"], ["/services/breast", "surgery"], // Standard about pages ["/about/", "about"], ["/about/greeting.php", "about"], ["/about/intro", "about"], ["/about/history", "about"], ["/소개", "about"], ["/인사말", "about"], // Null cases ["/gallery", null], ["/contact", null], ["/blog/post-123", null], ]; // ─── Self-Test ─── /** * Run classification against the test corpus. * Call during Edge Function cold-start for automatic regression detection. */ export function validateClassifier(): { pass: boolean; failures: string[] } { const failures: string[] = []; for (const [url, expected] of CLASSIFICATION_TEST_CORPUS) { const result = classifyPageUrl(url); if (result !== expected) { failures.push(`${url}: expected '${expected}', got '${result}'`); } } return { pass: failures.length === 0, failures }; }