o2o-infinith-demo/supabase/functions/_shared/urlClassifier.ts

121 lines
3.7 KiB
TypeScript

/**
* URL Classification Guard — Harness 1
*
* Separates URL classification into a testable pure function with
* an embedded test corpus. Prevents misclassification bugs like
* `/about/meet-our-doctors.php` being tagged as an "about" page.
*
* Priority: doctor > surgery > about (specific → general)
*/
// ─── Pattern Definitions ───
const PAGE_PATTERNS = {
doctor: {
positive: [
"/doctor", "/doctors", "/team", "/staff", "/specialist", "/professor",
"/의료진", "/원장", "meet-our-doctor", "/physicians", "/surgeon",
"our-team", "our-doctors",
],
},
surgery: {
positive: [
"/surgery", "/service", "/procedure", "/treatment", "/시술", "/수술",
"/procedures", "/treatments", "/services", "/진료",
],
},
about: {
positive: [
"/about", "/intro", "/소개", "/greeting", "/인사말", "/history", "/연혁",
"/company", "/clinic-info",
],
negative: [
"/doctor", "/doctors", "/procedure", "/surgery", "/service",
"/treatment", "meet-our", "/team", "/staff", "/specialist",
"/의료진", "/원장", "/시술", "/수술", "/진료",
"our-team", "our-doctors",
],
},
} as const;
// ─── Classification Function ───
/**
* Classify a URL into one of: 'doctor' | 'surgery' | 'about' | null.
* Uses priority ordering (doctor > surgery > about) so that
* compound URLs like `/about/meet-our-doctors.php` correctly resolve
* to the more specific category.
*/
export function classifyPageUrl(url: string): "doctor" | "surgery" | "about" | null {
const lower = url.toLowerCase();
// Priority 1: Doctor pages
if (PAGE_PATTERNS.doctor.positive.some((p) => lower.includes(p))) {
return "doctor";
}
// Priority 2: Surgery/service pages
if (PAGE_PATTERNS.surgery.positive.some((p) => lower.includes(p))) {
return "surgery";
}
// Priority 3: About pages (with negative exclusion)
if (PAGE_PATTERNS.about.positive.some((p) => lower.includes(p))) {
if (PAGE_PATTERNS.about.negative.some((p) => lower.includes(p))) {
return null; // Contains about + doctor/surgery keyword → ambiguous, skip
}
return "about";
}
return null;
}
// ─── Test Corpus ───
/**
* Known failure cases and expected classifications.
* Used by validateClassifier() for regression prevention.
*/
export const CLASSIFICATION_TEST_CORPUS: ReadonlyArray<readonly [string, "doctor" | "surgery" | "about" | null]> = [
// Bug cases (historically misclassified)
["/about/meet-our-doctors.php", "doctor"], // 그랜드성형외과 bug
["/about/procedures.php", "surgery"], // 그랜드성형외과
// Standard doctor pages
["/doctors", "doctor"],
["/의료진", "doctor"],
["/team/professor-kim", "doctor"],
["/about/our-team", "doctor"],
// Standard surgery pages
["/surgery/rhinoplasty", "surgery"],
["/시술안내", "surgery"],
["/services/breast", "surgery"],
// Standard about pages
["/about/", "about"],
["/about/greeting.php", "about"],
["/about/intro", "about"],
["/about/history", "about"],
["/소개", "about"],
["/인사말", "about"],
// Null cases
["/gallery", null],
["/contact", null],
["/blog/post-123", null],
];
// ─── Self-Test ───
/**
* Run classification against the test corpus.
* Call during Edge Function cold-start for automatic regression detection.
*/
export function validateClassifier(): { pass: boolean; failures: string[] } {
const failures: string[] = [];
for (const [url, expected] of CLASSIFICATION_TEST_CORPUS) {
const result = classifyPageUrl(url);
if (result !== expected) {
failures.push(`${url}: expected '${expected}', got '${result}'`);
}
}
return { pass: failures.length === 0, failures };
}