121 lines
3.7 KiB
TypeScript
121 lines
3.7 KiB
TypeScript
/**
|
|
* URL Classification Guard — Harness 1
|
|
*
|
|
* Separates URL classification into a testable pure function with
|
|
* an embedded test corpus. Prevents misclassification bugs like
|
|
* `/about/meet-our-doctors.php` being tagged as an "about" page.
|
|
*
|
|
* Priority: doctor > surgery > about (specific → general)
|
|
*/
|
|
|
|
// ─── Pattern Definitions ───
|
|
|
|
const PAGE_PATTERNS = {
|
|
doctor: {
|
|
positive: [
|
|
"/doctor", "/doctors", "/team", "/staff", "/specialist", "/professor",
|
|
"/의료진", "/원장", "meet-our-doctor", "/physicians", "/surgeon",
|
|
"our-team", "our-doctors",
|
|
],
|
|
},
|
|
surgery: {
|
|
positive: [
|
|
"/surgery", "/service", "/procedure", "/treatment", "/시술", "/수술",
|
|
"/procedures", "/treatments", "/services", "/진료",
|
|
],
|
|
},
|
|
about: {
|
|
positive: [
|
|
"/about", "/intro", "/소개", "/greeting", "/인사말", "/history", "/연혁",
|
|
"/company", "/clinic-info",
|
|
],
|
|
negative: [
|
|
"/doctor", "/doctors", "/procedure", "/surgery", "/service",
|
|
"/treatment", "meet-our", "/team", "/staff", "/specialist",
|
|
"/의료진", "/원장", "/시술", "/수술", "/진료",
|
|
"our-team", "our-doctors",
|
|
],
|
|
},
|
|
} as const;
|
|
|
|
// ─── Classification Function ───
|
|
|
|
/**
|
|
* Classify a URL into one of: 'doctor' | 'surgery' | 'about' | null.
|
|
* Uses priority ordering (doctor > surgery > about) so that
|
|
* compound URLs like `/about/meet-our-doctors.php` correctly resolve
|
|
* to the more specific category.
|
|
*/
|
|
export function classifyPageUrl(url: string): "doctor" | "surgery" | "about" | null {
|
|
const lower = url.toLowerCase();
|
|
|
|
// Priority 1: Doctor pages
|
|
if (PAGE_PATTERNS.doctor.positive.some((p) => lower.includes(p))) {
|
|
return "doctor";
|
|
}
|
|
|
|
// Priority 2: Surgery/service pages
|
|
if (PAGE_PATTERNS.surgery.positive.some((p) => lower.includes(p))) {
|
|
return "surgery";
|
|
}
|
|
|
|
// Priority 3: About pages (with negative exclusion)
|
|
if (PAGE_PATTERNS.about.positive.some((p) => lower.includes(p))) {
|
|
if (PAGE_PATTERNS.about.negative.some((p) => lower.includes(p))) {
|
|
return null; // Contains about + doctor/surgery keyword → ambiguous, skip
|
|
}
|
|
return "about";
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
// ─── Test Corpus ───
|
|
|
|
/**
|
|
* Known failure cases and expected classifications.
|
|
* Used by validateClassifier() for regression prevention.
|
|
*/
|
|
export const CLASSIFICATION_TEST_CORPUS: ReadonlyArray<readonly [string, "doctor" | "surgery" | "about" | null]> = [
|
|
// Bug cases (historically misclassified)
|
|
["/about/meet-our-doctors.php", "doctor"], // 그랜드성형외과 bug
|
|
["/about/procedures.php", "surgery"], // 그랜드성형외과
|
|
// Standard doctor pages
|
|
["/doctors", "doctor"],
|
|
["/의료진", "doctor"],
|
|
["/team/professor-kim", "doctor"],
|
|
["/about/our-team", "doctor"],
|
|
// Standard surgery pages
|
|
["/surgery/rhinoplasty", "surgery"],
|
|
["/시술안내", "surgery"],
|
|
["/services/breast", "surgery"],
|
|
// Standard about pages
|
|
["/about/", "about"],
|
|
["/about/greeting.php", "about"],
|
|
["/about/intro", "about"],
|
|
["/about/history", "about"],
|
|
["/소개", "about"],
|
|
["/인사말", "about"],
|
|
// Null cases
|
|
["/gallery", null],
|
|
["/contact", null],
|
|
["/blog/post-123", null],
|
|
];
|
|
|
|
// ─── Self-Test ───
|
|
|
|
/**
|
|
* Run classification against the test corpus.
|
|
* Call during Edge Function cold-start for automatic regression detection.
|
|
*/
|
|
export function validateClassifier(): { pass: boolean; failures: string[] } {
|
|
const failures: string[] = [];
|
|
for (const [url, expected] of CLASSIFICATION_TEST_CORPUS) {
|
|
const result = classifyPageUrl(url);
|
|
if (result !== expected) {
|
|
failures.push(`${url}: expected '${expected}', got '${result}'`);
|
|
}
|
|
}
|
|
return { pass: failures.length === 0, failures };
|
|
}
|