o2o-infinith-demo/supabase/functions/_shared/foundingYearExtractor.ts

101 lines
3.7 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/**
* Founding Year Extraction Guard — Harness 2
*
* Regex-based founding year extraction to complement Gemini Vision.
* Acts as a fallback when the LLM misses patterns like "2004년개원 이래".
*
* Defense layers:
* 1. Gemini Vision (primary)
* 2. This module on scraped text (secondary)
* 3. generate-report post-processing on all channel_data text (tertiary)
*/
// ─── Patterns ───
/**
* Regex patterns for founding year detection.
* Each captures a numeric group that is either:
* - A 4-digit year (1950currentYear)
* - A 1-2 digit number to subtract from currentYear
*/
const FOUNDING_PATTERNS: RegExp[] = [
// Direct year expressions
/(\d{4})년\s*개원/, // "2004년개원", "2004년 개원"
/(\d{4})년개원\s*이래/, // "2004년개원 이래" ← 그랜드 패턴
/개원\s*(\d{4})년/, // "개원 2004년"
/설립\s*(\d{4})년/, // "설립 2004년"
/(\d{4})년\s*설립/, // "2004년 설립"
/since\s*(\d{4})/i, // "SINCE 2004"
/established\s*(?:in\s*)?(\d{4})/i, // "Established in 2004"
/(\d{4})년\s*오픈/, // "2004년 오픈"
/(\d{4})년\s*개업/, // "2004년 개업"
/개원일?\s*:\s*(\d{4})/, // "개원: 2004", "개원일: 2004"
// Anniversary / relative year expressions
/(\d{1,2})주년/, // "22주년" → currentYear-22
/(\d{1,2})년\s*전통/, // "20년 전통" → currentYear-20
/(\d{1,2})년\s*동안/, // "22년 동안" → currentYear-22
/개원\s*(\d{1,2})주년/, // "개원 15주년" → currentYear-15
/(\d{1,2})년\s*역사/, // "20년 역사" → currentYear-20
];
// ─── Extraction Function ───
/**
* Extract founding year from arbitrary text.
* Returns a 4-digit year or null if no pattern matches.
*
* @param text - Any text (HTML markdown, scraped content, vision output)
* @param currentYear - For relative calculations (default: current year)
*/
export function extractFoundingYear(
text: string,
currentYear: number = new Date().getFullYear(),
): number | null {
for (const pattern of FOUNDING_PATTERNS) {
const match = text.match(pattern);
if (!match) continue;
const num = parseInt(match[1], 10);
// 4-digit: direct year
if (num >= 1950 && num <= currentYear) return num;
// 1-2 digit: years ago → subtract from currentYear
if (num >= 1 && num <= 80) return currentYear - num;
}
return null;
}
// ─── Test Corpus ───
export const FOUNDING_YEAR_TEST_CORPUS: ReadonlyArray<readonly [string, number | null]> = [
["2004년개원 이래 중국, 베트남 환자들이 찾아오고 있습니다", 2004],
["SINCE 2004", 2004],
["22주년 기념 이벤트", 2004], // 2026-22
["개원 15주년을 맞이하여", 2011], // 2026-15
["20년 전통의 성형외과", 2006], // 2026-20
["2005년 설립된 뷰성형외과", 2005],
["설립 2010년, 강남에 위치한", 2010],
["Established in 2003", 2003],
["2018년 오픈한 신규 클리닉", 2018],
["개원: 2015", 2015],
["아무 관련 없는 텍스트입니다", null],
["전화번호 02-1234-5678", null],
];
/**
* Self-test: validate all known patterns.
* Uses fixed currentYear=2026 for deterministic results.
*/
export function validateFoundingYearExtractor(): { pass: boolean; failures: string[] } {
const failures: string[] = [];
for (const [text, expected] of FOUNDING_YEAR_TEST_CORPUS) {
const result = extractFoundingYear(text, 2026);
if (result !== expected) {
failures.push(`"${text.slice(0, 30)}...": expected ${expected}, got ${result}`);
}
}
return { pass: failures.length === 0, failures };
}