193 lines
5.3 KiB
TypeScript
193 lines
5.3 KiB
TypeScript
/**
|
||
* Missing Value Guard & Report Quality Validator — Harness 3
|
||
*
|
||
* Detects all known variants of "no data" from LLM outputs and
|
||
* provides a quality score for generated reports.
|
||
*
|
||
* Usage:
|
||
* - isMissingValue(val) → boolean (use everywhere you check for missing data)
|
||
* - validateReportQuality(report) → DataQualityReport (use before saving reports)
|
||
*/
|
||
|
||
// ─── Missing Value Detection ───
|
||
|
||
/**
|
||
* All known LLM expressions for "no data available".
|
||
* Normalized to lowercase for comparison.
|
||
*/
|
||
const MISSING_PATTERNS: ReadonlyArray<string> = [
|
||
// Korean
|
||
"데이터 없음", "데이터없음", "데이터 미확인", "데이터미확인",
|
||
"정보없음", "정보 없음", "정보 미제공", "미제공",
|
||
"확인불가", "확인 불가", "미확인", "미발견",
|
||
"알 수 없음", "알수없음", "해당 없음", "해당없음",
|
||
"없음", "미정",
|
||
// English
|
||
"n/a", "na", "none", "null", "undefined",
|
||
"not available", "unknown", "not found", "no data",
|
||
// Symbols
|
||
"-", "—", "–", ".", "...", "N/A",
|
||
];
|
||
|
||
/**
|
||
* Check if a value represents missing/unavailable data.
|
||
* Handles null, undefined, empty strings, zero, and LLM "no data" variants.
|
||
*/
|
||
export function isMissingValue(val: unknown): boolean {
|
||
if (val == null) return true;
|
||
|
||
if (typeof val === "number") return val === 0 || isNaN(val);
|
||
|
||
const s = String(val).trim().toLowerCase();
|
||
if (s === "" || s === "0") return true;
|
||
|
||
return MISSING_PATTERNS.some((p) => s === p.toLowerCase());
|
||
}
|
||
|
||
/**
|
||
* Clean a value: return it if valid, or return the fallback if missing.
|
||
* Useful for providing defaults without silent data loss.
|
||
*/
|
||
export function cleanValue<T>(val: unknown, fallback: T): T | unknown {
|
||
return isMissingValue(val) ? fallback : val;
|
||
}
|
||
|
||
// ─── Report Quality Validation ───
|
||
|
||
export interface DataQualityReport {
|
||
score: number; // 0-100
|
||
missingCritical: string[]; // Missing critical fields
|
||
missingImportant: string[]; // Missing important fields
|
||
missingOptional: string[]; // Missing optional fields
|
||
warnings: string[]; // Human-readable warnings
|
||
}
|
||
|
||
/** Fields that MUST be present for a valid report */
|
||
const CRITICAL_FIELDS = [
|
||
"clinicInfo.name",
|
||
"clinicInfo.established",
|
||
];
|
||
|
||
/** Fields that significantly impact report quality */
|
||
const IMPORTANT_FIELDS = [
|
||
"clinicInfo.doctors",
|
||
"channelAnalysis.youtube",
|
||
"channelAnalysis.instagram",
|
||
"channelAnalysis.naverBlog",
|
||
"channelAnalysis.gangnamUnni",
|
||
];
|
||
|
||
/** Nice-to-have fields */
|
||
const OPTIONAL_FIELDS = [
|
||
"channelAnalysis.facebook",
|
||
"channelAnalysis.tiktok",
|
||
"channelAnalysis.naverPlace",
|
||
"channelAnalysis.googleMaps",
|
||
"clinicInfo.location",
|
||
];
|
||
|
||
/**
|
||
* Validate report data quality and return a score with details.
|
||
*
|
||
* Scoring:
|
||
* - Each critical field missing: -20 points
|
||
* - Each important field missing: -5 points
|
||
* - Each optional field missing: -2 points
|
||
*/
|
||
export function validateReportQuality(
|
||
report: Record<string, unknown>,
|
||
): DataQualityReport {
|
||
const result: DataQualityReport = {
|
||
score: 100,
|
||
missingCritical: [],
|
||
missingImportant: [],
|
||
missingOptional: [],
|
||
warnings: [],
|
||
};
|
||
|
||
// Check critical fields
|
||
for (const path of CRITICAL_FIELDS) {
|
||
const val = getNestedValue(report, path);
|
||
if (isMissingValue(val)) {
|
||
result.missingCritical.push(path);
|
||
result.score -= 20;
|
||
result.warnings.push(`❌ Critical: '${path}' is missing`);
|
||
}
|
||
}
|
||
|
||
// Check important fields
|
||
for (const path of IMPORTANT_FIELDS) {
|
||
const val = getNestedValue(report, path);
|
||
if (isMissingValue(val)) {
|
||
result.missingImportant.push(path);
|
||
result.score -= 5;
|
||
result.warnings.push(`⚠️ Important: '${path}' is missing`);
|
||
}
|
||
}
|
||
|
||
// Check optional fields
|
||
for (const path of OPTIONAL_FIELDS) {
|
||
const val = getNestedValue(report, path);
|
||
if (isMissingValue(val)) {
|
||
result.missingOptional.push(path);
|
||
result.score -= 2;
|
||
}
|
||
}
|
||
|
||
result.score = Math.max(0, result.score);
|
||
return result;
|
||
}
|
||
|
||
// ─── Helpers ───
|
||
|
||
/**
|
||
* Traverse a nested object by dot-separated path.
|
||
* e.g., getNestedValue({ a: { b: 1 } }, "a.b") → 1
|
||
*/
|
||
function getNestedValue(obj: Record<string, unknown>, path: string): unknown {
|
||
return path.split(".").reduce(
|
||
(current, key) => {
|
||
if (current == null || typeof current !== "object") return undefined;
|
||
return (current as Record<string, unknown>)[key];
|
||
},
|
||
obj as unknown,
|
||
);
|
||
}
|
||
|
||
// ─── Self-Test ───
|
||
|
||
const MISSING_VALUE_TEST_CORPUS: ReadonlyArray<readonly [unknown, boolean]> = [
|
||
[null, true],
|
||
[undefined, true],
|
||
["", true],
|
||
[" ", true],
|
||
[0, true],
|
||
["데이터 없음", true],
|
||
["데이터없음", true],
|
||
["N/A", true],
|
||
["n/a", true],
|
||
["확인 불가", true],
|
||
["미확인", true],
|
||
["unknown", true],
|
||
["-", true],
|
||
["—", true],
|
||
["none", true],
|
||
// Valid values
|
||
["뷰성형외과", false],
|
||
[4.5, false],
|
||
["2004", false],
|
||
[387, false],
|
||
["https://example.com", false],
|
||
];
|
||
|
||
export function validateDataQuality(): { pass: boolean; failures: string[] } {
|
||
const failures: string[] = [];
|
||
for (const [val, expected] of MISSING_VALUE_TEST_CORPUS) {
|
||
const result = isMissingValue(val);
|
||
if (result !== expected) {
|
||
failures.push(`isMissingValue(${JSON.stringify(val)}): expected ${expected}, got ${result}`);
|
||
}
|
||
}
|
||
return { pass: failures.length === 0, failures };
|
||
}
|