"""10종 메타 태그 분류체계 + 38개 케이스 로더. PDF IV/IX장 그대로 JSON으로 보관 (data/taxonomy/). 부팅 시 1회 로드. """ from __future__ import annotations import json import logging from dataclasses import dataclass, field from pathlib import Path from typing import Iterable from app.api.schemas import LegalTag logger = logging.getLogger(__name__) @dataclass(frozen=True) class MetaTagDef: id: str label_ko: str category: str law_ref: str scope: str description: str @dataclass(frozen=True) class CaseDef: case_id: str # A1, A7, B1, ... old_no: int subgroup: str title: str actor: str primary_tags: tuple[str, ...] secondary_tags: tuple[str, ...] = field(default_factory=tuple) detectable_internal: bool = False high_risk: bool = False note: str | None = None @dataclass(frozen=True) class Taxonomy: meta_tags_version: str cases_version: str meta_tags: tuple[MetaTagDef, ...] cases: tuple[CaseDef, ...] def tag_label(self, tag_id: str) -> str: for t in self.meta_tags: if t.id == tag_id: return t.label_ko return tag_id def find_case(self, primary_tags: Iterable[str]) -> CaseDef | None: """주 태그 조합이 가장 잘 매칭되는 케이스 추정. 완벽 일치 우선, 없으면 부분 일치 (jaccard). 내부 검출 가능 케이스(A1~A5, A24, A25, B1, B2, D1)에 가중치. """ target = set(primary_tags) if not target: return None best: CaseDef | None = None best_score = -1.0 for c in self.cases: primary = set(c.primary_tags) if not primary: continue inter = len(target & primary) union = len(target | primary) score = inter / max(1, union) if c.detectable_internal: score += 0.1 if score > best_score: best_score = score best = c return best if best_score > 0.3 else None def load_taxonomy(taxonomy_dir: Path) -> Taxonomy | None: mt_path = taxonomy_dir / "meta_tags_v1.0.json" cs_path = taxonomy_dir / "cases_v1.2.json" if not mt_path.exists() or not cs_path.exists(): logger.warning("Taxonomy files not found in %s", taxonomy_dir) return None mt_data = json.loads(mt_path.read_text(encoding="utf-8")) cs_data = json.loads(cs_path.read_text(encoding="utf-8")) meta_tags = tuple( MetaTagDef( id=t["id"], label_ko=t["label_ko"], category=t["category"], law_ref=t["law_ref"], scope=t["scope"], description=t["description"], ) for t in mt_data["tags"] ) cases = tuple( CaseDef( case_id=c["case_id"], old_no=c.get("old_no", 0), subgroup=c.get("subgroup", ""), title=c["title"], actor=c.get("actor", ""), primary_tags=tuple(c.get("primary_tags", [])), secondary_tags=tuple(c.get("secondary_tags", [])), detectable_internal=c.get("detectable_internal", False), high_risk=c.get("high_risk", False), note=c.get("note"), ) for c in cs_data["cases"] ) tax = Taxonomy( meta_tags_version=mt_data["version"], cases_version=cs_data["version"], meta_tags=meta_tags, cases=cases, ) logger.info("Taxonomy loaded: tags=%s cases=%s (%d tags, %d cases)", tax.meta_tags_version, tax.cases_version, len(meta_tags), len(cases)) return tax