124 lines
3.6 KiB
Python
124 lines
3.6 KiB
Python
"""10종 메타 태그 분류체계 + 38개 케이스 로더.
|
|
|
|
PDF IV/IX장 그대로 JSON으로 보관 (data/taxonomy/). 부팅 시 1회 로드.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Iterable
|
|
|
|
from app.api.schemas import LegalTag
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MetaTagDef:
|
|
id: str
|
|
label_ko: str
|
|
category: str
|
|
law_ref: str
|
|
scope: str
|
|
description: str
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class CaseDef:
|
|
case_id: str # A1, A7, B1, ...
|
|
old_no: int
|
|
subgroup: str
|
|
title: str
|
|
actor: str
|
|
primary_tags: tuple[str, ...]
|
|
secondary_tags: tuple[str, ...] = field(default_factory=tuple)
|
|
detectable_internal: bool = False
|
|
high_risk: bool = False
|
|
note: str | None = None
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Taxonomy:
|
|
meta_tags_version: str
|
|
cases_version: str
|
|
meta_tags: tuple[MetaTagDef, ...]
|
|
cases: tuple[CaseDef, ...]
|
|
|
|
def tag_label(self, tag_id: str) -> str:
|
|
for t in self.meta_tags:
|
|
if t.id == tag_id:
|
|
return t.label_ko
|
|
return tag_id
|
|
|
|
def find_case(self, primary_tags: Iterable[str]) -> CaseDef | None:
|
|
"""주 태그 조합이 가장 잘 매칭되는 케이스 추정.
|
|
|
|
완벽 일치 우선, 없으면 부분 일치 (jaccard).
|
|
내부 검출 가능 케이스(A1~A5, A24, A25, B1, B2, D1)에 가중치.
|
|
"""
|
|
target = set(primary_tags)
|
|
if not target:
|
|
return None
|
|
best: CaseDef | None = None
|
|
best_score = -1.0
|
|
for c in self.cases:
|
|
primary = set(c.primary_tags)
|
|
if not primary:
|
|
continue
|
|
inter = len(target & primary)
|
|
union = len(target | primary)
|
|
score = inter / max(1, union)
|
|
if c.detectable_internal:
|
|
score += 0.1
|
|
if score > best_score:
|
|
best_score = score
|
|
best = c
|
|
return best if best_score > 0.3 else None
|
|
|
|
|
|
def load_taxonomy(taxonomy_dir: Path) -> Taxonomy | None:
|
|
mt_path = taxonomy_dir / "meta_tags_v1.0.json"
|
|
cs_path = taxonomy_dir / "cases_v1.2.json"
|
|
if not mt_path.exists() or not cs_path.exists():
|
|
logger.warning("Taxonomy files not found in %s", taxonomy_dir)
|
|
return None
|
|
|
|
mt_data = json.loads(mt_path.read_text(encoding="utf-8"))
|
|
cs_data = json.loads(cs_path.read_text(encoding="utf-8"))
|
|
|
|
meta_tags = tuple(
|
|
MetaTagDef(
|
|
id=t["id"], label_ko=t["label_ko"], category=t["category"],
|
|
law_ref=t["law_ref"], scope=t["scope"], description=t["description"],
|
|
)
|
|
for t in mt_data["tags"]
|
|
)
|
|
cases = tuple(
|
|
CaseDef(
|
|
case_id=c["case_id"],
|
|
old_no=c.get("old_no", 0),
|
|
subgroup=c.get("subgroup", ""),
|
|
title=c["title"],
|
|
actor=c.get("actor", ""),
|
|
primary_tags=tuple(c.get("primary_tags", [])),
|
|
secondary_tags=tuple(c.get("secondary_tags", [])),
|
|
detectable_internal=c.get("detectable_internal", False),
|
|
high_risk=c.get("high_risk", False),
|
|
note=c.get("note"),
|
|
)
|
|
for c in cs_data["cases"]
|
|
)
|
|
|
|
tax = Taxonomy(
|
|
meta_tags_version=mt_data["version"],
|
|
cases_version=cs_data["version"],
|
|
meta_tags=meta_tags,
|
|
cases=cases,
|
|
)
|
|
logger.info("Taxonomy loaded: tags=%s cases=%s (%d tags, %d cases)",
|
|
tax.meta_tags_version, tax.cases_version, len(meta_tags), len(cases))
|
|
return tax
|