o2o-plagiarism-ai/app/engine/taxonomy.py

124 lines
3.6 KiB
Python

"""10종 메타 태그 분류체계 + 38개 케이스 로더.
PDF IV/IX장 그대로 JSON으로 보관 (data/taxonomy/). 부팅 시 1회 로드.
"""
from __future__ import annotations
import json
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable
from app.api.schemas import LegalTag
logger = logging.getLogger(__name__)
@dataclass(frozen=True)
class MetaTagDef:
id: str
label_ko: str
category: str
law_ref: str
scope: str
description: str
@dataclass(frozen=True)
class CaseDef:
case_id: str # A1, A7, B1, ...
old_no: int
subgroup: str
title: str
actor: str
primary_tags: tuple[str, ...]
secondary_tags: tuple[str, ...] = field(default_factory=tuple)
detectable_internal: bool = False
high_risk: bool = False
note: str | None = None
@dataclass(frozen=True)
class Taxonomy:
meta_tags_version: str
cases_version: str
meta_tags: tuple[MetaTagDef, ...]
cases: tuple[CaseDef, ...]
def tag_label(self, tag_id: str) -> str:
for t in self.meta_tags:
if t.id == tag_id:
return t.label_ko
return tag_id
def find_case(self, primary_tags: Iterable[str]) -> CaseDef | None:
"""주 태그 조합이 가장 잘 매칭되는 케이스 추정.
완벽 일치 우선, 없으면 부분 일치 (jaccard).
내부 검출 가능 케이스(A1~A5, A24, A25, B1, B2, D1)에 가중치.
"""
target = set(primary_tags)
if not target:
return None
best: CaseDef | None = None
best_score = -1.0
for c in self.cases:
primary = set(c.primary_tags)
if not primary:
continue
inter = len(target & primary)
union = len(target | primary)
score = inter / max(1, union)
if c.detectable_internal:
score += 0.1
if score > best_score:
best_score = score
best = c
return best if best_score > 0.3 else None
def load_taxonomy(taxonomy_dir: Path) -> Taxonomy | None:
mt_path = taxonomy_dir / "meta_tags_v1.0.json"
cs_path = taxonomy_dir / "cases_v1.2.json"
if not mt_path.exists() or not cs_path.exists():
logger.warning("Taxonomy files not found in %s", taxonomy_dir)
return None
mt_data = json.loads(mt_path.read_text(encoding="utf-8"))
cs_data = json.loads(cs_path.read_text(encoding="utf-8"))
meta_tags = tuple(
MetaTagDef(
id=t["id"], label_ko=t["label_ko"], category=t["category"],
law_ref=t["law_ref"], scope=t["scope"], description=t["description"],
)
for t in mt_data["tags"]
)
cases = tuple(
CaseDef(
case_id=c["case_id"],
old_no=c.get("old_no", 0),
subgroup=c.get("subgroup", ""),
title=c["title"],
actor=c.get("actor", ""),
primary_tags=tuple(c.get("primary_tags", [])),
secondary_tags=tuple(c.get("secondary_tags", [])),
detectable_internal=c.get("detectable_internal", False),
high_risk=c.get("high_risk", False),
note=c.get("note"),
)
for c in cs_data["cases"]
)
tax = Taxonomy(
meta_tags_version=mt_data["version"],
cases_version=cs_data["version"],
meta_tags=meta_tags,
cases=cases,
)
logger.info("Taxonomy loaded: tags=%s cases=%s (%d tags, %d cases)",
tax.meta_tags_version, tax.cases_version, len(meta_tags), len(cases))
return tax