o2o-plagiarism-ai/app/engine/corpus.py

"""레퍼런스 코퍼스 로더 + 관리(CRUD).

저장 형식: data/reference/<doc_id>__<title>.txt
컴북스가 자서전을 업로드하면 이 디렉토리에 파일 생성 → detector 재빌드 트리거.
"""

from __future__ import annotations

import logging
import re
import uuid
from dataclasses import dataclass
from pathlib import Path

logger = logging.getLogger(__name__)


@dataclass(frozen=True)
class ReferenceDoc:
    doc_id: str
    title: str
    text: str


def load_corpus(directory: Path) -> list[ReferenceDoc]:
    if not directory.exists():
        logger.warning("Reference corpus dir %s does not exist; running with empty corpus", directory)
        return []

    docs: list[ReferenceDoc] = []
    for path in sorted(directory.glob("*.txt")):
        stem = path.stem
        if "__" in stem:
            doc_id, title = stem.split("__", 1)
        else:
            doc_id, title = stem, stem
        try:
            text = path.read_text(encoding="utf-8").strip()
        except UnicodeDecodeError:
            logger.warning("Skipping non-utf8 file: %s", path)
            continue
        if not text:
            continue
        docs.append(ReferenceDoc(doc_id=doc_id, title=title, text=text))

    logger.info("Loaded %d reference docs from %s", len(docs), directory)
    return docs


# ---------- CRUD ----------

_INVALID_FN = re.compile(r"[^\w가-힣\-_.]")


def _safe_filename(s: str) -> str:
    """파일명 안전화: 한글/영숫자/일부 기호만 허용."""
    return _INVALID_FN.sub("_", s).strip("._")


def _path_for(directory: Path, doc_id: str, title: str) -> Path:
    fn = f"{_safe_filename(doc_id)}__{_safe_filename(title)}.txt"
    return directory / fn


def add_document(directory: Path, doc_id: str | None, title: str, text: str) -> ReferenceDoc:
    """신규 자서전 추가. doc_id None이면 자동 생성."""
    directory.mkdir(parents=True, exist_ok=True)

    if not doc_id or not doc_id.strip():
        doc_id = f"corpus-{uuid.uuid4().hex[:8]}"
    doc_id = doc_id.strip()
    title = (title or doc_id).strip()
    text = text.strip()
    if not text:
        raise ValueError("text is empty")

    # 중복 검사
    for existing in directory.glob(f"{_safe_filename(doc_id)}__*.txt"):
        raise FileExistsError(f"doc_id '{doc_id}' already exists at {existing}")

    target = _path_for(directory, doc_id, title)
    target.write_text(text, encoding="utf-8")
    logger.info("Added corpus doc: %s (%s) → %s", doc_id, title, target.name)
    return ReferenceDoc(doc_id=doc_id, title=title, text=text)


def delete_document(directory: Path, doc_id: str) -> bool:
    """doc_id에 해당하는 파일 삭제. 삭제 성공 시 True."""
    safe = _safe_filename(doc_id)
    deleted = False
    for path in directory.glob(f"{safe}__*.txt"):
        path.unlink()
        deleted = True
        logger.info("Deleted corpus doc: %s", path.name)
    return deleted


def list_documents(directory: Path) -> list[dict]:
    """가벼운 목록 조회 (본문 미포함)."""
    if not directory.exists():
        return []
    out = []
    for path in sorted(directory.glob("*.txt")):
        stem = path.stem
        if "__" in stem:
            doc_id, title = stem.split("__", 1)
        else:
            doc_id, title = stem, stem
        try:
            size = path.stat().st_size
        except OSError:
            size = 0
        out.append({"doc_id": doc_id, "title": title, "size_bytes": size, "filename": path.name})
    return out