o2o-plagiarism-ai/app/engine/corpus.py

115 lines
3.4 KiB
Python

"""레퍼런스 코퍼스 로더 + 관리(CRUD).
저장 형식: data/reference/<doc_id>__<title>.txt
컴북스가 자서전을 업로드하면 이 디렉토리에 파일 생성 → detector 재빌드 트리거.
"""
from __future__ import annotations
import logging
import re
import uuid
from dataclasses import dataclass
from pathlib import Path
logger = logging.getLogger(__name__)
@dataclass(frozen=True)
class ReferenceDoc:
doc_id: str
title: str
text: str
def load_corpus(directory: Path) -> list[ReferenceDoc]:
if not directory.exists():
logger.warning("Reference corpus dir %s does not exist; running with empty corpus", directory)
return []
docs: list[ReferenceDoc] = []
for path in sorted(directory.glob("*.txt")):
stem = path.stem
if "__" in stem:
doc_id, title = stem.split("__", 1)
else:
doc_id, title = stem, stem
try:
text = path.read_text(encoding="utf-8").strip()
except UnicodeDecodeError:
logger.warning("Skipping non-utf8 file: %s", path)
continue
if not text:
continue
docs.append(ReferenceDoc(doc_id=doc_id, title=title, text=text))
logger.info("Loaded %d reference docs from %s", len(docs), directory)
return docs
# ---------- CRUD ----------
_INVALID_FN = re.compile(r"[^\w가-힣\-_.]")
def _safe_filename(s: str) -> str:
"""파일명 안전화: 한글/영숫자/일부 기호만 허용."""
return _INVALID_FN.sub("_", s).strip("._")
def _path_for(directory: Path, doc_id: str, title: str) -> Path:
fn = f"{_safe_filename(doc_id)}__{_safe_filename(title)}.txt"
return directory / fn
def add_document(directory: Path, doc_id: str | None, title: str, text: str) -> ReferenceDoc:
"""신규 자서전 추가. doc_id None이면 자동 생성."""
directory.mkdir(parents=True, exist_ok=True)
if not doc_id or not doc_id.strip():
doc_id = f"corpus-{uuid.uuid4().hex[:8]}"
doc_id = doc_id.strip()
title = (title or doc_id).strip()
text = text.strip()
if not text:
raise ValueError("text is empty")
# 중복 검사
for existing in directory.glob(f"{_safe_filename(doc_id)}__*.txt"):
raise FileExistsError(f"doc_id '{doc_id}' already exists at {existing}")
target = _path_for(directory, doc_id, title)
target.write_text(text, encoding="utf-8")
logger.info("Added corpus doc: %s (%s) → %s", doc_id, title, target.name)
return ReferenceDoc(doc_id=doc_id, title=title, text=text)
def delete_document(directory: Path, doc_id: str) -> bool:
"""doc_id에 해당하는 파일 삭제. 삭제 성공 시 True."""
safe = _safe_filename(doc_id)
deleted = False
for path in directory.glob(f"{safe}__*.txt"):
path.unlink()
deleted = True
logger.info("Deleted corpus doc: %s", path.name)
return deleted
def list_documents(directory: Path) -> list[dict]:
"""가벼운 목록 조회 (본문 미포함)."""
if not directory.exists():
return []
out = []
for path in sorted(directory.glob("*.txt")):
stem = path.stem
if "__" in stem:
doc_id, title = stem.split("__", 1)
else:
doc_id, title = stem, stem
try:
size = path.stat().st_size
except OSError:
size = 0
out.append({"doc_id": doc_id, "title": title, "size_bytes": size, "filename": path.name})
return out