"""레퍼런스 코퍼스 로더 + 관리(CRUD). 저장 형식: data/reference/__.txt 컴북스가 자서전을 업로드하면 이 디렉토리에 파일 생성 → detector 재빌드 트리거. """ from __future__ import annotations import logging import re import uuid from dataclasses import dataclass from pathlib import Path logger = logging.getLogger(__name__) @dataclass(frozen=True) class ReferenceDoc: doc_id: str title: str text: str def load_corpus(directory: Path) -> list[ReferenceDoc]: if not directory.exists(): logger.warning("Reference corpus dir %s does not exist; running with empty corpus", directory) return [] docs: list[ReferenceDoc] = [] for path in sorted(directory.glob("*.txt")): stem = path.stem if "__" in stem: doc_id, title = stem.split("__", 1) else: doc_id, title = stem, stem try: text = path.read_text(encoding="utf-8").strip() except UnicodeDecodeError: logger.warning("Skipping non-utf8 file: %s", path) continue if not text: continue docs.append(ReferenceDoc(doc_id=doc_id, title=title, text=text)) logger.info("Loaded %d reference docs from %s", len(docs), directory) return docs # ---------- CRUD ---------- _INVALID_FN = re.compile(r"[^\w가-힣\-_.]") def _safe_filename(s: str) -> str: """파일명 안전화: 한글/영숫자/일부 기호만 허용.""" return _INVALID_FN.sub("_", s).strip("._") def _path_for(directory: Path, doc_id: str, title: str) -> Path: fn = f"{_safe_filename(doc_id)}__{_safe_filename(title)}.txt" return directory / fn def add_document(directory: Path, doc_id: str | None, title: str, text: str) -> ReferenceDoc: """신규 자서전 추가. doc_id None이면 자동 생성.""" directory.mkdir(parents=True, exist_ok=True) if not doc_id or not doc_id.strip(): doc_id = f"corpus-{uuid.uuid4().hex[:8]}" doc_id = doc_id.strip() title = (title or doc_id).strip() text = text.strip() if not text: raise ValueError("text is empty") # 중복 검사 for existing in directory.glob(f"{_safe_filename(doc_id)}__*.txt"): raise FileExistsError(f"doc_id '{doc_id}' already exists at {existing}") target = _path_for(directory, doc_id, title) target.write_text(text, encoding="utf-8") logger.info("Added corpus doc: %s (%s) → %s", doc_id, title, target.name) return ReferenceDoc(doc_id=doc_id, title=title, text=text) def delete_document(directory: Path, doc_id: str) -> bool: """doc_id에 해당하는 파일 삭제. 삭제 성공 시 True.""" safe = _safe_filename(doc_id) deleted = False for path in directory.glob(f"{safe}__*.txt"): path.unlink() deleted = True logger.info("Deleted corpus doc: %s", path.name) return deleted def list_documents(directory: Path) -> list[dict]: """가벼운 목록 조회 (본문 미포함).""" if not directory.exists(): return [] out = [] for path in sorted(directory.glob("*.txt")): stem = path.stem if "__" in stem: doc_id, title = stem.split("__", 1) else: doc_id, title = stem, stem try: size = path.stat().st_size except OSError: size = 0 out.append({"doc_id": doc_id, "title": title, "size_bytes": size, "filename": path.name}) return out