115 lines
3.4 KiB
Python
115 lines
3.4 KiB
Python
"""레퍼런스 코퍼스 로더 + 관리(CRUD).
|
|
|
|
저장 형식: data/reference/<doc_id>__<title>.txt
|
|
컴북스가 자서전을 업로드하면 이 디렉토리에 파일 생성 → detector 재빌드 트리거.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
import uuid
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ReferenceDoc:
|
|
doc_id: str
|
|
title: str
|
|
text: str
|
|
|
|
|
|
def load_corpus(directory: Path) -> list[ReferenceDoc]:
|
|
if not directory.exists():
|
|
logger.warning("Reference corpus dir %s does not exist; running with empty corpus", directory)
|
|
return []
|
|
|
|
docs: list[ReferenceDoc] = []
|
|
for path in sorted(directory.glob("*.txt")):
|
|
stem = path.stem
|
|
if "__" in stem:
|
|
doc_id, title = stem.split("__", 1)
|
|
else:
|
|
doc_id, title = stem, stem
|
|
try:
|
|
text = path.read_text(encoding="utf-8").strip()
|
|
except UnicodeDecodeError:
|
|
logger.warning("Skipping non-utf8 file: %s", path)
|
|
continue
|
|
if not text:
|
|
continue
|
|
docs.append(ReferenceDoc(doc_id=doc_id, title=title, text=text))
|
|
|
|
logger.info("Loaded %d reference docs from %s", len(docs), directory)
|
|
return docs
|
|
|
|
|
|
# ---------- CRUD ----------
|
|
|
|
_INVALID_FN = re.compile(r"[^\w가-힣\-_.]")
|
|
|
|
|
|
def _safe_filename(s: str) -> str:
|
|
"""파일명 안전화: 한글/영숫자/일부 기호만 허용."""
|
|
return _INVALID_FN.sub("_", s).strip("._")
|
|
|
|
|
|
def _path_for(directory: Path, doc_id: str, title: str) -> Path:
|
|
fn = f"{_safe_filename(doc_id)}__{_safe_filename(title)}.txt"
|
|
return directory / fn
|
|
|
|
|
|
def add_document(directory: Path, doc_id: str | None, title: str, text: str) -> ReferenceDoc:
|
|
"""신규 자서전 추가. doc_id None이면 자동 생성."""
|
|
directory.mkdir(parents=True, exist_ok=True)
|
|
|
|
if not doc_id or not doc_id.strip():
|
|
doc_id = f"corpus-{uuid.uuid4().hex[:8]}"
|
|
doc_id = doc_id.strip()
|
|
title = (title or doc_id).strip()
|
|
text = text.strip()
|
|
if not text:
|
|
raise ValueError("text is empty")
|
|
|
|
# 중복 검사
|
|
for existing in directory.glob(f"{_safe_filename(doc_id)}__*.txt"):
|
|
raise FileExistsError(f"doc_id '{doc_id}' already exists at {existing}")
|
|
|
|
target = _path_for(directory, doc_id, title)
|
|
target.write_text(text, encoding="utf-8")
|
|
logger.info("Added corpus doc: %s (%s) → %s", doc_id, title, target.name)
|
|
return ReferenceDoc(doc_id=doc_id, title=title, text=text)
|
|
|
|
|
|
def delete_document(directory: Path, doc_id: str) -> bool:
|
|
"""doc_id에 해당하는 파일 삭제. 삭제 성공 시 True."""
|
|
safe = _safe_filename(doc_id)
|
|
deleted = False
|
|
for path in directory.glob(f"{safe}__*.txt"):
|
|
path.unlink()
|
|
deleted = True
|
|
logger.info("Deleted corpus doc: %s", path.name)
|
|
return deleted
|
|
|
|
|
|
def list_documents(directory: Path) -> list[dict]:
|
|
"""가벼운 목록 조회 (본문 미포함)."""
|
|
if not directory.exists():
|
|
return []
|
|
out = []
|
|
for path in sorted(directory.glob("*.txt")):
|
|
stem = path.stem
|
|
if "__" in stem:
|
|
doc_id, title = stem.split("__", 1)
|
|
else:
|
|
doc_id, title = stem, stem
|
|
try:
|
|
size = path.stat().st_size
|
|
except OSError:
|
|
size = 0
|
|
out.append({"doc_id": doc_id, "title": title, "size_bytes": size, "filename": path.name})
|
|
return out
|