241 lines
7.6 KiB
Python
241 lines
7.6 KiB
Python
from __future__ import annotations
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, Request, UploadFile, status
|
|
|
|
from app.api.schemas import (
|
|
BatchCreatedResponse,
|
|
BatchRequest,
|
|
BatchStatusResponse,
|
|
CorpusItem,
|
|
CorpusListResponse,
|
|
CorpusUploadRequest,
|
|
CorpusUploadResponse,
|
|
DetectRequest,
|
|
DetectResponse,
|
|
HealthResponse,
|
|
TaxonomyResponse,
|
|
)
|
|
from app.core.config import get_settings
|
|
from app.engine.corpus import add_document, delete_document, list_documents
|
|
from app.engine.detector import PlagiarismDetector
|
|
from app.jobs.store import JobStore
|
|
|
|
router = APIRouter(prefix="/v1")
|
|
|
|
|
|
def _detector(request: Request) -> PlagiarismDetector:
|
|
return request.app.state.detector
|
|
|
|
|
|
def _job_store(request: Request) -> JobStore:
|
|
return request.app.state.job_store
|
|
|
|
|
|
@router.get("/health", response_model=HealthResponse, tags=["meta"])
|
|
async def health(request: Request) -> HealthResponse:
|
|
settings = get_settings()
|
|
det: PlagiarismDetector = request.app.state.detector
|
|
taxonomy_version = None
|
|
if det.taxonomy:
|
|
taxonomy_version = f"meta_tags_{det.taxonomy.meta_tags_version}, cases_{det.taxonomy.cases_version}"
|
|
return HealthResponse(
|
|
status="ok",
|
|
engine_version=settings.engine_version,
|
|
corpus_size=det.corpus_size,
|
|
taxonomy_version=taxonomy_version,
|
|
autobiography_mode=settings.autobiography_mode,
|
|
)
|
|
|
|
|
|
@router.get("/taxonomy", response_model=TaxonomyResponse, tags=["meta"])
|
|
async def taxonomy(request: Request) -> TaxonomyResponse:
|
|
"""분류체계 조회 - 컴북스/바이칼이 동일 라벨링 공유용."""
|
|
det: PlagiarismDetector = request.app.state.detector
|
|
if not det.taxonomy:
|
|
raise HTTPException(status_code=503, detail="Taxonomy not loaded")
|
|
return TaxonomyResponse(
|
|
meta_tags_version=det.taxonomy.meta_tags_version,
|
|
cases_version=det.taxonomy.cases_version,
|
|
meta_tags=[
|
|
{"id": t.id, "label_ko": t.label_ko, "category": t.category,
|
|
"law_ref": t.law_ref, "scope": t.scope, "description": t.description}
|
|
for t in det.taxonomy.meta_tags
|
|
],
|
|
cases=[
|
|
{"case_id": c.case_id, "old_no": c.old_no, "subgroup": c.subgroup,
|
|
"title": c.title, "actor": c.actor,
|
|
"primary_tags": list(c.primary_tags), "secondary_tags": list(c.secondary_tags),
|
|
"detectable_internal": c.detectable_internal, "high_risk": c.high_risk,
|
|
"note": c.note}
|
|
for c in det.taxonomy.cases
|
|
],
|
|
)
|
|
|
|
|
|
@router.post(
|
|
"/plagiarism/detect",
|
|
response_model=DetectResponse,
|
|
tags=["plagiarism"],
|
|
)
|
|
async def detect(req: DetectRequest, request: Request) -> DetectResponse:
|
|
return _detector(request).detect_request(req)
|
|
|
|
|
|
@router.post(
|
|
"/plagiarism/batch",
|
|
response_model=BatchCreatedResponse,
|
|
status_code=status.HTTP_202_ACCEPTED,
|
|
tags=["plagiarism"],
|
|
)
|
|
async def batch_create(
|
|
req: BatchRequest,
|
|
request: Request,
|
|
background_tasks: BackgroundTasks,
|
|
) -> BatchCreatedResponse:
|
|
store = _job_store(request)
|
|
detector = _detector(request)
|
|
job = store.create(total=len(req.items))
|
|
background_tasks.add_task(_run_batch, store, detector, job.job_id, req)
|
|
return BatchCreatedResponse(
|
|
job_id=job.job_id,
|
|
status=job.status,
|
|
total=job.total,
|
|
created_at=job.created_at,
|
|
)
|
|
|
|
|
|
@router.get(
|
|
"/plagiarism/batch/{job_id}",
|
|
response_model=BatchStatusResponse,
|
|
tags=["plagiarism"],
|
|
)
|
|
async def batch_status(job_id: str, request: Request) -> BatchStatusResponse:
|
|
job = _job_store(request).get(job_id)
|
|
if not job:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
return BatchStatusResponse(
|
|
job_id=job.job_id,
|
|
status=job.status,
|
|
total=job.total,
|
|
processed=job.processed,
|
|
created_at=job.created_at,
|
|
finished_at=job.finished_at,
|
|
results=job.results if job.status == "completed" else None,
|
|
error=job.error,
|
|
)
|
|
|
|
|
|
# ---------- 코퍼스 관리 ----------
|
|
|
|
def _rebuild(request: Request) -> int:
|
|
from app.main import rebuild_detector
|
|
return rebuild_detector(request.app)
|
|
|
|
|
|
@router.get(
|
|
"/corpus",
|
|
response_model=CorpusListResponse,
|
|
tags=["corpus"],
|
|
)
|
|
async def corpus_list(request: Request) -> CorpusListResponse:
|
|
settings = get_settings()
|
|
docs = list_documents(settings.corpus_path)
|
|
return CorpusListResponse(
|
|
total=len(docs),
|
|
docs=[CorpusItem(**d) for d in docs],
|
|
)
|
|
|
|
|
|
@router.post(
|
|
"/corpus",
|
|
response_model=CorpusUploadResponse,
|
|
status_code=status.HTTP_201_CREATED,
|
|
tags=["corpus"],
|
|
)
|
|
async def corpus_upload_json(req: CorpusUploadRequest, request: Request) -> CorpusUploadResponse:
|
|
"""JSON으로 자서전 1건 업로드. 인덱스 자동 재빌드."""
|
|
settings = get_settings()
|
|
try:
|
|
doc = add_document(settings.corpus_path, req.doc_id, req.title, req.text)
|
|
except FileExistsError as e:
|
|
raise HTTPException(status_code=409, detail=str(e))
|
|
except ValueError as e:
|
|
raise HTTPException(status_code=400, detail=str(e))
|
|
|
|
new_size = _rebuild(request)
|
|
return CorpusUploadResponse(
|
|
doc_id=doc.doc_id, title=doc.title,
|
|
size_bytes=len(doc.text.encode("utf-8")),
|
|
corpus_size_after=new_size, rebuilt=True,
|
|
)
|
|
|
|
|
|
@router.post(
|
|
"/corpus/file",
|
|
response_model=CorpusUploadResponse,
|
|
status_code=status.HTTP_201_CREATED,
|
|
tags=["corpus"],
|
|
)
|
|
async def corpus_upload_file(
|
|
request: Request,
|
|
title: str = Form(..., description="자서전 제목"),
|
|
doc_id: str | None = Form(default=None, description="비우면 자동 생성"),
|
|
file: UploadFile = File(..., description=".txt 파일"),
|
|
) -> CorpusUploadResponse:
|
|
"""multipart로 .txt 파일 업로드 (큰 자서전 파일용)."""
|
|
settings = get_settings()
|
|
raw = await file.read()
|
|
try:
|
|
text = raw.decode("utf-8")
|
|
except UnicodeDecodeError:
|
|
raise HTTPException(status_code=400, detail="UTF-8 인코딩 텍스트 파일만 업로드 가능합니다.")
|
|
|
|
try:
|
|
doc = add_document(settings.corpus_path, doc_id, title, text)
|
|
except FileExistsError as e:
|
|
raise HTTPException(status_code=409, detail=str(e))
|
|
except ValueError as e:
|
|
raise HTTPException(status_code=400, detail=str(e))
|
|
|
|
new_size = _rebuild(request)
|
|
return CorpusUploadResponse(
|
|
doc_id=doc.doc_id, title=doc.title,
|
|
size_bytes=len(doc.text.encode("utf-8")),
|
|
corpus_size_after=new_size, rebuilt=True,
|
|
)
|
|
|
|
|
|
@router.delete(
|
|
"/corpus/{doc_id}",
|
|
status_code=status.HTTP_204_NO_CONTENT,
|
|
tags=["corpus"],
|
|
)
|
|
async def corpus_delete(doc_id: str, request: Request) -> None:
|
|
settings = get_settings()
|
|
if not delete_document(settings.corpus_path, doc_id):
|
|
raise HTTPException(status_code=404, detail=f"doc_id '{doc_id}' not found")
|
|
_rebuild(request)
|
|
|
|
|
|
def _run_batch(store: JobStore, detector: PlagiarismDetector, job_id: str, req: BatchRequest) -> None:
|
|
store.update(job_id, status="running")
|
|
try:
|
|
for item in req.items:
|
|
result = detector.detect(
|
|
doc_id=item.doc_id,
|
|
text=item.text,
|
|
metadata=item.metadata,
|
|
options=req.options,
|
|
)
|
|
store.append_result(job_id, result)
|
|
store.update(job_id, status="completed", finished_at=datetime.now(timezone.utc))
|
|
except Exception as exc:
|
|
store.update(
|
|
job_id,
|
|
status="failed",
|
|
finished_at=datetime.now(timezone.utc),
|
|
error=str(exc),
|
|
)
|