o2o-plagiarism-ai/app/core/config.py

68 lines
2.1 KiB
Python

from functools import lru_cache
from pathlib import Path
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
# 서버 바인딩
host: str = "0.0.0.0"
port: int = 8000
log_level: str = "info" # debug / info / warning / error
reload: bool = False # 개발용 자동 재시작
root_path: str = "" # 리버스 프록시 sub-path (예: /plagiarism)
engine_version: str = "o2o-plagiarism-2.0.0-pdf-v1.2"
reference_corpus_dir: str = "./data/reference"
taxonomy_dir: str = "./data/taxonomy"
autobiography_patterns_path: str = "./data/autobiography/common_patterns.txt"
# PDF VII-4 권장: 정밀도 우선 보수적 임계값
similarity_threshold: float = 0.85
# KoSimCSE / KoSBERT (PDF VII-3 권장) - 한국어 오픈소스 임베딩
use_kosimcse: bool = True
kosimcse_model: str = "BM-K/KoSimCSE-roberta-multitask"
kosimcse_max_length: int = 512
# OpenAI (옵션 - 자체 모델 없을 때 폴백)
openai_api_key: str = ""
openai_extraction_model: str = "gpt-4o-mini"
openai_embedding_model: str = "text-embedding-3-small"
use_llm_extractor: bool = False
use_embedding_similarity: bool = False
# 삼중 유사도 가중치 (실측 기반)
weight_text_sim: float = 0.30
weight_lemma_sim: float = 0.45
weight_char_sim: float = 0.15
weight_motif_sim: float = 0.10
# PDF VII-3 캐스케이딩
use_lsh_filter: bool = True
lsh_threshold: float = 0.3 # 1차 필터는 느슨하게 (재현율 우선)
lsh_top_k: int = 50
# PDF VII-4 자서전 모드
autobiography_mode: bool = True
enable_entity_masking: bool = True
@property
def corpus_path(self) -> Path:
return Path(self.reference_corpus_dir).resolve()
@property
def taxonomy_path(self) -> Path:
return Path(self.taxonomy_dir).resolve()
@property
def has_openai(self) -> bool:
return bool(self.openai_api_key.strip())
@lru_cache
def get_settings() -> Settings:
return Settings()