67 lines
2.1 KiB
Python
67 lines
2.1 KiB
Python
from functools import lru_cache
|
|
from pathlib import Path
|
|
|
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
|
|
|
|
class Settings(BaseSettings):
|
|
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
|
|
|
|
# 서버 바인딩
|
|
host: str = "0.0.0.0"
|
|
port: int = 8000
|
|
log_level: str = "info" # debug / info / warning / error
|
|
reload: bool = False # 개발용 자동 재시작
|
|
|
|
engine_version: str = "o2o-plagiarism-2.0.0-pdf-v1.2"
|
|
reference_corpus_dir: str = "./data/reference"
|
|
taxonomy_dir: str = "./data/taxonomy"
|
|
autobiography_patterns_path: str = "./data/autobiography/common_patterns.txt"
|
|
|
|
# PDF VII-4 권장: 정밀도 우선 보수적 임계값
|
|
similarity_threshold: float = 0.85
|
|
|
|
# KoSimCSE / KoSBERT (PDF VII-3 권장) - 한국어 오픈소스 임베딩
|
|
use_kosimcse: bool = True
|
|
kosimcse_model: str = "BM-K/KoSimCSE-roberta-multitask"
|
|
kosimcse_max_length: int = 512
|
|
|
|
# OpenAI (옵션 - 자체 모델 없을 때 폴백)
|
|
openai_api_key: str = ""
|
|
openai_extraction_model: str = "gpt-4o-mini"
|
|
openai_embedding_model: str = "text-embedding-3-small"
|
|
use_llm_extractor: bool = False
|
|
use_embedding_similarity: bool = False
|
|
|
|
# 삼중 유사도 가중치 (실측 기반)
|
|
weight_text_sim: float = 0.30
|
|
weight_lemma_sim: float = 0.45
|
|
weight_char_sim: float = 0.15
|
|
weight_motif_sim: float = 0.10
|
|
|
|
# PDF VII-3 캐스케이딩
|
|
use_lsh_filter: bool = True
|
|
lsh_threshold: float = 0.3 # 1차 필터는 느슨하게 (재현율 우선)
|
|
lsh_top_k: int = 50
|
|
|
|
# PDF VII-4 자서전 모드
|
|
autobiography_mode: bool = True
|
|
enable_entity_masking: bool = True
|
|
|
|
@property
|
|
def corpus_path(self) -> Path:
|
|
return Path(self.reference_corpus_dir).resolve()
|
|
|
|
@property
|
|
def taxonomy_path(self) -> Path:
|
|
return Path(self.taxonomy_dir).resolve()
|
|
|
|
@property
|
|
def has_openai(self) -> bool:
|
|
return bool(self.openai_api_key.strip())
|
|
|
|
|
|
@lru_cache
|
|
def get_settings() -> Settings:
|
|
return Settings()
|