"""삼중 유사도(text + lemma + element) 알고리즘 단위테스트.""" from __future__ import annotations from app.api.schemas import ExtractedElements from app.core.config import Settings from app.engine.corpus import ReferenceDoc from app.engine.detector import _classify from app.engine.extractor import RuleExtractor from app.engine.similarity import ( DualSimilarityIndex, SimilarityHit, TfidfBackend, _element_similarities, _jaccard, ) from app.engine.structural import extract_lemmas def _settings() -> Settings: return Settings( weight_text_sim=0.35, weight_lemma_sim=0.30, weight_char_sim=0.20, weight_motif_sim=0.15, ) def _build_index(docs): ex = RuleExtractor() elements = [ex.extract(d.text) for d in docs] lemmas = [extract_lemmas(d.text) for d in docs] return DualSimilarityIndex(docs, elements, lemmas, _settings(), TfidfBackend(docs)), ex def test_jaccard_basic(): assert _jaccard([], []) == 0.0 assert _jaccard(["a", "b"], ["a", "b"]) == 1.0 assert _jaccard(["a", "b"], ["b", "c"]) == 1 / 3 assert _jaccard(["A"], ["a"]) == 1.0 def test_element_similarities(): q = ExtractedElements(characters=["앤", "마릴라"], motifs=["성장"], keywords=["고아"], genre="소설") c = ExtractedElements(characters=["앤", "다이애나"], motifs=["성장"], keywords=["고아", "친구"], genre="소설") sims = _element_similarities(q, c) assert sims["characters"] == 1 / 3 assert sims["motifs"] == 1.0 assert sims["genre"] == 1.0 def test_dual_index_identical_text(): docs = [ReferenceDoc(doc_id="d1", title="원본", text="어린왕자는 작은 별에서 온 소년이다 여우와 친구가 된다")] idx, ex = _build_index(docs) q_text = docs[0].text hits = idx.query(q_text, ex.extract(q_text), top_k=3) assert hits assert hits[0].text_sim > 0.9 assert hits[0].lemma_sim > 0.9 assert hits[0].score > 0.5 def test_dual_index_ending_change_caught_by_lemma(): """전임자 핵심: 어미만 바꾼 표절은 lemma 점수가 결정적으로 잡아냄.""" original = "홍길동은 활빈당을 만들어 탐관오리의 재물을 빼앗아 가난한 백성에게 나누어 준다" plagiarized = "홍길동이 활빈당을 만들고 탐관오리의 재물을 빼앗으며 가난한 백성에게 나누어 주었다" # 어미만 변경 docs = [ReferenceDoc(doc_id="d1", title="홍길동전", text=original)] idx, ex = _build_index(docs) hits = idx.query(plagiarized, ex.extract(plagiarized), top_k=3) assert hits # lemma 유사도가 매우 높아야 함 — 전임자가 강조한 신호 assert hits[0].lemma_sim >= 0.70, f"어미 변경 표절을 lemma가 못 잡음: {hits[0].lemma_sim}" assert hits[0].score >= 0.40 def test_dual_index_character_swap_detection(): """인물만 치환한 표절도 lemma + 모티프로 점수 확보.""" original = "홍길동은 서자로 태어나 활빈당을 만들어 탐관오리의 재물을 빼앗아 가난한 백성에게 나누어 준다" plagiarized = "김민수는 서자로 태어나 정의단을 만들어 부패한 관리의 재물을 빼앗아 가난한 백성에게 나누어 준다" docs = [ReferenceDoc(doc_id="d1", title="홍길동전", text=original)] idx, ex = _build_index(docs) hits = idx.query(plagiarized, ex.extract(plagiarized), top_k=3) assert hits assert hits[0].score >= 0.30, f"인물 치환 표절을 못 잡음: {hits[0]}" def _mk_hit(text_sim=0.0, lemma_sim=0.0, char=0.0, motif=0.0, score=None): return SimilarityHit( doc_id="d1", title="t", score=score if score is not None else (text_sim + lemma_sim + char + motif) / 4, text_sim=text_sim, lemma_sim=lemma_sim, element_sim={"characters": char, "motifs": motif, "keywords": 0.0, "genre": 0.0}, evidence=[], ) def test_classify_pure_copy_by_text(): assert _classify(_mk_hit(text_sim=0.95, lemma_sim=0.95, char=1.0, motif=1.0, score=0.95)) == "copy" def test_classify_copy_by_lemma_only(): """전임자 시나리오: 어미만 바꾼 복붙. text 점수는 낮을 수 있지만 lemma가 높음.""" assert _classify(_mk_hit(text_sim=0.30, lemma_sim=0.85, char=0.5, score=0.55)) == "copy" def test_classify_plot_borrow(): assert _classify(_mk_hit(text_sim=0.15, lemma_sim=0.45, char=0.05, motif=0.7, score=0.40)) == "plot" def test_classify_character_borrow(): assert _classify(_mk_hit(text_sim=0.10, lemma_sim=0.20, char=0.6, motif=0.1, score=0.30)) == "character"