o2o-plagiarism-ai/tests/test_dual_similarity.py

113 lines
4.5 KiB
Python

"""삼중 유사도(text + lemma + element) 알고리즘 단위테스트."""
from __future__ import annotations
from app.api.schemas import ExtractedElements
from app.core.config import Settings
from app.engine.corpus import ReferenceDoc
from app.engine.detector import _classify
from app.engine.extractor import RuleExtractor
from app.engine.similarity import (
DualSimilarityIndex,
SimilarityHit,
TfidfBackend,
_element_similarities,
_jaccard,
)
from app.engine.structural import extract_lemmas
def _settings() -> Settings:
return Settings(
weight_text_sim=0.35,
weight_lemma_sim=0.30,
weight_char_sim=0.20,
weight_motif_sim=0.15,
)
def _build_index(docs):
ex = RuleExtractor()
elements = [ex.extract(d.text) for d in docs]
lemmas = [extract_lemmas(d.text) for d in docs]
return DualSimilarityIndex(docs, elements, lemmas, _settings(), TfidfBackend(docs)), ex
def test_jaccard_basic():
assert _jaccard([], []) == 0.0
assert _jaccard(["a", "b"], ["a", "b"]) == 1.0
assert _jaccard(["a", "b"], ["b", "c"]) == 1 / 3
assert _jaccard(["A"], ["a"]) == 1.0
def test_element_similarities():
q = ExtractedElements(characters=["", "마릴라"], motifs=["성장"], keywords=["고아"], genre="소설")
c = ExtractedElements(characters=["", "다이애나"], motifs=["성장"], keywords=["고아", "친구"], genre="소설")
sims = _element_similarities(q, c)
assert sims["characters"] == 1 / 3
assert sims["motifs"] == 1.0
assert sims["genre"] == 1.0
def test_dual_index_identical_text():
docs = [ReferenceDoc(doc_id="d1", title="원본", text="어린왕자는 작은 별에서 온 소년이다 여우와 친구가 된다")]
idx, ex = _build_index(docs)
q_text = docs[0].text
hits = idx.query(q_text, ex.extract(q_text), top_k=3)
assert hits
assert hits[0].text_sim > 0.9
assert hits[0].lemma_sim > 0.9
assert hits[0].score > 0.5
def test_dual_index_ending_change_caught_by_lemma():
"""전임자 핵심: 어미만 바꾼 표절은 lemma 점수가 결정적으로 잡아냄."""
original = "홍길동은 활빈당을 만들어 탐관오리의 재물을 빼앗아 가난한 백성에게 나누어 준다"
plagiarized = "홍길동이 활빈당을 만들고 탐관오리의 재물을 빼앗으며 가난한 백성에게 나누어 주었다" # 어미만 변경
docs = [ReferenceDoc(doc_id="d1", title="홍길동전", text=original)]
idx, ex = _build_index(docs)
hits = idx.query(plagiarized, ex.extract(plagiarized), top_k=3)
assert hits
# lemma 유사도가 매우 높아야 함 — 전임자가 강조한 신호
assert hits[0].lemma_sim >= 0.70, f"어미 변경 표절을 lemma가 못 잡음: {hits[0].lemma_sim}"
assert hits[0].score >= 0.40
def test_dual_index_character_swap_detection():
"""인물만 치환한 표절도 lemma + 모티프로 점수 확보."""
original = "홍길동은 서자로 태어나 활빈당을 만들어 탐관오리의 재물을 빼앗아 가난한 백성에게 나누어 준다"
plagiarized = "김민수는 서자로 태어나 정의단을 만들어 부패한 관리의 재물을 빼앗아 가난한 백성에게 나누어 준다"
docs = [ReferenceDoc(doc_id="d1", title="홍길동전", text=original)]
idx, ex = _build_index(docs)
hits = idx.query(plagiarized, ex.extract(plagiarized), top_k=3)
assert hits
assert hits[0].score >= 0.30, f"인물 치환 표절을 못 잡음: {hits[0]}"
def _mk_hit(text_sim=0.0, lemma_sim=0.0, char=0.0, motif=0.0, score=None):
return SimilarityHit(
doc_id="d1", title="t",
score=score if score is not None else (text_sim + lemma_sim + char + motif) / 4,
text_sim=text_sim, lemma_sim=lemma_sim,
element_sim={"characters": char, "motifs": motif, "keywords": 0.0, "genre": 0.0},
evidence=[],
)
def test_classify_pure_copy_by_text():
assert _classify(_mk_hit(text_sim=0.95, lemma_sim=0.95, char=1.0, motif=1.0, score=0.95)) == "copy"
def test_classify_copy_by_lemma_only():
"""전임자 시나리오: 어미만 바꾼 복붙. text 점수는 낮을 수 있지만 lemma가 높음."""
assert _classify(_mk_hit(text_sim=0.30, lemma_sim=0.85, char=0.5, score=0.55)) == "copy"
def test_classify_plot_borrow():
assert _classify(_mk_hit(text_sim=0.15, lemma_sim=0.45, char=0.05, motif=0.7, score=0.40)) == "plot"
def test_classify_character_borrow():
assert _classify(_mk_hit(text_sim=0.10, lemma_sim=0.20, char=0.6, motif=0.1, score=0.30)) == "character"