from __future__ import annotations from collections import defaultdict from eval_ppt2html.models import EvalUnit, SamplePair from eval_ppt2html.utils.logging import get_logger logger = get_logger(__name__) def build_eval_units(sample: SamplePair) -> None: """ 단순 규칙: - input_page_index == output_page_index인 페이지를 1:1 매칭하여 평가 유닛을 생성한다. - 복잡한 매칭 로직이 필요해지면 이 함수만 교체하면 된다. """ text_by_input_page: dict[int, list[str]] = defaultdict(list) layout_by_input_page: dict[int, list[str]] = defaultdict(list) text_by_output_page: dict[int, list[str]] = defaultdict(list) layout_by_output_page: dict[int, list[str]] = defaultdict(list) for t in sample.T_i: text_by_input_page[t.page_index].append(t.id) for l in sample.L_i: layout_by_input_page[l.page_index].append(l.id) for t in sample.T_o: text_by_output_page[t.page_index].append(t.id) for l in sample.L_o: layout_by_output_page[l.page_index].append(l.id) snapshot_i_by_page = {s.page_index: s.file_path for s in sample.S_i} snapshot_o_by_page = {s.page_index: s.file_path for s in sample.S_o} pages = sorted( set(text_by_input_page.keys()) | set(layout_by_input_page.keys()) | set(text_by_output_page.keys()) | set(layout_by_output_page.keys()) ) logger.info("Building eval units for pages: %s", pages) for page_index in pages: unit_id = f"{sample.sample_id}_page{page_index}" eval_unit = EvalUnit( unit_id=unit_id, input_page_index=page_index, output_page_index=page_index, input_text_ids=text_by_input_page.get(page_index, []), output_text_ids=text_by_output_page.get(page_index, []), input_layout_ids=layout_by_input_page.get(page_index, []), output_layout_ids=layout_by_output_page.get(page_index, []), input_snapshot=snapshot_i_by_page.get(page_index), output_snapshot=snapshot_o_by_page.get(page_index), ) sample.eval_units.append(eval_unit)