ppt-eval-benchmark/src/eval_ppt2html/eval_units/builder.py

59 lines
2.1 KiB
Python

from __future__ import annotations
from collections import defaultdict
from eval_ppt2html.models import EvalUnit, SamplePair
from eval_ppt2html.utils.logging import get_logger
logger = get_logger(__name__)
def build_eval_units(sample: SamplePair) -> None:
"""
단순 규칙:
- input_page_index == output_page_index인 페이지를 1:1 매칭하여 평가 유닛을 생성한다.
- 복잡한 매칭 로직이 필요해지면 이 함수만 교체하면 된다.
"""
text_by_input_page: dict[int, list[str]] = defaultdict(list)
layout_by_input_page: dict[int, list[str]] = defaultdict(list)
text_by_output_page: dict[int, list[str]] = defaultdict(list)
layout_by_output_page: dict[int, list[str]] = defaultdict(list)
for t in sample.T_i:
text_by_input_page[t.page_index].append(t.id)
for l in sample.L_i:
layout_by_input_page[l.page_index].append(l.id)
for t in sample.T_o:
text_by_output_page[t.page_index].append(t.id)
for l in sample.L_o:
layout_by_output_page[l.page_index].append(l.id)
snapshot_i_by_page = {s.page_index: s.file_path for s in sample.S_i}
snapshot_o_by_page = {s.page_index: s.file_path for s in sample.S_o}
pages = sorted(
set(text_by_input_page.keys())
| set(layout_by_input_page.keys())
| set(text_by_output_page.keys())
| set(layout_by_output_page.keys())
)
logger.info("Building eval units for pages: %s", pages)
for page_index in pages:
unit_id = f"{sample.sample_id}_page{page_index}"
eval_unit = EvalUnit(
unit_id=unit_id,
input_page_index=page_index,
output_page_index=page_index,
input_text_ids=text_by_input_page.get(page_index, []),
output_text_ids=text_by_output_page.get(page_index, []),
input_layout_ids=layout_by_input_page.get(page_index, []),
output_layout_ids=layout_by_output_page.get(page_index, []),
input_snapshot=snapshot_i_by_page.get(page_index),
output_snapshot=snapshot_o_by_page.get(page_index),
)
sample.eval_units.append(eval_unit)