59 lines
2.1 KiB
Python
59 lines
2.1 KiB
Python
from __future__ import annotations
|
|
|
|
from collections import defaultdict
|
|
|
|
from eval_ppt2html.models import EvalUnit, SamplePair
|
|
from eval_ppt2html.utils.logging import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
def build_eval_units(sample: SamplePair) -> None:
|
|
"""
|
|
단순 규칙:
|
|
- input_page_index == output_page_index인 페이지를 1:1 매칭하여 평가 유닛을 생성한다.
|
|
- 복잡한 매칭 로직이 필요해지면 이 함수만 교체하면 된다.
|
|
"""
|
|
|
|
text_by_input_page: dict[int, list[str]] = defaultdict(list)
|
|
layout_by_input_page: dict[int, list[str]] = defaultdict(list)
|
|
text_by_output_page: dict[int, list[str]] = defaultdict(list)
|
|
layout_by_output_page: dict[int, list[str]] = defaultdict(list)
|
|
|
|
for t in sample.T_i:
|
|
text_by_input_page[t.page_index].append(t.id)
|
|
for l in sample.L_i:
|
|
layout_by_input_page[l.page_index].append(l.id)
|
|
|
|
for t in sample.T_o:
|
|
text_by_output_page[t.page_index].append(t.id)
|
|
for l in sample.L_o:
|
|
layout_by_output_page[l.page_index].append(l.id)
|
|
|
|
snapshot_i_by_page = {s.page_index: s.file_path for s in sample.S_i}
|
|
snapshot_o_by_page = {s.page_index: s.file_path for s in sample.S_o}
|
|
|
|
pages = sorted(
|
|
set(text_by_input_page.keys())
|
|
| set(layout_by_input_page.keys())
|
|
| set(text_by_output_page.keys())
|
|
| set(layout_by_output_page.keys())
|
|
)
|
|
|
|
logger.info("Building eval units for pages: %s", pages)
|
|
|
|
for page_index in pages:
|
|
unit_id = f"{sample.sample_id}_page{page_index}"
|
|
eval_unit = EvalUnit(
|
|
unit_id=unit_id,
|
|
input_page_index=page_index,
|
|
output_page_index=page_index,
|
|
input_text_ids=text_by_input_page.get(page_index, []),
|
|
output_text_ids=text_by_output_page.get(page_index, []),
|
|
input_layout_ids=layout_by_input_page.get(page_index, []),
|
|
output_layout_ids=layout_by_output_page.get(page_index, []),
|
|
input_snapshot=snapshot_i_by_page.get(page_index),
|
|
output_snapshot=snapshot_o_by_page.get(page_index),
|
|
)
|
|
sample.eval_units.append(eval_unit)
|