ppt-eval-benchmark/src/eval_ppt2html/utils/io.py

44 lines
1.2 KiB
Python

from __future__ import annotations
import csv
import json
from pathlib import Path
from typing import Dict, List
from eval_ppt2html.models import SamplePair
def load_samples_from_csv(csv_path: Path) -> List[Dict[str, str]]:
"""
CSV 파일에서 샘플 목록(id, pptx_path, html_path)을 읽어온다.
예:
id,pptx_path,html_path
sample_001,data/raw/pptx/input.pptx,data/raw/html/output_001.html
"""
with csv_path.open(newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
return list(reader)
def save_sample_to_json(sample: SamplePair, work_dir: Path) -> Path:
"""
SamplePair 객체를 JSON 파일로 저장한다.
저장 경로: <work_dir>/processed/json/<sample_id>.json
"""
json_dir = work_dir / "processed" / "json"
json_dir.mkdir(parents=True, exist_ok=True)
json_path = json_dir / f"{sample.sample_id}.json"
with json_path.open("w", encoding="utf-8") as f:
json.dump(sample.to_dict(), f, ensure_ascii=False, indent=2)
return json_path
def load_sample_json(json_path: Path) -> Dict:
"""
전처리된 SamplePair JSON 파일을 로드하여 dict로 반환한다.
"""
with json_path.open(encoding="utf-8") as f:
return json.load(f)