from __future__ import annotations import csv import json from pathlib import Path from typing import Dict, List from eval_ppt2html.models import SamplePair def load_samples_from_csv(csv_path: Path) -> List[Dict[str, str]]: """ CSV 파일에서 샘플 목록(id, pptx_path, html_path)을 읽어온다. 예: id,pptx_path,html_path sample_001,data/raw/pptx/input.pptx,data/raw/html/output_001.html """ with csv_path.open(newline="", encoding="utf-8") as f: reader = csv.DictReader(f) return list(reader) def save_sample_to_json(sample: SamplePair, work_dir: Path) -> Path: """ SamplePair 객체를 JSON 파일로 저장한다. 저장 경로: /processed/json/.json """ json_dir = work_dir / "processed" / "json" json_dir.mkdir(parents=True, exist_ok=True) json_path = json_dir / f"{sample.sample_id}.json" with json_path.open("w", encoding="utf-8") as f: json.dump(sample.to_dict(), f, ensure_ascii=False, indent=2) return json_path def load_sample_json(json_path: Path) -> Dict: """ 전처리된 SamplePair JSON 파일을 로드하여 dict로 반환한다. """ with json_path.open(encoding="utf-8") as f: return json.load(f)