44 lines
1.2 KiB
Python
44 lines
1.2 KiB
Python
from __future__ import annotations
|
|
|
|
import csv
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Dict, List
|
|
|
|
from eval_ppt2html.models import SamplePair
|
|
|
|
|
|
def load_samples_from_csv(csv_path: Path) -> List[Dict[str, str]]:
|
|
"""
|
|
CSV 파일에서 샘플 목록(id, pptx_path, html_path)을 읽어온다.
|
|
예:
|
|
id,pptx_path,html_path
|
|
sample_001,data/raw/pptx/input.pptx,data/raw/html/output_001.html
|
|
"""
|
|
with csv_path.open(newline="", encoding="utf-8") as f:
|
|
reader = csv.DictReader(f)
|
|
return list(reader)
|
|
|
|
|
|
def save_sample_to_json(sample: SamplePair, work_dir: Path) -> Path:
|
|
"""
|
|
SamplePair 객체를 JSON 파일로 저장한다.
|
|
저장 경로: <work_dir>/processed/json/<sample_id>.json
|
|
"""
|
|
json_dir = work_dir / "processed" / "json"
|
|
json_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
json_path = json_dir / f"{sample.sample_id}.json"
|
|
with json_path.open("w", encoding="utf-8") as f:
|
|
json.dump(sample.to_dict(), f, ensure_ascii=False, indent=2)
|
|
|
|
return json_path
|
|
|
|
|
|
def load_sample_json(json_path: Path) -> Dict:
|
|
"""
|
|
전처리된 SamplePair JSON 파일을 로드하여 dict로 반환한다.
|
|
"""
|
|
with json_path.open(encoding="utf-8") as f:
|
|
return json.load(f)
|