o2o-infinith-demo/scripts/update-csv-gangnamunni.py

172 lines
6.5 KiB
Python

#!/usr/bin/env python3
"""
update-csv-gangnamunni.py
강남언니 스크래핑 데이터를 두 CSV에 추가합니다:
- data/clinic-registry/clinic_registry_working.csv
- data/clinic-registry/INFINITH_Outbound_List.csv
추가 컬럼:
clinic_registry_working.csv → gangnam_unni_rating, gangnam_unni_reviews, lead_doctor
INFINITH_Outbound_List.csv → 강남언니 평점, 강남언니 리뷰수, 대표원장
"""
import csv
import re
import os
import sys
# ── 스크래핑 데이터 (gangnamunni hospital ID → rating, reviews, lead_doctor) ─────
SCRAPED = {
23: (9.2, 6843, "반재상"),
189: (9.1, 18840, "최순우"),
257: (9.5, 14933, "박상훈"),
62: (9.8, 1531, "이세환"),
729: (9.1, 3185, "이강우"),
250: (9.6, 69859, "이상우"),
2500: (9.3, 11789, "강문석"),
215: (9.3, 12735, "김태규"),
926: (9.6, 1696, "조정남"),
938: (9.6, 63166, "황동연"),
2186: (9.8, 9209, "유지한"),
3004: (9.5, 23631, "김주연"),
139: (9.5, 1934, "박형준"),
141: (9.3, 15207, "서일범"),
55: (9.5, 2560, "윤용일"),
116: (9.5, 12506, "이상균"),
248: (9.6, 9227, "김신영"),
1449: (9.9, 150, "최민"),
2196: (9.6, 812, "최동훈"),
398: (9.3, 21088, "송훈"),
1515: (None, 419, None), # 페이스성형외과 — 평점/원장 미확인
213: (9.4, 8529, "이민구"),
623: (7.8, 44, "조배정"),
331: (8.3, 658, "이성준"),
2122: (9.6, 9179, "신종인"),
166: (9.5, 23378, "김한조"),
108: (9.1, 9214, "김석한"),
69: (8.8, 5411, "김정배"),
660: (9.5, 9022, "박일"),
231: (9.6, 342, "신동진"),
300: (9.3, 13769, "이한정"),
563: (9.2, 9172, "김준호"),
58: (9.0, 20810, None), # 아이웰 — 대표원장 미확인
54: (9.4, 8416, "노봉일"),
1181: (9.2, 915, "김흥규"),
66: (9.5, 5134, "심재선"),
431: (9.6, 4487, "노경환"),
339: (9.2, 207, "박재현"),
912: (9.4, 2329, "이용우"),
839: (9.4, 86, "최규진"),
369: (9.5, 5710, "윤석호"),
450: (9.4, 351, "나민화"),
413: (9.6, 883, "이무영"),
3000: (9.5, 2490, "오재윤"),
4749: (9.7, 48, "김승준"),
5500: (9.0, 225, "정진욱"),
2052: (9.2, 8606, "우경식"),
4459: (8.9, 738, "허찬"),
1265: (9.5, 587, "김진우"),
5554: (9.7, 274, "박영규"),
4212: (9.4, 3352, "민성기"),
2414: (9.8, 571, "정태원"),
3569: (9.8, 2053, "권순범"),
6680: (10.0, 216, "권영훈"),
6204: (None, None, None), # 에비뉴 — 신규 등록, 데이터 없음
1178: (9.0, 624, "정창호"),
3429: (9.5, 1034, "정태광"),
5636: (10.0, 1, "방난석"),
2991: (9.3, 10278, "손유석"),
4154: (9.7, 1368, "김의건"),
6597: (10.0, 11, "이석준"),
4244: (9.6, 92, "엄수진"),
5870: (9.3, 1636, "주락균"),
}
def extract_id(url: str):
"""강남언니 URL에서 병원 ID를 추출합니다."""
if not url:
return None
m = re.search(r'/hospitals/(\d+)', url)
return int(m.group(1)) if m else None
def lookup(url: str):
"""URL → (rating_str, reviews_str, doctor_str) 반환."""
hid = extract_id(url)
if hid is None or hid not in SCRAPED:
return ("", "", "")
rating, reviews, doctor = SCRAPED[hid]
return (
str(rating) if rating is not None else "",
str(reviews) if reviews is not None else "",
doctor if doctor else "",
)
# ── 1. clinic_registry_working.csv ──────────────────────────────────────────
BASE = os.path.join(os.path.dirname(__file__), '..', 'data', 'clinic-registry')
reg_path = os.path.join(BASE, 'clinic_registry_working.csv')
with open(reg_path, newline='', encoding='utf-8-sig') as f:
reader = csv.DictReader(f)
orig_fields = reader.fieldnames or []
rows = list(reader)
# gangnam_unni_note 바로 뒤에 3개 컬럼 삽입 (중복 방지)
new_fields = list(orig_fields)
insert_after = 'gangnam_unni_note'
if 'gangnam_unni_rating' not in new_fields:
idx = new_fields.index(insert_after) + 1 if insert_after in new_fields else len(new_fields)
new_fields[idx:idx] = ['gangnam_unni_rating', 'gangnam_unni_reviews', 'lead_doctor']
for row in rows:
rating, reviews, doctor = lookup(row.get('gangnam_unni_url', ''))
row.setdefault('gangnam_unni_rating', rating)
row.setdefault('gangnam_unni_reviews', reviews)
row.setdefault('lead_doctor', doctor)
# 이미 값이 있으면 덮어쓰지 않음 — 명시적으로 덮어쓰려면 아래 주석 해제
# row['gangnam_unni_rating'] = rating
# row['gangnam_unni_reviews'] = reviews
# row['lead_doctor'] = doctor
with open(reg_path, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=new_fields, extrasaction='ignore')
writer.writeheader()
writer.writerows(rows)
print(f"✅ clinic_registry_working.csv 업데이트 완료 ({len(rows)}행)")
# ── 2. INFINITH_Outbound_List.csv ────────────────────────────────────────────
out_path = os.path.join(BASE, 'INFINITH_Outbound_List.csv')
with open(out_path, newline='', encoding='utf-8-sig') as f:
reader = csv.DictReader(f)
orig_fields2 = reader.fieldnames or []
rows2 = list(reader)
new_fields2 = list(orig_fields2)
insert_after2 = '강남언니 비고'
if '강남언니 평점' not in new_fields2:
idx2 = new_fields2.index(insert_after2) + 1 if insert_after2 in new_fields2 else len(new_fields2)
new_fields2[idx2:idx2] = ['강남언니 평점', '강남언니 리뷰수', '대표원장']
for row in rows2:
rating, reviews, doctor = lookup(row.get('강남언니', ''))
row.setdefault('강남언니 평점', rating)
row.setdefault('강남언니 리뷰수', reviews)
row.setdefault('대표원장', doctor)
with open(out_path, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=new_fields2, extrasaction='ignore')
writer.writeheader()
writer.writerows(rows2)
print(f"✅ INFINITH_Outbound_List.csv 업데이트 완료 ({len(rows2)}행)")
print()
print("샘플 확인:")
for row in rows2[:5]:
print(f" {row.get('병원명',''):<16} 평점={row.get('강남언니 평점',''):>4} 리뷰={row.get('강남언니 리뷰수',''):>6} 원장={row.get('대표원장','')}")