172 lines
6.5 KiB
Python
172 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
update-csv-gangnamunni.py
|
|
|
|
강남언니 스크래핑 데이터를 두 CSV에 추가합니다:
|
|
- data/clinic-registry/clinic_registry_working.csv
|
|
- data/clinic-registry/INFINITH_Outbound_List.csv
|
|
|
|
추가 컬럼:
|
|
clinic_registry_working.csv → gangnam_unni_rating, gangnam_unni_reviews, lead_doctor
|
|
INFINITH_Outbound_List.csv → 강남언니 평점, 강남언니 리뷰수, 대표원장
|
|
"""
|
|
|
|
import csv
|
|
import re
|
|
import os
|
|
import sys
|
|
|
|
# ── 스크래핑 데이터 (gangnamunni hospital ID → rating, reviews, lead_doctor) ─────
|
|
SCRAPED = {
|
|
23: (9.2, 6843, "반재상"),
|
|
189: (9.1, 18840, "최순우"),
|
|
257: (9.5, 14933, "박상훈"),
|
|
62: (9.8, 1531, "이세환"),
|
|
729: (9.1, 3185, "이강우"),
|
|
250: (9.6, 69859, "이상우"),
|
|
2500: (9.3, 11789, "강문석"),
|
|
215: (9.3, 12735, "김태규"),
|
|
926: (9.6, 1696, "조정남"),
|
|
938: (9.6, 63166, "황동연"),
|
|
2186: (9.8, 9209, "유지한"),
|
|
3004: (9.5, 23631, "김주연"),
|
|
139: (9.5, 1934, "박형준"),
|
|
141: (9.3, 15207, "서일범"),
|
|
55: (9.5, 2560, "윤용일"),
|
|
116: (9.5, 12506, "이상균"),
|
|
248: (9.6, 9227, "김신영"),
|
|
1449: (9.9, 150, "최민"),
|
|
2196: (9.6, 812, "최동훈"),
|
|
398: (9.3, 21088, "송훈"),
|
|
1515: (None, 419, None), # 페이스성형외과 — 평점/원장 미확인
|
|
213: (9.4, 8529, "이민구"),
|
|
623: (7.8, 44, "조배정"),
|
|
331: (8.3, 658, "이성준"),
|
|
2122: (9.6, 9179, "신종인"),
|
|
166: (9.5, 23378, "김한조"),
|
|
108: (9.1, 9214, "김석한"),
|
|
69: (8.8, 5411, "김정배"),
|
|
660: (9.5, 9022, "박일"),
|
|
231: (9.6, 342, "신동진"),
|
|
300: (9.3, 13769, "이한정"),
|
|
563: (9.2, 9172, "김준호"),
|
|
58: (9.0, 20810, None), # 아이웰 — 대표원장 미확인
|
|
54: (9.4, 8416, "노봉일"),
|
|
1181: (9.2, 915, "김흥규"),
|
|
66: (9.5, 5134, "심재선"),
|
|
431: (9.6, 4487, "노경환"),
|
|
339: (9.2, 207, "박재현"),
|
|
912: (9.4, 2329, "이용우"),
|
|
839: (9.4, 86, "최규진"),
|
|
369: (9.5, 5710, "윤석호"),
|
|
450: (9.4, 351, "나민화"),
|
|
413: (9.6, 883, "이무영"),
|
|
3000: (9.5, 2490, "오재윤"),
|
|
4749: (9.7, 48, "김승준"),
|
|
5500: (9.0, 225, "정진욱"),
|
|
2052: (9.2, 8606, "우경식"),
|
|
4459: (8.9, 738, "허찬"),
|
|
1265: (9.5, 587, "김진우"),
|
|
5554: (9.7, 274, "박영규"),
|
|
4212: (9.4, 3352, "민성기"),
|
|
2414: (9.8, 571, "정태원"),
|
|
3569: (9.8, 2053, "권순범"),
|
|
6680: (10.0, 216, "권영훈"),
|
|
6204: (None, None, None), # 에비뉴 — 신규 등록, 데이터 없음
|
|
1178: (9.0, 624, "정창호"),
|
|
3429: (9.5, 1034, "정태광"),
|
|
5636: (10.0, 1, "방난석"),
|
|
2991: (9.3, 10278, "손유석"),
|
|
4154: (9.7, 1368, "김의건"),
|
|
6597: (10.0, 11, "이석준"),
|
|
4244: (9.6, 92, "엄수진"),
|
|
5870: (9.3, 1636, "주락균"),
|
|
}
|
|
|
|
|
|
def extract_id(url: str):
|
|
"""강남언니 URL에서 병원 ID를 추출합니다."""
|
|
if not url:
|
|
return None
|
|
m = re.search(r'/hospitals/(\d+)', url)
|
|
return int(m.group(1)) if m else None
|
|
|
|
|
|
def lookup(url: str):
|
|
"""URL → (rating_str, reviews_str, doctor_str) 반환."""
|
|
hid = extract_id(url)
|
|
if hid is None or hid not in SCRAPED:
|
|
return ("", "", "")
|
|
rating, reviews, doctor = SCRAPED[hid]
|
|
return (
|
|
str(rating) if rating is not None else "",
|
|
str(reviews) if reviews is not None else "",
|
|
doctor if doctor else "",
|
|
)
|
|
|
|
|
|
# ── 1. clinic_registry_working.csv ──────────────────────────────────────────
|
|
BASE = os.path.join(os.path.dirname(__file__), '..', 'data', 'clinic-registry')
|
|
reg_path = os.path.join(BASE, 'clinic_registry_working.csv')
|
|
|
|
with open(reg_path, newline='', encoding='utf-8-sig') as f:
|
|
reader = csv.DictReader(f)
|
|
orig_fields = reader.fieldnames or []
|
|
rows = list(reader)
|
|
|
|
# gangnam_unni_note 바로 뒤에 3개 컬럼 삽입 (중복 방지)
|
|
new_fields = list(orig_fields)
|
|
insert_after = 'gangnam_unni_note'
|
|
if 'gangnam_unni_rating' not in new_fields:
|
|
idx = new_fields.index(insert_after) + 1 if insert_after in new_fields else len(new_fields)
|
|
new_fields[idx:idx] = ['gangnam_unni_rating', 'gangnam_unni_reviews', 'lead_doctor']
|
|
|
|
for row in rows:
|
|
rating, reviews, doctor = lookup(row.get('gangnam_unni_url', ''))
|
|
row.setdefault('gangnam_unni_rating', rating)
|
|
row.setdefault('gangnam_unni_reviews', reviews)
|
|
row.setdefault('lead_doctor', doctor)
|
|
# 이미 값이 있으면 덮어쓰지 않음 — 명시적으로 덮어쓰려면 아래 주석 해제
|
|
# row['gangnam_unni_rating'] = rating
|
|
# row['gangnam_unni_reviews'] = reviews
|
|
# row['lead_doctor'] = doctor
|
|
|
|
with open(reg_path, 'w', newline='', encoding='utf-8-sig') as f:
|
|
writer = csv.DictWriter(f, fieldnames=new_fields, extrasaction='ignore')
|
|
writer.writeheader()
|
|
writer.writerows(rows)
|
|
|
|
print(f"✅ clinic_registry_working.csv 업데이트 완료 ({len(rows)}행)")
|
|
|
|
|
|
# ── 2. INFINITH_Outbound_List.csv ────────────────────────────────────────────
|
|
out_path = os.path.join(BASE, 'INFINITH_Outbound_List.csv')
|
|
|
|
with open(out_path, newline='', encoding='utf-8-sig') as f:
|
|
reader = csv.DictReader(f)
|
|
orig_fields2 = reader.fieldnames or []
|
|
rows2 = list(reader)
|
|
|
|
new_fields2 = list(orig_fields2)
|
|
insert_after2 = '강남언니 비고'
|
|
if '강남언니 평점' not in new_fields2:
|
|
idx2 = new_fields2.index(insert_after2) + 1 if insert_after2 in new_fields2 else len(new_fields2)
|
|
new_fields2[idx2:idx2] = ['강남언니 평점', '강남언니 리뷰수', '대표원장']
|
|
|
|
for row in rows2:
|
|
rating, reviews, doctor = lookup(row.get('강남언니', ''))
|
|
row.setdefault('강남언니 평점', rating)
|
|
row.setdefault('강남언니 리뷰수', reviews)
|
|
row.setdefault('대표원장', doctor)
|
|
|
|
with open(out_path, 'w', newline='', encoding='utf-8-sig') as f:
|
|
writer = csv.DictWriter(f, fieldnames=new_fields2, extrasaction='ignore')
|
|
writer.writeheader()
|
|
writer.writerows(rows2)
|
|
|
|
print(f"✅ INFINITH_Outbound_List.csv 업데이트 완료 ({len(rows2)}행)")
|
|
print()
|
|
print("샘플 확인:")
|
|
for row in rows2[:5]:
|
|
print(f" {row.get('병원명',''):<16} 평점={row.get('강남언니 평점',''):>4} 리뷰={row.get('강남언니 리뷰수',''):>6} 원장={row.get('대표원장','')}")
|