fix(url): _with_scheme 강화 — www 자동 보강 + 중첩 https:// 정리 + API 입력 적용

문제 1: gangnamunni.com 의 SSL 인증서가 www.gangnamunni.com 에만 유효 →
  사용자가 'gangnamunni.com/hospitals/189' 같이 줬을 때 클릭 시 브라우저 SSL warning.
문제 2: LLM 출력에 'https://www.facebook.com/https://facebook.com/X' 같이 중첩된
  URL이 가끔 박힘.

수정 (_with_scheme):
- 중첩된 'http(s)://' 발견 시 마지막 URL 만 잘라 사용
- _WWW_REQUIRED 도메인 (gangnamunni / facebook / instagram) 은 bare 도메인이면
  www. 자동 보강

api/analysis.py: main 채널(instagram/facebook/naver_blog/youtube/gangnam_unni)
URL 도 _with_scheme 적용해서 DB에 정규화된 형태로 저장. 이전엔 extra channels
(tiktok/EN/카카오톡/카페) 에만 적용돼있어서 강남언니 같은 main 채널이 빠져있었음.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
channel-brand
Mina Choi 2026-05-29 10:44:53 +09:00
parent db42805fdb
commit dca0c78860
2 changed files with 33 additions and 7 deletions

View File

@ -27,6 +27,8 @@ def _extra_channels_from_mockurls(homepage_url: str) -> dict:
"tiktok": _with_scheme(urls.get("tiktok")),
"instagram_en": _with_scheme(urls.get("instagramEn")),
"facebook_en": _with_scheme(urls.get("facebookEn")),
"kakao_talk": _with_scheme(urls.get("kakaoTalk")),
"naver_cafe": _with_scheme(urls.get("naverCafe")),
}
return {}
@ -45,11 +47,12 @@ async def start_analysis(body: AnalysisCreate, background_tasks: BackgroundTasks
if not hospital:
raise HTTPException(status_code=409, detail="Clinic not found")
ig_id = await insert_instagram_row(hospital_id, body.channels.instagram) if body.channels.instagram else None
fb_id = await insert_facebook_row(hospital_id, body.channels.facebook) if body.channels.facebook else None
nb_id = await insert_naver_blog_row(hospital_id, body.channels.naver_blog) if body.channels.naver_blog else None
yt_id = await insert_youtube_row(hospital_id, body.channels.youtube) if body.channels.youtube else None
gu_id = await insert_gangnam_unni_row(hospital_id, body.channels.gangnam_unni) if body.channels.gangnam_unni else None
# 사용자가 'gangnamunni.com/...' 같이 scheme/www 없이 줘도 _with_scheme이 https://www. 보강.
ig_id = await insert_instagram_row(hospital_id, _with_scheme(body.channels.instagram)) if body.channels.instagram else None
fb_id = await insert_facebook_row(hospital_id, _with_scheme(body.channels.facebook)) if body.channels.facebook else None
nb_id = await insert_naver_blog_row(hospital_id, _with_scheme(body.channels.naver_blog)) if body.channels.naver_blog else None
yt_id = await insert_youtube_row(hospital_id, _with_scheme(body.channels.youtube)) if body.channels.youtube else None
gu_id = await insert_gangnam_unni_row(hospital_id, _with_scheme(body.channels.gangnam_unni)) if body.channels.gangnam_unni else None
analysis_run_id = await insert_analysis_run(
analysis_run_id, hospital_id, hospital["owner_user_id"],
@ -62,6 +65,8 @@ async def start_analysis(body: AnalysisCreate, background_tasks: BackgroundTasks
"tiktok": body.channels.tiktok or mock_extra.get("tiktok"),
"instagram_en": body.channels.instagram_en or mock_extra.get("instagram_en"),
"facebook_en": body.channels.facebook_en or mock_extra.get("facebook_en"),
"kakao_talk": body.channels.kakao_talk or mock_extra.get("kakao_talk"),
"naver_cafe": body.channels.naver_cafe or mock_extra.get("naver_cafe"),
}
logger.info("[analysis] extra_channels=%s (mock_matched=%s)", extra_channels, bool(mock_extra))
background_tasks.add_task(run_pipeline, analysis_run_id, extra_channels)

View File

@ -61,6 +61,27 @@ def _normalize_homepage(url: str) -> str:
return u.rstrip("/")
# SSL 인증서가 www.* 에만 유효한 도메인 — bare 도메인이면 사용자 클릭 시 브라우저 SSL warning 뜸.
_WWW_REQUIRED = ("gangnamunni.com", "facebook.com", "instagram.com")
def _with_scheme(u: str | None) -> str | None:
"""scheme 없는 URL에 https:// 보정 (수집기 파싱용). 빈 값은 None."""
return (u if "://" in u else "https://" + u) if u else None
"""scheme 없는 URL에 https:// 보정 (수집기/링크 표시용). 빈 값은 None.
+ 중첩된 https:// 끼어있으면 마지막 URL만 추출 (LLM이 가끔 'https://www.X/https://Y' 같이 만듦).
+ SSL 엄격 도메인(gangnamunni/facebook/instagram) www. 자동 보강."""
if not u:
return None
u = u.strip()
# 'https://www.facebook.com/https://facebook.com/X' 같은 중첩 → 마지막 'http(s)://' 부터 잘라 사용
last = max(u.rfind("https://"), u.rfind("http://"))
if last > 0:
u = u[last:]
if "://" not in u:
u = "https://" + u
# scheme 뒤가 www. 없이 SSL 엄격 도메인이면 www. 추가
for dom in _WWW_REQUIRED:
for scheme in ("https://", "http://"):
if u.startswith(scheme + dom):
u = scheme + "www." + u[len(scheme):]
break
return u