"""홈페이지 HTML + 외부 CSS 를 가져오는 fetch 전용 모듈. 오래된 한국 의료 사이트들이 SSL DH_KEY_TOO_SMALL / cipher 약함 / host mismatch 등으로 표준 fetch 에 차단되는 케이스가 많아 단계별 SSL fallback 으로 받는다. 파싱·도메인 로직은 들어가지 않음 — 순수 HTTP 응답 본문 반환. """ import logging import re import ssl from urllib.parse import urljoin import httpx logger = logging.getLogger(__name__) CSS_LINK = re.compile( r']+rel=["\']stylesheet["\'][^>]+href=["\']([^"\']+)["\']', re.IGNORECASE, ) def _make_ssl_context() -> ssl.SSLContext: """보안 등급 1로 낮춤 + cert 검증 유지 (옛 한국 의료 사이트 cipher 약함 회피).""" ctx = ssl.create_default_context() try: ctx.set_ciphers("DEFAULT@SECLEVEL=1") except ssl.SSLError: pass return ctx async def fetch_html(url: str, timeout: float = 20.0) -> tuple[int, str]: """SSL 검증 단계별 fallback 으로 HTML 본문 받기. 실패 시 (0, "").""" headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"} try: async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as c: r = await c.get(url) return r.status_code, r.text except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e: logger.info("[fetch] %s standard SSL failed: %s — fallback to weak cipher", url, e) try: async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=_make_ssl_context()) as c: r = await c.get(url) return r.status_code, r.text except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e: logger.info("[fetch] %s weak cipher failed: %s — fallback to verify=False", url, e) try: async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=False) as c: r = await c.get(url) return r.status_code, r.text except Exception as e: logger.warning("[fetch] %s all fallbacks failed: %s", url, e) return 0, "" async def fetch_html_and_css(homepage_url: str, max_css_files: int = 8) -> tuple[str, list[str]]: """홈페이지 HTML + 외부 CSS(Top N) 한 번에 fetch. 실패 시 ("", []).""" status, html = await fetch_html(homepage_url) if status != 200 or not html: logger.warning("[fetch] homepage fetch failed status=%s url=%s", status, homepage_url) return "", [] css_texts: list[str] = [] for css_href in CSS_LINK.findall(html)[:max_css_files]: cstatus, ctext = await fetch_html(urljoin(homepage_url, css_href), timeout=15.0) if cstatus == 200 and ctext: css_texts.append(ctext) return html, css_texts