67 lines
2.8 KiB
Python
67 lines
2.8 KiB
Python
"""홈페이지 HTML + 외부 CSS 를 가져오는 fetch 전용 모듈.
|
|
|
|
오래된 한국 의료 사이트들이 SSL DH_KEY_TOO_SMALL / cipher 약함 / host mismatch 등으로
|
|
표준 fetch 에 차단되는 케이스가 많아 단계별 SSL fallback 으로 받는다.
|
|
파싱·도메인 로직은 들어가지 않음 — 순수 HTTP 응답 본문 반환.
|
|
"""
|
|
import logging
|
|
import re
|
|
import ssl
|
|
from urllib.parse import urljoin
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
CSS_LINK = re.compile(
|
|
r'<link[^>]+rel=["\']stylesheet["\'][^>]+href=["\']([^"\']+)["\']',
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _make_ssl_context() -> ssl.SSLContext:
|
|
"""보안 등급 1로 낮춤 + cert 검증 유지 (옛 한국 의료 사이트 cipher 약함 회피)."""
|
|
ctx = ssl.create_default_context()
|
|
try:
|
|
ctx.set_ciphers("DEFAULT@SECLEVEL=1")
|
|
except ssl.SSLError:
|
|
pass
|
|
return ctx
|
|
|
|
|
|
async def fetch_html(url: str, timeout: float = 20.0) -> tuple[int, str]:
|
|
"""SSL 검증 단계별 fallback 으로 HTML 본문 받기. 실패 시 (0, "")."""
|
|
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}
|
|
try:
|
|
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as c:
|
|
r = await c.get(url)
|
|
return r.status_code, r.text
|
|
except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e:
|
|
logger.info("[fetch] %s standard SSL failed: %s — fallback to weak cipher", url, e)
|
|
try:
|
|
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=_make_ssl_context()) as c:
|
|
r = await c.get(url)
|
|
return r.status_code, r.text
|
|
except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e:
|
|
logger.info("[fetch] %s weak cipher failed: %s — fallback to verify=False", url, e)
|
|
try:
|
|
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=False) as c:
|
|
r = await c.get(url)
|
|
return r.status_code, r.text
|
|
except Exception as e:
|
|
logger.warning("[fetch] %s all fallbacks failed: %s", url, e)
|
|
return 0, ""
|
|
|
|
|
|
async def fetch_html_and_css(homepage_url: str, max_css_files: int = 8) -> tuple[str, list[str]]:
|
|
"""홈페이지 HTML + 외부 CSS(Top N) 한 번에 fetch. 실패 시 ("", [])."""
|
|
status, html = await fetch_html(homepage_url)
|
|
if status != 200 or not html:
|
|
logger.warning("[fetch] homepage fetch failed status=%s url=%s", status, homepage_url)
|
|
return "", []
|
|
css_texts: list[str] = []
|
|
for css_href in CSS_LINK.findall(html)[:max_css_files]:
|
|
cstatus, ctext = await fetch_html(urljoin(homepage_url, css_href), timeout=15.0)
|
|
if cstatus == 200 and ctext:
|
|
css_texts.append(ctext)
|
|
return html, css_texts
|