diff --git a/app/integrations/color_extractor.py b/app/integrations/color_extractor.py new file mode 100644 index 0000000..806d8cd --- /dev/null +++ b/app/integrations/color_extractor.py @@ -0,0 +1,250 @@ +"""홈페이지 HTML/CSS에서 hex 색상 직접 추출 + 빈도 기반 brand palette 산출. + +Vision LLM에 의존하지 않고 페이지의 실제 CSS 값을 정규식으로 잡음. +로고만 분석하는 Vision보다 사이트 전체 컬러 시스템 (primary/secondary/background/text)을 더 정확히 추출. +""" +import logging +import re +import ssl +from collections import Counter +from urllib.parse import urljoin, urlparse +import httpx + +logger = logging.getLogger(__name__) + + +def _make_ssl_context() -> ssl.SSLContext: + """오래된 한국 의료 사이트들이 SSL DH_KEY_TOO_SMALL / cipher 약함 등으로 차단되는 문제 우회. + 보안 등급 1로 낮춤 + cert 검증 유지.""" + ctx = ssl.create_default_context() + try: + ctx.set_ciphers("DEFAULT@SECLEVEL=1") + except ssl.SSLError: + pass + return ctx + + +async def _fetch_html(url: str, timeout: float = 20.0) -> tuple[int, str]: + """SSL/검증 단계별 fallback으로 HTML 받기. 그랜드/톡스앤필 같은 oldsite 대응.""" + headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"} + # 1차: 표준 검증 + try: + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as c: + r = await c.get(url) + return r.status_code, r.text + except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e: + logger.info("[fetch] %s standard SSL failed: %s — fallback to weak cipher", url, e) + # 2차: 약한 cipher 허용 + try: + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=_make_ssl_context()) as c: + r = await c.get(url) + return r.status_code, r.text + except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e: + logger.info("[fetch] %s weak cipher failed: %s — fallback to verify=False", url, e) + # 3차: SSL 검증 끔 (host mismatch 등) + try: + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=False) as c: + r = await c.get(url) + return r.status_code, r.text + except Exception as e: + logger.warning("[fetch] %s all fallbacks failed: %s", url, e) + return 0, "" + +LOGO_IMG_PATTERNS = [ + # 1) + re.compile(r']*\bclass=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), + # 2) + re.compile(r']*\bsrc=["\']([^"\']+)["\'][^>]*\bclass=["\'][^"\']*\blogo\b[^"\']*["\']', re.IGNORECASE), + # 3) + re.compile(r']*\bid=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), + # 4) ...logo... + re.compile(r']*\balt=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), + # 5)