From 843ccdb806055b9139240b95cf49d8654554f21a Mon Sep 17 00:00:00 2001 From: Mina Choi Date: Wed, 27 May 2026 13:27:39 +0900 Subject: [PATCH] =?UTF-8?q?=EB=B8=8C=EB=9E=9C=EB=93=9C=20=EC=9E=90?= =?UTF-8?q?=EC=82=B0(=EB=A1=9C=EA=B3=A0/=EC=83=89=EC=83=81)=C2=B7=EC=B1=84?= =?UTF-8?q?=EB=84=90=20=EB=A1=9C=EA=B3=A0=20Vision=20=EB=B6=84=EC=84=9D=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - color_extractor: 홈페이지 HTML/CSS에서 로고 URL·브랜드 hex 추출 - vision: Gemini Vision 로고 묘사·채널 로고 일치 평가 - youtube: 채널 profileImage 추출 / firecrawl: clinic_info 추출 보정 Co-Authored-By: Claude Opus 4.7 (1M context) --- app/integrations/color_extractor.py | 250 ++++++++++++++++++++++++++++ app/integrations/firecrawl.py | 3 +- app/integrations/vision.py | 171 +++++++++++++++++++ app/integrations/youtube.py | 2 + 4 files changed, 425 insertions(+), 1 deletion(-) create mode 100644 app/integrations/color_extractor.py create mode 100644 app/integrations/vision.py diff --git a/app/integrations/color_extractor.py b/app/integrations/color_extractor.py new file mode 100644 index 0000000..806d8cd --- /dev/null +++ b/app/integrations/color_extractor.py @@ -0,0 +1,250 @@ +"""홈페이지 HTML/CSS에서 hex 색상 직접 추출 + 빈도 기반 brand palette 산출. + +Vision LLM에 의존하지 않고 페이지의 실제 CSS 값을 정규식으로 잡음. +로고만 분석하는 Vision보다 사이트 전체 컬러 시스템 (primary/secondary/background/text)을 더 정확히 추출. +""" +import logging +import re +import ssl +from collections import Counter +from urllib.parse import urljoin, urlparse +import httpx + +logger = logging.getLogger(__name__) + + +def _make_ssl_context() -> ssl.SSLContext: + """오래된 한국 의료 사이트들이 SSL DH_KEY_TOO_SMALL / cipher 약함 등으로 차단되는 문제 우회. + 보안 등급 1로 낮춤 + cert 검증 유지.""" + ctx = ssl.create_default_context() + try: + ctx.set_ciphers("DEFAULT@SECLEVEL=1") + except ssl.SSLError: + pass + return ctx + + +async def _fetch_html(url: str, timeout: float = 20.0) -> tuple[int, str]: + """SSL/검증 단계별 fallback으로 HTML 받기. 그랜드/톡스앤필 같은 oldsite 대응.""" + headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"} + # 1차: 표준 검증 + try: + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as c: + r = await c.get(url) + return r.status_code, r.text + except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e: + logger.info("[fetch] %s standard SSL failed: %s — fallback to weak cipher", url, e) + # 2차: 약한 cipher 허용 + try: + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=_make_ssl_context()) as c: + r = await c.get(url) + return r.status_code, r.text + except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e: + logger.info("[fetch] %s weak cipher failed: %s — fallback to verify=False", url, e) + # 3차: SSL 검증 끔 (host mismatch 등) + try: + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=False) as c: + r = await c.get(url) + return r.status_code, r.text + except Exception as e: + logger.warning("[fetch] %s all fallbacks failed: %s", url, e) + return 0, "" + +LOGO_IMG_PATTERNS = [ + # 1) + re.compile(r']*\bclass=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), + # 2) + re.compile(r']*\bsrc=["\']([^"\']+)["\'][^>]*\bclass=["\'][^"\']*\blogo\b[^"\']*["\']', re.IGNORECASE), + # 3) + re.compile(r']*\bid=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), + # 4) ...logo... + re.compile(r']*\balt=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), + # 5)