From af617136979f89e77f20bfbf0632de78bbd597e8 Mon Sep 17 00:00:00 2001 From: Mina Choi Date: Tue, 2 Jun 2026 11:08:38 +0900 Subject: [PATCH] =?UTF-8?q?refactor(branding):=20collect/report=20?= =?UTF-8?q?=EB=8B=A8=EA=B3=84=20=EB=B6=84=EB=A6=AC=20+=20Vision=20logo=20h?= =?UTF-8?q?ex=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - integrations/color_extractor → integrations/site_fetcher (HTTP) + services/brand_parser (파싱) 분리 - integrations/vision → integrations/llm/gemini_vision 이동 - services/collect_extras → services/collect.collect_brand_basics (collect) + services/branding (report) 분리 - Vision prompt 에 logo_colors_hex 5개 강제 + 길이 fallback (4·6개 들어와도 5개로 정규화) - branding 단계: HTML parser canonical logo URL 을 Vision 에 1순위 전달 → firecrawl 가 잘못된 이미지 (마케팅 배너 등) 를 logo 로 잡는 케이스 회피 - select_run 에서 큰 JSON 컬럼 (report_data/plan_data) 빼서 meta only → generate_plan 만 select_run_report_data 별도 조회. 4군데 호출자는 가벼워짐 Co-Authored-By: Claude Opus 4.7 (1M context) --- app/common/db/__init__.py | 4 +- app/common/db/run.py | 12 + app/common/db/source.py | 9 + app/integrations/color_extractor.py | 275 ------------------ .../{vision.py => llm/gemini_vision.py} | 13 +- app/integrations/site_fetcher.py | 66 +++++ app/services/analysis.py | 8 +- app/services/brand_parser.py | 172 +++++++++++ app/services/branding.py | 89 ++++++ app/services/collect.py | 34 ++- app/services/collect_extras.py | 125 -------- 11 files changed, 396 insertions(+), 411 deletions(-) delete mode 100644 app/integrations/color_extractor.py rename app/integrations/{vision.py => llm/gemini_vision.py} (94%) create mode 100644 app/integrations/site_fetcher.py create mode 100644 app/services/brand_parser.py create mode 100644 app/services/branding.py delete mode 100644 app/services/collect_extras.py diff --git a/app/common/db/__init__.py b/app/common/db/__init__.py index 8f20d0d..6ea6cf7 100644 --- a/app/common/db/__init__.py +++ b/app/common/db/__init__.py @@ -3,14 +3,14 @@ from common.db.hospital import select_hospital, update_hospital_status, insert_h from common.db.source import ( insert_source, select_source_mainpage, select_source_by_type, insert_raw_info, update_raw_info_status, update_raw_info, update_raw_info_merge, - update_raw_info_logo_url, select_branding_logo_url, + update_raw_info_logo_url, select_branding_logo_url, select_branding_info_id, select_raw_info_data, select_run_sources, select_run_raw_data, select_run_source_raw, select_run_mainpage_url, ) from common.db.run import ( insert_run, select_run, select_run_status, update_run_status, - update_run_report, update_run_plan, select_run_with_clinic, + update_run_report, update_run_plan, select_run_with_clinic, select_run_report_data, ) from common.db.market import upsert_market_status, upsert_market_result, select_market from common.db.file_data import insert_file, select_run_files, select_file, delete_file diff --git a/app/common/db/run.py b/app/common/db/run.py index cbf188c..4169f5f 100644 --- a/app/common/db/run.py +++ b/app/common/db/run.py @@ -22,6 +22,18 @@ async def select_run(analysis_run_id: str) -> dict | None: ) +async def select_run_report_data(analysis_run_id: str) -> dict | None: + """report 결과가 필요할 때만 호출. raw JSON 파싱해서 dict 반환.""" + import json + row = await fetchone( + "SELECT report_data FROM analysis_runs WHERE analysis_run_id = %s", + (analysis_run_id,), + ) + if not row or not row["report_data"]: + return None + return json.loads(row["report_data"]) if isinstance(row["report_data"], str) else row["report_data"] + + async def select_run_status(analysis_run_id: str) -> str | None: row = await fetchone( "SELECT status FROM analysis_runs WHERE analysis_run_id = %s", diff --git a/app/common/db/source.py b/app/common/db/source.py index 3084685..871778f 100644 --- a/app/common/db/source.py +++ b/app/common/db/source.py @@ -106,6 +106,15 @@ async def update_raw_info_logo_url(info_id: int, logo_url: str) -> None: ) +async def select_branding_info_id(analysis_run_id: str) -> int | None: + row = await fetchone( + "SELECT ri.info_id FROM raw_info ri JOIN remote_source rs USING (source_id)" + " WHERE ri.analysis_run_id = %s AND rs.source_type = 'branding' LIMIT 1", + (analysis_run_id,), + ) + return (row or {}).get("info_id") + + async def select_branding_logo_url(analysis_run_id: str) -> str | None: row = await fetchone( "SELECT ri.logo_url FROM raw_info ri JOIN remote_source rs USING (source_id)" diff --git a/app/integrations/color_extractor.py b/app/integrations/color_extractor.py deleted file mode 100644 index 6419061..0000000 --- a/app/integrations/color_extractor.py +++ /dev/null @@ -1,275 +0,0 @@ -"""홈페이지 HTML/CSS에서 hex 색상 직접 추출 + 빈도 기반 brand palette 산출. - -Vision LLM에 의존하지 않고 페이지의 실제 CSS 값을 정규식으로 잡음. -로고만 분석하는 Vision보다 사이트 전체 컬러 시스템 (primary/secondary/background/text)을 더 정확히 추출. -""" -import logging -import re -import ssl -from collections import Counter -from urllib.parse import urljoin, urlparse -import httpx - -logger = logging.getLogger(__name__) - - -def _make_ssl_context() -> ssl.SSLContext: - """오래된 한국 의료 사이트들이 SSL DH_KEY_TOO_SMALL / cipher 약함 등으로 차단되는 문제 우회. - 보안 등급 1로 낮춤 + cert 검증 유지.""" - ctx = ssl.create_default_context() - try: - ctx.set_ciphers("DEFAULT@SECLEVEL=1") - except ssl.SSLError: - pass - return ctx - - -async def _fetch_html(url: str, timeout: float = 20.0) -> tuple[int, str]: - """SSL/검증 단계별 fallback으로 HTML 받기. 그랜드/톡스앤필 같은 oldsite 대응.""" - headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"} - # 1차: 표준 검증 - try: - async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as c: - r = await c.get(url) - return r.status_code, r.text - except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e: - logger.info("[fetch] %s standard SSL failed: %s — fallback to weak cipher", url, e) - # 2차: 약한 cipher 허용 - try: - async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=_make_ssl_context()) as c: - r = await c.get(url) - return r.status_code, r.text - except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e: - logger.info("[fetch] %s weak cipher failed: %s — fallback to verify=False", url, e) - # 3차: SSL 검증 끔 (host mismatch 등) - try: - async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=False) as c: - r = await c.get(url) - return r.status_code, r.text - except Exception as e: - logger.warning("[fetch] %s all fallbacks failed: %s", url, e) - return 0, "" - -LOGO_IMG_PATTERNS = [ - # 1) - re.compile(r']*\bclass=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), - # 2) - re.compile(r']*\bsrc=["\']([^"\']+)["\'][^>]*\bclass=["\'][^"\']*\blogo\b[^"\']*["\']', re.IGNORECASE), - # 3) - re.compile(r']*\bid=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), - # 4) ...logo... - re.compile(r']*\balt=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), - # 5)