diff --git a/app/common/db/__init__.py b/app/common/db/__init__.py index 8f20d0d..6ea6cf7 100644 --- a/app/common/db/__init__.py +++ b/app/common/db/__init__.py @@ -3,14 +3,14 @@ from common.db.hospital import select_hospital, update_hospital_status, insert_h from common.db.source import ( insert_source, select_source_mainpage, select_source_by_type, insert_raw_info, update_raw_info_status, update_raw_info, update_raw_info_merge, - update_raw_info_logo_url, select_branding_logo_url, + update_raw_info_logo_url, select_branding_logo_url, select_branding_info_id, select_raw_info_data, select_run_sources, select_run_raw_data, select_run_source_raw, select_run_mainpage_url, ) from common.db.run import ( insert_run, select_run, select_run_status, update_run_status, - update_run_report, update_run_plan, select_run_with_clinic, + update_run_report, update_run_plan, select_run_with_clinic, select_run_report_data, ) from common.db.market import upsert_market_status, upsert_market_result, select_market from common.db.file_data import insert_file, select_run_files, select_file, delete_file diff --git a/app/common/db/run.py b/app/common/db/run.py index cbf188c..4169f5f 100644 --- a/app/common/db/run.py +++ b/app/common/db/run.py @@ -22,6 +22,18 @@ async def select_run(analysis_run_id: str) -> dict | None: ) +async def select_run_report_data(analysis_run_id: str) -> dict | None: + """report 결과가 필요할 때만 호출. raw JSON 파싱해서 dict 반환.""" + import json + row = await fetchone( + "SELECT report_data FROM analysis_runs WHERE analysis_run_id = %s", + (analysis_run_id,), + ) + if not row or not row["report_data"]: + return None + return json.loads(row["report_data"]) if isinstance(row["report_data"], str) else row["report_data"] + + async def select_run_status(analysis_run_id: str) -> str | None: row = await fetchone( "SELECT status FROM analysis_runs WHERE analysis_run_id = %s", diff --git a/app/common/db/source.py b/app/common/db/source.py index 3084685..871778f 100644 --- a/app/common/db/source.py +++ b/app/common/db/source.py @@ -106,6 +106,15 @@ async def update_raw_info_logo_url(info_id: int, logo_url: str) -> None: ) +async def select_branding_info_id(analysis_run_id: str) -> int | None: + row = await fetchone( + "SELECT ri.info_id FROM raw_info ri JOIN remote_source rs USING (source_id)" + " WHERE ri.analysis_run_id = %s AND rs.source_type = 'branding' LIMIT 1", + (analysis_run_id,), + ) + return (row or {}).get("info_id") + + async def select_branding_logo_url(analysis_run_id: str) -> str | None: row = await fetchone( "SELECT ri.logo_url FROM raw_info ri JOIN remote_source rs USING (source_id)" diff --git a/app/integrations/color_extractor.py b/app/integrations/color_extractor.py deleted file mode 100644 index 6419061..0000000 --- a/app/integrations/color_extractor.py +++ /dev/null @@ -1,275 +0,0 @@ -"""홈페이지 HTML/CSS에서 hex 색상 직접 추출 + 빈도 기반 brand palette 산출. - -Vision LLM에 의존하지 않고 페이지의 실제 CSS 값을 정규식으로 잡음. -로고만 분석하는 Vision보다 사이트 전체 컬러 시스템 (primary/secondary/background/text)을 더 정확히 추출. -""" -import logging -import re -import ssl -from collections import Counter -from urllib.parse import urljoin, urlparse -import httpx - -logger = logging.getLogger(__name__) - - -def _make_ssl_context() -> ssl.SSLContext: - """오래된 한국 의료 사이트들이 SSL DH_KEY_TOO_SMALL / cipher 약함 등으로 차단되는 문제 우회. - 보안 등급 1로 낮춤 + cert 검증 유지.""" - ctx = ssl.create_default_context() - try: - ctx.set_ciphers("DEFAULT@SECLEVEL=1") - except ssl.SSLError: - pass - return ctx - - -async def _fetch_html(url: str, timeout: float = 20.0) -> tuple[int, str]: - """SSL/검증 단계별 fallback으로 HTML 받기. 그랜드/톡스앤필 같은 oldsite 대응.""" - headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"} - # 1차: 표준 검증 - try: - async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as c: - r = await c.get(url) - return r.status_code, r.text - except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e: - logger.info("[fetch] %s standard SSL failed: %s — fallback to weak cipher", url, e) - # 2차: 약한 cipher 허용 - try: - async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=_make_ssl_context()) as c: - r = await c.get(url) - return r.status_code, r.text - except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e: - logger.info("[fetch] %s weak cipher failed: %s — fallback to verify=False", url, e) - # 3차: SSL 검증 끔 (host mismatch 등) - try: - async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=False) as c: - r = await c.get(url) - return r.status_code, r.text - except Exception as e: - logger.warning("[fetch] %s all fallbacks failed: %s", url, e) - return 0, "" - -LOGO_IMG_PATTERNS = [ - # 1) - re.compile(r']*\bclass=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), - # 2) - re.compile(r']*\bsrc=["\']([^"\']+)["\'][^>]*\bclass=["\'][^"\']*\blogo\b[^"\']*["\']', re.IGNORECASE), - # 3) - re.compile(r']*\bid=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), - # 4) ...logo... - re.compile(r']*\balt=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), - # 5)