refactor(branding): collect/report 단계 분리 + Vision logo hex 추가

- integrations/color_extractor → integrations/site_fetcher (HTTP) + services/brand_parser (파싱) 분리
- integrations/vision → integrations/llm/gemini_vision 이동
- services/collect_extras → services/collect.collect_brand_basics (collect) + services/branding (report) 분리
- Vision prompt 에 logo_colors_hex 5개 강제 + 길이 fallback (4·6개 들어와도 5개로 정규화)
- branding 단계: HTML parser canonical logo URL 을 Vision 에 1순위 전달
  → firecrawl 가 잘못된 이미지 (마케팅 배너 등) 를 logo 로 잡는 케이스 회피
- select_run 에서 큰 JSON 컬럼 (report_data/plan_data) 빼서 meta only
  → generate_plan 만 select_run_report_data 별도 조회. 4군데 호출자는 가벼워짐

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
main
Mina Choi 2026-06-02 11:08:38 +09:00
parent b844951ad8
commit af61713697
11 changed files with 396 additions and 411 deletions

View File

@ -3,14 +3,14 @@ from common.db.hospital import select_hospital, update_hospital_status, insert_h
from common.db.source import (
insert_source, select_source_mainpage, select_source_by_type,
insert_raw_info, update_raw_info_status, update_raw_info, update_raw_info_merge,
update_raw_info_logo_url, select_branding_logo_url,
update_raw_info_logo_url, select_branding_logo_url, select_branding_info_id,
select_raw_info_data,
select_run_sources, select_run_raw_data, select_run_source_raw,
select_run_mainpage_url,
)
from common.db.run import (
insert_run, select_run, select_run_status, update_run_status,
update_run_report, update_run_plan, select_run_with_clinic,
update_run_report, update_run_plan, select_run_with_clinic, select_run_report_data,
)
from common.db.market import upsert_market_status, upsert_market_result, select_market
from common.db.file_data import insert_file, select_run_files, select_file, delete_file

View File

@ -22,6 +22,18 @@ async def select_run(analysis_run_id: str) -> dict | None:
)
async def select_run_report_data(analysis_run_id: str) -> dict | None:
"""report 결과가 필요할 때만 호출. raw JSON 파싱해서 dict 반환."""
import json
row = await fetchone(
"SELECT report_data FROM analysis_runs WHERE analysis_run_id = %s",
(analysis_run_id,),
)
if not row or not row["report_data"]:
return None
return json.loads(row["report_data"]) if isinstance(row["report_data"], str) else row["report_data"]
async def select_run_status(analysis_run_id: str) -> str | None:
row = await fetchone(
"SELECT status FROM analysis_runs WHERE analysis_run_id = %s",

View File

@ -106,6 +106,15 @@ async def update_raw_info_logo_url(info_id: int, logo_url: str) -> None:
)
async def select_branding_info_id(analysis_run_id: str) -> int | None:
row = await fetchone(
"SELECT ri.info_id FROM raw_info ri JOIN remote_source rs USING (source_id)"
" WHERE ri.analysis_run_id = %s AND rs.source_type = 'branding' LIMIT 1",
(analysis_run_id,),
)
return (row or {}).get("info_id")
async def select_branding_logo_url(analysis_run_id: str) -> str | None:
row = await fetchone(
"SELECT ri.logo_url FROM raw_info ri JOIN remote_source rs USING (source_id)"

View File

@ -1,275 +0,0 @@
"""홈페이지 HTML/CSS에서 hex 색상 직접 추출 + 빈도 기반 brand palette 산출.
Vision LLM에 의존하지 않고 페이지의 실제 CSS 값을 정규식으로 잡음.
로고만 분석하는 Vision보다 사이트 전체 컬러 시스템 (primary/secondary/background/text) 정확히 추출.
"""
import logging
import re
import ssl
from collections import Counter
from urllib.parse import urljoin, urlparse
import httpx
logger = logging.getLogger(__name__)
def _make_ssl_context() -> ssl.SSLContext:
"""오래된 한국 의료 사이트들이 SSL DH_KEY_TOO_SMALL / cipher 약함 등으로 차단되는 문제 우회.
보안 등급 1 낮춤 + cert 검증 유지."""
ctx = ssl.create_default_context()
try:
ctx.set_ciphers("DEFAULT@SECLEVEL=1")
except ssl.SSLError:
pass
return ctx
async def _fetch_html(url: str, timeout: float = 20.0) -> tuple[int, str]:
"""SSL/검증 단계별 fallback으로 HTML 받기. 그랜드/톡스앤필 같은 oldsite 대응."""
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}
# 1차: 표준 검증
try:
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as c:
r = await c.get(url)
return r.status_code, r.text
except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e:
logger.info("[fetch] %s standard SSL failed: %s — fallback to weak cipher", url, e)
# 2차: 약한 cipher 허용
try:
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=_make_ssl_context()) as c:
r = await c.get(url)
return r.status_code, r.text
except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e:
logger.info("[fetch] %s weak cipher failed: %s — fallback to verify=False", url, e)
# 3차: SSL 검증 끔 (host mismatch 등)
try:
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=False) as c:
r = await c.get(url)
return r.status_code, r.text
except Exception as e:
logger.warning("[fetch] %s all fallbacks failed: %s", url, e)
return 0, ""
LOGO_IMG_PATTERNS = [
# 1) <img class="...logo..." src="...">
re.compile(r'<img[^>]*\bclass=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE),
# 2) <img src="..." class="...logo...">
re.compile(r'<img[^>]*\bsrc=["\']([^"\']+)["\'][^>]*\bclass=["\'][^"\']*\blogo\b[^"\']*["\']', re.IGNORECASE),
# 3) <img id="...logo..." src="...">
re.compile(r'<img[^>]*\bid=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE),
# 4) <img alt="...logo..." src="...">
re.compile(r'<img[^>]*\balt=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE),
# 5) <a/h1 class="logo"><...nested...><img src="...">
re.compile(r'<(?:a|h[1-6]|div|span)[^>]*\b(?:class|id)=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*>(?:[^<]|<(?!img))*<img[^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE | re.DOTALL),
# 6) inline background-image: <a/div class="logo" style="background-image: url(...)">
re.compile(r'<(?:a|div|span|h[1-6])[^>]*\b(?:class|id)=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bstyle=["\'][^"\']*background(?:-image)?\s*:\s*url\(\s*["\']?([^"\')\s]+)', re.IGNORECASE),
# 7) inline background-image: <a/div style="background-image: url(...)" class="logo"> (속성 순서 반대)
re.compile(r'<(?:a|div|span|h[1-6])[^>]*\bstyle=["\'][^"\']*background(?:-image)?\s*:\s*url\(\s*["\']?([^"\')\s]+)[^"\']*["\'][^>]*\b(?:class|id)=["\'][^"\']*\blogo\b', re.IGNORECASE),
# 8) src 자체에 "logo" 포함 (header_logo.png, brand-logo.svg 등)
re.compile(r'<img[^>]*\bsrc=["\']([^"\']*\blogo\b[^"\']*\.(?:png|svg|jpe?g|webp)[^"\']*)["\']', re.IGNORECASE),
# 9) <header>...<img src="..."> (헤더 영역 첫 img)
re.compile(r'<header\b[^>]*>(?:[^<]|<(?!img))*<img[^>]*\bsrc=["\']([^"\']+\.(?:png|svg|jpe?g|webp)[^"\']*)["\']', re.IGNORECASE | re.DOTALL),
# 10) <nav>...<img src="..."> (nav 영역 첫 img)
re.compile(r'<nav\b[^>]*>(?:[^<]|<(?!img))*<img[^>]*\bsrc=["\']([^"\']+\.(?:png|svg|jpe?g|webp)[^"\']*)["\']', re.IGNORECASE | re.DOTALL),
# 11) Open Graph image (대표 이미지) - 최후 fallback
re.compile(r'<meta[^>]*\bproperty=["\']og:image["\'][^>]*\bcontent=["\']([^"\']+)["\']', re.IGNORECASE),
re.compile(r'<meta[^>]*\bcontent=["\']([^"\']+)["\'][^>]*\bproperty=["\']og:image["\']', re.IGNORECASE),
]
# CSS 파일에서 .logo { background-image: url(...) } 추출용
LOGO_CSS_PATTERN = re.compile(
r'\.[\w-]*\blogo\b[\w-]*\s*(?:,\s*\.[\w-]+\s*)*\{[^}]*background(?:-image)?\s*:\s*url\(\s*["\']?([^"\')\s]+)',
re.IGNORECASE | re.DOTALL,
)
def find_logo_url_in_html(html: str, base_url: str, css_texts: list[str] | None = None) -> str | None:
"""HTML에서 logo URL 찾기. 우선순위:
1) 패턴 1~8 (class/id/alt/src에 'logo' 명시된 img 가장 specific)
2) 외부 CSS의 .logo background-image (class-based, specific)
3) 패턴 9~10 (<header>/<nav> img 가장 generic, 잘못 잡힐 위험 )
"""
def _is_noise(src: str) -> bool:
"""logo로 잘못 잡힐 가능성 높은 URL 패턴 — lang/flag/icon/arrow/spacer 등."""
if not src or src.startswith("data:"):
return True
if re.search(r"(blank|spacer|pixel|transparent|1x1)\b", src, re.IGNORECASE):
return True
# 헤더 첫 img가 lang flag / 검색 아이콘 / 네비 화살표인 경우 (JK plastic 한국어 깃발이 잡히던 케이스)
if re.search(r"(lang[-_]?(kor|eng|chn|jpn|rus|jp|en|ko|cn|ar|in)|flag|country|icon-|btn-|arrow|prev|next|search)\b", src, re.IGNORECASE):
return True
return False
# 1) class/id/alt/src/inline-bg/src-with-logo 패턴 (1~8)
for pat in LOGO_IMG_PATTERNS[:8]:
for m in pat.finditer(html):
src = m.group(1)
if _is_noise(src):
continue
return urljoin(base_url, src)
# 2) 외부 CSS의 .logo { background-image } — class-based 이므로 generic 패턴보다 우선
for css in (css_texts or []):
m = LOGO_CSS_PATTERN.search(css)
if m:
src = m.group(1)
if not _is_noise(src):
return urljoin(base_url, src)
# 3) header/nav 첫 img — 가장 generic, lang flag 등 noise 필터 강화 적용
for pat in LOGO_IMG_PATTERNS[8:]:
for m in pat.finditer(html):
src = m.group(1)
if _is_noise(src):
continue
return urljoin(base_url, src)
return None
HEX6 = re.compile(r"#([0-9a-fA-F]{6})\b")
HEX3 = re.compile(r"#([0-9a-fA-F]{3})\b(?![0-9a-fA-F])")
RGB = re.compile(r"rgba?\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*(?:,\s*[\d.]+\s*)?\)")
CSS_VAR_HEX = re.compile(r"--[\w-]+\s*:\s*(#[0-9a-fA-F]{3,8})", re.IGNORECASE)
CSS_LINK = re.compile(r'<link[^>]+rel=["\']stylesheet["\'][^>]+href=["\']([^"\']+)["\']', re.IGNORECASE)
STYLE_BLOCK = re.compile(r"<style[^>]*>(.*?)</style>", re.IGNORECASE | re.DOTALL)
# 무채색·아주 흔한 노이즈 컬러 (이런 건 brand color로 잡지 않음)
NOISE = {
"#ffffff", "#000000", "#fff", "#000",
"#333", "#222", "#111", "#444", "#555", "#666", "#777", "#888", "#999",
"#aaa", "#bbb", "#ccc", "#ddd", "#eee", "#f0f0f0", "#f5f5f5", "#fafafa",
}
def _normalize(hex_str: str) -> str:
h = hex_str.lstrip("#").lower()
if len(h) == 3:
h = "".join(c * 2 for c in h)
if len(h) == 8:
h = h[:6]
return f"#{h}"
def _rgb_to_hex(r: int, g: int, b: int) -> str:
return f"#{r:02x}{g:02x}{b:02x}"
def _hex_to_rgb(h: str) -> tuple[int, int, int]:
h = h.lstrip("#")
return int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
def _distance(a: str, b: str) -> float:
ar, ag, ab = _hex_to_rgb(a)
br, bg, bb = _hex_to_rgb(b)
return ((ar - br) ** 2 + (ag - bg) ** 2 + (ab - bb) ** 2) ** 0.5
def _is_grayscale(h: str, tol: int = 12) -> bool:
r, g, b = _hex_to_rgb(h)
return max(r, g, b) - min(r, g, b) < tol
def _extract_hex(text: str) -> list[str]:
"""텍스트에서 모든 hex 색상 추출 (정규화)."""
out: list[str] = []
out.extend(_normalize(m.group(0)) for m in HEX6.finditer(text))
out.extend(_normalize(m.group(0)) for m in HEX3.finditer(text))
for m in RGB.finditer(text):
r, g, b = int(m.group(1)), int(m.group(2)), int(m.group(3))
if 0 <= r <= 255 and 0 <= g <= 255 and 0 <= b <= 255:
out.append(_rgb_to_hex(r, g, b))
return out
def _cluster(colors: Counter, threshold: float = 25.0) -> list[tuple[str, int]]:
"""비슷한 색은 묶음. 가장 빈도 높은 색을 대표로."""
ranked = colors.most_common()
clusters: list[tuple[str, int]] = []
for color, count in ranked:
merged = False
for i, (rep, rep_count) in enumerate(clusters):
if _distance(color, rep) < threshold:
clusters[i] = (rep, rep_count + count)
merged = True
break
if not merged:
clusters.append((color, count))
return clusters
async def _fetch_html_and_css(homepage_url: str, max_css_files: int = 8) -> tuple[str, list[str]]:
"""홈페이지 HTML + 외부 CSS(Top N)를 한 번에 fetch. 로고/색상 추출이 사이트를 중복으로 긁지 않도록 공유.
_fetch_html이 SSL 약함/host mismatch까지 fallback 처리. 실패 ("", [])."""
status, html = await _fetch_html(homepage_url)
if status != 200 or not html:
logger.warning("[color_extractor] homepage fetch failed status=%s url=%s", status, homepage_url)
return "", []
css_texts: list[str] = []
for css_href in CSS_LINK.findall(html)[:max_css_files]:
cstatus, ctext = await _fetch_html(urljoin(homepage_url, css_href), timeout=15.0)
if cstatus == 200 and ctext:
css_texts.append(ctext)
return html, css_texts
def _colors_from_text(html: str, css_texts: list[str], source_url: str = "") -> dict:
"""이미 받아온 HTML + CSS 텍스트에서 hex 빈도 분석 → primary/accent/text + palette. (fetch 없음, 순수 계산)"""
# 1. HTML 내 <style> 블록 + 통째(inline style="color:#...") + 외부 CSS
all_text_chunks: list[str] = list(STYLE_BLOCK.findall(html))
all_text_chunks.append(html)
all_text_chunks.extend(css_texts)
# 2. 모든 hex 추출 (NOISE 제외)
counter: Counter = Counter()
for text in all_text_chunks:
for color in _extract_hex(text):
if color in NOISE:
continue
counter[color] += 1
if not counter:
logger.info("[color_extractor] no colors extracted from %s", source_url)
return {}
# 3. 비슷한 색 클러스터링
clustered = _cluster(counter)
# 4. primary = 빈도 높은 채도 있는 색 / accent = 두번째 채도 있는 색 / text = 빈도 높은 무채색
chromatic = [c for c, _ in clustered if not _is_grayscale(c)]
grayscale = [c for c, _ in clustered if _is_grayscale(c)]
palette_top = clustered[:8]
palette = [{"name": f"색상 {i+1}", "hex": h, "usage": f"빈도 {n}"} for i, (h, n) in enumerate(palette_top)]
return {
"brand_colors": {
"primary": chromatic[0] if chromatic else None,
"accent": chromatic[1] if len(chromatic) > 1 else None,
"text": grayscale[0] if grayscale else None,
},
"color_palette": palette,
"extracted_from": "html+css",
}
async def extract_brand_colors_from_site(homepage_url: str, max_css_files: int = 8) -> dict:
"""홈페이지 HTML + 외부 CSS fetch → hex 색상 빈도 분석 → primary/accent/text + palette 5종."""
html, css_texts = await _fetch_html_and_css(homepage_url, max_css_files)
if not html:
return {}
return _colors_from_text(html, css_texts, homepage_url)
async def extract_brand_assets_from_site(homepage_url: str, max_css_files: int = 8) -> dict:
"""사이트를 한 번만 fetch해서 로고 URL과 brand 색상을 함께 추출.
반환: {"logo_url": str | None, "colors": {brand_colors, color_palette, ...} | {}}"""
html, css_texts = await _fetch_html_and_css(homepage_url, max_css_files)
if not html:
return {"logo_url": None, "colors": {}}
return {
"logo_url": find_logo_url_in_html(html, homepage_url, css_texts=css_texts),
"colors": _colors_from_text(html, css_texts, homepage_url),
}

View File

@ -218,9 +218,10 @@ class VisionClient:
' "has_symbol": "심볼/아이콘이 있으면 true, 글자만 있으면 false (boolean)",\n'
' "logo_symbol": "심볼이 묘사하는 대상 (예: \'잎사귀\', \'추상 곡선\'). 없으면 빈 문자열",\n'
' "logo_text": "로고에 보이는 워드마크 텍스트 그대로 (한글/영문). 없으면 빈 문자열",\n'
' "logo_colors_desc": "로고에 쓰인 색감을 사람이 부르는 이름으로 서술 (예: \'딥네이비 + 골드\'). 정확한 hex는 출력하지 말 것"\n'
' "logo_colors_desc": "로고에 쓰인 색감을 사람이 부르는 이름으로 서술 (예: \'딥네이비 + 골드\')",\n'
' "logo_colors_hex": ["로고에서 시각적으로 두드러진 색 정확히 5개의 hex 근사값 배열. 예: [\'#1A2B3C\', \'#D4A017\', \'#FFFFFF\', \'#9E5C2A\', \'#1F1F1F\']. 강한 색이 5개 안 되면 음영/명도 차이로 5개 채울 것. 빈 배열 금지."]\n'
"}\n"
"주의: 색상 hex 값이나 logo URL 같은 필드는 출력하지 마세요 (별도 추출 로직이 처리).\n"
"주의: logo_colors_hex 는 시각 추정이라 정확도 떨어질 수 있음. CSS 추출이 우선이고 이건 fallback/보완 용.\n"
"모든 설명/텍스트 값은 반드시 한국어로 작성하세요 (영어 금지)."
)
result = await self._ask(urls, prompt)
@ -228,6 +229,14 @@ class VisionClient:
return {}
# logo_images는 우리가 직접 채움 (Vision은 묘사만)
result["logo_images"] = {"circle": None, "horizontal": logo_url, "korean": None}
# logo_colors_hex 5개 강제 정규화 — LLM 이 4개나 6개 줄 수도 있어서 길이 fallback.
hex_list = [h for h in (result.get("logo_colors_hex") or []) if isinstance(h, str) and h.startswith("#")]
if hex_list:
while len(hex_list) < 5:
hex_list.append(hex_list[-1]) # 마지막 색 복제로 패딩
result["logo_colors_hex"] = hex_list[:5]
else:
result["logo_colors_hex"] = []
return result
async def describe_channel_logos(

View File

@ -0,0 +1,66 @@
"""홈페이지 HTML + 외부 CSS 를 가져오는 fetch 전용 모듈.
오래된 한국 의료 사이트들이 SSL DH_KEY_TOO_SMALL / cipher 약함 / host mismatch 등으로
표준 fetch 차단되는 케이스가 많아 단계별 SSL fallback 으로 받는다.
파싱·도메인 로직은 들어가지 않음 순수 HTTP 응답 본문 반환.
"""
import logging
import re
import ssl
from urllib.parse import urljoin
import httpx
logger = logging.getLogger(__name__)
CSS_LINK = re.compile(
r'<link[^>]+rel=["\']stylesheet["\'][^>]+href=["\']([^"\']+)["\']',
re.IGNORECASE,
)
def _make_ssl_context() -> ssl.SSLContext:
"""보안 등급 1로 낮춤 + cert 검증 유지 (옛 한국 의료 사이트 cipher 약함 회피)."""
ctx = ssl.create_default_context()
try:
ctx.set_ciphers("DEFAULT@SECLEVEL=1")
except ssl.SSLError:
pass
return ctx
async def fetch_html(url: str, timeout: float = 20.0) -> tuple[int, str]:
"""SSL 검증 단계별 fallback 으로 HTML 본문 받기. 실패 시 (0, "")."""
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}
try:
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as c:
r = await c.get(url)
return r.status_code, r.text
except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e:
logger.info("[fetch] %s standard SSL failed: %s — fallback to weak cipher", url, e)
try:
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=_make_ssl_context()) as c:
r = await c.get(url)
return r.status_code, r.text
except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e:
logger.info("[fetch] %s weak cipher failed: %s — fallback to verify=False", url, e)
try:
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=False) as c:
r = await c.get(url)
return r.status_code, r.text
except Exception as e:
logger.warning("[fetch] %s all fallbacks failed: %s", url, e)
return 0, ""
async def fetch_html_and_css(homepage_url: str, max_css_files: int = 8) -> tuple[str, list[str]]:
"""홈페이지 HTML + 외부 CSS(Top N) 한 번에 fetch. 실패 시 ("", [])."""
status, html = await fetch_html(homepage_url)
if status != 200 or not html:
logger.warning("[fetch] homepage fetch failed status=%s url=%s", status, homepage_url)
return "", []
css_texts: list[str] = []
for css_href in CSS_LINK.findall(html)[:max_css_files]:
cstatus, ctext = await fetch_html(urljoin(homepage_url, css_href), timeout=15.0)
if cstatus == 200 and ctext:
css_texts.append(ctext)
return html, css_texts

View File

@ -3,12 +3,13 @@ import logging
import re
from datetime import datetime
from urllib.parse import urlparse
from common.db.run import select_run, update_run_report, update_run_plan
from common.db.run import update_run_report, update_run_plan, select_run_report_data
from common.db.source import select_run_raw_data, select_branding_logo_url
from common.db.market import select_market
from integrations.llm.llm_service import LLMService
from integrations.llm.prompt import report_prompt, plan_prompt, youtube_diagnosis_prompt
from integrations.llm.schemas.report import ReportOutput, ClinicSnapshot, YouTubeAudit
from services.branding import analyze_branding
from services.instagram_audit import build_instagram_accounts
from services.facebook_audit import build_facebook_pages
from services.kpi_dashboard import build_kpi_dashboard
@ -63,12 +64,10 @@ async def generate_report(analysis_run_id: str) -> ReportOutput:
async def generate_plan(analysis_run_id: str) -> PlanOutput:
run = await select_run(analysis_run_id)
raw = await select_run_raw_data(analysis_run_id)
clinic = raw.get("mainpage") or {}
branding = raw.get("branding") or {}
report_data = run["report_data"]
report = json.loads(report_data) if isinstance(report_data, str) else report_data
report = await select_run_report_data(analysis_run_id)
market = await select_market(analysis_run_id)
def _json(v) -> str | None:
@ -341,6 +340,7 @@ def _patch_report(result: ReportOutput, overrides: dict) -> ReportOutput:
async def run_report_task(analysis_run_id: str) -> None:
logger.info("[report] start run=%s", analysis_run_id)
await analyze_branding(analysis_run_id)
result = await generate_report(analysis_run_id)
result = _patch_report(result, await _build_overrides(analysis_run_id))
await update_run_report(analysis_run_id, result.model_dump())

View File

@ -0,0 +1,172 @@
"""collect 단계 - HTML/CSS 텍스트에서 brand 로고 URL + 색상 추출"""
import logging
import re
from collections import Counter
from urllib.parse import urljoin
logger = logging.getLogger(__name__)
# ── 로고 URL 추출 ─────────────────────────────────────────────────────────────
LOGO_IMG_PATTERNS = [
re.compile(r'<img[^>]*\bclass=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE),
re.compile(r'<img[^>]*\bsrc=["\']([^"\']+)["\'][^>]*\bclass=["\'][^"\']*\blogo\b[^"\']*["\']', re.IGNORECASE),
re.compile(r'<img[^>]*\bid=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE),
re.compile(r'<img[^>]*\balt=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE),
re.compile(r'<(?:a|h[1-6]|div|span)[^>]*\b(?:class|id)=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*>(?:[^<]|<(?!img))*<img[^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE | re.DOTALL),
re.compile(r'<(?:a|div|span|h[1-6])[^>]*\b(?:class|id)=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bstyle=["\'][^"\']*background(?:-image)?\s*:\s*url\(\s*["\']?([^"\')\s]+)', re.IGNORECASE),
re.compile(r'<(?:a|div|span|h[1-6])[^>]*\bstyle=["\'][^"\']*background(?:-image)?\s*:\s*url\(\s*["\']?([^"\')\s]+)[^"\']*["\'][^>]*\b(?:class|id)=["\'][^"\']*\blogo\b', re.IGNORECASE),
re.compile(r'<img[^>]*\bsrc=["\']([^"\']*\blogo\b[^"\']*\.(?:png|svg|jpe?g|webp)[^"\']*)["\']', re.IGNORECASE),
re.compile(r'<header\b[^>]*>(?:[^<]|<(?!img))*<img[^>]*\bsrc=["\']([^"\']+\.(?:png|svg|jpe?g|webp)[^"\']*)["\']', re.IGNORECASE | re.DOTALL),
re.compile(r'<nav\b[^>]*>(?:[^<]|<(?!img))*<img[^>]*\bsrc=["\']([^"\']+\.(?:png|svg|jpe?g|webp)[^"\']*)["\']', re.IGNORECASE | re.DOTALL),
re.compile(r'<meta[^>]*\bproperty=["\']og:image["\'][^>]*\bcontent=["\']([^"\']+)["\']', re.IGNORECASE),
re.compile(r'<meta[^>]*\bcontent=["\']([^"\']+)["\'][^>]*\bproperty=["\']og:image["\']', re.IGNORECASE),
]
LOGO_CSS_PATTERN = re.compile(
r'\.[\w-]*\blogo\b[\w-]*\s*(?:,\s*\.[\w-]+\s*)*\{[^}]*background(?:-image)?\s*:\s*url\(\s*["\']?([^"\')\s]+)',
re.IGNORECASE | re.DOTALL,
)
def find_logo_url_in_html(html: str, base_url: str, css_texts: list[str] | None = None) -> str | None:
"""HTML 에서 logo URL 찾기. 우선순위: 1) class/id/alt 명시 img 2) 외부 CSS .logo bg 3) header/nav 첫 img."""
def _is_noise(src: str) -> bool:
if not src or src.startswith("data:"):
return True
if re.search(r"(blank|spacer|pixel|transparent|1x1)\b", src, re.IGNORECASE):
return True
if re.search(r"(lang[-_]?(kor|eng|chn|jpn|rus|jp|en|ko|cn|ar|in)|flag|country|icon-|btn-|arrow|prev|next|search)\b", src, re.IGNORECASE):
return True
return False
for pat in LOGO_IMG_PATTERNS[:8]:
for m in pat.finditer(html):
src = m.group(1)
if _is_noise(src):
continue
return urljoin(base_url, src)
for css in (css_texts or []):
m = LOGO_CSS_PATTERN.search(css)
if m:
src = m.group(1)
if not _is_noise(src):
return urljoin(base_url, src)
for pat in LOGO_IMG_PATTERNS[8:]:
for m in pat.finditer(html):
src = m.group(1)
if _is_noise(src):
continue
return urljoin(base_url, src)
return None
# ── 색상 추출 ────────────────────────────────────────────────────────────────
HEX6 = re.compile(r"#([0-9a-fA-F]{6})\b")
HEX3 = re.compile(r"#([0-9a-fA-F]{3})\b(?![0-9a-fA-F])")
RGB = re.compile(r"rgba?\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*(?:,\s*[\d.]+\s*)?\)")
STYLE_BLOCK = re.compile(r"<style[^>]*>(.*?)</style>", re.IGNORECASE | re.DOTALL)
NOISE = {
"#ffffff", "#000000", "#fff", "#000",
"#333", "#222", "#111", "#444", "#555", "#666", "#777", "#888", "#999",
"#aaa", "#bbb", "#ccc", "#ddd", "#eee", "#f0f0f0", "#f5f5f5", "#fafafa",
}
def _normalize(hex_str: str) -> str:
h = hex_str.lstrip("#").lower()
if len(h) == 3:
h = "".join(c * 2 for c in h)
if len(h) == 8:
h = h[:6]
return f"#{h}"
def _rgb_to_hex(r: int, g: int, b: int) -> str:
return f"#{r:02x}{g:02x}{b:02x}"
def _hex_to_rgb(h: str) -> tuple[int, int, int]:
h = h.lstrip("#")
return int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
def _distance(a: str, b: str) -> float:
ar, ag, ab = _hex_to_rgb(a)
br, bg, bb = _hex_to_rgb(b)
return ((ar - br) ** 2 + (ag - bg) ** 2 + (ab - bb) ** 2) ** 0.5
def _is_grayscale(h: str, tol: int = 12) -> bool:
r, g, b = _hex_to_rgb(h)
return max(r, g, b) - min(r, g, b) < tol
def _extract_hex(text: str) -> list[str]:
out: list[str] = []
out.extend(_normalize(m.group(0)) for m in HEX6.finditer(text))
out.extend(_normalize(m.group(0)) for m in HEX3.finditer(text))
for m in RGB.finditer(text):
r, g, b = int(m.group(1)), int(m.group(2)), int(m.group(3))
if 0 <= r <= 255 and 0 <= g <= 255 and 0 <= b <= 255:
out.append(_rgb_to_hex(r, g, b))
return out
def _cluster(colors: Counter, threshold: float = 25.0) -> list[tuple[str, int]]:
ranked = colors.most_common()
clusters: list[tuple[str, int]] = []
for color, count in ranked:
merged = False
for i, (rep, rep_count) in enumerate(clusters):
if _distance(color, rep) < threshold:
clusters[i] = (rep, rep_count + count)
merged = True
break
if not merged:
clusters.append((color, count))
return clusters
def extract_brand_colors_from_text(html: str, css_texts: list[str], source_url: str = "") -> dict:
"""HTML + CSS 텍스트에서 hex 빈도 분석 → primary/accent/text + palette. (fetch 없음)"""
all_text_chunks: list[str] = list(STYLE_BLOCK.findall(html))
all_text_chunks.append(html)
all_text_chunks.extend(css_texts)
counter: Counter = Counter()
for text in all_text_chunks:
for color in _extract_hex(text):
if color in NOISE:
continue
counter[color] += 1
if not counter:
logger.info("[brand_parser] no colors extracted from %s", source_url)
return {}
clustered = _cluster(counter)
chromatic = [c for c, _ in clustered if not _is_grayscale(c)]
grayscale = [c for c, _ in clustered if _is_grayscale(c)]
palette_top = clustered[:8]
palette = [{"name": f"색상 {i+1}", "hex": h, "usage": f"빈도 {n}"} for i, (h, n) in enumerate(palette_top)]
return {
"brand_colors": {
"primary": chromatic[0] if chromatic else None,
"accent": chromatic[1] if len(chromatic) > 1 else None,
"text": grayscale[0] if grayscale else None,
},
"color_palette": palette,
"extracted_from": "html+css",
}

89
app/services/branding.py Normal file
View File

@ -0,0 +1,89 @@
"""report 단계 - Gemini Vision 으로 로고 묘사 + 채널 로고 매칭."""
import logging
import os
from urllib.parse import urlparse
from common.db.source import (
select_run_raw_data, update_raw_info_merge,
select_branding_info_id, select_branding_logo_url,
)
from common.utils import _run_optional_step
from integrations.llm.gemini_vision import VisionClient
logger = logging.getLogger(__name__)
async def _describe_logo(analysis_run_id: str, info_id: int, vc: VisionClient) -> None:
"""공식 로고 정성 묘사. branding raw_info["brandAssets"] 머지.
호출 우선순위: raw_info.logo_url 컬럼 (HTML parser canonical) firecrawl 메타 fallback."""
raw = await select_run_raw_data(analysis_run_id)
mainpage = raw.get("mainpage") or {}
homepage_url = mainpage.get("sourceUrl") or ""
branding_meta = mainpage.get("branding") or {}
column_logo = await select_branding_logo_url(analysis_run_id)
candidates = [u for u in [
column_logo,
branding_meta.get("logoUrl"),
branding_meta.get("ogImage"),
branding_meta.get("faviconUrl"),
] if u]
if homepage_url:
parsed = urlparse(homepage_url)
if parsed.scheme and parsed.netloc:
candidates.append(f"{parsed.scheme}://{parsed.netloc}/favicon.ico")
if not candidates:
logger.info("[brand_logo] skip — no candidates")
return
logger.info("[brand_logo] start run=%s candidates=%d", analysis_run_id, len(candidates))
result: dict = {}
for cand in candidates:
result = await vc.analyze_brand_assets(logo_url=cand, homepage_url=homepage_url)
if result:
break
result.pop("logo_images", None) # logo_images 는 컬럼으로 옮겼으니 JSON 에서 제거
if result:
await update_raw_info_merge(info_id, {"brandAssets": result})
logger.info("[brand_logo] done keys=%s", list(result.keys()) if result else None)
async def _describe_channel_logos(analysis_run_id: str, info_id: int, vc: VisionClient) -> None:
"""채널 프로필 로고를 공식 로고와 비교. branding raw_info["channelLogos"] 머지."""
raw = await select_run_raw_data(analysis_run_id)
official = await select_branding_logo_url(analysis_run_id)
_label = {
"instagram": "Instagram",
"facebook": "Facebook",
"youtube": "YouTube",
"instagram_en": "Instagram EN",
"facebook_en": "Facebook EN",
"tiktok": "TikTok",
}
logos = [{"channel": label, "url": img}
for key, label in _label.items()
if (img := (raw.get(key) or {}).get("profileImage"))]
if not logos:
logger.info("[channel_logos] skip — no channel profileImages")
return
logger.info("[channel_logos] start run=%s channels=%s official=%s",
analysis_run_id, [l["channel"] for l in logos], bool(official))
result = await vc.describe_channel_logos(official, logos)
if result:
result["logos"] = logos # Vision 못 본 채널도 url 은 프론트 표시용으로 보관
await update_raw_info_merge(info_id, {"channelLogos": result})
logger.info("[channel_logos] done keys=%s", list(result.keys()) if result else None)
async def analyze_branding(analysis_run_id: str) -> None:
"""report build 직전 호출 — 로고 묘사 + 채널 로고 매칭 (Gemini). 둘 다 격리."""
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
logger.info("[branding] skip — GEMINI_API_KEY 없음")
return
branding_info_id = await select_branding_info_id(analysis_run_id)
if branding_info_id is None:
logger.info("[branding] skip — branding source 없음 run=%s", analysis_run_id)
return
vc = VisionClient(api_key)
logger.info("[branding] start run=%s", analysis_run_id)
await _run_optional_step(_describe_logo(analysis_run_id, branding_info_id, vc), "brand_logo")
await _run_optional_step(_describe_channel_logos(analysis_run_id, branding_info_id, vc), "channel_logos")
logger.info("[branding] done run=%s", analysis_run_id)

View File

@ -8,7 +8,9 @@ from integrations.naver import NaverClient
from integrations.youtube import YouTubeClient
from integrations.firecrawl import FirecrawlClient
from models.status import SourceType
from services.collect_extras import collect_brand_assets, collect_channel_logos
from integrations.site_fetcher import fetch_html_and_css
from services.brand_parser import find_logo_url_in_html, extract_brand_colors_from_text
from common.db.source import update_raw_info_merge, update_raw_info_logo_url, select_run_raw_data
from services.facebook_audit import transform_for_storage as transform_facebook
logger = logging.getLogger(__name__)
@ -122,6 +124,33 @@ async def collect_kakaotalk(analysis_run_id: str, info_id: int, url: str) -> Non
await update_raw_info(info_id, {"url": url})
async def collect_brand_basics(analysis_run_id: str, info_id: int) -> None:
"""branding 단계 collect — HTML/CSS 한 번 fetch → logo URL(컬럼) + brand 색상(JSON).
mainpage 수집 결과 의존이라 main wave gather 끝난 호출."""
logger.info("[brand_basics] start run=%s info=%s", analysis_run_id, info_id)
raw = await select_run_raw_data(analysis_run_id)
mainpage = raw.get("mainpage") or {}
homepage_url = mainpage.get("sourceUrl") or ""
branding_meta = mainpage.get("branding") or {}
html, css_texts = await fetch_html_and_css(homepage_url) if homepage_url else ("", [])
html_logo_url = find_logo_url_in_html(html, homepage_url, css_texts) if html else None
css_colors = extract_brand_colors_from_text(html, css_texts, homepage_url) if html else {}
logo_url = html_logo_url or branding_meta.get("logoUrl") or branding_meta.get("ogImage")
if logo_url:
await update_raw_info_logo_url(info_id, logo_url)
payload: dict = {}
if css_colors:
if css_colors.get("brand_colors"): payload["brand_colors"] = css_colors["brand_colors"]
if css_colors.get("color_palette"): payload["color_palette"] = css_colors["color_palette"]
payload["color_source"] = "html+css"
if payload:
await update_raw_info_merge(info_id, {"brandAssets": payload})
logger.info("[brand_basics] done logo_url=%s colors=%s", bool(logo_url), bool(payload))
async def collect_all(analysis_run_id: str, hospital_id: str) -> None:
rows = await select_run_sources(analysis_run_id)
@ -156,5 +185,4 @@ async def collect_all(analysis_run_id: str, hospital_id: str) -> None:
# 2단계: branding (brandAssets → channelLogos 한 raw_info 안에 머지). mainpage·채널 raw_data 의존이라 순차.
# 부가 기능이라 실패해도 리포트는 나와야 하므로 _run_optional_step 으로 격리.
if branding_info_id is not None:
await _run_optional_step(collect_brand_assets(analysis_run_id, branding_info_id), "brand_assets")
await _run_optional_step(collect_channel_logos(analysis_run_id, branding_info_id), "channel_logos")
await _run_optional_step(collect_brand_basics(analysis_run_id, branding_info_id), "brand_basics")

View File

@ -1,125 +0,0 @@
import logging
import os
from urllib.parse import urlparse
from common.db.source import select_run_raw_data, update_raw_info_merge, update_raw_info_logo_url
from integrations.vision import VisionClient
from integrations.color_extractor import extract_brand_assets_from_site
logger = logging.getLogger(__name__)
async def collect_brand_assets(analysis_run_id: str, info_id: int) -> None:
"""홈페이지에서 로고 URL + brand hex 색상 추출 → branding raw_info["brandAssets"] 머지.
- 로고 URL/hex: HTML·CSS 정규식 (color_extractor) Vision 의존 X, 사이트 전체 컬러 시스템이 정확.
- 로고 정성 묘사(심볼/워드마크/): Gemini Vision (GEMINI_API_KEY 없으면 색상만 저장하고 skip).
"""
logger.info("[brand_assets] start run=%s info=%s", analysis_run_id, info_id)
raw = await select_run_raw_data(analysis_run_id)
mainpage = raw.get("mainpage") or {}
homepage_url = mainpage.get("sourceUrl") or ""
branding = mainpage.get("branding") or {}
# 0~1. 사이트 1회 fetch 로 logo URL + brand hex 동시 추출 (img/background-image/CSS .logo, Vision 의존 X)
site = await extract_brand_assets_from_site(homepage_url) if homepage_url else {}
html_logo_url = site.get("logo_url")
css_colors = site.get("colors") or {}
if html_logo_url:
logger.info("[brand_assets] HTML logo found: %s", html_logo_url)
if css_colors:
logger.info("[brand_assets] css colors: %s", css_colors.get("brand_colors"))
# 2. 로고/대표 이미지 후보 (logo → og:image → favicon 순)
logo_url = html_logo_url or branding.get("logoUrl")
og_image = branding.get("ogImage")
favicon = branding.get("faviconUrl")
candidates: list[tuple[str, str]] = []
if logo_url: candidates.append(("logo", logo_url))
if og_image: candidates.append(("og", og_image))
if favicon: candidates.append(("favicon", favicon))
if homepage_url:
parsed = urlparse(homepage_url)
if parsed.scheme and parsed.netloc:
candidates.append(("favicon", f"{parsed.scheme}://{parsed.netloc}/favicon.ico"))
if not candidates and not css_colors:
logger.info("[brand_assets] skip — no logo/og/favicon candidates and no CSS colors")
return
# 3. Vision 은 로고 정성 묘사만 (hex 는 CSS 추출이 더 정확). 키 없으면 색상만 저장.
# SVG 는 vision 내부에서 resvg 로 PNG 래스터화 후 Gemini 에 던지므로 분기 불필요.
result: dict = {}
used_kind: str | None = None
api_key = os.getenv("GEMINI_API_KEY")
if api_key and candidates:
vc = VisionClient(api_key)
for kind, cand in candidates:
result = await vc.analyze_brand_assets(logo_url=cand, homepage_url=homepage_url)
if result:
used_kind = kind
break
elif not api_key:
logger.info("[brand_assets] GEMINI_API_KEY not set — 색상만 저장, Vision 묘사 skip")
# 4. CSS 에서 추출한 brand_colors/palette 를 Vision 보다 우선 사용
if css_colors:
if css_colors.get("brand_colors"): result["brand_colors"] = css_colors["brand_colors"]
if css_colors.get("color_palette"): result["color_palette"] = css_colors["color_palette"]
result["color_source"] = "html+css"
elif result:
result["color_source"] = "vision"
# 5. logo URL 은 JSON 이 아니라 raw_info.logo_url 컬럼에 분리 저장 (raw vs 분석 텍스트 분리).
# favicon 으로만 매칭된 경우 진짜 로고 아니라 컬럼 저장 X.
result.pop("logo_images", None)
column_logo_url = logo_url if used_kind in ("logo", "og") and logo_url else None
if column_logo_url:
await update_raw_info_logo_url(info_id, column_logo_url)
if result:
result["logo_source"] = used_kind or "none"
await update_raw_info_merge(info_id, {"brandAssets": result})
logger.info("[brand_assets] done logo_url=%s keys=%s",
bool(column_logo_url), list(result.keys()) if result else None)
async def collect_channel_logos(analysis_run_id: str, info_id: int) -> None:
"""채널별 프로필 이미지(로고)를 모아 Gemini Vision 으로 설명 + 공식 로고 일치 여부 평가.
branding raw_info["channelLogos"] 머지. GEMINI_API_KEY 없으면 skip.
brand_assets(공식 로고) · 채널 raw_info(profileImage) 채워진 실행돼야 ."""
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
logger.info("[channel_logos] skip — GEMINI_API_KEY 없음")
return
raw = await select_run_raw_data(analysis_run_id)
branding = raw.get("branding") or {}
official = ((branding.get("brandAssets") or {}).get("logo_images") or {}).get("horizontal")
# KR 메인 채널 + EN/TikTok 부가 채널 profileImage 수집 (raw_info dict 키: instagram, instagram_en, ...)
_label = {
"instagram": "Instagram",
"facebook": "Facebook",
"youtube": "YouTube",
"instagram_en": "Instagram EN",
"facebook_en": "Facebook EN",
"tiktok": "TikTok",
}
logos: list[dict] = []
for key, label in _label.items():
img = (raw.get(key) or {}).get("profileImage")
if img:
logos.append({"channel": label, "url": img})
if not logos:
logger.info("[channel_logos] skip — 채널 프로필 이미지 없음")
return
logger.info("[channel_logos] start run=%s channels=%s official=%s",
analysis_run_id, [l["channel"] for l in logos], bool(official))
result = await VisionClient(api_key).describe_channel_logos(official, logos)
if result:
# Vision 이 못 본 채널도 url 은 채워둠 (프론트에서 이미지 표시용)
result["logos"] = logos
await update_raw_info_merge(info_id, {"channelLogos": result})
logger.info("[channel_logos] done run=%s keys=%s",
analysis_run_id, list(result.keys()) if result else None)