"""collect 단계 - HTML/CSS 텍스트에서 brand 로고 URL + 색상 추출""" import logging import re from collections import Counter from urllib.parse import urljoin logger = logging.getLogger(__name__) # ── 로고 URL 추출 ───────────────────────────────────────────────────────────── LOGO_IMG_PATTERNS = [ re.compile(r']*\bclass=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), re.compile(r']*\bsrc=["\']([^"\']+)["\'][^>]*\bclass=["\'][^"\']*\blogo\b[^"\']*["\']', re.IGNORECASE), re.compile(r']*\bid=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), re.compile(r']*\balt=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), re.compile(r'<(?:a|h[1-6]|div|span)[^>]*\b(?:class|id)=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*>(?:[^<]|<(?!img))*]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE | re.DOTALL), re.compile(r'<(?:a|div|span|h[1-6])[^>]*\b(?:class|id)=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bstyle=["\'][^"\']*background(?:-image)?\s*:\s*url\(\s*["\']?([^"\')\s]+)', re.IGNORECASE), re.compile(r'<(?:a|div|span|h[1-6])[^>]*\bstyle=["\'][^"\']*background(?:-image)?\s*:\s*url\(\s*["\']?([^"\')\s]+)[^"\']*["\'][^>]*\b(?:class|id)=["\'][^"\']*\blogo\b', re.IGNORECASE), re.compile(r']*\bsrc=["\']([^"\']*\blogo\b[^"\']*\.(?:png|svg|jpe?g|webp)[^"\']*)["\']', re.IGNORECASE), re.compile(r']*>(?:[^<]|<(?!img))*]*\bsrc=["\']([^"\']+\.(?:png|svg|jpe?g|webp)[^"\']*)["\']', re.IGNORECASE | re.DOTALL), re.compile(r']*>(?:[^<]|<(?!img))*]*\bsrc=["\']([^"\']+\.(?:png|svg|jpe?g|webp)[^"\']*)["\']', re.IGNORECASE | re.DOTALL), re.compile(r']*\bproperty=["\']og:image["\'][^>]*\bcontent=["\']([^"\']+)["\']', re.IGNORECASE), re.compile(r']*\bcontent=["\']([^"\']+)["\'][^>]*\bproperty=["\']og:image["\']', re.IGNORECASE), ] LOGO_CSS_PATTERN = re.compile( r'\.[\w-]*\blogo\b[\w-]*\s*(?:,\s*\.[\w-]+\s*)*\{[^}]*background(?:-image)?\s*:\s*url\(\s*["\']?([^"\')\s]+)', re.IGNORECASE | re.DOTALL, ) def find_logo_url_in_html(html: str, base_url: str, css_texts: list[str] | None = None) -> str | None: """HTML 에서 logo URL 찾기. 우선순위: 1) class/id/alt 명시 img 2) 외부 CSS .logo bg 3) header/nav 첫 img.""" def _is_noise(src: str) -> bool: if not src or src.startswith("data:"): return True if re.search(r"(blank|spacer|pixel|transparent|1x1)\b", src, re.IGNORECASE): return True if re.search(r"(lang[-_]?(kor|eng|chn|jpn|rus|jp|en|ko|cn|ar|in)|flag|country|icon-|btn-|arrow|prev|next|search)\b", src, re.IGNORECASE): return True return False for pat in LOGO_IMG_PATTERNS[:8]: for m in pat.finditer(html): src = m.group(1) if _is_noise(src): continue return urljoin(base_url, src) for css in (css_texts or []): m = LOGO_CSS_PATTERN.search(css) if m: src = m.group(1) if not _is_noise(src): return urljoin(base_url, src) for pat in LOGO_IMG_PATTERNS[8:]: for m in pat.finditer(html): src = m.group(1) if _is_noise(src): continue return urljoin(base_url, src) return None # ── 색상 추출 ──────────────────────────────────────────────────────────────── HEX6 = re.compile(r"#([0-9a-fA-F]{6})\b") HEX3 = re.compile(r"#([0-9a-fA-F]{3})\b(?![0-9a-fA-F])") RGB = re.compile(r"rgba?\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*(?:,\s*[\d.]+\s*)?\)") STYLE_BLOCK = re.compile(r"]*>(.*?)", re.IGNORECASE | re.DOTALL) NOISE = { "#ffffff", "#000000", "#fff", "#000", "#333", "#222", "#111", "#444", "#555", "#666", "#777", "#888", "#999", "#aaa", "#bbb", "#ccc", "#ddd", "#eee", "#f0f0f0", "#f5f5f5", "#fafafa", } def _normalize(hex_str: str) -> str: h = hex_str.lstrip("#").lower() if len(h) == 3: h = "".join(c * 2 for c in h) if len(h) == 8: h = h[:6] return f"#{h}" def _rgb_to_hex(r: int, g: int, b: int) -> str: return f"#{r:02x}{g:02x}{b:02x}" def _hex_to_rgb(h: str) -> tuple[int, int, int]: h = h.lstrip("#") return int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16) def _distance(a: str, b: str) -> float: ar, ag, ab = _hex_to_rgb(a) br, bg, bb = _hex_to_rgb(b) return ((ar - br) ** 2 + (ag - bg) ** 2 + (ab - bb) ** 2) ** 0.5 def _is_grayscale(h: str, tol: int = 12) -> bool: r, g, b = _hex_to_rgb(h) return max(r, g, b) - min(r, g, b) < tol def _extract_hex(text: str) -> list[str]: out: list[str] = [] out.extend(_normalize(m.group(0)) for m in HEX6.finditer(text)) out.extend(_normalize(m.group(0)) for m in HEX3.finditer(text)) for m in RGB.finditer(text): r, g, b = int(m.group(1)), int(m.group(2)), int(m.group(3)) if 0 <= r <= 255 and 0 <= g <= 255 and 0 <= b <= 255: out.append(_rgb_to_hex(r, g, b)) return out def _cluster(colors: Counter, threshold: float = 25.0) -> list[tuple[str, int]]: ranked = colors.most_common() clusters: list[tuple[str, int]] = [] for color, count in ranked: merged = False for i, (rep, rep_count) in enumerate(clusters): if _distance(color, rep) < threshold: clusters[i] = (rep, rep_count + count) merged = True break if not merged: clusters.append((color, count)) return clusters def extract_brand_colors_from_text(html: str, css_texts: list[str], source_url: str = "") -> dict: """HTML + CSS 텍스트에서 hex 빈도 분석 → primary/accent/text + palette. (fetch 없음)""" all_text_chunks: list[str] = list(STYLE_BLOCK.findall(html)) all_text_chunks.append(html) all_text_chunks.extend(css_texts) counter: Counter = Counter() for text in all_text_chunks: for color in _extract_hex(text): if color in NOISE: continue counter[color] += 1 if not counter: logger.info("[brand_parser] no colors extracted from %s", source_url) return {} clustered = _cluster(counter) chromatic = [c for c, _ in clustered if not _is_grayscale(c)] grayscale = [c for c, _ in clustered if _is_grayscale(c)] palette_top = clustered[:8] palette = [{"name": f"색상 {i+1}", "hex": h, "usage": f"빈도 {n}"} for i, (h, n) in enumerate(palette_top)] return { "brand_colors": { "primary": chromatic[0] if chromatic else None, "accent": chromatic[1] if len(chromatic) > 1 else None, "text": grayscale[0] if grayscale else None, }, "color_palette": palette, "extracted_from": "html+css", }