o2o-infinith-backend/app/services/brand_parser.py

"""collect 단계 - HTML/CSS 텍스트에서 brand 로고 URL + 색상 추출"""
import logging
import re
from collections import Counter
from urllib.parse import urljoin

logger = logging.getLogger(__name__)


# ── 로고 URL 추출 ─────────────────────────────────────────────────────────────


LOGO_IMG_PATTERNS = [
    re.compile(r'<img[^>]*\bclass=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE),
    re.compile(r'<img[^>]*\bsrc=["\']([^"\']+)["\'][^>]*\bclass=["\'][^"\']*\blogo\b[^"\']*["\']', re.IGNORECASE),
    re.compile(r'<img[^>]*\bid=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE),
    re.compile(r'<img[^>]*\balt=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE),
    re.compile(r'<(?:a|h[1-6]|div|span)[^>]*\b(?:class|id)=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*>(?:[^<]|<(?!img))*<img[^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE | re.DOTALL),
    re.compile(r'<(?:a|div|span|h[1-6])[^>]*\b(?:class|id)=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bstyle=["\'][^"\']*background(?:-image)?\s*:\s*url\(\s*["\']?([^"\')\s]+)', re.IGNORECASE),
    re.compile(r'<(?:a|div|span|h[1-6])[^>]*\bstyle=["\'][^"\']*background(?:-image)?\s*:\s*url\(\s*["\']?([^"\')\s]+)[^"\']*["\'][^>]*\b(?:class|id)=["\'][^"\']*\blogo\b', re.IGNORECASE),
    re.compile(r'<img[^>]*\bsrc=["\']([^"\']*\blogo\b[^"\']*\.(?:png|svg|jpe?g|webp)[^"\']*)["\']', re.IGNORECASE),
    re.compile(r'<header\b[^>]*>(?:[^<]|<(?!img))*<img[^>]*\bsrc=["\']([^"\']+\.(?:png|svg|jpe?g|webp)[^"\']*)["\']', re.IGNORECASE | re.DOTALL),
    re.compile(r'<nav\b[^>]*>(?:[^<]|<(?!img))*<img[^>]*\bsrc=["\']([^"\']+\.(?:png|svg|jpe?g|webp)[^"\']*)["\']', re.IGNORECASE | re.DOTALL),
    re.compile(r'<meta[^>]*\bproperty=["\']og:image["\'][^>]*\bcontent=["\']([^"\']+)["\']', re.IGNORECASE),
    re.compile(r'<meta[^>]*\bcontent=["\']([^"\']+)["\'][^>]*\bproperty=["\']og:image["\']', re.IGNORECASE),
]

LOGO_CSS_PATTERN = re.compile(
    r'\.[\w-]*\blogo\b[\w-]*\s*(?:,\s*\.[\w-]+\s*)*\{[^}]*background(?:-image)?\s*:\s*url\(\s*["\']?([^"\')\s]+)',
    re.IGNORECASE | re.DOTALL,
)


def find_logo_url_in_html(html: str, base_url: str, css_texts: list[str] | None = None) -> str | None:
    """HTML 에서 logo URL 찾기. 우선순위: 1) class/id/alt 명시 img 2) 외부 CSS .logo bg 3) header/nav 첫 img."""

    def _is_noise(src: str) -> bool:
        if not src or src.startswith("data:"):
            return True
        if re.search(r"(blank|spacer|pixel|transparent|1x1)\b", src, re.IGNORECASE):
            return True
        if re.search(r"(lang[-_]?(kor|eng|chn|jpn|rus|jp|en|ko|cn|ar|in)|flag|country|icon-|btn-|arrow|prev|next|search)\b", src, re.IGNORECASE):
            return True
        return False

    for pat in LOGO_IMG_PATTERNS[:8]:
        for m in pat.finditer(html):
            src = m.group(1)
            if _is_noise(src):
                continue
            return urljoin(base_url, src)

    for css in (css_texts or []):
        m = LOGO_CSS_PATTERN.search(css)
        if m:
            src = m.group(1)
            if not _is_noise(src):
                return urljoin(base_url, src)

    for pat in LOGO_IMG_PATTERNS[8:]:
        for m in pat.finditer(html):
            src = m.group(1)
            if _is_noise(src):
                continue
            return urljoin(base_url, src)

    return None


# ── 색상 추출 ────────────────────────────────────────────────────────────────


HEX6 = re.compile(r"#([0-9a-fA-F]{6})\b")
HEX3 = re.compile(r"#([0-9a-fA-F]{3})\b(?![0-9a-fA-F])")
RGB  = re.compile(r"rgba?\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*(?:,\s*[\d.]+\s*)?\)")
STYLE_BLOCK = re.compile(r"<style[^>]*>(.*?)</style>", re.IGNORECASE | re.DOTALL)

NOISE = {
    "#ffffff", "#000000", "#fff", "#000",
    "#333", "#222", "#111", "#444", "#555", "#666", "#777", "#888", "#999",
    "#aaa", "#bbb", "#ccc", "#ddd", "#eee", "#f0f0f0", "#f5f5f5", "#fafafa",
}


def _normalize(hex_str: str) -> str:
    h = hex_str.lstrip("#").lower()
    if len(h) == 3:
        h = "".join(c * 2 for c in h)
    if len(h) == 8:
        h = h[:6]
    return f"#{h}"


def _rgb_to_hex(r: int, g: int, b: int) -> str:
    return f"#{r:02x}{g:02x}{b:02x}"


def _hex_to_rgb(h: str) -> tuple[int, int, int]:
    h = h.lstrip("#")
    return int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)


def _distance(a: str, b: str) -> float:
    ar, ag, ab = _hex_to_rgb(a)
    br, bg, bb = _hex_to_rgb(b)
    return ((ar - br) ** 2 + (ag - bg) ** 2 + (ab - bb) ** 2) ** 0.5


def _is_grayscale(h: str, tol: int = 12) -> bool:
    r, g, b = _hex_to_rgb(h)
    return max(r, g, b) - min(r, g, b) < tol


def _extract_hex(text: str) -> list[str]:
    out: list[str] = []
    out.extend(_normalize(m.group(0)) for m in HEX6.finditer(text))
    out.extend(_normalize(m.group(0)) for m in HEX3.finditer(text))
    for m in RGB.finditer(text):
        r, g, b = int(m.group(1)), int(m.group(2)), int(m.group(3))
        if 0 <= r <= 255 and 0 <= g <= 255 and 0 <= b <= 255:
            out.append(_rgb_to_hex(r, g, b))
    return out


def _cluster(colors: Counter, threshold: float = 25.0) -> list[tuple[str, int]]:
    ranked = colors.most_common()
    clusters: list[tuple[str, int]] = []
    for color, count in ranked:
        merged = False
        for i, (rep, rep_count) in enumerate(clusters):
            if _distance(color, rep) < threshold:
                clusters[i] = (rep, rep_count + count)
                merged = True
                break
        if not merged:
            clusters.append((color, count))
    return clusters


def extract_brand_colors_from_text(html: str, css_texts: list[str], source_url: str = "") -> dict:
    """HTML + CSS 텍스트에서 hex 빈도 분석 → primary/accent/text + palette. (fetch 없음)"""
    all_text_chunks: list[str] = list(STYLE_BLOCK.findall(html))
    all_text_chunks.append(html)
    all_text_chunks.extend(css_texts)

    counter: Counter = Counter()
    for text in all_text_chunks:
        for color in _extract_hex(text):
            if color in NOISE:
                continue
            counter[color] += 1

    if not counter:
        logger.info("[brand_parser] no colors extracted from %s", source_url)
        return {}

    clustered = _cluster(counter)
    chromatic = [c for c, _ in clustered if not _is_grayscale(c)]
    grayscale = [c for c, _ in clustered if _is_grayscale(c)]

    palette_top = clustered[:8]
    palette = [{"name": f"색상 {i+1}", "hex": h, "usage": f"빈도 {n}"} for i, (h, n) in enumerate(palette_top)]

    return {
        "brand_colors": {
            "primary": chromatic[0] if chromatic else None,
            "accent": chromatic[1] if len(chromatic) > 1 else None,
            "text": grayscale[0] if grayscale else None,
        },
        "color_palette": palette,
        "extracted_from": "html+css",
    }