From 9817b53be1af0a7ca1d03d13b6214fd05f64de9c Mon Sep 17 00:00:00 2001 From: Mina Choi Date: Wed, 27 May 2026 13:27:39 +0900 Subject: [PATCH 01/14] =?UTF-8?q?=ED=8B=B1=ED=86=A1=C2=B7=EC=98=81?= =?UTF-8?q?=EB=AC=B8=20=EC=9D=B8=EC=8A=A4=ED=83=80/=ED=8E=98=EB=B6=81=20?= =?UTF-8?q?=EC=B1=84=EB=84=90=20=EC=88=98=EC=A7=91=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - apify: 틱톡 프로필 액터 - mock_urls.py: 클리닉별 채널 URL 매핑 (mockUrls.json → 파이썬 모듈) - api/analysis: homepage 매칭으로 미지원 채널 보충 (추후 DB) Co-Authored-By: Claude Opus 4.7 (1M context) --- app/api/analysis.py | 28 +++++++- app/integrations/apify.py | 44 ++++++++++++ app/mock_urls.py | 141 ++++++++++++++++++++++++++++++++++++++ app/models/analysis.py | 3 + 4 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 app/mock_urls.py diff --git a/app/api/analysis.py b/app/api/analysis.py index 7e2691c..ed4b0ae 100644 --- a/app/api/analysis.py +++ b/app/api/analysis.py @@ -8,10 +8,28 @@ from models.file import FileListItem, FileType, FileUploadResponse from models.status import AnalysisStatus from services.pipeline import run_pipeline from services.file import get_analysis_files_response, handle_analysis_file_upload, soft_delete_analysis_file +from mock_urls import MOCK_CLINICS +from common.utils import _normalize_homepage, _with_scheme router = APIRouter(prefix="/api/analysis", tags=["analysis"], dependencies=[Depends(verify_api_key)]) logger = logging.getLogger(__name__) +# 추후 DB에 클리닉별로 매핑할 채널(틱톡/영문 인스타·페북). 지금은 mock_urls에서 homepage 매칭으로 보충. +def _extra_channels_from_mockurls(homepage_url: str) -> dict: + """homepage로 mock_urls에서 클리닉을 찾아 틱톡/영문 인스타·페북 URL 반환 (없으면 {}).""" + target = _normalize_homepage(homepage_url) + if not target: + return {} + for c in MOCK_CLINICS: + urls = c["urls"] + if _normalize_homepage(urls.get("homepage", "")) == target: + return { + "tiktok": _with_scheme(urls.get("tiktok")), + "instagram_en": _with_scheme(urls.get("instagramEn")), + "facebook_en": _with_scheme(urls.get("facebookEn")), + } + return {} + @router.post("", status_code=status.HTTP_202_ACCEPTED, response_model=AnalysisStartResponse) async def start_analysis(body: AnalysisCreate, background_tasks: BackgroundTasks): @@ -38,7 +56,15 @@ async def start_analysis(body: AnalysisCreate, background_tasks: BackgroundTasks ig_id, fb_id, nb_id, yt_id, gu_id, ) - background_tasks.add_task(run_pipeline, analysis_run_id) + # 클라 값 우선, 없으면 보충 (추후 DB에서 클리닉별로 가져올 값) + mock_extra = _extra_channels_from_mockurls(hospital["url"]) + extra_channels = { + "tiktok": body.channels.tiktok or mock_extra.get("tiktok"), + "instagram_en": body.channels.instagram_en or mock_extra.get("instagram_en"), + "facebook_en": body.channels.facebook_en or mock_extra.get("facebook_en"), + } + logger.info("[analysis] extra_channels=%s (mock_matched=%s)", extra_channels, bool(mock_extra)) + background_tasks.add_task(run_pipeline, analysis_run_id, extra_channels) return AnalysisStartResponse( analysis_run_id=analysis_run_id, diff --git a/app/integrations/apify.py b/app/integrations/apify.py index b5c77dd..8952993 100644 --- a/app/integrations/apify.py +++ b/app/integrations/apify.py @@ -44,6 +44,7 @@ class ApifyClient: return None return { "username": profile["username"], + "profileImage": profile.get("profilePicUrlHD") or profile.get("profilePicUrl"), "followers": profile.get("followersCount", 0), "following": profile.get("followsCount", 0), "posts": profile.get("postsCount", 0), @@ -134,6 +135,7 @@ class ApifyClient: return None return { "pageName": page.get("title") or page.get("name"), + "profileImage": page.get("profilePictureUrl") or page.get("profilePhoto") or page.get("profilePic"), "pageUrl": page.get("pageUrl", page_url), "followers": page.get("followers", 0), "likes": page.get("likes", 0), @@ -145,3 +147,45 @@ class ApifyClient: "intro": page.get("intro"), "rating": page.get("rating"), } + + async def fetch_tiktok_profile(self, url: str) -> list[dict]: + user = urlparse(url).path.strip("/").lstrip("@").split("/")[0] if "://" in url else url.lstrip("@") + return await self._run_actor("clockworks~tiktok-scraper", { + "profiles": [user], + "resultsPerPage": 10, + "profileScrapeSections": ["videos"], + "profileSorting": "latest", + "shouldDownloadVideos": False, + "shouldDownloadCovers": False, + "shouldDownloadSubtitles": False, + }) + + async def get_tiktok_profile(self, url: str) -> dict | None: + items = await self.fetch_tiktok_profile(url) + if not items: + return None + author = (items[0] or {}).get("authorMeta") or {} + videos = [ + { + "title": (v.get("text") or "")[:300], + "playCount": v.get("playCount", 0), + "diggCount": v.get("diggCount", 0), + "commentCount": v.get("commentCount", 0), + "shareCount": v.get("shareCount", 0), + "createTime": v.get("createTimeISO"), + "url": v.get("webVideoUrl"), + } + for v in items if isinstance(v, dict) + ] + return { + "handle": author.get("name"), + "profileImage": author.get("avatar"), + "nickname": author.get("nickName"), + "followers": author.get("fans", 0), + "following": author.get("following", 0), + "likes": author.get("heart", 0), + "videoCount": author.get("video", 0), + "verified": author.get("verified", False), + "bio": author.get("signature", ""), + "recentVideos": videos[:10], + } diff --git a/app/mock_urls.py b/app/mock_urls.py new file mode 100644 index 0000000..3971a8e --- /dev/null +++ b/app/mock_urls.py @@ -0,0 +1,141 @@ +# 프론트가 아직 안 보내는 채널(틱톡/영문 인스타·페북)을 homepage로 매칭해 보충하는 임시 mock 데이터. +# 기존 mockUrls.json을 파이썬 모듈로 전환 — 런타임 파일 I/O 없이 직접 import. + +MOCK_CLINICS = [ + { + "label": "뷰성형외과", + "urls": { + "homepage": "viewclinic.com", + "youtube": "youtube.com/channel/UCQqqH3Klj2HQSHNNSVug-CQ", + "instagram": "instagram.com/viewplastic", + "facebook": "facebook.com/viewps1", + "naverPlace": "https://naver.me/x9BxGXkK", + "naverBlog": "blog.naver.com/viewclinicps", + "gangnamUnni": "gangnamunni.com/hospitals/189", + "tiktok": "tiktok.com/@viewplastic", + "tiktokEn": "tiktok.com/@viewplasticsurgery", + "instagramEn": "instagram.com/view_plastic_surgery", + "facebookEn": "facebook.com/viewclinic" + } + }, + { + "label": "바노바기 성형외과", + "urls": { + "homepage": "banobagi.com", + "youtube": "youtube.com/c/banobagips", + "instagram": "instagram.com/banobagi_ps", + "facebook": "facebook.com/BanobagiPlasticSurgery", + "naverPlace": "https://naver.me/xxY2yLr5", + "naverBlog": "blog.naver.com/banobagiprs", + "gangnamUnni": "gangnamunni.com/hospitals/23", + "tiktok": "", + "instagramEn": "instagram.com/english_banobagi", + "facebookEn": "facebook.com/englishbanobagi" + } + }, + { + "label": "ID 성형외과", + "urls": { + "homepage": "idhospital.com", + "youtube": "youtube.com/user/IDhospital", + "instagram": "instagram.com/idhospital", + "facebook": "facebook.com/idhospital0050", + "naverPlace": "https://naver.me/GtURpCEn", + "naverBlog": "", + "gangnamUnni": "gangnamunni.com/hospitals/257", + "tiktok": "tiktok.com/@idhospitalkorea", + "instagramEn": "instagram.com/idhospitalkorea", + "facebookEn": "facebook.com/idhospital.eng" + } + }, + { + "label": "JK 성형외과", + "urls": { + "homepage": "jkplastic.com", + "youtube": "youtube.com/channel/UC5F8dEt32hdp3cTeFyls4qg", + "instagram": "instagram.com/jkplasticsurgery_kr", + "facebook": "facebook.com/jkmedicalgroup", + "naverPlace": "https://naver.me/x67y6cAc", + "naverBlog": "blog.naver.com/jkstory1", + "gangnamUnni": "gangnamunni.com/hospitals/858", + "tiktok": "tiktok.com/@jkplastic", + "instagramEn": "instagram.com/jkplasticsurgery", + "facebookEn": "facebook.com/jkplastic" + } + }, + { + "label": "그랜드 성형외과", + "urls": { + "homepage": "grandsurgery.com", + "youtube": "youtube.com/channel/UCU2o_aHqsNFuqwtdzVM3xbQ", + "instagram": "instagram.com/grand_korea", + "facebook": "facebook.com/grandps.korea", + "naverPlace": "https://naver.me/Fw7MYKWK", + "naverBlog": "blog.naver.com/grandprs", + "gangnamUnni": "gangnamunni.com/hospitals/62", + "tiktok": "", + "instagramEn": "instagram.com/grandps_eng", + "facebookEn": "facebook.com/grandplasticsurgery" + } + }, + { + "label": "BK 성형외과", + "urls": { + "homepage": "bkhospital.com", + "youtube": "youtube.com/channel/UChJONft3hemy5DGbXUveTFg", + "instagram": "instagram.com/bkhospital_korea", + "facebook": "", + "naverPlace": "https://naver.me/517CTH3W", + "naverBlog": "", + "gangnamUnni": "", + "tiktok": "", + "instagramEn": "instagram.com/english_bkhospital", + "facebookEn": "facebook.com/BKPSKoreaE" + } + }, + { + "label": "톡스앤필", + "urls": { + "homepage": "toxnfill.com", + "youtube": "youtube.com/channel/UCFpFZkm7mclD-z_-j7FTUag", + "instagram": "instagram.com/toxnfill_official", + "facebook": "facebook.com/toxnfill.official", + "naverPlace": "https://naver.me/FvEmJIHA", + "naverBlog": "blog.naver.com/toxnfill", + "gangnamUnni": "gangnamunni.com/hospitals/3702", + "tiktok": "tiktok.com/@toxnfillglobal", + "instagramEn": "instagram.com/toxnfill_global", + "facebookEn": "facebook.com/p/Toxnfill-Global-61557593068252" + } + }, + { + "label": "더 압구정 성형외과", + "urls": { + "homepage": "theclinic.co.kr", + "youtube": "youtube.com/user/theplasticsurgery1", + "instagram": "instagram.com/the_plasticsurgery", + "facebook": "facebook.com/THEPS16445998", + "naverPlace": "", + "naverBlog": "blog.naver.com/with_theps", + "gangnamUnni": "gangnamunni.com/hospitals/30", + "tiktok": "", + "instagramEn": "instagram.com/the_plasticsurgery.en", + "facebookEn": "facebook.com/theps.english" + } + }, + { + "label": "오라클 성형외과", + "urls": { + "homepage": "oracleclinic.com", + "youtube": "youtube.com/@oracle_medical_group", + "instagram": "instagram.com/oraclemedicalgroup", + "facebook": "facebook.com/oracleclinickr", + "naverPlace": "https://naver.me/GhbU3VtK", + "naverBlog": "", + "gangnamUnni": "gangnamunni.com/hospitals/125", + "tiktok": "tiktok.com/@oracleclinic_usa", + "instagramEn": "instagram.com/oracleclinic_global", + "facebookEn": "facebook.com/oracleclinicglobal" + } + } +] diff --git a/app/models/analysis.py b/app/models/analysis.py index 2b4e7ec..185035b 100644 --- a/app/models/analysis.py +++ b/app/models/analysis.py @@ -8,6 +8,9 @@ class Channels(BaseModel): facebook: str | None = None naver_blog: str | None = None gangnam_unni: str | None = None + tiktok: str | None = None + instagram_en: str | None = None + facebook_en: str | None = None class AnalysisOptions(BaseModel): From 843ccdb806055b9139240b95cf49d8654554f21a Mon Sep 17 00:00:00 2001 From: Mina Choi Date: Wed, 27 May 2026 13:27:39 +0900 Subject: [PATCH 02/14] =?UTF-8?q?=EB=B8=8C=EB=9E=9C=EB=93=9C=20=EC=9E=90?= =?UTF-8?q?=EC=82=B0(=EB=A1=9C=EA=B3=A0/=EC=83=89=EC=83=81)=C2=B7=EC=B1=84?= =?UTF-8?q?=EB=84=90=20=EB=A1=9C=EA=B3=A0=20Vision=20=EB=B6=84=EC=84=9D=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - color_extractor: 홈페이지 HTML/CSS에서 로고 URL·브랜드 hex 추출 - vision: Gemini Vision 로고 묘사·채널 로고 일치 평가 - youtube: 채널 profileImage 추출 / firecrawl: clinic_info 추출 보정 Co-Authored-By: Claude Opus 4.7 (1M context) --- app/integrations/color_extractor.py | 250 ++++++++++++++++++++++++++++ app/integrations/firecrawl.py | 3 +- app/integrations/vision.py | 171 +++++++++++++++++++ app/integrations/youtube.py | 2 + 4 files changed, 425 insertions(+), 1 deletion(-) create mode 100644 app/integrations/color_extractor.py create mode 100644 app/integrations/vision.py diff --git a/app/integrations/color_extractor.py b/app/integrations/color_extractor.py new file mode 100644 index 0000000..806d8cd --- /dev/null +++ b/app/integrations/color_extractor.py @@ -0,0 +1,250 @@ +"""홈페이지 HTML/CSS에서 hex 색상 직접 추출 + 빈도 기반 brand palette 산출. + +Vision LLM에 의존하지 않고 페이지의 실제 CSS 값을 정규식으로 잡음. +로고만 분석하는 Vision보다 사이트 전체 컬러 시스템 (primary/secondary/background/text)을 더 정확히 추출. +""" +import logging +import re +import ssl +from collections import Counter +from urllib.parse import urljoin, urlparse +import httpx + +logger = logging.getLogger(__name__) + + +def _make_ssl_context() -> ssl.SSLContext: + """오래된 한국 의료 사이트들이 SSL DH_KEY_TOO_SMALL / cipher 약함 등으로 차단되는 문제 우회. + 보안 등급 1로 낮춤 + cert 검증 유지.""" + ctx = ssl.create_default_context() + try: + ctx.set_ciphers("DEFAULT@SECLEVEL=1") + except ssl.SSLError: + pass + return ctx + + +async def _fetch_html(url: str, timeout: float = 20.0) -> tuple[int, str]: + """SSL/검증 단계별 fallback으로 HTML 받기. 그랜드/톡스앤필 같은 oldsite 대응.""" + headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"} + # 1차: 표준 검증 + try: + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers) as c: + r = await c.get(url) + return r.status_code, r.text + except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e: + logger.info("[fetch] %s standard SSL failed: %s — fallback to weak cipher", url, e) + # 2차: 약한 cipher 허용 + try: + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=_make_ssl_context()) as c: + r = await c.get(url) + return r.status_code, r.text + except (httpx.ConnectError, httpx.ReadError, ssl.SSLError) as e: + logger.info("[fetch] %s weak cipher failed: %s — fallback to verify=False", url, e) + # 3차: SSL 검증 끔 (host mismatch 등) + try: + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True, headers=headers, verify=False) as c: + r = await c.get(url) + return r.status_code, r.text + except Exception as e: + logger.warning("[fetch] %s all fallbacks failed: %s", url, e) + return 0, "" + +LOGO_IMG_PATTERNS = [ + # 1) + re.compile(r']*\bclass=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), + # 2) + re.compile(r']*\bsrc=["\']([^"\']+)["\'][^>]*\bclass=["\'][^"\']*\blogo\b[^"\']*["\']', re.IGNORECASE), + # 3) + re.compile(r']*\bid=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), + # 4) ...logo... + re.compile(r']*\balt=["\'][^"\']*\blogo\b[^"\']*["\'][^>]*\bsrc=["\']([^"\']+)["\']', re.IGNORECASE), + # 5)