인스타 highlights/계정 수집 개선 (VIEW actor + 코드로 계정 구성)

- apify: 프로필 coderx, 하이라이트 igview actor로 교체. highlights/category/ following(followsCount)/profileImage(hdProfilePicUrl)/latestPosts.mediaType 수집. reel 스크래퍼 제거, post 스크래퍼 비활성화(주석) - instagram_audit.py(신규): KR·EN 계정 hard 필드를 수집 데이터로 구성 - analysis: _build_overrides에서 위 함수로 계정 구성, _patch_report가 accounts를 코드값으로 주입 (LLM은 diagnosis만, 프롬프트에서 accounts는 []로 두게 지시) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 09:43:03 +09:00 · 2026-05-28 09:43:03 +09:00 · 4f756cf001
parent 163e9d1c02
commit 4f756cf001
4 changed files with 116 additions and 80 deletions
--- a/app/integrations/apify.py
+++ b/app/integrations/apify.py
@ -1,9 +1,18 @@
+import asyncio
 from http import HTTPMethod
 from urllib.parse import urlparse
 from common.utils import http_request

 APIFY_BASE = "https://api.apify.com/v2"

+# Instagram: profile + highlights 두 actor 직접 호출.
+IG_PROFILE_ACTOR = "coderx~instagram-profile-scraper-bio-posts"
+IG_HIGHLIGHTS_ACTOR = "igview-owner~instagram-highlights-scraper"
+
+
+def _ig_username(url: str) -> str:
+    return urlparse(url).path.strip("/").split("/")[0] if "://" in url else url.lstrip("@")
+

 class ApifyClient:
    def __init__(self, token: str, wait_for_finish: int = 120):
@ -33,27 +42,39 @@ class ApifyClient:
            return []
        return items_resp.json()

-    async def fetch_instagram_profile(self, url: str) -> dict | None:
-        username = urlparse(url).path.strip("/").split("/")[0] if "://" in url else url.lstrip("@")
-        items = await self._run_actor("apify~instagram-profile-scraper", {"usernames": [username], "resultsLimit": 12})
+    async def fetch_instagram_profile(self, username: str) -> dict | None:
+        items = await self._run_actor(IG_PROFILE_ACTOR, {"usernames": [username]})
        return items[0] if items else None

+    async def fetch_instagram_highlights(self, username: str) -> list[dict]:
+        return await self._run_actor(IG_HIGHLIGHTS_ACTOR, {"usernames": [username]})
+
    async def get_instagram_profile(self, url: str) -> dict | None:
-        profile = await self.fetch_instagram_profile(url)
-        if not profile or profile.get("error"):
+        username = _ig_username(url)
+        # profile·highlights 두 actor를 병렬 호출 (highlights 실패해도 profile만 있으면 진행)
+        profile, highlights = await asyncio.gather(
+            self.fetch_instagram_profile(username),
+            self.fetch_instagram_highlights(username),
+            return_exceptions=True,
+        )
+        if isinstance(profile, Exception) or not profile or profile.get("error"):
            return None
+        if isinstance(highlights, Exception):
+            highlights = []
        return {
            "username": profile["username"],
-            "profileImage": profile.get("profilePicUrlHD") or profile.get("profilePicUrl"),
+            "profileImage": profile.get("hdProfilePicUrl") or profile.get("profilePicUrl"),
            "followers": profile.get("followersCount", 0),
            "following": profile.get("followsCount", 0),
            "posts": profile.get("postsCount", 0),
            "bio": profile.get("biography", ""),
+            "category": profile.get("businessCategoryName") or "",
            "isBusinessAccount": profile.get("isBusinessAccount", False),
            #"externalUrl": profile.get("externalUrl"), LLM에 혼동을 주는 듯 하여 비활성화
+            "highlights": [h["highlightTitle"] for h in (highlights or []) if isinstance(h, dict) and h.get("highlightTitle")],
            "latestPosts": [
                {
-                    "type": p.get("type"),
+                    "type": p.get("mediaType") or p.get("type"),
                    "likes": p.get("likesCount", 0),
                    "comments": p.get("commentsCount", 0),
                    "caption": (p.get("caption") or "")[:500],
@ -63,67 +84,36 @@ class ApifyClient:
            ],
        }

-    async def fetch_instagram_posts(self, url: str, limit: int = 20) -> list[dict]:
-        username = urlparse(url).path.strip("/").split("/")[0] if "://" in url else url.lstrip("@")
-        return await self._run_actor("apify~instagram-post-scraper", {
-            "directUrls": [f"https://www.instagram.com/{username}/"],
-            "resultsLimit": limit,
-        })
-
-    async def get_instagram_posts(self, url: str, limit: int = 20) -> dict:
-        items = await self.fetch_instagram_posts(url, limit)
-        posts = [
-            {
-                "id": p["id"],
-                "type": p.get("type"),
-                "url": p.get("url"),
-                "caption": (p.get("caption") or "")[:500],
-                "hashtags": p.get("hashtags", []),
-                "likesCount": p.get("likesCount", 0),
-                "commentsCount": p.get("commentsCount", 0),
-                "timestamp": p.get("timestamp"),
-            }
-            for p in items
-        ]
-        n = len(posts) or 1
-        return {
-            "posts": posts,
-            "totalPosts": len(posts),
-            "avgLikes": round(sum(p["likesCount"] for p in posts) / n),
-            "avgComments": round(sum(p["commentsCount"] for p in posts) / n),
-        }
-
-    async def fetch_instagram_reels(self, url: str, limit: int = 15) -> list[dict]:
-        username = urlparse(url).path.strip("/").split("/")[0] if "://" in url else url.lstrip("@")
-        return await self._run_actor("apify~instagram-reel-scraper", {
-            "directUrls": [f"https://www.instagram.com/{username}/reels/"],
-            "resultsLimit": limit,
-        })
-
-    async def get_instagram_reels(self, url: str, limit: int = 15) -> dict:
-        items = await self.fetch_instagram_reels(url, limit)
-        reels = [
-            {
-                "id": r["id"],
-                "url": r.get("url"),
-                "caption": (r.get("caption") or "")[:500],
-                "hashtags": r.get("hashtags", []),
-                "likesCount": r.get("likesCount", 0),
-                "commentsCount": r.get("commentsCount", 0),
-                "videoViewCount": r.get("videoViewCount", 0),
-                "videoPlayCount": r.get("videoPlayCount", 0),
-                "videoDuration": r.get("videoDuration", 0),
-                "timestamp": r.get("timestamp"),
-            }
-            for r in items
-        ]
-        n = len(reels) or 1
-        return {
-            "reels": reels,
-            "totalReels": len(reels),
-            "avgViews": round(sum(r["videoViewCount"] for r in reels) / n),
-            "avgPlays": round(sum(r["videoPlayCount"] for r in reels) / n),
-        }
+    # 인스타 post 스크래퍼는 현재 파이프라인 미사용 — 비활성화 (필요 시 복구)
+    # async def fetch_instagram_posts(self, url: str, limit: int = 20) -> list[dict]:
+    #     username = urlparse(url).path.strip("/").split("/")[0] if "://" in url else url.lstrip("@")
+    #     return await self._run_actor("apify~instagram-post-scraper", {
+    #         "directUrls": [f"https://www.instagram.com/{username}/"],
+    #         "resultsLimit": limit,
+    #     })
+    #
+    # async def get_instagram_posts(self, url: str, limit: int = 20) -> dict:
+    #     items = await self.fetch_instagram_posts(url, limit)
+    #     posts = [
+    #         {
+    #             "id": p["id"],
+    #             "type": p.get("type"),
+    #             "url": p.get("url"),
+    #             "caption": (p.get("caption") or "")[:500],
+    #             "hashtags": p.get("hashtags", []),
+    #             "likesCount": p.get("likesCount", 0),
+    #             "commentsCount": p.get("commentsCount", 0),
+    #             "timestamp": p.get("timestamp"),
+    #         }
+    #         for p in items
+    #     ]
+    #     n = len(posts) or 1
+    #     return {
+    #         "posts": posts,
+    #         "totalPosts": len(posts),
+    #         "avgLikes": round(sum(p["likesCount"] for p in posts) / n),
+    #         "avgComments": round(sum(p["commentsCount"] for p in posts) / n),
+    #     }

    async def fetch_facebook_page(self, page_url: str) -> dict | None:
        items = await self._run_actor("apify~facebook-pages-scraper", {"startUrls": [{"url": page_url}]})
--- a/app/integrations/llm/temp-prompt/report_prompt.txt
+++ b/app/integrations/llm/temp-prompt/report_prompt.txt
@ -67,14 +67,13 @@
 {channel_logos}
 - channel_logos.channel_logos[]에 각 채널의 로고 설명(logo_description)과 공식 로고 일치 여부(is_official)가 있습니다.
 - **facebook_audit.pages[].logo** 는 짧은 판정 타이틀로: is_official=true면 `"일치 (공식 로고)"`, false면 `"불일치 (비공식 변형)"`. 그리고 **facebook_audit.pages[].logo_description** 에 해당 채널의 logo_description(설명문)을 넣으세요.
- **instagram_audit.accounts[].profile_photo** 는 해당 채널 로고를 짧게 서술 (예: `"모델 사진 (브랜드 로고 아님)"`, `"VIEW 골드 로고"`). 긴 문장 말고 짧게.
 - 위 값들은 channel_logos 데이터 기반으로만 작성하고 추측하지 마세요.
 - 채널 간 로고 불일치(is_official=false)는 brand 일관성 진단(problem_diagnosis/weaknesses)에 반영하세요.

 ## clinic_snapshot / 채널 audit 작성 지침 (수집 데이터 그대로, 추측 금지)
 - clinic_snapshot.name 은 {clinic_name} 을 **그대로** 사용 (강남언니 표기명 '-본원' 등으로 바꾸지 말 것).
 - clinic_snapshot 의 overall_rating/total_reviews/staff_count/location/certifications/lead_doctor 는 강남언니({gangnam_unni}) 데이터의 값을 그대로 사용.
- instagram_audit.accounts: KR 인스타({instagram})·영문 인스타({instagram_en}) 데이터가 있으면 **각각 별도 계정**으로 넣고, handle/followers/posts/following 은 그 데이터 수치를 그대로. KR=language "KR"·label "인스타그램 KR", EN=language "EN"·label "인스타그램 EN".
+- **instagram_audit.accounts 는 반드시 빈 배열 []로 두세요.** 계정 정보는 시스템이 수집 데이터로 직접 채우니 LLM은 만들지 말고, instagram_audit.diagnosis(진단)만 작성하세요.
 - facebook_audit.pages: KR 페북({facebook})·영문 페북({facebook_en}) 데이터가 있으면 **각각 별도 페이지**로 넣고, url/page_name/followers 등은 그 데이터 그대로. language/label 동일 규칙.
 - 위 수치·URL·이름은 제공된 데이터에서 그대로 쓰고 절대 지어내지 마세요.

--- a/app/services/analysis.py
+++ b/app/services/analysis.py
@ -4,6 +4,7 @@ from common.db import fetchone, execute, fetch_raw, get_analysis_raw_data, save_
 from integrations.llm.llm_service import LLMService
 from integrations.llm.prompt import report_prompt, plan_prompt
 from integrations.llm.schemas.report import ReportOutput
+from services.instagram_audit import build_instagram_accounts
 from integrations.llm.schemas.plan import PlanOutput
 from models.status import AnalysisStatus

@ -134,14 +135,10 @@ async def _build_overrides(analysis_run_id: str) -> dict:
            "review_count": lead.get("reviews"),
        }

-    # ── instagram ─────────────────────────────────────────────────────────────
-    ig_patch: dict = {}
-    if instagram.get("username"):  ig_patch["handle"]       = instagram["username"]
-    if instagram.get("posts"):     ig_patch["posts"]        = instagram["posts"]
-    if instagram.get("followers"): ig_patch["followers"]    = instagram["followers"]
-    if instagram.get("following"): ig_patch["following"]    = instagram["following"]
-    if instagram.get("bio"):       ig_patch["bio"]          = instagram["bio"]
-    if instagram.get("username"):  ig_patch["profile_link"] = f"https://www.instagram.com/{instagram['username']}/"
+    # ── instagram (KR·EN 계정을 코드에서 구성 → LLM 출력 무시하고 교체) ──────────────
+    ig_patch = build_instagram_accounts(
+        instagram, hospital.get("instagramEn") or {}, hospital.get("channelLogos") or {},
+    )

    # ── facebook ──────────────────────────────────────────────────────────────
    fb_patch: dict = {}
@ -178,7 +175,7 @@ async def _build_overrides(analysis_run_id: str) -> dict:
    if snapshot:
        overrides["clinic_snapshot"] = snapshot
    if ig_patch:
-        overrides["instagram_audit"] = {"accounts": [ig_patch]}
+        overrides["instagram_audit"] = {"accounts": ig_patch}
    if fb_patch:
        overrides["facebook_audit"] = {"pages": [fb_patch]}
    if yt_patch:
@ -200,6 +197,8 @@ def _deep_merge(base: dict, overrides: dict) -> dict:

 def _patch_report(result: ReportOutput, overrides: dict) -> ReportOutput:
    merged = _deep_merge(result.model_dump(), overrides)
+    # 인스타 계정은 프롬프트에서 LLM이 []로 두게 했고, 코드가 수집 데이터로 채운다 (데이터 없으면 빈 리스트)
+    merged.setdefault("instagram_audit", {})["accounts"] = (overrides.get("instagram_audit") or {}).get("accounts") or []
    return ReportOutput(**merged)


--- a/app/services/instagram_audit.py
+++ b/app/services/instagram_audit.py
@ -0,0 +1,48 @@
+"""Instagram audit 계정(KR·EN)을 수집 데이터로 구성.
+fix 값(handle/followers/highlights/content_format 등)은 전부 코드에서 박는다 — LLM 출력 무시."""
+
+_MEDIA = {"GraphImage": "이미지", "GraphSidecar": "카드뉴스", "GraphVideo": "영상/릴스"}
+
+
+def _content_format(data: dict) -> str:
+    """latestPosts 미디어 타입으로 콘텐츠 포맷 도출 (표기 순서는 _MEDIA 정의 순서로 고정)."""
+    present = {_MEDIA.get(p.get("type")) for p in (data.get("latestPosts") or [])}
+    return "/".join(m for m in _MEDIA.values() if m in present)
+
+
+def _logo_desc(channel_logos: dict, channel: str) -> str:
+    """channelLogos(비전 결과)에서 해당 채널 로고 설명만 가져온다."""
+    for c in (channel_logos or {}).get("channel_logos", []):
+        if c.get("channel") == channel:
+            return c.get("logo_description") or ""
+    return ""
+
+
+def _account(data: dict, language: str, label: str, channel: str, channel_logos: dict) -> dict:
+    """스크래퍼 수집값으로 InstagramAccount 전 필드를 구성."""
+    handle = data.get("username") or ""
+    return {
+        "handle":         handle,
+        "language":       language,
+        "label":          label,
+        "posts":          data.get("posts", 0),
+        "followers":      data.get("followers", 0),
+        "following":      data.get("following", 0),
+        "category":       data.get("category", ""),
+        "profile_link":   f"https://www.instagram.com/{handle}/" if handle else "",
+        "highlights":     data.get("highlights") or [],
+        "reels_count":    0,  # 릴스 스크래퍼 미사용
+        "content_format": _content_format(data),
+        "profile_photo":  _logo_desc(channel_logos, channel),
+        "bio":            data.get("bio", ""),
+    }
+
+
+def build_instagram_accounts(instagram: dict, instagram_en: dict, channel_logos: dict) -> list[dict]:
+    """KR·EN 인스타 계정 리스트 구성 (username 있는 것만)."""
+    accounts: list[dict] = []
+    if instagram.get("username"):
+        accounts.append(_account(instagram, "KR", "인스타그램 KR", "Instagram", channel_logos))
+    if instagram_en.get("username"):
+        accounts.append(_account(instagram_en, "EN", "인스타그램 EN", "Instagram EN", channel_logos))
+    return accounts