integration 1차 데이터 및 DB 정의, 테스트
parent
23e859217b
commit
d930679e90
|
|
@ -42,3 +42,5 @@ Thumbs.db
|
|||
|
||||
# Alembic
|
||||
alembic/versions/*.pyc
|
||||
|
||||
test_results/
|
||||
15
README.md
15
README.md
|
|
@ -0,0 +1,15 @@
|
|||
# o2o-infinith-backend
|
||||
|
||||
## 설치
|
||||
|
||||
**Docker**
|
||||
|
||||
```bash
|
||||
curl -fsSL https://get.docker.com | sh
|
||||
```
|
||||
|
||||
## 실행
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
-- 테이블 순서는 관계를 고려하여 한 번에 실행해도 에러가 발생하지 않게 정렬되었습니다.
|
||||
|
||||
-- instagram_data Table Create SQL
|
||||
-- 테이블 생성 SQL - instagram_data
|
||||
CREATE TABLE instagram_data
|
||||
(
|
||||
`id` INT NOT NULL AUTO_INCREMENT,
|
||||
`hospital_id` INT NOT NULL,
|
||||
`url` VARCHAR(500) NOT NULL,
|
||||
`raw_data` JSON NULL,
|
||||
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (id)
|
||||
);
|
||||
|
||||
-- Index 설정 SQL - instagram_data(hospital_id)
|
||||
CREATE INDEX IX_instagram_data_1
|
||||
ON instagram_data(hospital_id);
|
||||
|
||||
|
||||
-- facebook_data Table Create SQL
|
||||
-- 테이블 생성 SQL - facebook_data
|
||||
CREATE TABLE facebook_data
|
||||
(
|
||||
`id` INT NOT NULL AUTO_INCREMENT,
|
||||
`hospital_id` INT NOT NULL,
|
||||
`url` VARCHAR(500) NOT NULL,
|
||||
`raw_data` JSON NULL,
|
||||
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (id)
|
||||
);
|
||||
|
||||
-- Index 설정 SQL - facebook_data(hospital_id)
|
||||
CREATE INDEX IX_facebook_data_1
|
||||
ON facebook_data(hospital_id);
|
||||
|
||||
|
||||
-- naver_blog_data Table Create SQL
|
||||
-- 테이블 생성 SQL - naver_blog_data
|
||||
CREATE TABLE naver_blog_data
|
||||
(
|
||||
`id` INT NOT NULL AUTO_INCREMENT,
|
||||
`hospital_id` INT NOT NULL,
|
||||
`url` VARCHAR(500) NOT NULL,
|
||||
`raw_data` JSON NULL,
|
||||
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (id)
|
||||
);
|
||||
|
||||
-- Index 설정 SQL - naver_blog_data(hospital_id)
|
||||
CREATE INDEX IX_naver_blog_data_1
|
||||
ON naver_blog_data(hospital_id);
|
||||
|
||||
|
||||
-- hospital_baseinfo Table Create SQL
|
||||
-- 테이블 생성 SQL - hospital_baseinfo
|
||||
CREATE TABLE hospital_baseinfo
|
||||
(
|
||||
`hospital_id` INT NOT NULL AUTO_INCREMENT,
|
||||
`owner_user_id` INT NOT NULL,
|
||||
`hospital_name` VARCHAR(50) NOT NULL,
|
||||
`brn` VARCHAR(50) NOT NULL,
|
||||
`road_address` VARCHAR(100) NULL,
|
||||
`site_address` VARCHAR(100) NULL,
|
||||
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
`updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (hospital_id)
|
||||
);
|
||||
|
||||
-- Index 설정 SQL - hospital_baseinfo(owner_user_id)
|
||||
CREATE INDEX IX_hospital_baseinfo_1
|
||||
ON hospital_baseinfo(owner_user_id);
|
||||
|
||||
|
||||
-- user_info Table Create SQL
|
||||
-- 테이블 생성 SQL - user_info
|
||||
CREATE TABLE user_info
|
||||
(
|
||||
`user_id` INT NOT NULL AUTO_INCREMENT,
|
||||
`username` VARCHAR(50) NOT NULL,
|
||||
`password` VARCHAR(50) NOT NULL,
|
||||
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
`updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (user_id)
|
||||
);
|
||||
|
||||
-- youtube_data Table Create SQL
|
||||
CREATE TABLE youtube_data
|
||||
(
|
||||
`id` INT NOT NULL AUTO_INCREMENT,
|
||||
`hospital_id` INT NOT NULL,
|
||||
`url` VARCHAR(500) NOT NULL,
|
||||
`raw_data` JSON NULL,
|
||||
`created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (id)
|
||||
);
|
||||
|
||||
-- Index 설정 SQL - youtube_data(hospital_id)
|
||||
CREATE INDEX IX_youtube_data_1
|
||||
ON youtube_data(hospital_id);
|
||||
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
import os
|
||||
import re
|
||||
import asyncio
|
||||
from http import HTTPMethod
|
||||
import httpx
|
||||
|
||||
REQUEST_TIMEOUT = 60
|
||||
|
||||
|
||||
def get_env(key: str) -> str:
|
||||
v = os.environ.get(key, "")
|
||||
if not v:
|
||||
raise EnvironmentError(f"Missing env: {key}")
|
||||
return v
|
||||
|
||||
async def http_request(
|
||||
method: HTTPMethod,
|
||||
url: str,
|
||||
*,
|
||||
label: str,
|
||||
headers: dict | None = None,
|
||||
params: dict | None = None,
|
||||
json_body: dict | None = None,
|
||||
timeout: int = REQUEST_TIMEOUT,
|
||||
max_retries: int = 0,
|
||||
) -> httpx.Response | None:
|
||||
async with httpx.AsyncClient() as client:
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
resp = await client.request(method, url, headers=headers, params=params, json=json_body, timeout=timeout)
|
||||
return resp
|
||||
except httpx.RequestError as e:
|
||||
if attempt < max_retries:
|
||||
print(f" [retry] {label} → {e}, attempt {attempt + 1}")
|
||||
await asyncio.sleep((attempt + 1) * 2)
|
||||
else:
|
||||
print(f" [error] {label} → {e}")
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
_SKIP_IG = {"p", "reel", "stories", "explore", "accounts", "about", "directory"}
|
||||
_SKIP_FB = {"sharer", "share", "dialog", "plugins", "groups", "events", "watch", "help"}
|
||||
|
||||
|
||||
def extract_social_handles(urls: list[str]) -> dict[str, list[str]]:
|
||||
result: dict[str, list[str]] = {"instagram": [], "youtube": [], "facebook": [], "naver_blog": [], "tiktok": []}
|
||||
|
||||
for url in urls:
|
||||
if not url:
|
||||
continue
|
||||
m = re.search(r"instagram\.com/([a-zA-Z0-9._]+)", url)
|
||||
if m and m.group(1).lower() not in _SKIP_IG:
|
||||
result["instagram"].append(m.group(1))
|
||||
|
||||
m = re.search(r"youtube\.com/(?:@([a-zA-Z0-9._-]+)|channel/(UC[a-zA-Z0-9_-]+)|c/([a-zA-Z0-9._-]+))", url)
|
||||
if m:
|
||||
result["youtube"].append(f"@{m.group(1)}" if m.group(1) else (m.group(2) or m.group(3) or ""))
|
||||
|
||||
m = re.search(r"facebook\.com/([a-zA-Z0-9._-]+)", url)
|
||||
if m and m.group(1).lower() not in _SKIP_FB:
|
||||
result["facebook"].append(m.group(1))
|
||||
|
||||
m = re.search(r"blog\.naver\.com/([a-zA-Z0-9_-]+)", url)
|
||||
if m:
|
||||
result["naver_blog"].append(m.group(1))
|
||||
|
||||
m = re.search(r"tiktok\.com/@([a-zA-Z0-9._-]+)", url)
|
||||
if m:
|
||||
result["tiktok"].append(m.group(1))
|
||||
|
||||
return {k: list(set(v)) for k, v in result.items()}
|
||||
|
||||
|
||||
def normalize_handle(platform: str, value: str) -> str:
|
||||
"""URL이 들어오면 핸들을 추출하고, 이미 핸들이면 그대로 반환."""
|
||||
if not value:
|
||||
return value
|
||||
if "://" in value or value.startswith("www."):
|
||||
handles = extract_social_handles([value]).get(platform, [])
|
||||
value = handles[0] if handles else value
|
||||
return value.lstrip("@") if platform != "youtube" else value
|
||||
|
|
@ -0,0 +1,145 @@
|
|||
from http import HTTPMethod
|
||||
from common.utils import http_request
|
||||
|
||||
APIFY_BASE = "https://api.apify.com/v2"
|
||||
|
||||
|
||||
class ApifyClient:
|
||||
def __init__(self, token: str, wait_for_finish: int = 120):
|
||||
self.token = token
|
||||
self.wait_for_finish = wait_for_finish
|
||||
|
||||
async def _run_actor(self, actor_id: str, input_data: dict) -> list[dict]:
|
||||
resp = await http_request(
|
||||
HTTPMethod.POST,
|
||||
url=f"{APIFY_BASE}/acts/{actor_id}/runs",
|
||||
params={"token": self.token, "waitForFinish": self.wait_for_finish},
|
||||
headers={"Content-Type": "application/json"},
|
||||
json_body=input_data,
|
||||
timeout=self.wait_for_finish + 10,
|
||||
label=f"apify:{actor_id.split('~')[-1]}",
|
||||
)
|
||||
if not resp or not resp.is_success:
|
||||
return []
|
||||
dataset_id = resp.json()["data"]["defaultDatasetId"]
|
||||
items_resp = await http_request(
|
||||
HTTPMethod.GET,
|
||||
url=f"{APIFY_BASE}/datasets/{dataset_id}/items",
|
||||
params={"token": self.token, "limit": 20},
|
||||
label=f"apify-dataset-{dataset_id}",
|
||||
)
|
||||
if not items_resp or not items_resp.is_success:
|
||||
return []
|
||||
return items_resp.json()
|
||||
|
||||
async def fetch_instagram_profile(self, handle: str) -> dict | None:
|
||||
items = await self._run_actor("apify~instagram-profile-scraper", {"usernames": [handle], "resultsLimit": 12})
|
||||
return items[0] if items else None
|
||||
|
||||
async def get_instagram_profile(self, handle: str) -> dict | None:
|
||||
profile = await self.fetch_instagram_profile(handle)
|
||||
if not profile or profile.get("error"):
|
||||
return None
|
||||
return {
|
||||
"username": profile["username"],
|
||||
"followers": profile.get("followersCount", 0),
|
||||
"following": profile.get("followsCount", 0),
|
||||
"posts": profile.get("postsCount", 0),
|
||||
"bio": profile.get("biography", ""),
|
||||
"isBusinessAccount": profile.get("isBusinessAccount", False),
|
||||
"externalUrl": profile.get("externalUrl"),
|
||||
"latestPosts": [
|
||||
{
|
||||
"type": p.get("type"),
|
||||
"likes": p.get("likesCount", 0),
|
||||
"comments": p.get("commentsCount", 0),
|
||||
"caption": (p.get("caption") or "")[:500],
|
||||
"timestamp": p.get("timestamp"),
|
||||
}
|
||||
for p in (profile.get("latestPosts") or [])[:12]
|
||||
],
|
||||
}
|
||||
|
||||
async def fetch_instagram_posts(self, handle: str, limit: int = 20) -> list[dict]:
|
||||
clean = handle.lstrip("@")
|
||||
return await self._run_actor("apify~instagram-post-scraper", {
|
||||
"directUrls": [f"https://www.instagram.com/{clean}/"],
|
||||
"resultsLimit": limit,
|
||||
})
|
||||
|
||||
async def get_instagram_posts(self, handle: str, limit: int = 20) -> dict:
|
||||
items = await self.fetch_instagram_posts(handle, limit)
|
||||
posts = [
|
||||
{
|
||||
"id": p["id"],
|
||||
"type": p.get("type"),
|
||||
"url": p.get("url"),
|
||||
"caption": (p.get("caption") or "")[:500],
|
||||
"hashtags": p.get("hashtags", []),
|
||||
"likesCount": p.get("likesCount", 0),
|
||||
"commentsCount": p.get("commentsCount", 0),
|
||||
"timestamp": p.get("timestamp"),
|
||||
}
|
||||
for p in items
|
||||
]
|
||||
n = len(posts) or 1
|
||||
return {
|
||||
"posts": posts,
|
||||
"totalPosts": len(posts),
|
||||
"avgLikes": round(sum(p["likesCount"] for p in posts) / n),
|
||||
"avgComments": round(sum(p["commentsCount"] for p in posts) / n),
|
||||
}
|
||||
|
||||
async def fetch_instagram_reels(self, handle: str, limit: int = 15) -> list[dict]:
|
||||
clean = handle.lstrip("@")
|
||||
return await self._run_actor("apify~instagram-reel-scraper", {
|
||||
"directUrls": [f"https://www.instagram.com/{clean}/reels/"],
|
||||
"resultsLimit": limit,
|
||||
})
|
||||
|
||||
async def get_instagram_reels(self, handle: str, limit: int = 15) -> dict:
|
||||
items = await self.fetch_instagram_reels(handle, limit)
|
||||
reels = [
|
||||
{
|
||||
"id": r["id"],
|
||||
"url": r.get("url"),
|
||||
"caption": (r.get("caption") or "")[:500],
|
||||
"hashtags": r.get("hashtags", []),
|
||||
"likesCount": r.get("likesCount", 0),
|
||||
"commentsCount": r.get("commentsCount", 0),
|
||||
"videoViewCount": r.get("videoViewCount", 0),
|
||||
"videoPlayCount": r.get("videoPlayCount", 0),
|
||||
"videoDuration": r.get("videoDuration", 0),
|
||||
"timestamp": r.get("timestamp"),
|
||||
}
|
||||
for r in items
|
||||
]
|
||||
n = len(reels) or 1
|
||||
return {
|
||||
"reels": reels,
|
||||
"totalReels": len(reels),
|
||||
"avgViews": round(sum(r["videoViewCount"] for r in reels) / n),
|
||||
"avgPlays": round(sum(r["videoPlayCount"] for r in reels) / n),
|
||||
}
|
||||
|
||||
async def fetch_facebook_page(self, page_url: str) -> dict | None:
|
||||
items = await self._run_actor("apify~facebook-pages-scraper", {"startUrls": [{"url": page_url}]})
|
||||
return items[0] if items else None
|
||||
|
||||
async def get_facebook_page(self, page_url: str) -> dict | None:
|
||||
page = await self.fetch_facebook_page(page_url)
|
||||
if not page:
|
||||
return None
|
||||
return {
|
||||
"pageName": page["title"],
|
||||
"pageUrl": page.get("pageUrl", page_url),
|
||||
"followers": page.get("followers", 0),
|
||||
"likes": page.get("likes", 0),
|
||||
"categories": page.get("categories", []),
|
||||
"email": page.get("email"),
|
||||
"phone": page.get("phone"),
|
||||
"website": page.get("website"),
|
||||
"address": page.get("address"),
|
||||
"intro": page.get("intro"),
|
||||
"rating": page.get("rating"),
|
||||
}
|
||||
|
|
@ -0,0 +1,128 @@
|
|||
from http import HTTPMethod
|
||||
from common.utils import get_env, http_request
|
||||
|
||||
FIRECRAWL_BASE = "https://api.firecrawl.dev/v1"
|
||||
|
||||
|
||||
class FirecrawlClient:
|
||||
def __init__(self, api_key: str):
|
||||
self.api_key = api_key
|
||||
|
||||
def _headers(self) -> dict:
|
||||
return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
|
||||
|
||||
async def scrape(self, url: str, json_options: dict, wait_for: int = 5000) -> dict | None:
|
||||
resp = await http_request(
|
||||
HTTPMethod.POST,
|
||||
url=f"{FIRECRAWL_BASE}/scrape",
|
||||
headers=self._headers(),
|
||||
json_body={"url": url, "formats": ["json", "links"], "jsonOptions": json_options, "waitFor": wait_for},
|
||||
label="firecrawl-scrape",
|
||||
)
|
||||
if not resp or not resp.is_success:
|
||||
return None
|
||||
return resp.json().get("data")
|
||||
|
||||
async def map(self, url: str, limit: int = 50) -> list[str]:
|
||||
resp = await http_request(
|
||||
HTTPMethod.POST,
|
||||
url=f"{FIRECRAWL_BASE}/map",
|
||||
headers=self._headers(),
|
||||
json_body={"url": url, "limit": limit},
|
||||
label="firecrawl-map",
|
||||
)
|
||||
if not resp or not resp.is_success:
|
||||
return []
|
||||
return resp.json().get("links", [])
|
||||
|
||||
async def search(self, query: str, limit: int = 5) -> list[dict]:
|
||||
resp = await http_request(
|
||||
HTTPMethod.POST,
|
||||
url=f"{FIRECRAWL_BASE}/search",
|
||||
headers=self._headers(),
|
||||
json_body={"query": query, "limit": limit},
|
||||
label="firecrawl-search",
|
||||
)
|
||||
if not resp or not resp.is_success:
|
||||
return []
|
||||
return resp.json().get("data", [])
|
||||
|
||||
async def fetch_social_buttons(self, url: str) -> list[dict]:
|
||||
data = await self.scrape(url, {
|
||||
"prompt": "Find ALL social media link URLs on this page — header, footer, sidebar, floating buttons. Extract actual href URLs for: Instagram, YouTube, Facebook, TikTok, Naver Blog, KakaoTalk.",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"socialLinks": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {"platform": {"type": "string"}, "url": {"type": "string"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
if not data:
|
||||
return []
|
||||
return (data.get("json") or {}).get("socialLinks", [])
|
||||
|
||||
async def fetch_gangnam_unni(self, hospital_url: str) -> dict | None:
|
||||
resp = await http_request(
|
||||
HTTPMethod.POST,
|
||||
url=f"{FIRECRAWL_BASE}/scrape",
|
||||
headers=self._headers(),
|
||||
json_body={
|
||||
"url": hospital_url,
|
||||
"formats": ["json"],
|
||||
"jsonOptions": {
|
||||
"prompt": "Extract: hospital name, overall rating (out of 10), total review count, doctors with names/ratings/review counts/specialties, procedures, address, badges",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"hospitalName": {"type": "string"},
|
||||
"rating": {"type": "number"},
|
||||
"totalReviews": {"type": "number"},
|
||||
"doctors": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"rating": {"type": "number"},
|
||||
"reviews": {"type": "number"},
|
||||
"specialty": {"type": "string"},
|
||||
},
|
||||
},
|
||||
},
|
||||
"procedures": {"type": "array", "items": {"type": "string"}},
|
||||
"address": {"type": "string"},
|
||||
"badges": {"type": "array", "items": {"type": "string"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
"waitFor": 5000,
|
||||
},
|
||||
timeout=60,
|
||||
label="firecrawl-gangnamunni",
|
||||
)
|
||||
if not resp or not resp.is_success:
|
||||
return None
|
||||
raw = (resp.json().get("data") or {}).get("json")
|
||||
return {"sourceUrl": hospital_url, **raw} if raw else None
|
||||
|
||||
async def get_gangnam_unni(self, hospital_url: str) -> dict | None:
|
||||
raw = await self.fetch_gangnam_unni(hospital_url)
|
||||
if not raw or not raw.get("hospitalName"):
|
||||
return None
|
||||
return {
|
||||
"name": raw["hospitalName"],
|
||||
"rating": raw.get("rating"),
|
||||
"ratingScale": "/10",
|
||||
"totalReviews": raw.get("totalReviews", 0),
|
||||
"doctors": (raw.get("doctors") or [])[:10],
|
||||
"procedures": raw.get("procedures", []),
|
||||
"address": raw.get("address", ""),
|
||||
"badges": raw.get("badges", []),
|
||||
"sourceUrl": raw["sourceUrl"],
|
||||
}
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
from http import HTTPMethod
|
||||
from common.utils import http_request
|
||||
|
||||
PLACES_BASE = "https://places.googleapis.com/v1"
|
||||
FIELD_MASK = ",".join([
|
||||
"places.id", "places.displayName", "places.formattedAddress",
|
||||
"places.rating", "places.userRatingCount",
|
||||
"places.internationalPhoneNumber", "places.websiteUri",
|
||||
"places.googleMapsUri", "places.primaryTypeDisplayName",
|
||||
"places.regularOpeningHours", "places.reviews",
|
||||
])
|
||||
|
||||
|
||||
class GooglePlacesClient:
|
||||
def __init__(self, api_key: str):
|
||||
self.api_key = api_key
|
||||
|
||||
def _headers(self) -> dict:
|
||||
return {
|
||||
"Content-Type": "application/json",
|
||||
"X-Goog-Api-Key": self.api_key,
|
||||
"X-Goog-FieldMask": FIELD_MASK,
|
||||
}
|
||||
|
||||
async def fetch_place(self, query: str) -> dict | None:
|
||||
resp = await http_request(
|
||||
HTTPMethod.POST,
|
||||
url=f"{PLACES_BASE}/places:searchText",
|
||||
headers=self._headers(),
|
||||
json_body={"textQuery": query, "languageCode": "ko", "regionCode": "KR", "maxResultCount": 3},
|
||||
timeout=15,
|
||||
label="google-places",
|
||||
)
|
||||
if not resp or not resp.is_success:
|
||||
return None
|
||||
places = resp.json().get("places", [])
|
||||
return places[0] if places else None
|
||||
|
||||
async def get_place(self, query: str) -> dict | None:
|
||||
p = await self.fetch_place(query)
|
||||
if not p:
|
||||
return None
|
||||
return {
|
||||
"name": (p.get("displayName") or {}).get("text", ""),
|
||||
"rating": p.get("rating"),
|
||||
"reviewCount": p.get("userRatingCount", 0),
|
||||
"address": p.get("formattedAddress", ""),
|
||||
"phone": p.get("internationalPhoneNumber", ""),
|
||||
"clinicWebsite": p.get("websiteUri", ""),
|
||||
"mapsUrl": p.get("googleMapsUri", ""),
|
||||
"placeId": p.get("id", ""),
|
||||
"category": (p.get("primaryTypeDisplayName") or {}).get("text", ""),
|
||||
"topReviews": [
|
||||
{
|
||||
"stars": r.get("rating", 0),
|
||||
"text": ((r.get("text") or {}).get("text", ""))[:500],
|
||||
"date": r.get("publishTime", ""),
|
||||
}
|
||||
for r in (p.get("reviews") or [])[:10]
|
||||
],
|
||||
}
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
from .service import LLMService
|
||||
from .prompt import Prompt
|
||||
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class Prompt:
|
||||
def __init__(
|
||||
self,
|
||||
template: str,
|
||||
model: str,
|
||||
input_class: type[BaseModel],
|
||||
output_class: type[BaseModel],
|
||||
):
|
||||
self.template = template
|
||||
self.model = model
|
||||
self.input_class = input_class
|
||||
self.output_class = output_class
|
||||
|
||||
def build(self, input_data: dict) -> str:
|
||||
verified = self.input_class(**input_data)
|
||||
return self.template.format(**verified.model_dump())
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
from pydantic import BaseModel
|
||||
from openai import AsyncOpenAI
|
||||
from common.utils import get_env
|
||||
from .prompt import Prompt
|
||||
|
||||
|
||||
class LLMResponseError(Exception):
|
||||
def __init__(self, status: str, code: str = None, message: str = None):
|
||||
self.status = status
|
||||
self.code = code
|
||||
self.message = message
|
||||
super().__init__(f"LLM response failed: status={status}, code={code}, message={message}")
|
||||
|
||||
|
||||
class LLMService:
|
||||
def __init__(self, provider: str = "openai", max_retries: int = 2):
|
||||
self.max_retries = max_retries
|
||||
match provider:
|
||||
case "openai":
|
||||
self.client = AsyncOpenAI(api_key=get_env("OPENAI_API_KEY"))
|
||||
case "perplexity":
|
||||
self.client = AsyncOpenAI(
|
||||
api_key=get_env("PERPLEXITY_API_KEY"),
|
||||
base_url="https://api.perplexity.ai",
|
||||
)
|
||||
case "gemini":
|
||||
self.client = AsyncOpenAI(
|
||||
api_key=get_env("GEMINI_API_KEY"),
|
||||
base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
|
||||
)
|
||||
case _:
|
||||
raise NotImplementedError(f"Unknown provider: {provider}")
|
||||
|
||||
async def generate(
|
||||
self,
|
||||
prompt: Prompt,
|
||||
input_data: dict,
|
||||
) -> BaseModel:
|
||||
prompt_text = prompt.build(input_data)
|
||||
last_error = None
|
||||
|
||||
for attempt in range(self.max_retries + 1):
|
||||
response = await self.client.beta.chat.completions.parse(
|
||||
model=prompt.model,
|
||||
messages=[{"role": "user", "content": prompt_text}],
|
||||
response_format=prompt.output_class,
|
||||
)
|
||||
choice = response.choices[0]
|
||||
finish_reason = choice.finish_reason
|
||||
|
||||
if finish_reason == "stop":
|
||||
return choice.message.parsed
|
||||
|
||||
if finish_reason == "length":
|
||||
last_error = LLMResponseError("incomplete", finish_reason, "max tokens reached")
|
||||
elif finish_reason == "content_filter":
|
||||
last_error = LLMResponseError("failed", finish_reason, "blocked by content filter")
|
||||
else:
|
||||
last_error = LLMResponseError("failed", finish_reason, f"unexpected finish_reason: {finish_reason}")
|
||||
|
||||
raise last_error
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
import re
|
||||
from http import HTTPMethod
|
||||
from common.utils import http_request
|
||||
|
||||
NAVER_BASE = "https://openapi.naver.com/v1/search"
|
||||
|
||||
|
||||
class NaverClient:
|
||||
def __init__(self, client_id: str, client_secret: str):
|
||||
self.client_id = client_id
|
||||
self.client_secret = client_secret
|
||||
|
||||
def _headers(self) -> dict:
|
||||
return {
|
||||
"X-Naver-Client-Id": self.client_id,
|
||||
"X-Naver-Client-Secret": self.client_secret,
|
||||
}
|
||||
|
||||
async def fetch_blog_search(self, query: str, display: int = 5) -> list[dict]:
|
||||
resp = await http_request(
|
||||
HTTPMethod.GET,
|
||||
url=f"{NAVER_BASE}/blog.json",
|
||||
headers=self._headers(),
|
||||
params={"query": query, "display": display, "sort": "sim"},
|
||||
label="naver-blog",
|
||||
)
|
||||
if not resp or not resp.is_success:
|
||||
return []
|
||||
return resp.json().get("items", [])
|
||||
|
||||
async def fetch_web_search(self, query: str, display: int = 10) -> list[dict]:
|
||||
resp = await http_request(
|
||||
HTTPMethod.GET,
|
||||
url=f"{NAVER_BASE}/webkr.json",
|
||||
headers=self._headers(),
|
||||
params={"query": query, "display": display},
|
||||
label="naver-web",
|
||||
)
|
||||
if not resp or not resp.is_success:
|
||||
return []
|
||||
return resp.json().get("items", [])
|
||||
|
||||
async def fetch_local_search(self, query: str, display: int = 5) -> list[dict]:
|
||||
resp = await http_request(
|
||||
HTTPMethod.GET,
|
||||
url=f"{NAVER_BASE}/local.json",
|
||||
headers=self._headers(),
|
||||
params={"query": query, "display": display, "sort": "comment"},
|
||||
label="naver-local",
|
||||
)
|
||||
if not resp or not resp.is_success:
|
||||
return []
|
||||
return resp.json().get("items", [])
|
||||
|
||||
async def fetch_blog_rss(self, blog_handle: str) -> str | None:
|
||||
resp = await http_request(
|
||||
HTTPMethod.GET,
|
||||
url=f"https://rss.blog.naver.com/{blog_handle}.xml",
|
||||
timeout=15,
|
||||
label="naver-rss",
|
||||
)
|
||||
if not resp or not resp.is_success:
|
||||
return None
|
||||
return resp.text
|
||||
|
||||
async def get_blog_rss(self, blog_handle: str) -> dict | None:
|
||||
xml = await self.fetch_blog_rss(blog_handle)
|
||||
if not xml:
|
||||
return None
|
||||
posts = []
|
||||
for m in re.finditer(r"<item>([\s\S]*?)</item>", xml):
|
||||
block = m.group(1)
|
||||
title = re.search(r"<title><!\[CDATA\[(.*?)\]\]></title>", block) or re.search(r"<title>(.*?)</title>", block)
|
||||
link = re.search(r"<link>(.*?)</link>", block)
|
||||
date = re.search(r"<pubDate>(.*?)</pubDate>", block)
|
||||
desc = re.search(r"<description><!\[CDATA\[(.*?)\]\]></description>", block) or re.search(r"<description>(.*?)</description>", block)
|
||||
posts.append({
|
||||
"title": title.group(1) if title else "",
|
||||
"link": link.group(1) if link else "",
|
||||
"postDate": date.group(1) if date else "",
|
||||
"description": re.sub(r"<[^>]*>", "", desc.group(1) if desc else "").strip()[:150],
|
||||
})
|
||||
total_match = re.search(r"<totalCount>(\d+)</totalCount>", xml)
|
||||
return {
|
||||
"officialBlogUrl": f"https://blog.naver.com/{blog_handle}",
|
||||
"officialBlogHandle": blog_handle,
|
||||
"totalResults": int(total_match.group(1)) if total_match else len(posts),
|
||||
"posts": posts[:10],
|
||||
}
|
||||
|
|
@ -0,0 +1,123 @@
|
|||
from http import HTTPMethod
|
||||
from common.utils import http_request
|
||||
|
||||
YT = "https://www.googleapis.com/youtube/v3"
|
||||
|
||||
|
||||
class YouTubeClient:
|
||||
def __init__(self, api_key: str):
|
||||
self.api_key = api_key
|
||||
|
||||
async def _resolve_channel_id(self, handle: str) -> str:
|
||||
h = handle.lstrip("@")
|
||||
if h.startswith("UC") and len(h) == 24:
|
||||
return h
|
||||
for param in ("forHandle", "forUsername"):
|
||||
resp = await http_request(
|
||||
HTTPMethod.GET,
|
||||
url=f"{YT}/channels",
|
||||
params={"part": "id", param: h, "key": self.api_key},
|
||||
label="yt-resolve",
|
||||
)
|
||||
if resp and resp.is_success:
|
||||
items = resp.json().get("items", [])
|
||||
if items:
|
||||
return items[0]["id"]
|
||||
return ""
|
||||
|
||||
async def fetch_channel(self, handle_or_id: str) -> dict | None:
|
||||
channel_id = await self._resolve_channel_id(handle_or_id)
|
||||
if not channel_id:
|
||||
return None
|
||||
|
||||
resp = await http_request(
|
||||
HTTPMethod.GET,
|
||||
url=f"{YT}/channels",
|
||||
params={"part": "snippet,statistics", "id": channel_id, "key": self.api_key},
|
||||
label="yt-channel",
|
||||
)
|
||||
if not resp or not resp.is_success:
|
||||
return None
|
||||
items = resp.json().get("items", [])
|
||||
if not items:
|
||||
return None
|
||||
channel = items[0]
|
||||
|
||||
video_ids: list[str] = []
|
||||
resp = await http_request(
|
||||
HTTPMethod.GET,
|
||||
url=f"{YT}/search",
|
||||
params={
|
||||
"part": "snippet",
|
||||
"channelId": channel_id,
|
||||
"order": "viewCount",
|
||||
"type": "video",
|
||||
"maxResults": 10,
|
||||
"key": self.api_key,
|
||||
},
|
||||
label="yt-search",
|
||||
)
|
||||
if resp and resp.is_success:
|
||||
video_ids = [i["id"]["videoId"] for i in resp.json().get("items", []) if i.get("id", {}).get("videoId")]
|
||||
|
||||
videos: list[dict] = []
|
||||
if video_ids:
|
||||
resp = await http_request(
|
||||
HTTPMethod.GET,
|
||||
url=f"{YT}/videos",
|
||||
params={
|
||||
"part": "snippet,statistics,contentDetails",
|
||||
"id": ",".join(video_ids),
|
||||
"key": self.api_key,
|
||||
},
|
||||
label="yt-videos",
|
||||
)
|
||||
if resp and resp.is_success:
|
||||
videos = resp.json().get("items", [])[:10]
|
||||
|
||||
return {"channelId": channel_id, "channel": channel, "videos": videos}
|
||||
|
||||
async def get_channel(self, handle_or_id: str) -> dict | None:
|
||||
raw = await self.fetch_channel(handle_or_id)
|
||||
if not raw:
|
||||
return None
|
||||
ch = raw["channel"]
|
||||
stats = ch.get("statistics", {})
|
||||
snippet = ch.get("snippet", {})
|
||||
return {
|
||||
"channelId": raw["channelId"],
|
||||
"channelName": snippet.get("title"),
|
||||
"handle": snippet.get("customUrl"),
|
||||
"description": snippet.get("description", ""),
|
||||
"publishedAt": snippet.get("publishedAt"),
|
||||
"subscribers": int(stats.get("subscriberCount", 0)),
|
||||
"totalViews": int(stats.get("viewCount", 0)),
|
||||
"totalVideos": int(stats.get("videoCount", 0)),
|
||||
"videos": [
|
||||
{
|
||||
"title": v.get("snippet", {}).get("title"),
|
||||
"views": int(v.get("statistics", {}).get("viewCount", 0)),
|
||||
"likes": int(v.get("statistics", {}).get("likeCount", 0)),
|
||||
"comments": int(v.get("statistics", {}).get("commentCount", 0)),
|
||||
"date": v.get("snippet", {}).get("publishedAt"),
|
||||
"duration": v.get("contentDetails", {}).get("duration"),
|
||||
"url": f"https://www.youtube.com/watch?v={v['id']}",
|
||||
}
|
||||
for v in raw["videos"]
|
||||
],
|
||||
}
|
||||
|
||||
async def search_channels(self, query: str, max_results: int = 3) -> list[str]:
|
||||
resp = await http_request(
|
||||
HTTPMethod.GET,
|
||||
url=f"{YT}/search",
|
||||
params={"part": "snippet", "type": "channel", "q": query, "maxResults": max_results, "key": self.api_key},
|
||||
label="yt-search-channels",
|
||||
)
|
||||
if not resp or not resp.is_success:
|
||||
return []
|
||||
return [
|
||||
i.get("snippet", {}).get("channelId") or i.get("id", {}).get("channelId")
|
||||
for i in resp.json().get("items", [])
|
||||
if i.get("snippet", {}).get("channelId") or i.get("id", {}).get("channelId")
|
||||
]
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
import asyncio
|
||||
import json
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv("../.env")
|
||||
|
||||
from common.utils import normalize_handle
|
||||
from integrations.youtube import YouTubeClient
|
||||
from integrations.apify import ApifyClient
|
||||
from integrations.naver import NaverClient
|
||||
from integrations.firecrawl import FirecrawlClient
|
||||
|
||||
INPUT = {
|
||||
"youtube": "@banobagips",
|
||||
"instagram": ["@banobagi_ps"],
|
||||
"facebook": "BanobagiPlasticSurgery",
|
||||
"naver_blog": "https://blog.naver.com/banobagiprs",
|
||||
"gangnam_unni": "https://www.gangnamunni.com/hospitals/23",
|
||||
}
|
||||
|
||||
OUT_DIR = "../test_results"
|
||||
|
||||
|
||||
def save(name: str, data) -> None:
|
||||
os.makedirs(OUT_DIR, exist_ok=True)
|
||||
path = os.path.join(OUT_DIR, f"{name}.json")
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False)
|
||||
print(f"saved → {path}")
|
||||
|
||||
|
||||
async def main():
|
||||
yt = YouTubeClient(api_key=os.environ["YOUTUBE_API_KEY"])
|
||||
apify = ApifyClient(token=os.environ["APIFY_API_TOKEN"])
|
||||
naver = NaverClient(client_id=os.environ["NAVER_CLIENT_ID"], client_secret=os.environ["NAVER_CLIENT_SECRET"])
|
||||
firecrawl = FirecrawlClient(api_key=os.environ["FIRECRAWL_API_KEY"])
|
||||
|
||||
yt_handle = normalize_handle("youtube", INPUT["youtube"])
|
||||
ig_handle = normalize_handle("instagram", INPUT["instagram"][0])
|
||||
fb_handle = normalize_handle("facebook", INPUT["facebook"])
|
||||
naver_handle = normalize_handle("naver_blog", INPUT["naver_blog"])
|
||||
|
||||
save("youtube", await yt.fetch_channel(yt_handle))
|
||||
save("instagram_profile", await apify.fetch_instagram_profile(ig_handle))
|
||||
# save("instagram_posts", await apify.fetch_instagram_posts(ig_handle))
|
||||
# save("instagram_reels", await apify.fetch_instagram_reels(ig_handle))
|
||||
save("facebook", await apify.fetch_facebook_page(f"https://www.facebook.com/{fb_handle}"))
|
||||
save("naver_blog", await naver.fetch_blog_rss(naver_handle))
|
||||
save("gangnam_unni", await firecrawl.fetch_gangnam_unni(INPUT["gangnam_unni"]))
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
|
|
@ -4,6 +4,7 @@ pydantic==2.13.2
|
|||
python-dotenv==1.2.2
|
||||
redis==7.4.0
|
||||
httpx==0.28.1
|
||||
openai==2.32.0
|
||||
python-jose[cryptography]==3.5.0
|
||||
passlib[bcrypt]==1.7.4
|
||||
python-multipart==0.0.26
|
||||
|
|
|
|||
Loading…
Reference in New Issue