from http import HTTPMethod from common.utils import get_env, http_request FIRECRAWL_BASE = "https://api.firecrawl.dev/v1" class FirecrawlClient: def __init__(self, api_key: str): self.api_key = api_key def _headers(self) -> dict: return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} async def scrape(self, url: str, json_options: dict, wait_for: int = 5000) -> dict | None: resp = await http_request( HTTPMethod.POST, url=f"{FIRECRAWL_BASE}/scrape", headers=self._headers(), json_body={"url": url, "formats": ["json", "links"], "jsonOptions": json_options, "waitFor": wait_for}, label="firecrawl-scrape", ) if not resp or not resp.is_success: return None return resp.json().get("data") async def map(self, url: str, limit: int = 50) -> list[str]: resp = await http_request( HTTPMethod.POST, url=f"{FIRECRAWL_BASE}/map", headers=self._headers(), json_body={"url": url, "limit": limit}, label="firecrawl-map", ) if not resp or not resp.is_success: return [] return resp.json().get("links", []) async def search(self, query: str, limit: int = 5) -> list[dict]: resp = await http_request( HTTPMethod.POST, url=f"{FIRECRAWL_BASE}/search", headers=self._headers(), json_body={"query": query, "limit": limit}, label="firecrawl-search", ) if not resp or not resp.is_success: return [] return resp.json().get("data", []) async def fetch_social_buttons(self, url: str) -> list[dict]: data = await self.scrape(url, { "prompt": "Find ALL social media link URLs on this page — header, footer, sidebar, floating buttons. Extract actual href URLs for: Instagram, YouTube, Facebook, TikTok, Naver Blog, KakaoTalk.", "schema": { "type": "object", "properties": { "socialLinks": { "type": "array", "items": { "type": "object", "properties": {"platform": {"type": "string"}, "url": {"type": "string"}}, }, }, }, }, }) if not data: return [] return (data.get("json") or {}).get("socialLinks", []) async def fetch_clinic_info(self, url: str) -> dict | None: resp = await http_request( HTTPMethod.POST, url=f"{FIRECRAWL_BASE}/scrape", headers=self._headers(), json_body={ "url": url, "formats": ["json", "links"], "jsonOptions": { "prompt": "Extract: clinic name (Korean), clinic name (English), address, phone, business hours, slogan, services offered, doctors with name/title/specialty, brand identity (primary/accent/background/text colors in hex, heading/body fonts, logo URL, favicon URL)", "schema": { "type": "object", "properties": { "clinicName": {"type": "string"}, "clinicNameEn": {"type": "string"}, "address": {"type": "string"}, "phone": {"type": "string"}, "businessHours": {"type": "string"}, "slogan": {"type": "string"}, "services": {"type": "array", "items": {"type": "string"}}, "doctors": { "type": "array", "items": { "type": "object", "properties": { "name": {"type": "string"}, "title": {"type": "string"}, "specialty": {"type": "string"}, }, }, }, # "socialMedia": { # "type": "object", # "properties": { # "instagram": {"type": "string"}, # "youtube": {"type": "string"}, # "blog": {"type": "string"}, # "facebook": {"type": "string"}, # "tiktok": {"type": "string"}, # "kakao": {"type": "string"}, # }, # }, "branding": { "type": "object", "properties": { "primaryColor": {"type": "string"}, "accentColor": {"type": "string"}, "backgroundColor": {"type": "string"}, "textColor": {"type": "string"}, "headingFont": {"type": "string"}, "bodyFont": {"type": "string"}, "logoUrl": {"type": "string"}, "faviconUrl": {"type": "string"}, }, }, }, }, }, "waitFor": 5000, }, timeout=60, label="firecrawl-clinic-info", ) if not resp or not resp.is_success: return None data = resp.json().get("data") or {} info = data.get("json") or {} return { "clinicName": info.get("clinicName"), "clinicNameEn": info.get("clinicNameEn"), "address": info.get("address"), "phone": info.get("phone"), "businessHours": info.get("businessHours"), "slogan": info.get("slogan"), "services": info.get("services", []), "doctors": info.get("doctors", []), # "socialMedia": info.get("socialMedia", {}), "branding": info.get("branding", {}), "siteLinks": data.get("links", []), "sourceUrl": url, } async def fetch_gangnam_unni(self, hospital_url: str) -> dict | None: resp = await http_request( HTTPMethod.POST, url=f"{FIRECRAWL_BASE}/scrape", headers=self._headers(), json_body={ "url": hospital_url, "formats": ["json"], "jsonOptions": { "prompt": "Extract: hospital name, overall rating (out of 10), total review count, number of major staffs, all doctor with names/ratings/review counts/specialties(please check html, there are not only 4 doctors!), procedures, address, badges.", "schema": { "type": "object", "properties": { "hospitalName": {"type": "string"}, "rating": {"type": "number"}, "totalReviews": {"type": "number"}, "totalMajorStaffs" : {"type" : "number"}, "doctors": { "type": "array", "items": { "type": "object", "properties": { "name": {"type": "string"}, "rating": {"type": "number"}, "reviews": {"type": "number"}, "specialty": {"type": "string"}, }, }, }, "procedures": {"type": "array", "items": {"type": "string"}}, "address": {"type": "string"}, "badges": {"type": "array", "items": {"type": "string"}}, }, }, }, "waitFor": 5000, }, timeout=60, label="firecrawl-gangnamunni", ) if not resp or not resp.is_success: return None raw = (resp.json().get("data") or {}).get("json") return {"sourceUrl": hospital_url, **raw} if raw else None async def get_gangnam_unni(self, hospital_url: str) -> dict | None: raw = await self.fetch_gangnam_unni(hospital_url) if not raw or not raw.get("hospitalName"): return None return { "name": raw["hospitalName"], "rating": raw.get("rating"), "ratingScale": "/10", "totalReviews": raw.get("totalReviews", 0), "doctors": (raw.get("doctors") or []), "totalMajorStaffs": raw.get("totalMajorStaffs", 0), "procedures": raw.get("procedures", []), "address": raw.get("address", ""), "badges": raw.get("badges", []), "sourceUrl": raw["sourceUrl"], }