import asyncio import json import re import aiohttp import bs4 from app.utils.logger import get_logger from config import crawler_settings # 로거 설정 logger = get_logger("scraper") class GraphQLException(Exception): """GraphQL 요청 실패 시 발생하는 예외""" pass class CrawlingTimeoutException(Exception): """크롤링 타임아웃 시 발생하는 예외""" pass class NvMapScraper: """네이버 지도 GraphQL API 스크래퍼 네이버 지도에서 숙소/장소 정보를 크롤링합니다. """ GRAPHQL_URL: str = "https://pcmap-api.place.naver.com/graphql" REQUEST_TIMEOUT = 120 # 초 data_source_identifier = "nv" OVERVIEW_QUERY: str = """ query getAccommodation($id: String!, $deviceType: String) { business: placeDetail(input: {id: $id, isNx: true, deviceType: $deviceType}) { base { id name category roadAddress address phone virtualPhone microReviews conveniences visitorReviewsTotal } images { images { origin url } } cpImages(source: [ugcImage]) { images { origin url } } } }""" DEFAULT_HEADERS: dict = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", "Referer": "https://map.naver.com/", "Origin": "https://map.naver.com", "Content-Type": "application/json", } def __init__(self, url: str, cookies: str | None = None): self.url = url self.cookies = ( cookies if cookies is not None else crawler_settings.NAVER_COOKIES ) self.scrap_type: str | None = None self.rawdata: dict | None = None self.image_link_list: list[str] | None = None self.base_info: dict | None = None self.facility_info: str | None = None def _get_request_headers(self) -> dict: headers = self.DEFAULT_HEADERS.copy() if self.cookies: headers["Cookie"] = self.cookies return headers async def parse_url(self) -> str: """URL에서 place ID를 추출합니다. 단축 URL인 경우 실제 URL로 변환합니다.""" place_pattern = r"/place/(\d+)" # URL에 place가 없는 경우 단축 URL 처리 if "place" not in self.url: if "naver.me" in self.url: async with aiohttp.ClientSession() as session: async with session.get(self.url) as response: self.url = str(response.url) else: raise GraphQLException("This URL does not contain a place ID") match = re.search(place_pattern, self.url) if not match: raise GraphQLException("Failed to parse place ID from URL") return match[1] async def scrap(self): try: place_id = await self.parse_url() data = await self._call_get_accommodation(place_id) self.rawdata = data fac_data = await self._get_facility_string(place_id) # Naver 기준임, 구글 등 다른 데이터 소스의 경우 고유 Identifier 사용할 것. self.place_id = self.data_source_identifier + place_id self.rawdata["facilities"] = fac_data self.image_link_list = [ nv_image["origin"] for nv_image in data["data"]["business"]["images"]["images"] ] self.base_info = data["data"]["business"]["base"] self.facility_info = fac_data self.scrap_type = "GraphQL" except GraphQLException: logger.debug("GraphQL failed, fallback to Playwright") self.scrap_type = "Playwright" pass # 나중에 pw 이용한 crawling으로 fallback 추가 return async def _call_get_accommodation(self, place_id: str) -> dict: """GraphQL API를 호출하여 숙소 정보를 가져옵니다. Args: place_id: 네이버 지도 장소 ID Returns: GraphQL 응답 데이터 Raises: GraphQLException: API 호출 실패 시 CrawlingTimeoutException: 타임아웃 발생 시 """ payload = { "operationName": "getAccommodation", "variables": {"id": place_id, "deviceType": "pc"}, "query": self.OVERVIEW_QUERY, } json_payload = json.dumps(payload) timeout = aiohttp.ClientTimeout(total=self.REQUEST_TIMEOUT) try: logger.info(f"[NvMapScraper] Requesting place_id: {place_id}") async with aiohttp.ClientSession(timeout=timeout) as session: async with session.post( self.GRAPHQL_URL, data=json_payload, headers=self._get_request_headers() ) as response: if response.status == 200: logger.info(f"[NvMapScraper] SUCCESS - place_id: {place_id}") return await response.json() # 실패 상태 코드 logger.error(f"[NvMapScraper] Failed with status {response.status} - place_id: {place_id}") raise GraphQLException( f"Request failed with status {response.status}" ) except (TimeoutError, asyncio.TimeoutError): logger.error(f"[NvMapScraper] Timeout - place_id: {place_id}") raise CrawlingTimeoutException(f"Request timed out after {self.REQUEST_TIMEOUT}s") except aiohttp.ClientError as e: logger.error(f"[NvMapScraper] Client error: {e}") raise GraphQLException(f"Client error: {e}") async def _get_facility_string(self, place_id: str) -> str | None: """숙소 페이지에서 편의시설 정보를 크롤링합니다. Args: place_id: 네이버 지도 장소 ID Returns: 편의시설 정보 문자열 또는 None """ url = f"https://pcmap.place.naver.com/accommodation/{place_id}/home" try: async with aiohttp.ClientSession() as session: async with session.get(url, headers=self._get_request_headers()) as response: soup = bs4.BeautifulSoup(await response.read(), "html.parser") c_elem = soup.find("span", "place_blind", string="편의") if c_elem: facilities = c_elem.parent.parent.find("div").string return facilities return None except Exception as e: logger.warning(f"[NvMapScraper] Failed to get facility info: {e}") return None # if __name__ == "__main__": # import asyncio # url = "https://map.naver.com/p/search/%EC%8A%A4%ED%85%8C%EC%9D%B4%EB%A8%B8%EB%AD%84/place/1133638931?c=14.70,0,0,0,dh&placePath=/photo?businessCategory=pension&fromPanelNum=2&locale=ko&searchText=%EC%8A%A4%ED%85%8C%EC%9D%B4%EB%A8%B8%EB%AD%84&svcName=map_pcv5×tamp=202512191123&fromPanelNum=2&locale=ko&searchText=%EC%8A%A4%ED%85%8C%EC%9D%B4%EB%A8%B8%EB%AD%84&svcName=map_pcv5×tamp=202512191007&from=map&entry=bmp&filterType=%EC%97%85%EC%B2%B4&businessCategory=pension" # scraper = NvMapScraper(url) # asyncio.run(scraper.scrap()) # print(scraper.image_link_list) # print(len(scraper.image_link_list) if scraper.image_link_list else 0) # print(scraper.base_info)