diff --git a/app/utils/nvMapScraper.py b/app/utils/nvMapScraper.py index 1edf149..7eec1bf 100644 --- a/app/utils/nvMapScraper.py +++ b/app/utils/nvMapScraper.py @@ -1,8 +1,10 @@ +import asyncio import json import logging import re import aiohttp +import bs4 from config import crawler_settings @@ -27,7 +29,7 @@ class NvMapScraper: """ GRAPHQL_URL: str = "https://pcmap-api.place.naver.com/graphql" - REQUEST_TIMEOUT = 30 # 초 + REQUEST_TIMEOUT = 120 # 초 OVERVIEW_QUERY: str = """ query getAccommodation($id: String!, $deviceType: String) { @@ -65,6 +67,7 @@ query getAccommodation($id: String!, $deviceType: String) { self.rawdata: dict | None = None self.image_link_list: list[str] | None = None self.base_info: dict | None = None + self.facility_info: str | None = None def _get_request_headers(self) -> dict: headers = self.DEFAULT_HEADERS.copy() @@ -72,8 +75,19 @@ query getAccommodation($id: String!, $deviceType: String) { headers["Cookie"] = self.cookies return headers - def parse_url(self) -> str: + async def parse_url(self) -> str: + """URL에서 place ID를 추출합니다. 단축 URL인 경우 실제 URL로 변환합니다.""" place_pattern = r"/place/(\d+)" + + # URL에 place가 없는 경우 단축 URL 처리 + if "place" not in self.url: + if "naver.me" in self.url: + async with aiohttp.ClientSession() as session: + async with session.get(self.url) as response: + self.url = str(response.url) + else: + raise GraphQLException("This URL does not contain a place ID") + match = re.search(place_pattern, self.url) if not match: raise GraphQLException("Failed to parse place ID from URL") @@ -81,14 +95,17 @@ query getAccommodation($id: String!, $deviceType: String) { async def scrap(self): try: - place_id = self.parse_url() + place_id = await self.parse_url() data = await self._call_get_accommodation(place_id) self.rawdata = data + fac_data = await self._get_facility_string(place_id) + self.rawdata["facilities"] = fac_data self.image_link_list = [ nv_image["origin"] for nv_image in data["data"]["business"]["images"]["images"] ] self.base_info = data["data"]["business"]["base"] + self.facility_info = fac_data self.scrap_type = "GraphQL" except GraphQLException: @@ -141,7 +158,7 @@ query getAccommodation($id: String!, $deviceType: String) { f"Request failed with status {response.status}" ) - except TimeoutError: + except (TimeoutError, asyncio.TimeoutError): logger.error(f"[NvMapScraper] Timeout - place_id: {place_id}") print(f"[NvMapScraper] Timeout - place_id: {place_id}") raise CrawlingTimeoutException(f"Request timed out after {self.REQUEST_TIMEOUT}s") @@ -151,6 +168,29 @@ query getAccommodation($id: String!, $deviceType: String) { print(f"[NvMapScraper] Client error: {e}") raise GraphQLException(f"Client error: {e}") + async def _get_facility_string(self, place_id: str) -> str | None: + """숙소 페이지에서 편의시설 정보를 크롤링합니다. + + Args: + place_id: 네이버 지도 장소 ID + + Returns: + 편의시설 정보 문자열 또는 None + """ + url = f"https://pcmap.place.naver.com/accommodation/{place_id}/home" + try: + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=self._get_request_headers()) as response: + soup = bs4.BeautifulSoup(await response.read(), "html.parser") + c_elem = soup.find("span", "place_blind", string="편의") + if c_elem: + facilities = c_elem.parent.parent.find("div").string + return facilities + return None + except Exception as e: + logger.warning(f"[NvMapScraper] Failed to get facility info: {e}") + return None + # if __name__ == "__main__": # import asyncio diff --git a/poc/crawling/nvMapScraper-2026-01-12.py b/poc/crawling/nvMapScraper-2026-01-12.py new file mode 100644 index 0000000..7d155e4 --- /dev/null +++ b/poc/crawling/nvMapScraper-2026-01-12.py @@ -0,0 +1,119 @@ +import re +import aiohttp +import json +import asyncio +import bs4 + +PLACE_PATTERN = r"/place/(\d+)" +GRAPHQL_URL = "https://pcmap-api.place.naver.com/graphql" +NAVER_COOKIES="NAC=mQ7mBownbQf4A; NNB=TQPII6AKDBFGQ; PLACE_LANGUAGE=ko; NACT=1; nid_inf=1431570813; NID_AUT=k2T7FraXOdIMRCHzEZIFtHQup+I7b87M5fd7+p65AXZTdGB/gelRmW8s/Q4oDxm8; tooltipDisplayed=true; SRT30=1762660151; NID_SES=AAAB1Lpy3y3hGzuPbJpJl8vvFx18C+HXXuZEFou/YPgocHe7k2/5MpFlgE48X1JF7c7IPoU2khZKkkuLx+tsvWAzOf0TnG/G8RrBGeawnSluSJcKcTdKKRJ4cygKc/OabVxoc3TNZJWxer3vFtXBoXkDS5querVNS6wvcMhA/p4vkPKOeepwKLR+1IJERlQJWZw4q29IdAysrbBNn3Akf9mDA5eTYvMDLYyRkToRh10TVMW/yhyNQeMXlIdnR8U1ZCNqe/9ErYdos5gQDstswEJQQA0T2cHFGJOtmlYMPlnhWado5w521iZXGJyKcA9ZawizM/i5nK5xNYtPGS3cvImUYl6B5ulIipUJSqpj8v2XstK0TZlOGxHToXaVDrCNmSfCA9vFYbTb6xJHB2JRAT3Jik/z6QgLjJLBWRnsucMDqldxoiEDAUHEhY3pjgZ89quR3c3hwAuTlI9hBn5I3e5VQR0Y/GxoS9mIkMF8pJmcGneqnE0BNIt91RN6Se5rDM69B+JWppBXtSir1JGuXADaRLLMP8VlxJX949iH0UYTKWKsrD4OgNNK5aUx24nAH494WPknBMlx4fCMIeWzy7K3sEZkNUn/+A+eHraqIFfbGpveSCNM+8EqEjMgA+YRgg3eig==; _naver_usersession_=Kkgzim/64JicPJzgkIIvqQ==; page_uid=jesTPsqVWUZssE4qJeossssssD0-011300; SRT5=1762662010; BUC=z5Fu3sAYtFwpbRDrrDFYdn4AgK5hNkOqX-DdaLU7VJM=" + +OVERVIEW_QUERY = ''' +query getAccommodation($id: String!, $deviceType: String) { + business: placeDetail(input: {id: $id, isNx: true, deviceType: $deviceType}) { + base { + id + name + category + roadAddress + address + phone + virtualPhone + microReviews + conveniences + visitorReviewsTotal + } + images { images { origin url } } + cpImages(source: [ugcImage]) { images { origin url } } + } +}''' + +REQUEST_HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", + "Referer": "https://map.naver.com/", + "Origin": "https://map.naver.com", + "Content-Type": "application/json", + "Cookie": NAVER_COOKIES +} + +class GraphQLException(Exception): + pass + +class nvMapScraper(): + url : str = None + scrap_type : str = None + rawdata : dict = None + image_link_list : list[str] = None + base_info : dict = None + + + def __init__(self, url): + self.url = url + + async def parse_url(self): + if 'place' not in self.url: + if 'naver.me' in self.url: + async with aiohttp.ClientSession() as session: + async with session.get(self.url) as response: + self.url = str(response.url) + else: + raise GraphQLException("this shorten url not have place id") + try: + place_id = re.search(PLACE_PATTERN, self.url)[1] + except Exception as E: + raise GraphQLException("Cannot find place id") + + return place_id + + async def scrap(self): + try: + place_id = await self.parse_url() + data = await self.call_get_accomodation(place_id) + self.rawdata = data + fac_data = await self.get_facility_string(place_id) + self.rawdata['facilities'] = fac_data + self.image_link_list = [nv_image['origin'] for nv_image in data['data']['business']['images']['images']] + self.base_info = data['data']['business']['base'] + self.facility_info = fac_data + self.scrap_type = "GraphQL" + + except GraphQLException as G: + print (G) + print("fallback") + self.scrap_type = "Playwright" + pass # 나중에 pw 이용한 crawling으로 fallback 추가 + + return + + async def call_get_accomodation(self, place_id): + payload = { + "operationName" : "getAccommodation", + "variables": { "id": place_id, "deviceType": "pc" }, + "query": OVERVIEW_QUERY, + } + json_payload = json.dumps(payload) + + async with aiohttp.ClientSession() as session: + async with session.post(GRAPHQL_URL, data=json_payload, headers=REQUEST_HEADERS) as response: + response.encoding = 'utf-8' + if response.status == 200: # 요청 성공 + return await response.json() # await 주의 + else: # 요청 실패 + print('실패 상태 코드:', response.status) + print(response.text) + raise Exception() + + async def get_facility_string(self, place_id): + url = f"https://pcmap.place.naver.com/accommodation/{place_id}/home" + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=REQUEST_HEADERS) as response: + soup = bs4.BeautifulSoup(await response.read(), 'html.parser') + c_elem = soup.find('span', 'place_blind', string='편의') + facilities = c_elem.parent.parent.find('div').string + return facilities + +url = "https://naver.me/IgJGCCic" +scraper = nvMapScraper(url) +asyncio.run(scraper.scrap()) +print(scraper.image_link_list) +print(len(scraper.image_link_list)) \ No newline at end of file