From 1516f2807ce32e497784604a3731c408a8a8675e Mon Sep 17 00:00:00 2001 From: jaehwang Date: Fri, 26 Dec 2025 11:10:14 +0900 Subject: [PATCH] =?UTF-8?q?=EC=8A=A4=ED=81=AC=EB=9E=98=ED=8D=BC=20?= =?UTF-8?q?=EC=97=85=EB=8D=B0=EC=9D=B4=ED=8A=B8:=20=EB=8B=A8=EC=B6=95=20ur?= =?UTF-8?q?l=20=EC=B2=98=EB=A6=AC,=20=EC=8B=9C=EC=84=A4=20=EC=A0=95?= =?UTF-8?q?=EB=B3=B4=20=EC=B6=94=EC=B6=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- poc/crawling/nvMapScraper.py | 106 ++++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 46 deletions(-) diff --git a/poc/crawling/nvMapScraper.py b/poc/crawling/nvMapScraper.py index 0988192..7d155e4 100644 --- a/poc/crawling/nvMapScraper.py +++ b/poc/crawling/nvMapScraper.py @@ -1,13 +1,14 @@ -import asyncio -import json import re - import aiohttp +import json +import asyncio +import bs4 +PLACE_PATTERN = r"/place/(\d+)" GRAPHQL_URL = "https://pcmap-api.place.naver.com/graphql" -NAVER_COOKIES = "NAC=mQ7mBownbQf4A; NNB=TQPII6AKDBFGQ; PLACE_LANGUAGE=ko; NACT=1; nid_inf=1431570813; NID_AUT=k2T7FraXOdIMRCHzEZIFtHQup+I7b87M5fd7+p65AXZTdGB/gelRmW8s/Q4oDxm8; tooltipDisplayed=true; SRT30=1762660151; NID_SES=AAAB1Lpy3y3hGzuPbJpJl8vvFx18C+HXXuZEFou/YPgocHe7k2/5MpFlgE48X1JF7c7IPoU2khZKkkuLx+tsvWAzOf0TnG/G8RrBGeawnSluSJcKcTdKKRJ4cygKc/OabVxoc3TNZJWxer3vFtXBoXkDS5querVNS6wvcMhA/p4vkPKOeepwKLR+1IJERlQJWZw4q29IdAysrbBNn3Akf9mDA5eTYvMDLYyRkToRh10TVMW/yhyNQeMXlIdnR8U1ZCNqe/9ErYdos5gQDstswEJQQA0T2cHFGJOtmlYMPlnhWado5w521iZXGJyKcA9ZawizM/i5nK5xNYtPGS3cvImUYl6B5ulIipUJSqpj8v2XstK0TZlOGxHToXaVDrCNmSfCA9vFYbTb6xJHB2JRAT3Jik/z6QgLjJLBWRnsucMDqldxoiEDAUHEhY3pjgZ89quR3c3hwAuTlI9hBn5I3e5VQR0Y/GxoS9mIkMF8pJmcGneqnE0BNIt91RN6Se5rDM69B+JWppBXtSir1JGuXADaRLLMP8VlxJX949iH0UYTKWKsrD4OgNNK5aUx24nAH494WPknBMlx4fCMIeWzy7K3sEZkNUn/+A+eHraqIFfbGpveSCNM+8EqEjMgA+YRgg3eig==; _naver_usersession_=Kkgzim/64JicPJzgkIIvqQ==; page_uid=jesTPsqVWUZssE4qJeossssssD0-011300; SRT5=1762662010; BUC=z5Fu3sAYtFwpbRDrrDFYdn4AgK5hNkOqX-DdaLU7VJM=" +NAVER_COOKIES="NAC=mQ7mBownbQf4A; NNB=TQPII6AKDBFGQ; PLACE_LANGUAGE=ko; NACT=1; nid_inf=1431570813; NID_AUT=k2T7FraXOdIMRCHzEZIFtHQup+I7b87M5fd7+p65AXZTdGB/gelRmW8s/Q4oDxm8; tooltipDisplayed=true; SRT30=1762660151; NID_SES=AAAB1Lpy3y3hGzuPbJpJl8vvFx18C+HXXuZEFou/YPgocHe7k2/5MpFlgE48X1JF7c7IPoU2khZKkkuLx+tsvWAzOf0TnG/G8RrBGeawnSluSJcKcTdKKRJ4cygKc/OabVxoc3TNZJWxer3vFtXBoXkDS5querVNS6wvcMhA/p4vkPKOeepwKLR+1IJERlQJWZw4q29IdAysrbBNn3Akf9mDA5eTYvMDLYyRkToRh10TVMW/yhyNQeMXlIdnR8U1ZCNqe/9ErYdos5gQDstswEJQQA0T2cHFGJOtmlYMPlnhWado5w521iZXGJyKcA9ZawizM/i5nK5xNYtPGS3cvImUYl6B5ulIipUJSqpj8v2XstK0TZlOGxHToXaVDrCNmSfCA9vFYbTb6xJHB2JRAT3Jik/z6QgLjJLBWRnsucMDqldxoiEDAUHEhY3pjgZ89quR3c3hwAuTlI9hBn5I3e5VQR0Y/GxoS9mIkMF8pJmcGneqnE0BNIt91RN6Se5rDM69B+JWppBXtSir1JGuXADaRLLMP8VlxJX949iH0UYTKWKsrD4OgNNK5aUx24nAH494WPknBMlx4fCMIeWzy7K3sEZkNUn/+A+eHraqIFfbGpveSCNM+8EqEjMgA+YRgg3eig==; _naver_usersession_=Kkgzim/64JicPJzgkIIvqQ==; page_uid=jesTPsqVWUZssE4qJeossssssD0-011300; SRT5=1762662010; BUC=z5Fu3sAYtFwpbRDrrDFYdn4AgK5hNkOqX-DdaLU7VJM=" -OVERVIEW_QUERY = """ +OVERVIEW_QUERY = ''' query getAccommodation($id: String!, $deviceType: String) { business: placeDetail(input: {id: $id, isNx: true, deviceType: $deviceType}) { base { @@ -25,81 +26,94 @@ query getAccommodation($id: String!, $deviceType: String) { images { images { origin url } } cpImages(source: [ugcImage]) { images { origin url } } } -}""" +}''' + REQUEST_HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", "Referer": "https://map.naver.com/", "Origin": "https://map.naver.com", "Content-Type": "application/json", - "Cookie": NAVER_COOKIES, + "Cookie": NAVER_COOKIES } - class GraphQLException(Exception): pass - -class nvMapScraper: - url: str = None - scrap_type: str = None - rawdata: dict = None - image_link_list: list[str] = None - base_info: dict = None +class nvMapScraper(): + url : str = None + scrap_type : str = None + rawdata : dict = None + image_link_list : list[str] = None + base_info : dict = None + def __init__(self, url): self.url = url - def parse_url(self): - place_pattern = r"/place/(\d+)" + async def parse_url(self): + if 'place' not in self.url: + if 'naver.me' in self.url: + async with aiohttp.ClientSession() as session: + async with session.get(self.url) as response: + self.url = str(response.url) + else: + raise GraphQLException("this shorten url not have place id") try: - place_id = re.search(place_pattern, self.url)[1] - except: - raise GraphQLException() - + place_id = re.search(PLACE_PATTERN, self.url)[1] + except Exception as E: + raise GraphQLException("Cannot find place id") + return place_id - + async def scrap(self): try: - place_id = self.parse_url() + place_id = await self.parse_url() data = await self.call_get_accomodation(place_id) self.rawdata = data - self.image_link_list = [ - nv_image["origin"] - for nv_image in data["data"]["business"]["images"]["images"] - ] - self.base_info = data["data"]["business"]["base"] + fac_data = await self.get_facility_string(place_id) + self.rawdata['facilities'] = fac_data + self.image_link_list = [nv_image['origin'] for nv_image in data['data']['business']['images']['images']] + self.base_info = data['data']['business']['base'] + self.facility_info = fac_data self.scrap_type = "GraphQL" - except GraphQLException: + except GraphQLException as G: + print (G) print("fallback") self.scrap_type = "Playwright" - pass # 나중에 pw 이용한 crawling으로 fallback 추가 + pass # 나중에 pw 이용한 crawling으로 fallback 추가 return async def call_get_accomodation(self, place_id): payload = { - "operationName": "getAccommodation", - "variables": {"id": place_id, "deviceType": "pc"}, + "operationName" : "getAccommodation", + "variables": { "id": place_id, "deviceType": "pc" }, "query": OVERVIEW_QUERY, } json_payload = json.dumps(payload) - + async with aiohttp.ClientSession() as session: - async with session.post( - GRAPHQL_URL, data=json_payload, headers=REQUEST_HEADERS - ) as response: - response.encoding = "utf-8" - if response.status == 200: # 요청 성공 - return await response.json() # await 주의 - else: # 요청 실패 - print("실패 상태 코드:", response.status) + async with session.post(GRAPHQL_URL, data=json_payload, headers=REQUEST_HEADERS) as response: + response.encoding = 'utf-8' + if response.status == 200: # 요청 성공 + return await response.json() # await 주의 + else: # 요청 실패 + print('실패 상태 코드:', response.status) + print(response.text) raise Exception() - - -url = "https://map.naver.com/p/search/%EC%8A%A4%ED%85%8C%EC%9D%B4%EB%A8%B8%EB%AD%84/place/1133638931?c=14.70,0,0,0,dh&placePath=/photo?businessCategory=pension&fromPanelNum=2&locale=ko&searchText=%EC%8A%A4%ED%85%8C%EC%9D%B4%EB%A8%B8%EB%AD%84&svcName=map_pcv5×tamp=202512191123&fromPanelNum=2&locale=ko&searchText=%EC%8A%A4%ED%85%8C%EC%9D%B4%EB%A8%B8%EB%AD%84&svcName=map_pcv5×tamp=202512191007&from=map&entry=bmp&filterType=%EC%97%85%EC%B2%B4&businessCategory=pension" + + async def get_facility_string(self, place_id): + url = f"https://pcmap.place.naver.com/accommodation/{place_id}/home" + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=REQUEST_HEADERS) as response: + soup = bs4.BeautifulSoup(await response.read(), 'html.parser') + c_elem = soup.find('span', 'place_blind', string='편의') + facilities = c_elem.parent.parent.find('div').string + return facilities + +url = "https://naver.me/IgJGCCic" scraper = nvMapScraper(url) asyncio.run(scraper.scrap()) print(scraper.image_link_list) -print(len(scraper.image_link_list)) -print(scraper.base_info) +print(len(scraper.image_link_list)) \ No newline at end of file