update crawler for short url
parent
2b777f5314
commit
94aae50564
|
|
@ -1,8 +1,10 @@
|
||||||
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
import bs4
|
||||||
|
|
||||||
from config import crawler_settings
|
from config import crawler_settings
|
||||||
|
|
||||||
|
|
@ -27,7 +29,7 @@ class NvMapScraper:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
GRAPHQL_URL: str = "https://pcmap-api.place.naver.com/graphql"
|
GRAPHQL_URL: str = "https://pcmap-api.place.naver.com/graphql"
|
||||||
REQUEST_TIMEOUT = 30 # 초
|
REQUEST_TIMEOUT = 120 # 초
|
||||||
|
|
||||||
OVERVIEW_QUERY: str = """
|
OVERVIEW_QUERY: str = """
|
||||||
query getAccommodation($id: String!, $deviceType: String) {
|
query getAccommodation($id: String!, $deviceType: String) {
|
||||||
|
|
@ -65,6 +67,7 @@ query getAccommodation($id: String!, $deviceType: String) {
|
||||||
self.rawdata: dict | None = None
|
self.rawdata: dict | None = None
|
||||||
self.image_link_list: list[str] | None = None
|
self.image_link_list: list[str] | None = None
|
||||||
self.base_info: dict | None = None
|
self.base_info: dict | None = None
|
||||||
|
self.facility_info: str | None = None
|
||||||
|
|
||||||
def _get_request_headers(self) -> dict:
|
def _get_request_headers(self) -> dict:
|
||||||
headers = self.DEFAULT_HEADERS.copy()
|
headers = self.DEFAULT_HEADERS.copy()
|
||||||
|
|
@ -72,8 +75,19 @@ query getAccommodation($id: String!, $deviceType: String) {
|
||||||
headers["Cookie"] = self.cookies
|
headers["Cookie"] = self.cookies
|
||||||
return headers
|
return headers
|
||||||
|
|
||||||
def parse_url(self) -> str:
|
async def parse_url(self) -> str:
|
||||||
|
"""URL에서 place ID를 추출합니다. 단축 URL인 경우 실제 URL로 변환합니다."""
|
||||||
place_pattern = r"/place/(\d+)"
|
place_pattern = r"/place/(\d+)"
|
||||||
|
|
||||||
|
# URL에 place가 없는 경우 단축 URL 처리
|
||||||
|
if "place" not in self.url:
|
||||||
|
if "naver.me" in self.url:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(self.url) as response:
|
||||||
|
self.url = str(response.url)
|
||||||
|
else:
|
||||||
|
raise GraphQLException("This URL does not contain a place ID")
|
||||||
|
|
||||||
match = re.search(place_pattern, self.url)
|
match = re.search(place_pattern, self.url)
|
||||||
if not match:
|
if not match:
|
||||||
raise GraphQLException("Failed to parse place ID from URL")
|
raise GraphQLException("Failed to parse place ID from URL")
|
||||||
|
|
@ -81,14 +95,17 @@ query getAccommodation($id: String!, $deviceType: String) {
|
||||||
|
|
||||||
async def scrap(self):
|
async def scrap(self):
|
||||||
try:
|
try:
|
||||||
place_id = self.parse_url()
|
place_id = await self.parse_url()
|
||||||
data = await self._call_get_accommodation(place_id)
|
data = await self._call_get_accommodation(place_id)
|
||||||
self.rawdata = data
|
self.rawdata = data
|
||||||
|
fac_data = await self._get_facility_string(place_id)
|
||||||
|
self.rawdata["facilities"] = fac_data
|
||||||
self.image_link_list = [
|
self.image_link_list = [
|
||||||
nv_image["origin"]
|
nv_image["origin"]
|
||||||
for nv_image in data["data"]["business"]["images"]["images"]
|
for nv_image in data["data"]["business"]["images"]["images"]
|
||||||
]
|
]
|
||||||
self.base_info = data["data"]["business"]["base"]
|
self.base_info = data["data"]["business"]["base"]
|
||||||
|
self.facility_info = fac_data
|
||||||
self.scrap_type = "GraphQL"
|
self.scrap_type = "GraphQL"
|
||||||
|
|
||||||
except GraphQLException:
|
except GraphQLException:
|
||||||
|
|
@ -141,7 +158,7 @@ query getAccommodation($id: String!, $deviceType: String) {
|
||||||
f"Request failed with status {response.status}"
|
f"Request failed with status {response.status}"
|
||||||
)
|
)
|
||||||
|
|
||||||
except TimeoutError:
|
except (TimeoutError, asyncio.TimeoutError):
|
||||||
logger.error(f"[NvMapScraper] Timeout - place_id: {place_id}")
|
logger.error(f"[NvMapScraper] Timeout - place_id: {place_id}")
|
||||||
print(f"[NvMapScraper] Timeout - place_id: {place_id}")
|
print(f"[NvMapScraper] Timeout - place_id: {place_id}")
|
||||||
raise CrawlingTimeoutException(f"Request timed out after {self.REQUEST_TIMEOUT}s")
|
raise CrawlingTimeoutException(f"Request timed out after {self.REQUEST_TIMEOUT}s")
|
||||||
|
|
@ -151,6 +168,29 @@ query getAccommodation($id: String!, $deviceType: String) {
|
||||||
print(f"[NvMapScraper] Client error: {e}")
|
print(f"[NvMapScraper] Client error: {e}")
|
||||||
raise GraphQLException(f"Client error: {e}")
|
raise GraphQLException(f"Client error: {e}")
|
||||||
|
|
||||||
|
async def _get_facility_string(self, place_id: str) -> str | None:
|
||||||
|
"""숙소 페이지에서 편의시설 정보를 크롤링합니다.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
place_id: 네이버 지도 장소 ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
편의시설 정보 문자열 또는 None
|
||||||
|
"""
|
||||||
|
url = f"https://pcmap.place.naver.com/accommodation/{place_id}/home"
|
||||||
|
try:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(url, headers=self._get_request_headers()) as response:
|
||||||
|
soup = bs4.BeautifulSoup(await response.read(), "html.parser")
|
||||||
|
c_elem = soup.find("span", "place_blind", string="편의")
|
||||||
|
if c_elem:
|
||||||
|
facilities = c_elem.parent.parent.find("div").string
|
||||||
|
return facilities
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"[NvMapScraper] Failed to get facility info: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# if __name__ == "__main__":
|
# if __name__ == "__main__":
|
||||||
# import asyncio
|
# import asyncio
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,119 @@
|
||||||
|
import re
|
||||||
|
import aiohttp
|
||||||
|
import json
|
||||||
|
import asyncio
|
||||||
|
import bs4
|
||||||
|
|
||||||
|
PLACE_PATTERN = r"/place/(\d+)"
|
||||||
|
GRAPHQL_URL = "https://pcmap-api.place.naver.com/graphql"
|
||||||
|
NAVER_COOKIES="NAC=mQ7mBownbQf4A; NNB=TQPII6AKDBFGQ; PLACE_LANGUAGE=ko; NACT=1; nid_inf=1431570813; NID_AUT=k2T7FraXOdIMRCHzEZIFtHQup+I7b87M5fd7+p65AXZTdGB/gelRmW8s/Q4oDxm8; tooltipDisplayed=true; SRT30=1762660151; NID_SES=AAAB1Lpy3y3hGzuPbJpJl8vvFx18C+HXXuZEFou/YPgocHe7k2/5MpFlgE48X1JF7c7IPoU2khZKkkuLx+tsvWAzOf0TnG/G8RrBGeawnSluSJcKcTdKKRJ4cygKc/OabVxoc3TNZJWxer3vFtXBoXkDS5querVNS6wvcMhA/p4vkPKOeepwKLR+1IJERlQJWZw4q29IdAysrbBNn3Akf9mDA5eTYvMDLYyRkToRh10TVMW/yhyNQeMXlIdnR8U1ZCNqe/9ErYdos5gQDstswEJQQA0T2cHFGJOtmlYMPlnhWado5w521iZXGJyKcA9ZawizM/i5nK5xNYtPGS3cvImUYl6B5ulIipUJSqpj8v2XstK0TZlOGxHToXaVDrCNmSfCA9vFYbTb6xJHB2JRAT3Jik/z6QgLjJLBWRnsucMDqldxoiEDAUHEhY3pjgZ89quR3c3hwAuTlI9hBn5I3e5VQR0Y/GxoS9mIkMF8pJmcGneqnE0BNIt91RN6Se5rDM69B+JWppBXtSir1JGuXADaRLLMP8VlxJX949iH0UYTKWKsrD4OgNNK5aUx24nAH494WPknBMlx4fCMIeWzy7K3sEZkNUn/+A+eHraqIFfbGpveSCNM+8EqEjMgA+YRgg3eig==; _naver_usersession_=Kkgzim/64JicPJzgkIIvqQ==; page_uid=jesTPsqVWUZssE4qJeossssssD0-011300; SRT5=1762662010; BUC=z5Fu3sAYtFwpbRDrrDFYdn4AgK5hNkOqX-DdaLU7VJM="
|
||||||
|
|
||||||
|
OVERVIEW_QUERY = '''
|
||||||
|
query getAccommodation($id: String!, $deviceType: String) {
|
||||||
|
business: placeDetail(input: {id: $id, isNx: true, deviceType: $deviceType}) {
|
||||||
|
base {
|
||||||
|
id
|
||||||
|
name
|
||||||
|
category
|
||||||
|
roadAddress
|
||||||
|
address
|
||||||
|
phone
|
||||||
|
virtualPhone
|
||||||
|
microReviews
|
||||||
|
conveniences
|
||||||
|
visitorReviewsTotal
|
||||||
|
}
|
||||||
|
images { images { origin url } }
|
||||||
|
cpImages(source: [ugcImage]) { images { origin url } }
|
||||||
|
}
|
||||||
|
}'''
|
||||||
|
|
||||||
|
REQUEST_HEADERS = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
|
||||||
|
"Referer": "https://map.naver.com/",
|
||||||
|
"Origin": "https://map.naver.com",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Cookie": NAVER_COOKIES
|
||||||
|
}
|
||||||
|
|
||||||
|
class GraphQLException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class nvMapScraper():
|
||||||
|
url : str = None
|
||||||
|
scrap_type : str = None
|
||||||
|
rawdata : dict = None
|
||||||
|
image_link_list : list[str] = None
|
||||||
|
base_info : dict = None
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, url):
|
||||||
|
self.url = url
|
||||||
|
|
||||||
|
async def parse_url(self):
|
||||||
|
if 'place' not in self.url:
|
||||||
|
if 'naver.me' in self.url:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(self.url) as response:
|
||||||
|
self.url = str(response.url)
|
||||||
|
else:
|
||||||
|
raise GraphQLException("this shorten url not have place id")
|
||||||
|
try:
|
||||||
|
place_id = re.search(PLACE_PATTERN, self.url)[1]
|
||||||
|
except Exception as E:
|
||||||
|
raise GraphQLException("Cannot find place id")
|
||||||
|
|
||||||
|
return place_id
|
||||||
|
|
||||||
|
async def scrap(self):
|
||||||
|
try:
|
||||||
|
place_id = await self.parse_url()
|
||||||
|
data = await self.call_get_accomodation(place_id)
|
||||||
|
self.rawdata = data
|
||||||
|
fac_data = await self.get_facility_string(place_id)
|
||||||
|
self.rawdata['facilities'] = fac_data
|
||||||
|
self.image_link_list = [nv_image['origin'] for nv_image in data['data']['business']['images']['images']]
|
||||||
|
self.base_info = data['data']['business']['base']
|
||||||
|
self.facility_info = fac_data
|
||||||
|
self.scrap_type = "GraphQL"
|
||||||
|
|
||||||
|
except GraphQLException as G:
|
||||||
|
print (G)
|
||||||
|
print("fallback")
|
||||||
|
self.scrap_type = "Playwright"
|
||||||
|
pass # 나중에 pw 이용한 crawling으로 fallback 추가
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
async def call_get_accomodation(self, place_id):
|
||||||
|
payload = {
|
||||||
|
"operationName" : "getAccommodation",
|
||||||
|
"variables": { "id": place_id, "deviceType": "pc" },
|
||||||
|
"query": OVERVIEW_QUERY,
|
||||||
|
}
|
||||||
|
json_payload = json.dumps(payload)
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(GRAPHQL_URL, data=json_payload, headers=REQUEST_HEADERS) as response:
|
||||||
|
response.encoding = 'utf-8'
|
||||||
|
if response.status == 200: # 요청 성공
|
||||||
|
return await response.json() # await 주의
|
||||||
|
else: # 요청 실패
|
||||||
|
print('실패 상태 코드:', response.status)
|
||||||
|
print(response.text)
|
||||||
|
raise Exception()
|
||||||
|
|
||||||
|
async def get_facility_string(self, place_id):
|
||||||
|
url = f"https://pcmap.place.naver.com/accommodation/{place_id}/home"
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(url, headers=REQUEST_HEADERS) as response:
|
||||||
|
soup = bs4.BeautifulSoup(await response.read(), 'html.parser')
|
||||||
|
c_elem = soup.find('span', 'place_blind', string='편의')
|
||||||
|
facilities = c_elem.parent.parent.find('div').string
|
||||||
|
return facilities
|
||||||
|
|
||||||
|
url = "https://naver.me/IgJGCCic"
|
||||||
|
scraper = nvMapScraper(url)
|
||||||
|
asyncio.run(scraper.scrap())
|
||||||
|
print(scraper.image_link_list)
|
||||||
|
print(len(scraper.image_link_list))
|
||||||
Loading…
Reference in New Issue