113 lines
4.0 KiB
Python
113 lines
4.0 KiB
Python
import json
|
|
import re
|
|
|
|
import aiohttp
|
|
|
|
from config import crawler_settings
|
|
|
|
|
|
class GraphQLException(Exception):
|
|
pass
|
|
|
|
|
|
class NvMapScraper:
|
|
GRAPHQL_URL: str = "https://pcmap-api.place.naver.com/graphql"
|
|
|
|
OVERVIEW_QUERY: str = """
|
|
query getAccommodation($id: String!, $deviceType: String) {
|
|
business: placeDetail(input: {id: $id, isNx: true, deviceType: $deviceType}) {
|
|
base {
|
|
id
|
|
name
|
|
category
|
|
roadAddress
|
|
address
|
|
phone
|
|
virtualPhone
|
|
microReviews
|
|
conveniences
|
|
visitorReviewsTotal
|
|
}
|
|
images { images { origin url } }
|
|
cpImages(source: [ugcImage]) { images { origin url } }
|
|
}
|
|
}"""
|
|
|
|
DEFAULT_HEADERS: dict = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
|
|
"Referer": "https://map.naver.com/",
|
|
"Origin": "https://map.naver.com",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
def __init__(self, url: str, cookies: str | None = None):
|
|
self.url = url
|
|
self.cookies = (
|
|
cookies if cookies is not None else crawler_settings.NAVER_COOKIES
|
|
)
|
|
self.scrap_type: str | None = None
|
|
self.rawdata: dict | None = None
|
|
self.image_link_list: list[str] | None = None
|
|
self.base_info: dict | None = None
|
|
|
|
def _get_request_headers(self) -> dict:
|
|
headers = self.DEFAULT_HEADERS.copy()
|
|
if self.cookies:
|
|
headers["Cookie"] = self.cookies
|
|
return headers
|
|
|
|
def parse_url(self) -> str:
|
|
place_pattern = r"/place/(\d+)"
|
|
match = re.search(place_pattern, self.url)
|
|
if not match:
|
|
raise GraphQLException("Failed to parse place ID from URL")
|
|
return match[1]
|
|
|
|
async def scrap(self):
|
|
try:
|
|
place_id = self.parse_url()
|
|
data = await self._call_get_accommodation(place_id)
|
|
self.rawdata = data
|
|
self.image_link_list = [
|
|
nv_image["origin"]
|
|
for nv_image in data["data"]["business"]["images"]["images"]
|
|
]
|
|
self.base_info = data["data"]["business"]["base"]
|
|
self.scrap_type = "GraphQL"
|
|
|
|
except GraphQLException:
|
|
print("fallback")
|
|
self.scrap_type = "Playwright"
|
|
pass # 나중에 pw 이용한 crawling으로 fallback 추가
|
|
|
|
return
|
|
|
|
async def _call_get_accommodation(self, place_id: str) -> dict:
|
|
payload = {
|
|
"operationName": "getAccommodation",
|
|
"variables": {"id": place_id, "deviceType": "pc"},
|
|
"query": self.OVERVIEW_QUERY,
|
|
}
|
|
json_payload = json.dumps(payload)
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.post(
|
|
self.GRAPHQL_URL, data=json_payload, headers=self._get_request_headers()
|
|
) as response:
|
|
if response.status == 200:
|
|
return await response.json()
|
|
else:
|
|
print("실패 상태 코드:", response.status)
|
|
raise GraphQLException(
|
|
f"Request failed with status {response.status}"
|
|
)
|
|
|
|
|
|
# if __name__ == "__main__":
|
|
# url = "https://map.naver.com/p/search/%EC%8A%A4%ED%85%8C%EC%9D%B4%EB%A8%B8%EB%AD%84/place/1133638931?c=14.70,0,0,0,dh&placePath=/photo?businessCategory=pension&fromPanelNum=2&locale=ko&searchText=%EC%8A%A4%ED%85%8C%EC%9D%B4%EB%A8%B8%EB%AD%84&svcName=map_pcv5×tamp=202512191123&fromPanelNum=2&locale=ko&searchText=%EC%8A%A4%ED%85%8C%EC%9D%B4%EB%A8%B8%EB%AD%84&svcName=map_pcv5×tamp=202512191007&from=map&entry=bmp&filterType=%EC%97%85%EC%B2%B4&businessCategory=pension"
|
|
# scraper = NvMapScraper(url)
|
|
# asyncio.run(scraper.scrap())
|
|
# print(scraper.image_link_list)
|
|
# print(len(scraper.image_link_list) if scraper.image_link_list else 0)
|
|
# print(scraper.base_info)
|