add facilities from result of crawling
parent
b84c07c325
commit
3f75b6d61d
Binary file not shown.
Binary file not shown.
|
|
@ -193,9 +193,12 @@ async def crawling(request_body: CrawlingRequest):
|
||||||
logger.info(f"[crawling] Step 3-3: GPT API 호출 완료 - 응답 {len(raw_response)}자 ({step3_3_elapsed:.1f}ms)")
|
logger.info(f"[crawling] Step 3-3: GPT API 호출 완료 - 응답 {len(raw_response)}자 ({step3_3_elapsed:.1f}ms)")
|
||||||
print(f"[crawling] Step 3-3: GPT API 호출 완료 - 응답 {len(raw_response)}자 ({step3_3_elapsed:.1f}ms)")
|
print(f"[crawling] Step 3-3: GPT API 호출 완료 - 응답 {len(raw_response)}자 ({step3_3_elapsed:.1f}ms)")
|
||||||
|
|
||||||
# Step 3-4: 응답 파싱
|
# Step 3-4: 응답 파싱 (크롤링에서 가져온 facility_info 전달)
|
||||||
step3_4_start = time.perf_counter()
|
step3_4_start = time.perf_counter()
|
||||||
parsed = await chatgpt_service.parse_marketing_analysis(raw_response)
|
print(f"[crawling] Step 3-4: 응답 파싱 시작 - facility_info: {scraper.facility_info}")
|
||||||
|
parsed = await chatgpt_service.parse_marketing_analysis(
|
||||||
|
raw_response, facility_info=scraper.facility_info
|
||||||
|
)
|
||||||
marketing_analysis = MarketingAnalysis(**parsed)
|
marketing_analysis = MarketingAnalysis(**parsed)
|
||||||
step3_4_elapsed = (time.perf_counter() - step3_4_start) * 1000
|
step3_4_elapsed = (time.perf_counter() - step3_4_start) * 1000
|
||||||
print(f"[crawling] Step 3-4: 응답 파싱 완료 ({step3_4_elapsed:.1f}ms)")
|
print(f"[crawling] Step 3-4: 응답 파싱 완료 ({step3_4_elapsed:.1f}ms)")
|
||||||
|
|
|
||||||
Binary file not shown.
Binary file not shown.
|
|
@ -160,18 +160,10 @@ Provide comprehensive marketing analysis including:
|
||||||
- Return as JSON with key "tags"
|
- Return as JSON with key "tags"
|
||||||
- **MUST be written in Korean (한국어)**
|
- **MUST be written in Korean (한국어)**
|
||||||
|
|
||||||
2. Facilities
|
|
||||||
- Based on the business name and region details, identify 5 likely facilities/amenities
|
|
||||||
- Consider typical facilities for accommodations in the given region
|
|
||||||
- Examples: 바베큐장, 수영장, 주차장, 와이파이, 주방, 테라스, 정원, etc.
|
|
||||||
- Return as JSON with key "facilities"
|
|
||||||
- **MUST be written in Korean (한국어)**
|
|
||||||
|
|
||||||
[CRITICAL LANGUAGE REQUIREMENT - ABSOLUTE RULE]
|
[CRITICAL LANGUAGE REQUIREMENT - ABSOLUTE RULE]
|
||||||
ALL OUTPUT MUST BE WRITTEN IN KOREAN (한국어)
|
ALL OUTPUT MUST BE WRITTEN IN KOREAN (한국어)
|
||||||
- Analysis sections: Korean only
|
- Analysis sections: Korean only
|
||||||
- Tags: Korean only
|
- Tags: Korean only
|
||||||
- Facilities: Korean only
|
|
||||||
- This is a NON-NEGOTIABLE requirement
|
- This is a NON-NEGOTIABLE requirement
|
||||||
- Any output in English or other languages is considered a FAILURE
|
- Any output in English or other languages is considered a FAILURE
|
||||||
- Violation of this rule invalidates the entire response
|
- Violation of this rule invalidates the entire response
|
||||||
|
|
@ -203,8 +195,7 @@ ALL OUTPUT MUST BE WRITTEN IN KOREAN (한국어)
|
||||||
## JSON Data
|
## JSON Data
|
||||||
```json
|
```json
|
||||||
{{
|
{{
|
||||||
"tags": ["태그1", "태그2", "태그3", "태그4", "태그5"],
|
"tags": ["태그1", "태그2", "태그3", "태그4", "태그5"]
|
||||||
"facilities": ["부대시설1", "부대시설2", "부대시설3", "부대시설4", "부대시설5"]
|
|
||||||
}}
|
}}
|
||||||
```
|
```
|
||||||
---
|
---
|
||||||
|
|
@ -361,9 +352,15 @@ class ChatgptService:
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
async def parse_marketing_analysis(self, raw_response: str) -> dict:
|
async def parse_marketing_analysis(
|
||||||
|
self, raw_response: str, facility_info: str | None = None
|
||||||
|
) -> dict:
|
||||||
"""ChatGPT 마케팅 분석 응답을 파싱하고 요약하여 딕셔너리로 반환
|
"""ChatGPT 마케팅 분석 응답을 파싱하고 요약하여 딕셔너리로 반환
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_response: ChatGPT 마케팅 분석 응답 원문
|
||||||
|
facility_info: 크롤링에서 가져온 편의시설 정보 문자열
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: {"report": str, "tags": list[str], "facilities": list[str]}
|
dict: {"report": str, "tags": list[str], "facilities": list[str]}
|
||||||
"""
|
"""
|
||||||
|
|
@ -377,7 +374,7 @@ class ChatgptService:
|
||||||
try:
|
try:
|
||||||
json_data = json.loads(json_match.group(1))
|
json_data = json.loads(json_match.group(1))
|
||||||
tags = json_data.get("tags", [])
|
tags = json_data.get("tags", [])
|
||||||
facilities = json_data.get("facilities", [])
|
print(f"[parse_marketing_analysis] GPT 응답에서 tags 파싱 완료: {tags}")
|
||||||
# JSON 블록을 제외한 리포트 부분 추출
|
# JSON 블록을 제외한 리포트 부분 추출
|
||||||
report = raw_response[: json_match.start()].strip()
|
report = raw_response[: json_match.start()].strip()
|
||||||
# --- 구분자 제거
|
# --- 구분자 제거
|
||||||
|
|
@ -386,10 +383,22 @@ class ChatgptService:
|
||||||
if report.endswith("---"):
|
if report.endswith("---"):
|
||||||
report = report[:-3].strip()
|
report = report[:-3].strip()
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
|
print("[parse_marketing_analysis] JSON 파싱 실패")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# 크롤링에서 가져온 facility_info로 facilities 설정
|
||||||
|
print(f"[parse_marketing_analysis] 크롤링 facility_info 원본: {facility_info}")
|
||||||
|
if facility_info:
|
||||||
|
# 쉼표로 구분된 편의시설 문자열을 리스트로 변환
|
||||||
|
facilities = [f.strip() for f in facility_info.split(",") if f.strip()]
|
||||||
|
print(f"[parse_marketing_analysis] facility_info 파싱 결과: {facilities}")
|
||||||
|
else:
|
||||||
|
facilities = ["등록된 정보 없음"]
|
||||||
|
print("[parse_marketing_analysis] facility_info 없음 - '등록된 정보 없음' 설정")
|
||||||
|
|
||||||
# 리포트 내용을 500자로 요약
|
# 리포트 내용을 500자로 요약
|
||||||
if report:
|
if report:
|
||||||
report = await self.summarize_marketing(report)
|
report = await self.summarize_marketing(report)
|
||||||
|
|
||||||
|
print(f"[parse_marketing_analysis] 최종 facilities: {facilities}")
|
||||||
return {"report": report, "tags": tags, "facilities": facilities}
|
return {"report": report, "tags": tags, "facilities": facilities}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,113 @@
|
||||||
|
import asyncio
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
from urllib import parse
|
||||||
|
|
||||||
|
class nvMapPwScraper():
|
||||||
|
# cls vars
|
||||||
|
is_ready = False
|
||||||
|
_playwright = None
|
||||||
|
_browser = None
|
||||||
|
_context = None
|
||||||
|
_win_width = 1280
|
||||||
|
_win_height = 720
|
||||||
|
_max_retry = 30 # place id timeout threshold seconds
|
||||||
|
|
||||||
|
# instance var
|
||||||
|
page = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def default_context_builder(cls):
|
||||||
|
context_builder_dict = {}
|
||||||
|
context_builder_dict['viewport'] = {
|
||||||
|
'width' : cls._win_width,
|
||||||
|
'height' : cls._win_height
|
||||||
|
}
|
||||||
|
context_builder_dict['screen'] = {
|
||||||
|
'width' : cls._win_width,
|
||||||
|
'height' : cls._win_height
|
||||||
|
}
|
||||||
|
context_builder_dict['user_agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
|
||||||
|
context_builder_dict['locale'] = 'ko-KR'
|
||||||
|
context_builder_dict['timezone_id']='Asia/Seoul'
|
||||||
|
|
||||||
|
return context_builder_dict
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def initiate_scraper(cls):
|
||||||
|
if not cls._playwright:
|
||||||
|
cls._playwright = await async_playwright().start()
|
||||||
|
if not cls._browser:
|
||||||
|
cls._browser = await cls._playwright.chromium.launch(headless=True)
|
||||||
|
if not cls._context:
|
||||||
|
cls._context = await cls._browser.new_context(**cls.default_context_builder())
|
||||||
|
cls.is_ready = True
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
if not self.is_ready:
|
||||||
|
raise Exception("nvMapScraper is not initiated")
|
||||||
|
|
||||||
|
async def __aenter__(self):
|
||||||
|
await self.create_page()
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, exc_type, exc, tb):
|
||||||
|
await self.page.close()
|
||||||
|
|
||||||
|
async def create_page(self):
|
||||||
|
self.page = await self._context.new_page()
|
||||||
|
await self.page.add_init_script(
|
||||||
|
'''const defaultGetter = Object.getOwnPropertyDescriptor(
|
||||||
|
Navigator.prototype,
|
||||||
|
"webdriver"
|
||||||
|
).get;
|
||||||
|
defaultGetter.apply(navigator);
|
||||||
|
defaultGetter.toString();
|
||||||
|
Object.defineProperty(Navigator.prototype, "webdriver", {
|
||||||
|
set: undefined,
|
||||||
|
enumerable: true,
|
||||||
|
configurable: true,
|
||||||
|
get: new Proxy(defaultGetter, {
|
||||||
|
apply: (target, thisArg, args) => {
|
||||||
|
Reflect.apply(target, thisArg, args);
|
||||||
|
return false;
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
const patchedGetter = Object.getOwnPropertyDescriptor(
|
||||||
|
Navigator.prototype,
|
||||||
|
"webdriver"
|
||||||
|
).get;
|
||||||
|
patchedGetter.apply(navigator);
|
||||||
|
patchedGetter.toString();''')
|
||||||
|
|
||||||
|
await self.page.set_extra_http_headers({
|
||||||
|
'sec-ch-ua': '\"Not?A_Brand\";v=\"99\", \"Chromium\";v=\"130\"'
|
||||||
|
})
|
||||||
|
await self.page.goto("http://google.com")
|
||||||
|
|
||||||
|
async def goto_url(self, url, wait_until="domcontentloaded", timeout=20000):
|
||||||
|
page = self.page
|
||||||
|
await page.goto(url, wait_until=wait_until, timeout=timeout)
|
||||||
|
|
||||||
|
async def get_place_id_url(self, selected):
|
||||||
|
|
||||||
|
title = selected['title'].replace("<b>", "").replace("</b>", "")
|
||||||
|
address = selected.get('roadAddress', selected['address']).replace("<b>", "").replace("</b>", "")
|
||||||
|
encoded_query = parse.quote(f"{address} {title}")
|
||||||
|
url = f"https://map.naver.com/p/search/{encoded_query}"
|
||||||
|
|
||||||
|
await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
|
||||||
|
|
||||||
|
if "/place/" in self.page.url:
|
||||||
|
return self.page.url
|
||||||
|
|
||||||
|
url = self.page.url.replace("?","?isCorrectAnswer=true&")
|
||||||
|
await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
|
||||||
|
|
||||||
|
if "/place/" in self.page.url:
|
||||||
|
return self.page.url
|
||||||
|
|
||||||
|
if (count == self._max_retry / 2):
|
||||||
|
raise Exception("Failed to identify place id. loading timeout")
|
||||||
|
else:
|
||||||
|
raise Exception("Failed to identify place id. item is ambiguous")
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,29 @@
|
||||||
|
import asyncio
|
||||||
|
from nvMapScraper import nvMapScraper
|
||||||
|
from nvMapPwScraper import nvMapPwScraper
|
||||||
|
|
||||||
|
async def main_function():
|
||||||
|
await nvMapPwScraper.initiate_scraper()
|
||||||
|
selected = {'title': '<b>스테이</b>,<b>머뭄</b>',
|
||||||
|
'link': 'https://www.instagram.com/staymeomoom',
|
||||||
|
'category': '숙박>펜션',
|
||||||
|
'description': '',
|
||||||
|
'telephone': '',
|
||||||
|
'address': '전북특별자치도 군산시 신흥동 63-18',
|
||||||
|
'roadAddress': '전북특별자치도 군산시 절골길 18',
|
||||||
|
'mapx': '1267061254',
|
||||||
|
'mapy': '359864175',
|
||||||
|
'lng': 126.7061254,
|
||||||
|
'lat': 35.9864175}
|
||||||
|
|
||||||
|
async with nvMapPwScraper() as pw_scraper:
|
||||||
|
new_url = await pw_scraper.get_place_id_url(selected)
|
||||||
|
|
||||||
|
print(new_url)
|
||||||
|
nv_scraper = nvMapScraper(new_url) # 이후 동일한 플로우
|
||||||
|
await nv_scraper.scrap()
|
||||||
|
print(nv_scraper.rawdata)
|
||||||
|
return
|
||||||
|
|
||||||
|
print("running main_funtion..")
|
||||||
|
asyncio.run(main_function())
|
||||||
|
|
@ -0,0 +1,113 @@
|
||||||
|
import asyncio
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
from urllib import parse
|
||||||
|
|
||||||
|
class nvMapPwScraper():
|
||||||
|
# cls vars
|
||||||
|
is_ready = False
|
||||||
|
_playwright = None
|
||||||
|
_browser = None
|
||||||
|
_context = None
|
||||||
|
_win_width = 1280
|
||||||
|
_win_height = 720
|
||||||
|
_max_retry = 30 # place id timeout threshold seconds
|
||||||
|
|
||||||
|
# instance var
|
||||||
|
page = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def default_context_builder(cls):
|
||||||
|
context_builder_dict = {}
|
||||||
|
context_builder_dict['viewport'] = {
|
||||||
|
'width' : cls._win_width,
|
||||||
|
'height' : cls._win_height
|
||||||
|
}
|
||||||
|
context_builder_dict['screen'] = {
|
||||||
|
'width' : cls._win_width,
|
||||||
|
'height' : cls._win_height
|
||||||
|
}
|
||||||
|
context_builder_dict['user_agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
|
||||||
|
context_builder_dict['locale'] = 'ko-KR'
|
||||||
|
context_builder_dict['timezone_id']='Asia/Seoul'
|
||||||
|
|
||||||
|
return context_builder_dict
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def initiate_scraper(cls):
|
||||||
|
if not cls._playwright:
|
||||||
|
cls._playwright = await async_playwright().start()
|
||||||
|
if not cls._browser:
|
||||||
|
cls._browser = await cls._playwright.chromium.launch(headless=True)
|
||||||
|
if not cls._context:
|
||||||
|
cls._context = await cls._browser.new_context(**cls.default_context_builder())
|
||||||
|
cls.is_ready = True
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
if not self.is_ready:
|
||||||
|
raise Exception("nvMapScraper is not initiated")
|
||||||
|
|
||||||
|
async def __aenter__(self):
|
||||||
|
await self.create_page()
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, exc_type, exc, tb):
|
||||||
|
await self.page.close()
|
||||||
|
|
||||||
|
async def create_page(self):
|
||||||
|
self.page = await self._context.new_page()
|
||||||
|
await self.page.add_init_script(
|
||||||
|
'''const defaultGetter = Object.getOwnPropertyDescriptor(
|
||||||
|
Navigator.prototype,
|
||||||
|
"webdriver"
|
||||||
|
).get;
|
||||||
|
defaultGetter.apply(navigator);
|
||||||
|
defaultGetter.toString();
|
||||||
|
Object.defineProperty(Navigator.prototype, "webdriver", {
|
||||||
|
set: undefined,
|
||||||
|
enumerable: true,
|
||||||
|
configurable: true,
|
||||||
|
get: new Proxy(defaultGetter, {
|
||||||
|
apply: (target, thisArg, args) => {
|
||||||
|
Reflect.apply(target, thisArg, args);
|
||||||
|
return false;
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
const patchedGetter = Object.getOwnPropertyDescriptor(
|
||||||
|
Navigator.prototype,
|
||||||
|
"webdriver"
|
||||||
|
).get;
|
||||||
|
patchedGetter.apply(navigator);
|
||||||
|
patchedGetter.toString();''')
|
||||||
|
|
||||||
|
await self.page.set_extra_http_headers({
|
||||||
|
'sec-ch-ua': '\"Not?A_Brand\";v=\"99\", \"Chromium\";v=\"130\"'
|
||||||
|
})
|
||||||
|
await self.page.goto("http://google.com")
|
||||||
|
|
||||||
|
async def goto_url(self, url, wait_until="domcontentloaded", timeout=20000):
|
||||||
|
page = self.page
|
||||||
|
await page.goto(url, wait_until=wait_until, timeout=timeout)
|
||||||
|
|
||||||
|
async def get_place_id_url(self, selected):
|
||||||
|
|
||||||
|
title = selected['title'].replace("<b>", "").replace("</b>", "")
|
||||||
|
address = selected.get('roadAddress', selected['address']).replace("<b>", "").replace("</b>", "")
|
||||||
|
encoded_query = parse.quote(f"{address} {title}")
|
||||||
|
url = f"https://map.naver.com/p/search/{encoded_query}"
|
||||||
|
|
||||||
|
await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
|
||||||
|
|
||||||
|
if "/place/" in self.page.url:
|
||||||
|
return self.page.url
|
||||||
|
|
||||||
|
url = self.page.url.replace("?","?isCorrectAnswer=true&")
|
||||||
|
await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
|
||||||
|
|
||||||
|
if "/place/" in self.page.url:
|
||||||
|
return self.page.url
|
||||||
|
|
||||||
|
if (count == self._max_retry / 2):
|
||||||
|
raise Exception("Failed to identify place id. loading timeout")
|
||||||
|
else:
|
||||||
|
raise Exception("Failed to identify place id. item is ambiguous")
|
||||||
|
|
@ -112,8 +112,8 @@ class nvMapScraper():
|
||||||
facilities = c_elem.parent.parent.find('div').string
|
facilities = c_elem.parent.parent.find('div').string
|
||||||
return facilities
|
return facilities
|
||||||
|
|
||||||
url = "https://naver.me/IgJGCCic"
|
# url = "https://naver.me/IgJGCCic"
|
||||||
scraper = nvMapScraper(url)
|
# scraper = nvMapScraper(url)
|
||||||
asyncio.run(scraper.scrap())
|
# asyncio.run(scraper.scrap())
|
||||||
print(scraper.image_link_list)
|
# print(scraper.image_link_list)
|
||||||
print(len(scraper.image_link_list))
|
# print(len(scraper.image_link_list))
|
||||||
Loading…
Reference in New Issue