add facilities from result of crawling

insta
Dohyun Lim 2026-01-12 16:50:16 +09:00
parent b84c07c325
commit 3f75b6d61d
16 changed files with 286 additions and 19 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

BIN
app/.DS_Store vendored Normal file

Binary file not shown.

BIN
app/home/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -193,9 +193,12 @@ async def crawling(request_body: CrawlingRequest):
logger.info(f"[crawling] Step 3-3: GPT API 호출 완료 - 응답 {len(raw_response)}자 ({step3_3_elapsed:.1f}ms)")
print(f"[crawling] Step 3-3: GPT API 호출 완료 - 응답 {len(raw_response)}자 ({step3_3_elapsed:.1f}ms)")
# Step 3-4: 응답 파싱
# Step 3-4: 응답 파싱 (크롤링에서 가져온 facility_info 전달)
step3_4_start = time.perf_counter()
parsed = await chatgpt_service.parse_marketing_analysis(raw_response)
print(f"[crawling] Step 3-4: 응답 파싱 시작 - facility_info: {scraper.facility_info}")
parsed = await chatgpt_service.parse_marketing_analysis(
raw_response, facility_info=scraper.facility_info
)
marketing_analysis = MarketingAnalysis(**parsed)
step3_4_elapsed = (time.perf_counter() - step3_4_start) * 1000
print(f"[crawling] Step 3-4: 응답 파싱 완료 ({step3_4_elapsed:.1f}ms)")

BIN
app/lyric/.DS_Store vendored Normal file

Binary file not shown.

BIN
app/song/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -160,18 +160,10 @@ Provide comprehensive marketing analysis including:
- Return as JSON with key "tags"
- **MUST be written in Korean (한국어)**
2. Facilities
- Based on the business name and region details, identify 5 likely facilities/amenities
- Consider typical facilities for accommodations in the given region
- Examples: 바베큐장, 수영장, 주차장, 와이파이, 주방, 테라스, 정원, etc.
- Return as JSON with key "facilities"
- **MUST be written in Korean (한국어)**
[CRITICAL LANGUAGE REQUIREMENT - ABSOLUTE RULE]
ALL OUTPUT MUST BE WRITTEN IN KOREAN (한국어)
- Analysis sections: Korean only
- Tags: Korean only
- Facilities: Korean only
- This is a NON-NEGOTIABLE requirement
- Any output in English or other languages is considered a FAILURE
- Violation of this rule invalidates the entire response
@ -203,8 +195,7 @@ ALL OUTPUT MUST BE WRITTEN IN KOREAN (한국어)
## JSON Data
```json
{{
"tags": ["태그1", "태그2", "태그3", "태그4", "태그5"],
"facilities": ["부대시설1", "부대시설2", "부대시설3", "부대시설4", "부대시설5"]
"tags": ["태그1", "태그2", "태그3", "태그4", "태그5"]
}}
```
---
@ -361,9 +352,15 @@ class ChatgptService:
return result
async def parse_marketing_analysis(self, raw_response: str) -> dict:
async def parse_marketing_analysis(
self, raw_response: str, facility_info: str | None = None
) -> dict:
"""ChatGPT 마케팅 분석 응답을 파싱하고 요약하여 딕셔너리로 반환
Args:
raw_response: ChatGPT 마케팅 분석 응답 원문
facility_info: 크롤링에서 가져온 편의시설 정보 문자열
Returns:
dict: {"report": str, "tags": list[str], "facilities": list[str]}
"""
@ -377,7 +374,7 @@ class ChatgptService:
try:
json_data = json.loads(json_match.group(1))
tags = json_data.get("tags", [])
facilities = json_data.get("facilities", [])
print(f"[parse_marketing_analysis] GPT 응답에서 tags 파싱 완료: {tags}")
# JSON 블록을 제외한 리포트 부분 추출
report = raw_response[: json_match.start()].strip()
# --- 구분자 제거
@ -386,10 +383,22 @@ class ChatgptService:
if report.endswith("---"):
report = report[:-3].strip()
except json.JSONDecodeError:
print("[parse_marketing_analysis] JSON 파싱 실패")
pass
# 크롤링에서 가져온 facility_info로 facilities 설정
print(f"[parse_marketing_analysis] 크롤링 facility_info 원본: {facility_info}")
if facility_info:
# 쉼표로 구분된 편의시설 문자열을 리스트로 변환
facilities = [f.strip() for f in facility_info.split(",") if f.strip()]
print(f"[parse_marketing_analysis] facility_info 파싱 결과: {facilities}")
else:
facilities = ["등록된 정보 없음"]
print("[parse_marketing_analysis] facility_info 없음 - '등록된 정보 없음' 설정")
# 리포트 내용을 500자로 요약
if report:
report = await self.summarize_marketing(report)
print(f"[parse_marketing_analysis] 최종 facilities: {facilities}")
return {"report": report, "tags": tags, "facilities": facilities}

113
app/utils/nvMapPwScraper.py Normal file
View File

@ -0,0 +1,113 @@
import asyncio
from playwright.async_api import async_playwright
from urllib import parse
class nvMapPwScraper():
# cls vars
is_ready = False
_playwright = None
_browser = None
_context = None
_win_width = 1280
_win_height = 720
_max_retry = 30 # place id timeout threshold seconds
# instance var
page = None
@classmethod
def default_context_builder(cls):
context_builder_dict = {}
context_builder_dict['viewport'] = {
'width' : cls._win_width,
'height' : cls._win_height
}
context_builder_dict['screen'] = {
'width' : cls._win_width,
'height' : cls._win_height
}
context_builder_dict['user_agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
context_builder_dict['locale'] = 'ko-KR'
context_builder_dict['timezone_id']='Asia/Seoul'
return context_builder_dict
@classmethod
async def initiate_scraper(cls):
if not cls._playwright:
cls._playwright = await async_playwright().start()
if not cls._browser:
cls._browser = await cls._playwright.chromium.launch(headless=True)
if not cls._context:
cls._context = await cls._browser.new_context(**cls.default_context_builder())
cls.is_ready = True
def __init__(self):
if not self.is_ready:
raise Exception("nvMapScraper is not initiated")
async def __aenter__(self):
await self.create_page()
return self
async def __aexit__(self, exc_type, exc, tb):
await self.page.close()
async def create_page(self):
self.page = await self._context.new_page()
await self.page.add_init_script(
'''const defaultGetter = Object.getOwnPropertyDescriptor(
Navigator.prototype,
"webdriver"
).get;
defaultGetter.apply(navigator);
defaultGetter.toString();
Object.defineProperty(Navigator.prototype, "webdriver", {
set: undefined,
enumerable: true,
configurable: true,
get: new Proxy(defaultGetter, {
apply: (target, thisArg, args) => {
Reflect.apply(target, thisArg, args);
return false;
},
}),
});
const patchedGetter = Object.getOwnPropertyDescriptor(
Navigator.prototype,
"webdriver"
).get;
patchedGetter.apply(navigator);
patchedGetter.toString();''')
await self.page.set_extra_http_headers({
'sec-ch-ua': '\"Not?A_Brand\";v=\"99\", \"Chromium\";v=\"130\"'
})
await self.page.goto("http://google.com")
async def goto_url(self, url, wait_until="domcontentloaded", timeout=20000):
page = self.page
await page.goto(url, wait_until=wait_until, timeout=timeout)
async def get_place_id_url(self, selected):
title = selected['title'].replace("<b>", "").replace("</b>", "")
address = selected.get('roadAddress', selected['address']).replace("<b>", "").replace("</b>", "")
encoded_query = parse.quote(f"{address} {title}")
url = f"https://map.naver.com/p/search/{encoded_query}"
await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
if "/place/" in self.page.url:
return self.page.url
url = self.page.url.replace("?","?isCorrectAnswer=true&")
await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
if "/place/" in self.page.url:
return self.page.url
if (count == self._max_retry / 2):
raise Exception("Failed to identify place id. loading timeout")
else:
raise Exception("Failed to identify place id. item is ambiguous")

BIN
app/video/.DS_Store vendored Normal file

Binary file not shown.

BIN
docs/.DS_Store vendored Normal file

Binary file not shown.

BIN
image/.DS_Store vendored Normal file

Binary file not shown.

BIN
image/2025-12-26/.DS_Store vendored Normal file

Binary file not shown.

BIN
poc/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -0,0 +1,29 @@
import asyncio
from nvMapScraper import nvMapScraper
from nvMapPwScraper import nvMapPwScraper
async def main_function():
await nvMapPwScraper.initiate_scraper()
selected = {'title': '<b>스테이</b>,<b>머뭄</b>',
'link': 'https://www.instagram.com/staymeomoom',
'category': '숙박>펜션',
'description': '',
'telephone': '',
'address': '전북특별자치도 군산시 신흥동 63-18',
'roadAddress': '전북특별자치도 군산시 절골길 18',
'mapx': '1267061254',
'mapy': '359864175',
'lng': 126.7061254,
'lat': 35.9864175}
async with nvMapPwScraper() as pw_scraper:
new_url = await pw_scraper.get_place_id_url(selected)
print(new_url)
nv_scraper = nvMapScraper(new_url) # 이후 동일한 플로우
await nv_scraper.scrap()
print(nv_scraper.rawdata)
return
print("running main_funtion..")
asyncio.run(main_function())

View File

@ -0,0 +1,113 @@
import asyncio
from playwright.async_api import async_playwright
from urllib import parse
class nvMapPwScraper():
# cls vars
is_ready = False
_playwright = None
_browser = None
_context = None
_win_width = 1280
_win_height = 720
_max_retry = 30 # place id timeout threshold seconds
# instance var
page = None
@classmethod
def default_context_builder(cls):
context_builder_dict = {}
context_builder_dict['viewport'] = {
'width' : cls._win_width,
'height' : cls._win_height
}
context_builder_dict['screen'] = {
'width' : cls._win_width,
'height' : cls._win_height
}
context_builder_dict['user_agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
context_builder_dict['locale'] = 'ko-KR'
context_builder_dict['timezone_id']='Asia/Seoul'
return context_builder_dict
@classmethod
async def initiate_scraper(cls):
if not cls._playwright:
cls._playwright = await async_playwright().start()
if not cls._browser:
cls._browser = await cls._playwright.chromium.launch(headless=True)
if not cls._context:
cls._context = await cls._browser.new_context(**cls.default_context_builder())
cls.is_ready = True
def __init__(self):
if not self.is_ready:
raise Exception("nvMapScraper is not initiated")
async def __aenter__(self):
await self.create_page()
return self
async def __aexit__(self, exc_type, exc, tb):
await self.page.close()
async def create_page(self):
self.page = await self._context.new_page()
await self.page.add_init_script(
'''const defaultGetter = Object.getOwnPropertyDescriptor(
Navigator.prototype,
"webdriver"
).get;
defaultGetter.apply(navigator);
defaultGetter.toString();
Object.defineProperty(Navigator.prototype, "webdriver", {
set: undefined,
enumerable: true,
configurable: true,
get: new Proxy(defaultGetter, {
apply: (target, thisArg, args) => {
Reflect.apply(target, thisArg, args);
return false;
},
}),
});
const patchedGetter = Object.getOwnPropertyDescriptor(
Navigator.prototype,
"webdriver"
).get;
patchedGetter.apply(navigator);
patchedGetter.toString();''')
await self.page.set_extra_http_headers({
'sec-ch-ua': '\"Not?A_Brand\";v=\"99\", \"Chromium\";v=\"130\"'
})
await self.page.goto("http://google.com")
async def goto_url(self, url, wait_until="domcontentloaded", timeout=20000):
page = self.page
await page.goto(url, wait_until=wait_until, timeout=timeout)
async def get_place_id_url(self, selected):
title = selected['title'].replace("<b>", "").replace("</b>", "")
address = selected.get('roadAddress', selected['address']).replace("<b>", "").replace("</b>", "")
encoded_query = parse.quote(f"{address} {title}")
url = f"https://map.naver.com/p/search/{encoded_query}"
await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
if "/place/" in self.page.url:
return self.page.url
url = self.page.url.replace("?","?isCorrectAnswer=true&")
await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
if "/place/" in self.page.url:
return self.page.url
if (count == self._max_retry / 2):
raise Exception("Failed to identify place id. loading timeout")
else:
raise Exception("Failed to identify place id. item is ambiguous")

View File

@ -112,8 +112,8 @@ class nvMapScraper():
facilities = c_elem.parent.parent.find('div').string
return facilities
url = "https://naver.me/IgJGCCic"
scraper = nvMapScraper(url)
asyncio.run(scraper.scrap())
print(scraper.image_link_list)
print(len(scraper.image_link_list))
# url = "https://naver.me/IgJGCCic"
# scraper = nvMapScraper(url)
# asyncio.run(scraper.scrap())
# print(scraper.image_link_list)
# print(len(scraper.image_link_list))