add facilities from result of crawling

2026-01-12 16:50:16 +09:00 · 2026-01-12 16:50:16 +09:00 · 3f75b6d61d
parent b84c07c325
commit 3f75b6d61d
16 changed files with 286 additions and 19 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/app/.DS_Store
+++ b/app/.DS_Store
--- a/app/home/.DS_Store
+++ b/app/home/.DS_Store
--- a/app/home/api/routers/v1/home.py
+++ b/app/home/api/routers/v1/home.py
@ -193,9 +193,12 @@ async def crawling(request_body: CrawlingRequest):
            logger.info(f"[crawling]   Step 3-3: GPT API 호출 완료 - 응답 {len(raw_response)}자 ({step3_3_elapsed:.1f}ms)")
            print(f"[crawling]   Step 3-3: GPT API 호출 완료 - 응답 {len(raw_response)}자 ({step3_3_elapsed:.1f}ms)")
-            # Step 3-4: 응답 파싱
+            # Step 3-4: 응답 파싱 (크롤링에서 가져온 facility_info 전달)
            step3_4_start = time.perf_counter()
-            parsed = await chatgpt_service.parse_marketing_analysis(raw_response)
+            print(f"[crawling]   Step 3-4: 응답 파싱 시작 - facility_info: {scraper.facility_info}")
            parsed = await chatgpt_service.parse_marketing_analysis(
                raw_response, facility_info=scraper.facility_info
            )
            marketing_analysis = MarketingAnalysis(**parsed)
            step3_4_elapsed = (time.perf_counter() - step3_4_start) * 1000
            print(f"[crawling]   Step 3-4: 응답 파싱 완료 ({step3_4_elapsed:.1f}ms)")
--- a/app/lyric/.DS_Store
+++ b/app/lyric/.DS_Store
--- a/app/song/.DS_Store
+++ b/app/song/.DS_Store
--- a/app/utils/chatgpt_prompt.py
+++ b/app/utils/chatgpt_prompt.py
@ -160,18 +160,10 @@ Provide comprehensive marketing analysis including:
   - Return as JSON with key "tags"
   - **MUST be written in Korean (한국어)**
 2. Facilities
   - Based on the business name and region details, identify 5 likely facilities/amenities
   - Consider typical facilities for accommodations in the given region
   - Examples: 바베큐장, 수영장, 주차장, 와이파이, 주방, 테라스, 정원, etc.
   - Return as JSON with key "facilities"
   - **MUST be written in Korean (한국어)**
 [CRITICAL LANGUAGE REQUIREMENT - ABSOLUTE RULE]
 ALL OUTPUT MUST BE WRITTEN IN KOREAN (한국어) 
 - Analysis sections: Korean only
 - Tags: Korean only
 - Facilities: Korean only
 - This is a NON-NEGOTIABLE requirement
 - Any output in English or other languages is considered a FAILURE
 - Violation of this rule invalidates the entire response
@ -203,8 +195,7 @@ ALL OUTPUT MUST BE WRITTEN IN KOREAN (한국어)
 ## JSON Data
 ```json
 {{
-    "tags": ["태그1", "태그2", "태그3", "태그4", "태그5"],
+    "tags": ["태그1", "태그2", "태그3", "태그4", "태그5"]
    "facilities": ["부대시설1", "부대시설2", "부대시설3", "부대시설4", "부대시설5"]
 }}
 ```
 ---
@ -361,9 +352,15 @@ class ChatgptService:
        return result
-    async def parse_marketing_analysis(self, raw_response: str) -> dict:
+    async def parse_marketing_analysis(
        self, raw_response: str, facility_info: str | None = None
    ) -> dict:
        """ChatGPT 마케팅 분석 응답을 파싱하고 요약하여 딕셔너리로 반환
        Args:
            raw_response: ChatGPT 마케팅 분석 응답 원문
            facility_info: 크롤링에서 가져온 편의시설 정보 문자열
        Returns:
            dict: {"report": str, "tags": list[str], "facilities": list[str]}
        """
@ -377,7 +374,7 @@ class ChatgptService:
            try:
                json_data = json.loads(json_match.group(1))
                tags = json_data.get("tags", [])
-                facilities = json_data.get("facilities", [])
+                print(f"[parse_marketing_analysis] GPT 응답에서 tags 파싱 완료: {tags}")
                # JSON 블록을 제외한 리포트 부분 추출
                report = raw_response[: json_match.start()].strip()
                # --- 구분자 제거
@ -386,10 +383,22 @@ class ChatgptService:
                if report.endswith("---"):
                    report = report[:-3].strip()
            except json.JSONDecodeError:
                print("[parse_marketing_analysis] JSON 파싱 실패")
                pass
        # 크롤링에서 가져온 facility_info로 facilities 설정
        print(f"[parse_marketing_analysis] 크롤링 facility_info 원본: {facility_info}")
        if facility_info:
            # 쉼표로 구분된 편의시설 문자열을 리스트로 변환
            facilities = [f.strip() for f in facility_info.split(",") if f.strip()]
            print(f"[parse_marketing_analysis] facility_info 파싱 결과: {facilities}")
        else:
            facilities = ["등록된 정보 없음"]
            print("[parse_marketing_analysis] facility_info 없음 - '등록된 정보 없음' 설정")
        # 리포트 내용을 500자로 요약
        if report:
            report = await self.summarize_marketing(report)
        print(f"[parse_marketing_analysis] 최종 facilities: {facilities}")
        return {"report": report, "tags": tags, "facilities": facilities}
--- a/app/utils/nvMapPwScraper.py
+++ b/app/utils/nvMapPwScraper.py
@ -0,0 +1,113 @@
 import asyncio
 from playwright.async_api import async_playwright
 from urllib import parse
 class nvMapPwScraper():
    # cls vars
    is_ready = False
    _playwright = None
    _browser = None
    _context = None
    _win_width = 1280
    _win_height = 720
    _max_retry = 30 # place id timeout threshold seconds
    # instance var
    page = None
    @classmethod
    def default_context_builder(cls):
        context_builder_dict = {}
        context_builder_dict['viewport'] = {
                'width' : cls._win_width,
                'height' : cls._win_height
        }
        context_builder_dict['screen'] = {
                'width' : cls._win_width,
                'height' : cls._win_height
        }
        context_builder_dict['user_agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
        context_builder_dict['locale'] = 'ko-KR'
        context_builder_dict['timezone_id']='Asia/Seoul'
        return context_builder_dict
    @classmethod
    async def initiate_scraper(cls):
        if not cls._playwright:
            cls._playwright = await async_playwright().start()
        if not cls._browser:
            cls._browser = await cls._playwright.chromium.launch(headless=True)
        if not cls._context:
            cls._context = await cls._browser.new_context(**cls.default_context_builder())
        cls.is_ready = True
    def __init__(self):
        if not self.is_ready:
            raise Exception("nvMapScraper is not initiated")
    async def __aenter__(self):
        await self.create_page()
        return self
    async def __aexit__(self, exc_type, exc, tb):
        await self.page.close()
    async def create_page(self):
        self.page = await self._context.new_page()
        await self.page.add_init_script(
 '''const defaultGetter = Object.getOwnPropertyDescriptor(
    Navigator.prototype,
    "webdriver"
 ).get;
 defaultGetter.apply(navigator);
 defaultGetter.toString();
 Object.defineProperty(Navigator.prototype, "webdriver", {
    set: undefined,
    enumerable: true,
    configurable: true,
    get: new Proxy(defaultGetter, {
    apply: (target, thisArg, args) => {
        Reflect.apply(target, thisArg, args);
        return false;
    },
    }),
 });
 const patchedGetter = Object.getOwnPropertyDescriptor(
    Navigator.prototype,
    "webdriver"
 ).get;
 patchedGetter.apply(navigator);
 patchedGetter.toString();''')
        await self.page.set_extra_http_headers({
            'sec-ch-ua': '\"Not?A_Brand\";v=\"99\", \"Chromium\";v=\"130\"'
        })
        await self.page.goto("http://google.com")
    async def goto_url(self, url, wait_until="domcontentloaded", timeout=20000):
        page = self.page
        await page.goto(url, wait_until=wait_until, timeout=timeout)
    async def get_place_id_url(self, selected):
        title = selected['title'].replace("<b>", "").replace("</b>", "")
        address = selected.get('roadAddress', selected['address']).replace("<b>", "").replace("</b>", "")
        encoded_query = parse.quote(f"{address} {title}")
        url = f"https://map.naver.com/p/search/{encoded_query}"
        await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
        if "/place/" in self.page.url:
            return self.page.url
        url = self.page.url.replace("?","?isCorrectAnswer=true&")
        await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
        if "/place/" in self.page.url:
            return self.page.url
        if (count == self._max_retry / 2):
            raise Exception("Failed to identify place id. loading timeout")
        else:
            raise Exception("Failed to identify place id. item is ambiguous")
--- a/app/video/.DS_Store
+++ b/app/video/.DS_Store
--- a/docs/.DS_Store
+++ b/docs/.DS_Store
--- a/image/.DS_Store
+++ b/image/.DS_Store
--- a/image/2025-12-26/.DS_Store
+++ b/image/2025-12-26/.DS_Store
--- a/poc/.DS_Store
+++ b/poc/.DS_Store
--- a/poc/crawling/2026-01-12/main-PwScraper.py
+++ b/poc/crawling/2026-01-12/main-PwScraper.py
@ -0,0 +1,29 @@
 import asyncio
 from nvMapScraper import nvMapScraper
 from nvMapPwScraper import nvMapPwScraper
 async def main_function():
    await nvMapPwScraper.initiate_scraper()
    selected = {'title': '<b>스테이</b>,<b>머뭄</b>',
                    'link': 'https://www.instagram.com/staymeomoom',
                    'category': '숙박>펜션',
                    'description': '',
                    'telephone': '',
                    'address': '전북특별자치도 군산시 신흥동 63-18',
                    'roadAddress': '전북특별자치도 군산시 절골길 18',
                    'mapx': '1267061254',
                    'mapy': '359864175',
                    'lng': 126.7061254,
                    'lat': 35.9864175}
    async with nvMapPwScraper() as pw_scraper:
        new_url = await pw_scraper.get_place_id_url(selected)
    print(new_url)
    nv_scraper = nvMapScraper(new_url) # 이후 동일한 플로우
    await nv_scraper.scrap()
    print(nv_scraper.rawdata)
    return
 print("running main_funtion..")
 asyncio.run(main_function())
--- a/poc/crawling/2026-01-12/nvMapPwScraper.py
+++ b/poc/crawling/2026-01-12/nvMapPwScraper.py
@ -0,0 +1,113 @@
 import asyncio
 from playwright.async_api import async_playwright
 from urllib import parse
 class nvMapPwScraper():
    # cls vars
    is_ready = False
    _playwright = None
    _browser = None
    _context = None
    _win_width = 1280
    _win_height = 720
    _max_retry = 30 # place id timeout threshold seconds
    # instance var
    page = None
    @classmethod
    def default_context_builder(cls):
        context_builder_dict = {}
        context_builder_dict['viewport'] = {
                'width' : cls._win_width,
                'height' : cls._win_height
        }
        context_builder_dict['screen'] = {
                'width' : cls._win_width,
                'height' : cls._win_height
        }
        context_builder_dict['user_agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
        context_builder_dict['locale'] = 'ko-KR'
        context_builder_dict['timezone_id']='Asia/Seoul'
        return context_builder_dict
    @classmethod
    async def initiate_scraper(cls):
        if not cls._playwright:
            cls._playwright = await async_playwright().start()
        if not cls._browser:
            cls._browser = await cls._playwright.chromium.launch(headless=True)
        if not cls._context:
            cls._context = await cls._browser.new_context(**cls.default_context_builder())
        cls.is_ready = True
    def __init__(self):
        if not self.is_ready:
            raise Exception("nvMapScraper is not initiated")
    async def __aenter__(self):
        await self.create_page()
        return self
    async def __aexit__(self, exc_type, exc, tb):
        await self.page.close()
    async def create_page(self):
        self.page = await self._context.new_page()
        await self.page.add_init_script(
 '''const defaultGetter = Object.getOwnPropertyDescriptor(
    Navigator.prototype,
    "webdriver"
 ).get;
 defaultGetter.apply(navigator);
 defaultGetter.toString();
 Object.defineProperty(Navigator.prototype, "webdriver", {
    set: undefined,
    enumerable: true,
    configurable: true,
    get: new Proxy(defaultGetter, {
    apply: (target, thisArg, args) => {
        Reflect.apply(target, thisArg, args);
        return false;
    },
    }),
 });
 const patchedGetter = Object.getOwnPropertyDescriptor(
    Navigator.prototype,
    "webdriver"
 ).get;
 patchedGetter.apply(navigator);
 patchedGetter.toString();''')
        await self.page.set_extra_http_headers({
            'sec-ch-ua': '\"Not?A_Brand\";v=\"99\", \"Chromium\";v=\"130\"'
        })
        await self.page.goto("http://google.com")
    async def goto_url(self, url, wait_until="domcontentloaded", timeout=20000):
        page = self.page
        await page.goto(url, wait_until=wait_until, timeout=timeout)
    async def get_place_id_url(self, selected):
        title = selected['title'].replace("<b>", "").replace("</b>", "")
        address = selected.get('roadAddress', selected['address']).replace("<b>", "").replace("</b>", "")
        encoded_query = parse.quote(f"{address} {title}")
        url = f"https://map.naver.com/p/search/{encoded_query}"
        await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
        if "/place/" in self.page.url:
            return self.page.url
        url = self.page.url.replace("?","?isCorrectAnswer=true&")
        await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
        if "/place/" in self.page.url:
            return self.page.url
        if (count == self._max_retry / 2):
            raise Exception("Failed to identify place id. loading timeout")
        else:
            raise Exception("Failed to identify place id. item is ambiguous")
--- a/poc/crawling/2026-01-12/nvMapScraper.py
+++ b/poc/crawling/2026-01-12/nvMapScraper.py
@ -112,8 +112,8 @@ class nvMapScraper():
                facilities = c_elem.parent.parent.find('div').string
        return facilities
-url = "https://naver.me/IgJGCCic"
+# url = "https://naver.me/IgJGCCic"
-scraper = nvMapScraper(url)
+# scraper = nvMapScraper(url)
-asyncio.run(scraper.scrap())
+# asyncio.run(scraper.scrap())
-print(scraper.image_link_list)
+# print(scraper.image_link_list)
-print(len(scraper.image_link_list))
+# print(len(scraper.image_link_list))