add facilities from result of crawling

2026-01-12 16:50:16 +09:00 · 2026-01-12 16:50:16 +09:00 · 3f75b6d61d
parent b84c07c325
commit 3f75b6d61d
16 changed files with 286 additions and 19 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/app/.DS_Store
+++ b/app/.DS_Store
--- a/app/home/.DS_Store
+++ b/app/home/.DS_Store
--- a/app/home/api/routers/v1/home.py
+++ b/app/home/api/routers/v1/home.py
@ -193,9 +193,12 @@ async def crawling(request_body: CrawlingRequest):
            logger.info(f"[crawling]   Step 3-3: GPT API 호출 완료 - 응답 {len(raw_response)}자 ({step3_3_elapsed:.1f}ms)")
            print(f"[crawling]   Step 3-3: GPT API 호출 완료 - 응답 {len(raw_response)}자 ({step3_3_elapsed:.1f}ms)")

-            # Step 3-4: 응답 파싱
+            # Step 3-4: 응답 파싱 (크롤링에서 가져온 facility_info 전달)
            step3_4_start = time.perf_counter()
-            parsed = await chatgpt_service.parse_marketing_analysis(raw_response)
+            print(f"[crawling]   Step 3-4: 응답 파싱 시작 - facility_info: {scraper.facility_info}")
+            parsed = await chatgpt_service.parse_marketing_analysis(
+                raw_response, facility_info=scraper.facility_info
+            )
            marketing_analysis = MarketingAnalysis(**parsed)
            step3_4_elapsed = (time.perf_counter() - step3_4_start) * 1000
            print(f"[crawling]   Step 3-4: 응답 파싱 완료 ({step3_4_elapsed:.1f}ms)")
--- a/app/lyric/.DS_Store
+++ b/app/lyric/.DS_Store
--- a/app/song/.DS_Store
+++ b/app/song/.DS_Store
--- a/app/utils/chatgpt_prompt.py
+++ b/app/utils/chatgpt_prompt.py
@ -160,18 +160,10 @@ Provide comprehensive marketing analysis including:
   - Return as JSON with key "tags"
   - **MUST be written in Korean (한국어)**

-2. Facilities
-   - Based on the business name and region details, identify 5 likely facilities/amenities
-   - Consider typical facilities for accommodations in the given region
-   - Examples: 바베큐장, 수영장, 주차장, 와이파이, 주방, 테라스, 정원, etc.
-   - Return as JSON with key "facilities"
-   - **MUST be written in Korean (한국어)**
-
 [CRITICAL LANGUAGE REQUIREMENT - ABSOLUTE RULE]
 ALL OUTPUT MUST BE WRITTEN IN KOREAN (한국어) 
 - Analysis sections: Korean only
 - Tags: Korean only
- Facilities: Korean only
 - This is a NON-NEGOTIABLE requirement
 - Any output in English or other languages is considered a FAILURE
 - Violation of this rule invalidates the entire response
@ -203,8 +195,7 @@ ALL OUTPUT MUST BE WRITTEN IN KOREAN (한국어)
 ## JSON Data
 ```json
 {{
-    "tags": ["태그1", "태그2", "태그3", "태그4", "태그5"],
-    "facilities": ["부대시설1", "부대시설2", "부대시설3", "부대시설4", "부대시설5"]
+    "tags": ["태그1", "태그2", "태그3", "태그4", "태그5"]
 }}
 ```
 ---
@ -361,9 +352,15 @@ class ChatgptService:

        return result

-    async def parse_marketing_analysis(self, raw_response: str) -> dict:
+    async def parse_marketing_analysis(
+        self, raw_response: str, facility_info: str | None = None
+    ) -> dict:
        """ChatGPT 마케팅 분석 응답을 파싱하고 요약하여 딕셔너리로 반환

+        Args:
+            raw_response: ChatGPT 마케팅 분석 응답 원문
+            facility_info: 크롤링에서 가져온 편의시설 정보 문자열
+
        Returns:
            dict: {"report": str, "tags": list[str], "facilities": list[str]}
        """
@ -377,7 +374,7 @@ class ChatgptService:
            try:
                json_data = json.loads(json_match.group(1))
                tags = json_data.get("tags", [])
-                facilities = json_data.get("facilities", [])
+                print(f"[parse_marketing_analysis] GPT 응답에서 tags 파싱 완료: {tags}")
                # JSON 블록을 제외한 리포트 부분 추출
                report = raw_response[: json_match.start()].strip()
                # --- 구분자 제거
@ -386,10 +383,22 @@ class ChatgptService:
                if report.endswith("---"):
                    report = report[:-3].strip()
            except json.JSONDecodeError:
+                print("[parse_marketing_analysis] JSON 파싱 실패")
                pass

+        # 크롤링에서 가져온 facility_info로 facilities 설정
+        print(f"[parse_marketing_analysis] 크롤링 facility_info 원본: {facility_info}")
+        if facility_info:
+            # 쉼표로 구분된 편의시설 문자열을 리스트로 변환
+            facilities = [f.strip() for f in facility_info.split(",") if f.strip()]
+            print(f"[parse_marketing_analysis] facility_info 파싱 결과: {facilities}")
+        else:
+            facilities = ["등록된 정보 없음"]
+            print("[parse_marketing_analysis] facility_info 없음 - '등록된 정보 없음' 설정")
+
        # 리포트 내용을 500자로 요약
        if report:
            report = await self.summarize_marketing(report)

+        print(f"[parse_marketing_analysis] 최종 facilities: {facilities}")
        return {"report": report, "tags": tags, "facilities": facilities}
--- a/app/utils/nvMapPwScraper.py
+++ b/app/utils/nvMapPwScraper.py
@ -0,0 +1,113 @@
+import asyncio
+from playwright.async_api import async_playwright
+from urllib import parse
+
+class nvMapPwScraper():
+    # cls vars
+    is_ready = False
+    _playwright = None
+    _browser = None
+    _context = None
+    _win_width = 1280
+    _win_height = 720
+    _max_retry = 30 # place id timeout threshold seconds
+    
+    # instance var
+    page = None
+
+    @classmethod
+    def default_context_builder(cls):
+        context_builder_dict = {}
+        context_builder_dict['viewport'] = {
+                'width' : cls._win_width,
+                'height' : cls._win_height
+        }
+        context_builder_dict['screen'] = {
+                'width' : cls._win_width,
+                'height' : cls._win_height
+        }
+        context_builder_dict['user_agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
+        context_builder_dict['locale'] = 'ko-KR'
+        context_builder_dict['timezone_id']='Asia/Seoul'
+
+        return context_builder_dict
+       
+    @classmethod
+    async def initiate_scraper(cls):
+        if not cls._playwright:
+            cls._playwright = await async_playwright().start()
+        if not cls._browser:
+            cls._browser = await cls._playwright.chromium.launch(headless=True)
+        if not cls._context:
+            cls._context = await cls._browser.new_context(**cls.default_context_builder())
+        cls.is_ready = True
+    
+    def __init__(self):
+        if not self.is_ready:
+            raise Exception("nvMapScraper is not initiated")
+
+    async def __aenter__(self):
+        await self.create_page()
+        return self
+
+    async def __aexit__(self, exc_type, exc, tb):
+        await self.page.close()
+        
+    async def create_page(self):
+        self.page = await self._context.new_page()
+        await self.page.add_init_script(
+'''const defaultGetter = Object.getOwnPropertyDescriptor(
+    Navigator.prototype,
+    "webdriver"
+).get;
+defaultGetter.apply(navigator);
+defaultGetter.toString();
+Object.defineProperty(Navigator.prototype, "webdriver", {
+    set: undefined,
+    enumerable: true,
+    configurable: true,
+    get: new Proxy(defaultGetter, {
+    apply: (target, thisArg, args) => {
+        Reflect.apply(target, thisArg, args);
+        return false;
+    },
+    }),
+});
+const patchedGetter = Object.getOwnPropertyDescriptor(
+    Navigator.prototype,
+    "webdriver"
+).get;
+patchedGetter.apply(navigator);
+patchedGetter.toString();''')
+
+        await self.page.set_extra_http_headers({
+            'sec-ch-ua': '\"Not?A_Brand\";v=\"99\", \"Chromium\";v=\"130\"'
+        })
+        await self.page.goto("http://google.com")
+        
+    async def goto_url(self, url, wait_until="domcontentloaded", timeout=20000):
+        page = self.page
+        await page.goto(url, wait_until=wait_until, timeout=timeout)
+
+    async def get_place_id_url(self, selected):
+        
+        title = selected['title'].replace("<b>", "").replace("</b>", "")
+        address = selected.get('roadAddress', selected['address']).replace("<b>", "").replace("</b>", "")
+        encoded_query = parse.quote(f"{address} {title}")
+        url = f"https://map.naver.com/p/search/{encoded_query}"
+
+        await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
+
+        if "/place/" in self.page.url:
+            return self.page.url
+
+        url = self.page.url.replace("?","?isCorrectAnswer=true&")
+        await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
+        
+        if "/place/" in self.page.url:
+            return self.page.url
+
+        if (count == self._max_retry / 2):
+            raise Exception("Failed to identify place id. loading timeout")
+        else:
+            raise Exception("Failed to identify place id. item is ambiguous")
--- a/app/video/.DS_Store
+++ b/app/video/.DS_Store
--- a/docs/.DS_Store
+++ b/docs/.DS_Store
--- a/image/.DS_Store
+++ b/image/.DS_Store
--- a/image/2025-12-26/.DS_Store
+++ b/image/2025-12-26/.DS_Store
--- a/poc/.DS_Store
+++ b/poc/.DS_Store
--- a/poc/crawling/2026-01-12/main-PwScraper.py
+++ b/poc/crawling/2026-01-12/main-PwScraper.py
@ -0,0 +1,29 @@
+import asyncio
+from nvMapScraper import nvMapScraper
+from nvMapPwScraper import nvMapPwScraper
+
+async def main_function():
+    await nvMapPwScraper.initiate_scraper()
+    selected = {'title': '<b>스테이</b>,<b>머뭄</b>',
+                    'link': 'https://www.instagram.com/staymeomoom',
+                    'category': '숙박>펜션',
+                    'description': '',
+                    'telephone': '',
+                    'address': '전북특별자치도 군산시 신흥동 63-18',
+                    'roadAddress': '전북특별자치도 군산시 절골길 18',
+                    'mapx': '1267061254',
+                    'mapy': '359864175',
+                    'lng': 126.7061254,
+                    'lat': 35.9864175}
+    
+    async with nvMapPwScraper() as pw_scraper:
+        new_url = await pw_scraper.get_place_id_url(selected)
+
+    print(new_url)
+    nv_scraper = nvMapScraper(new_url) # 이후 동일한 플로우
+    await nv_scraper.scrap()
+    print(nv_scraper.rawdata)
+    return
+
+print("running main_funtion..")
+asyncio.run(main_function())
--- a/poc/crawling/2026-01-12/nvMapPwScraper.py
+++ b/poc/crawling/2026-01-12/nvMapPwScraper.py
@ -0,0 +1,113 @@
+import asyncio
+from playwright.async_api import async_playwright
+from urllib import parse
+
+class nvMapPwScraper():
+    # cls vars
+    is_ready = False
+    _playwright = None
+    _browser = None
+    _context = None
+    _win_width = 1280
+    _win_height = 720
+    _max_retry = 30 # place id timeout threshold seconds
+    
+    # instance var
+    page = None
+
+    @classmethod
+    def default_context_builder(cls):
+        context_builder_dict = {}
+        context_builder_dict['viewport'] = {
+                'width' : cls._win_width,
+                'height' : cls._win_height
+        }
+        context_builder_dict['screen'] = {
+                'width' : cls._win_width,
+                'height' : cls._win_height
+        }
+        context_builder_dict['user_agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
+        context_builder_dict['locale'] = 'ko-KR'
+        context_builder_dict['timezone_id']='Asia/Seoul'
+
+        return context_builder_dict
+       
+    @classmethod
+    async def initiate_scraper(cls):
+        if not cls._playwright:
+            cls._playwright = await async_playwright().start()
+        if not cls._browser:
+            cls._browser = await cls._playwright.chromium.launch(headless=True)
+        if not cls._context:
+            cls._context = await cls._browser.new_context(**cls.default_context_builder())
+        cls.is_ready = True
+    
+    def __init__(self):
+        if not self.is_ready:
+            raise Exception("nvMapScraper is not initiated")
+
+    async def __aenter__(self):
+        await self.create_page()
+        return self
+
+    async def __aexit__(self, exc_type, exc, tb):
+        await self.page.close()
+        
+    async def create_page(self):
+        self.page = await self._context.new_page()
+        await self.page.add_init_script(
+'''const defaultGetter = Object.getOwnPropertyDescriptor(
+    Navigator.prototype,
+    "webdriver"
+).get;
+defaultGetter.apply(navigator);
+defaultGetter.toString();
+Object.defineProperty(Navigator.prototype, "webdriver", {
+    set: undefined,
+    enumerable: true,
+    configurable: true,
+    get: new Proxy(defaultGetter, {
+    apply: (target, thisArg, args) => {
+        Reflect.apply(target, thisArg, args);
+        return false;
+    },
+    }),
+});
+const patchedGetter = Object.getOwnPropertyDescriptor(
+    Navigator.prototype,
+    "webdriver"
+).get;
+patchedGetter.apply(navigator);
+patchedGetter.toString();''')
+
+        await self.page.set_extra_http_headers({
+            'sec-ch-ua': '\"Not?A_Brand\";v=\"99\", \"Chromium\";v=\"130\"'
+        })
+        await self.page.goto("http://google.com")
+        
+    async def goto_url(self, url, wait_until="domcontentloaded", timeout=20000):
+        page = self.page
+        await page.goto(url, wait_until=wait_until, timeout=timeout)
+
+    async def get_place_id_url(self, selected):
+        
+        title = selected['title'].replace("<b>", "").replace("</b>", "")
+        address = selected.get('roadAddress', selected['address']).replace("<b>", "").replace("</b>", "")
+        encoded_query = parse.quote(f"{address} {title}")
+        url = f"https://map.naver.com/p/search/{encoded_query}"
+
+        await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
+
+        if "/place/" in self.page.url:
+            return self.page.url
+
+        url = self.page.url.replace("?","?isCorrectAnswer=true&")
+        await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000)
+        
+        if "/place/" in self.page.url:
+            return self.page.url
+
+        if (count == self._max_retry / 2):
+            raise Exception("Failed to identify place id. loading timeout")
+        else:
+            raise Exception("Failed to identify place id. item is ambiguous")
--- a/poc/crawling/2026-01-12/nvMapScraper.py
+++ b/poc/crawling/2026-01-12/nvMapScraper.py
@ -112,8 +112,8 @@ class nvMapScraper():
                facilities = c_elem.parent.parent.find('div').string
        return facilities
                
-url = "https://naver.me/IgJGCCic"
-scraper = nvMapScraper(url)
-asyncio.run(scraper.scrap())
-print(scraper.image_link_list)
-print(len(scraper.image_link_list))
+# url = "https://naver.me/IgJGCCic"
+# scraper = nvMapScraper(url)
+# asyncio.run(scraper.scrap())
+# print(scraper.image_link_list)
+# print(len(scraper.image_link_list))