diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..8e90c8c Binary files /dev/null and b/.DS_Store differ diff --git a/app/.DS_Store b/app/.DS_Store new file mode 100644 index 0000000..725195a Binary files /dev/null and b/app/.DS_Store differ diff --git a/app/home/.DS_Store b/app/home/.DS_Store new file mode 100644 index 0000000..aa241cc Binary files /dev/null and b/app/home/.DS_Store differ diff --git a/app/home/api/routers/v1/home.py b/app/home/api/routers/v1/home.py index 440cb8b..fad8ca7 100644 --- a/app/home/api/routers/v1/home.py +++ b/app/home/api/routers/v1/home.py @@ -193,9 +193,12 @@ async def crawling(request_body: CrawlingRequest): logger.info(f"[crawling] Step 3-3: GPT API 호출 완료 - 응답 {len(raw_response)}자 ({step3_3_elapsed:.1f}ms)") print(f"[crawling] Step 3-3: GPT API 호출 완료 - 응답 {len(raw_response)}자 ({step3_3_elapsed:.1f}ms)") - # Step 3-4: 응답 파싱 + # Step 3-4: 응답 파싱 (크롤링에서 가져온 facility_info 전달) step3_4_start = time.perf_counter() - parsed = await chatgpt_service.parse_marketing_analysis(raw_response) + print(f"[crawling] Step 3-4: 응답 파싱 시작 - facility_info: {scraper.facility_info}") + parsed = await chatgpt_service.parse_marketing_analysis( + raw_response, facility_info=scraper.facility_info + ) marketing_analysis = MarketingAnalysis(**parsed) step3_4_elapsed = (time.perf_counter() - step3_4_start) * 1000 print(f"[crawling] Step 3-4: 응답 파싱 완료 ({step3_4_elapsed:.1f}ms)") diff --git a/app/lyric/.DS_Store b/app/lyric/.DS_Store new file mode 100644 index 0000000..6b464c0 Binary files /dev/null and b/app/lyric/.DS_Store differ diff --git a/app/song/.DS_Store b/app/song/.DS_Store new file mode 100644 index 0000000..4e8bde6 Binary files /dev/null and b/app/song/.DS_Store differ diff --git a/app/utils/chatgpt_prompt.py b/app/utils/chatgpt_prompt.py index f2c2ef9..cca878e 100644 --- a/app/utils/chatgpt_prompt.py +++ b/app/utils/chatgpt_prompt.py @@ -160,18 +160,10 @@ Provide comprehensive marketing analysis including: - Return as JSON with key "tags" - **MUST be written in Korean (한국어)** -2. Facilities - - Based on the business name and region details, identify 5 likely facilities/amenities - - Consider typical facilities for accommodations in the given region - - Examples: 바베큐장, 수영장, 주차장, 와이파이, 주방, 테라스, 정원, etc. - - Return as JSON with key "facilities" - - **MUST be written in Korean (한국어)** - [CRITICAL LANGUAGE REQUIREMENT - ABSOLUTE RULE] ALL OUTPUT MUST BE WRITTEN IN KOREAN (한국어) - Analysis sections: Korean only - Tags: Korean only -- Facilities: Korean only - This is a NON-NEGOTIABLE requirement - Any output in English or other languages is considered a FAILURE - Violation of this rule invalidates the entire response @@ -203,8 +195,7 @@ ALL OUTPUT MUST BE WRITTEN IN KOREAN (한국어) ## JSON Data ```json {{ - "tags": ["태그1", "태그2", "태그3", "태그4", "태그5"], - "facilities": ["부대시설1", "부대시설2", "부대시설3", "부대시설4", "부대시설5"] + "tags": ["태그1", "태그2", "태그3", "태그4", "태그5"] }} ``` --- @@ -361,9 +352,15 @@ class ChatgptService: return result - async def parse_marketing_analysis(self, raw_response: str) -> dict: + async def parse_marketing_analysis( + self, raw_response: str, facility_info: str | None = None + ) -> dict: """ChatGPT 마케팅 분석 응답을 파싱하고 요약하여 딕셔너리로 반환 + Args: + raw_response: ChatGPT 마케팅 분석 응답 원문 + facility_info: 크롤링에서 가져온 편의시설 정보 문자열 + Returns: dict: {"report": str, "tags": list[str], "facilities": list[str]} """ @@ -377,7 +374,7 @@ class ChatgptService: try: json_data = json.loads(json_match.group(1)) tags = json_data.get("tags", []) - facilities = json_data.get("facilities", []) + print(f"[parse_marketing_analysis] GPT 응답에서 tags 파싱 완료: {tags}") # JSON 블록을 제외한 리포트 부분 추출 report = raw_response[: json_match.start()].strip() # --- 구분자 제거 @@ -386,10 +383,22 @@ class ChatgptService: if report.endswith("---"): report = report[:-3].strip() except json.JSONDecodeError: + print("[parse_marketing_analysis] JSON 파싱 실패") pass + # 크롤링에서 가져온 facility_info로 facilities 설정 + print(f"[parse_marketing_analysis] 크롤링 facility_info 원본: {facility_info}") + if facility_info: + # 쉼표로 구분된 편의시설 문자열을 리스트로 변환 + facilities = [f.strip() for f in facility_info.split(",") if f.strip()] + print(f"[parse_marketing_analysis] facility_info 파싱 결과: {facilities}") + else: + facilities = ["등록된 정보 없음"] + print("[parse_marketing_analysis] facility_info 없음 - '등록된 정보 없음' 설정") + # 리포트 내용을 500자로 요약 if report: report = await self.summarize_marketing(report) + print(f"[parse_marketing_analysis] 최종 facilities: {facilities}") return {"report": report, "tags": tags, "facilities": facilities} diff --git a/app/utils/nvMapPwScraper.py b/app/utils/nvMapPwScraper.py new file mode 100644 index 0000000..d724764 --- /dev/null +++ b/app/utils/nvMapPwScraper.py @@ -0,0 +1,113 @@ +import asyncio +from playwright.async_api import async_playwright +from urllib import parse + +class nvMapPwScraper(): + # cls vars + is_ready = False + _playwright = None + _browser = None + _context = None + _win_width = 1280 + _win_height = 720 + _max_retry = 30 # place id timeout threshold seconds + + # instance var + page = None + + @classmethod + def default_context_builder(cls): + context_builder_dict = {} + context_builder_dict['viewport'] = { + 'width' : cls._win_width, + 'height' : cls._win_height + } + context_builder_dict['screen'] = { + 'width' : cls._win_width, + 'height' : cls._win_height + } + context_builder_dict['user_agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36" + context_builder_dict['locale'] = 'ko-KR' + context_builder_dict['timezone_id']='Asia/Seoul' + + return context_builder_dict + + @classmethod + async def initiate_scraper(cls): + if not cls._playwright: + cls._playwright = await async_playwright().start() + if not cls._browser: + cls._browser = await cls._playwright.chromium.launch(headless=True) + if not cls._context: + cls._context = await cls._browser.new_context(**cls.default_context_builder()) + cls.is_ready = True + + def __init__(self): + if not self.is_ready: + raise Exception("nvMapScraper is not initiated") + + async def __aenter__(self): + await self.create_page() + return self + + async def __aexit__(self, exc_type, exc, tb): + await self.page.close() + + async def create_page(self): + self.page = await self._context.new_page() + await self.page.add_init_script( +'''const defaultGetter = Object.getOwnPropertyDescriptor( + Navigator.prototype, + "webdriver" +).get; +defaultGetter.apply(navigator); +defaultGetter.toString(); +Object.defineProperty(Navigator.prototype, "webdriver", { + set: undefined, + enumerable: true, + configurable: true, + get: new Proxy(defaultGetter, { + apply: (target, thisArg, args) => { + Reflect.apply(target, thisArg, args); + return false; + }, + }), +}); +const patchedGetter = Object.getOwnPropertyDescriptor( + Navigator.prototype, + "webdriver" +).get; +patchedGetter.apply(navigator); +patchedGetter.toString();''') + + await self.page.set_extra_http_headers({ + 'sec-ch-ua': '\"Not?A_Brand\";v=\"99\", \"Chromium\";v=\"130\"' + }) + await self.page.goto("http://google.com") + + async def goto_url(self, url, wait_until="domcontentloaded", timeout=20000): + page = self.page + await page.goto(url, wait_until=wait_until, timeout=timeout) + + async def get_place_id_url(self, selected): + + title = selected['title'].replace("", "").replace("", "") + address = selected.get('roadAddress', selected['address']).replace("", "").replace("", "") + encoded_query = parse.quote(f"{address} {title}") + url = f"https://map.naver.com/p/search/{encoded_query}" + + await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000) + + if "/place/" in self.page.url: + return self.page.url + + url = self.page.url.replace("?","?isCorrectAnswer=true&") + await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000) + + if "/place/" in self.page.url: + return self.page.url + + if (count == self._max_retry / 2): + raise Exception("Failed to identify place id. loading timeout") + else: + raise Exception("Failed to identify place id. item is ambiguous") diff --git a/app/video/.DS_Store b/app/video/.DS_Store new file mode 100644 index 0000000..2803a5f Binary files /dev/null and b/app/video/.DS_Store differ diff --git a/docs/.DS_Store b/docs/.DS_Store new file mode 100644 index 0000000..d146433 Binary files /dev/null and b/docs/.DS_Store differ diff --git a/image/.DS_Store b/image/.DS_Store new file mode 100644 index 0000000..e870d4f Binary files /dev/null and b/image/.DS_Store differ diff --git a/image/2025-12-26/.DS_Store b/image/2025-12-26/.DS_Store new file mode 100644 index 0000000..bf8036b Binary files /dev/null and b/image/2025-12-26/.DS_Store differ diff --git a/poc/.DS_Store b/poc/.DS_Store new file mode 100644 index 0000000..098ffaf Binary files /dev/null and b/poc/.DS_Store differ diff --git a/poc/crawling/2026-01-12/main-PwScraper.py b/poc/crawling/2026-01-12/main-PwScraper.py new file mode 100644 index 0000000..5030706 --- /dev/null +++ b/poc/crawling/2026-01-12/main-PwScraper.py @@ -0,0 +1,29 @@ +import asyncio +from nvMapScraper import nvMapScraper +from nvMapPwScraper import nvMapPwScraper + +async def main_function(): + await nvMapPwScraper.initiate_scraper() + selected = {'title': '스테이,머뭄', + 'link': 'https://www.instagram.com/staymeomoom', + 'category': '숙박>펜션', + 'description': '', + 'telephone': '', + 'address': '전북특별자치도 군산시 신흥동 63-18', + 'roadAddress': '전북특별자치도 군산시 절골길 18', + 'mapx': '1267061254', + 'mapy': '359864175', + 'lng': 126.7061254, + 'lat': 35.9864175} + + async with nvMapPwScraper() as pw_scraper: + new_url = await pw_scraper.get_place_id_url(selected) + + print(new_url) + nv_scraper = nvMapScraper(new_url) # 이후 동일한 플로우 + await nv_scraper.scrap() + print(nv_scraper.rawdata) + return + +print("running main_funtion..") +asyncio.run(main_function()) \ No newline at end of file diff --git a/poc/crawling/2026-01-12/nvMapPwScraper.py b/poc/crawling/2026-01-12/nvMapPwScraper.py new file mode 100644 index 0000000..d724764 --- /dev/null +++ b/poc/crawling/2026-01-12/nvMapPwScraper.py @@ -0,0 +1,113 @@ +import asyncio +from playwright.async_api import async_playwright +from urllib import parse + +class nvMapPwScraper(): + # cls vars + is_ready = False + _playwright = None + _browser = None + _context = None + _win_width = 1280 + _win_height = 720 + _max_retry = 30 # place id timeout threshold seconds + + # instance var + page = None + + @classmethod + def default_context_builder(cls): + context_builder_dict = {} + context_builder_dict['viewport'] = { + 'width' : cls._win_width, + 'height' : cls._win_height + } + context_builder_dict['screen'] = { + 'width' : cls._win_width, + 'height' : cls._win_height + } + context_builder_dict['user_agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36" + context_builder_dict['locale'] = 'ko-KR' + context_builder_dict['timezone_id']='Asia/Seoul' + + return context_builder_dict + + @classmethod + async def initiate_scraper(cls): + if not cls._playwright: + cls._playwright = await async_playwright().start() + if not cls._browser: + cls._browser = await cls._playwright.chromium.launch(headless=True) + if not cls._context: + cls._context = await cls._browser.new_context(**cls.default_context_builder()) + cls.is_ready = True + + def __init__(self): + if not self.is_ready: + raise Exception("nvMapScraper is not initiated") + + async def __aenter__(self): + await self.create_page() + return self + + async def __aexit__(self, exc_type, exc, tb): + await self.page.close() + + async def create_page(self): + self.page = await self._context.new_page() + await self.page.add_init_script( +'''const defaultGetter = Object.getOwnPropertyDescriptor( + Navigator.prototype, + "webdriver" +).get; +defaultGetter.apply(navigator); +defaultGetter.toString(); +Object.defineProperty(Navigator.prototype, "webdriver", { + set: undefined, + enumerable: true, + configurable: true, + get: new Proxy(defaultGetter, { + apply: (target, thisArg, args) => { + Reflect.apply(target, thisArg, args); + return false; + }, + }), +}); +const patchedGetter = Object.getOwnPropertyDescriptor( + Navigator.prototype, + "webdriver" +).get; +patchedGetter.apply(navigator); +patchedGetter.toString();''') + + await self.page.set_extra_http_headers({ + 'sec-ch-ua': '\"Not?A_Brand\";v=\"99\", \"Chromium\";v=\"130\"' + }) + await self.page.goto("http://google.com") + + async def goto_url(self, url, wait_until="domcontentloaded", timeout=20000): + page = self.page + await page.goto(url, wait_until=wait_until, timeout=timeout) + + async def get_place_id_url(self, selected): + + title = selected['title'].replace("", "").replace("", "") + address = selected.get('roadAddress', selected['address']).replace("", "").replace("", "") + encoded_query = parse.quote(f"{address} {title}") + url = f"https://map.naver.com/p/search/{encoded_query}" + + await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000) + + if "/place/" in self.page.url: + return self.page.url + + url = self.page.url.replace("?","?isCorrectAnswer=true&") + await self.goto_url(url, wait_until="networkidle",timeout = self._max_retry/2*1000) + + if "/place/" in self.page.url: + return self.page.url + + if (count == self._max_retry / 2): + raise Exception("Failed to identify place id. loading timeout") + else: + raise Exception("Failed to identify place id. item is ambiguous") diff --git a/poc/crawling/nvMapScraper-2026-01-12.py b/poc/crawling/2026-01-12/nvMapScraper.py similarity index 96% rename from poc/crawling/nvMapScraper-2026-01-12.py rename to poc/crawling/2026-01-12/nvMapScraper.py index 7d155e4..38bc1cd 100644 --- a/poc/crawling/nvMapScraper-2026-01-12.py +++ b/poc/crawling/2026-01-12/nvMapScraper.py @@ -112,8 +112,8 @@ class nvMapScraper(): facilities = c_elem.parent.parent.find('div').string return facilities -url = "https://naver.me/IgJGCCic" -scraper = nvMapScraper(url) -asyncio.run(scraper.scrap()) -print(scraper.image_link_list) -print(len(scraper.image_link_list)) \ No newline at end of file +# url = "https://naver.me/IgJGCCic" +# scraper = nvMapScraper(url) +# asyncio.run(scraper.scrap()) +# print(scraper.image_link_list) +# print(len(scraper.image_link_list)) \ No newline at end of file