diff --git a/app/utils/nvMapPwScraper.py b/app/utils/nvMapPwScraper.py index dde73f5..c93b5ec 100644 --- a/app/utils/nvMapPwScraper.py +++ b/app/utils/nvMapPwScraper.py @@ -105,6 +105,37 @@ patchedGetter.toString();''') def _similarity(a: str, b: str) -> float: return SequenceMatcher(None, a, b).ratio() + @staticmethod + def _refine_address(address: str) -> str: + """한국 주소 패턴에서 첫 번째 유효한 주소만 추출한다.""" + patterns = [ + # 도로명 (정식): 경기도 가평군 운악로 278 + re.compile( + r'[가-힣]+(?:특별시|광역시|특별자치시|도|특별자치도|시)\s+' + r'[가-힣\s]+?(?:로|길|대로)\s+\d+(?:-\d+)?' + ), + # 지번 (정식): 경기도 가평군 조종면 운악리 278 + re.compile( + r'[가-힣]+(?:특별시|광역시|특별자치시|도|특별자치도|시)\s+' + r'[가-힣\s]+?(?:읍|면|동|리|가)\s+\d+(?:-\d+)?' + ), + # 도로명 (축약): 경기 가평 운악로 278 + re.compile( + r'[가-힣]{1,4}\s+[가-힣]{1,6}\s+' + r'[가-힣\s]+?(?:로|길|대로)\s+\d+(?:-\d+)?' + ), + # 지번 (축약): 경기 가평 조종면 운악리 278 + re.compile( + r'[가-힣]{1,4}\s+[가-힣]{1,6}\s+' + r'[가-힣\s]+?(?:읍|면|동|리|가)\s+\d+(?:-\d+)?' + ), + ] + for pattern in patterns: + m = pattern.search(address) + if m: + return m.group().strip() + return address + async def _extract_candidates_from_list_page(self) -> list[dict]: """pcmap.place.naver.com iframe HTML에서 place ID와 업체명을 추출한다.""" pcmap_frame = None @@ -150,52 +181,70 @@ patchedGetter.toString();''') logger.debug(f"[DEBUG] 목록 후보 {len(candidates)}개 추출") return candidates - async def get_place_id_url(self, selected): - count = 0 - title = self._clean_title(selected['title']) - address = self._clean_title(selected.get('roadAddress', selected['address'])) - encoded_query = parse.quote(f"{address} {title}") + async def _try_search(self, address: str, title: str) -> str | None: + """주어진 주소+업체명으로 검색해서 place URL을 반환한다. 실패 시 None.""" + encoded_query = parse.quote(f"{address} {title}".strip()) url = f"https://map.naver.com/p/search/{encoded_query}" - while count <= self._max_retry: - try: - await self.goto_url(url, wait_until="networkidle", timeout=self._timeout * 1000) - except: - if "/place/" in self.page.url: - return self.page.url - logger.error("[ERROR] Can't Finish networkidle") - - logger.debug(f"[DEBUG] Try {count+1} : current url = {self.page.url}") - + try: + await self.goto_url(url, wait_until="networkidle", timeout=self._timeout * 1000) + except: if "/place/" in self.page.url: return self.page.url + logger.error("[ERROR] Can't Finish networkidle") - # 목록 페이지에 머문 경우 — 후보 추출 후 유사도 선택 - candidates = await self._extract_candidates_from_list_page() - if candidates: - best = max( - candidates, - key=lambda c: self._similarity(title, self._clean_title(c['title'])) - ) - best_score = self._similarity(title, self._clean_title(best['title'])) - logger.info( - f"[AUTO-SELECT] '{title}' → '{best['title']}' (score={best_score:.2f}) {best['place_url']}" - ) - return best['place_url'] + if "/place/" in self.page.url: + return self.page.url - # isCorrectAnswer 플래그 재시도 - url = self.page.url.replace("?", "?isCorrectAnswer=true&") - try: - await self.goto_url(url, wait_until="networkidle", timeout=self._timeout * 1000) - except: - if "/place/" in self.page.url: - return self.page.url - logger.error("[ERROR] Can't Finish networkidle") + candidates = await self._extract_candidates_from_list_page() + if candidates: + best = max( + candidates, + key=lambda c: self._similarity(title, self._clean_title(c['title'])) + ) + best_score = self._similarity(title, self._clean_title(best['title'])) + logger.info( + f"[AUTO-SELECT] '{title}' → '{best['title']}' (score={best_score:.2f}) {best['place_url']}" + ) + return best['place_url'] + # isCorrectAnswer=true 로 강제 단일결과 재시도 (원본 로직 유지) + correct_url = self.page.url.replace("?", "?isCorrectAnswer=true&") + try: + await self.goto_url(correct_url, wait_until="networkidle", timeout=self._timeout * 1000) + except: if "/place/" in self.page.url: return self.page.url + logger.error("[ERROR] Can't Finish networkidle (isCorrectAnswer)") - count += 1 + if "/place/" in self.page.url: + return self.page.url + + return None + + async def get_place_id_url(self, selected): + title = self._clean_title(selected['title']) + address = self._clean_title(selected.get('roadAddress', selected['address'])) + + # 1차 시도: 원본 주소 + 업체명 + logger.debug(f"[DEBUG] 1차 시도 - address: {address}") + result = await self._try_search(address, title) + if result: + return result + + # 2차 시도: 정제 주소 + 업체명 + refined = self._refine_address(address) + if refined != address: + logger.info(f"[REFINE] 주소 정제: '{address}' → '{refined}'") + result = await self._try_search(refined, title) + if result: + return result + + # 3차 시도: 업체명만으로 검색 + logger.info(f"[RETRY] 업체명만으로 재시도: '{title}'") + result = await self._try_search("", title) + if result: + return result logger.error(f"[ERROR] Not found url for {selected}") return None