Playwright 모듈 PoC 추가

2026-01-12 11:01:03 +09:00 · 2026-01-12 11:01:03 +09:00 · b7edba8c80
parent 56d4c690bf
commit b7edba8c80
5 changed files with 404 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -27,3 +27,6 @@ build/
 *.mp3
 *.mp4
 media/
+
+
+*.ipynb_checkpoint*
--- a/poc/crawling/nvMapPwScraper.py
+++ b/poc/crawling/nvMapPwScraper.py
@ -0,0 +1,100 @@
+import asyncio
+from playwright.async_api import async_playwright
+from urllib import parse
+
+class nvMapPwScraper():
+    # cls vars
+    is_ready = False
+    _playwright = None
+    _browser = None
+    _context = None
+    _win_width = 1280
+    _win_height = 720
+
+    @classmethod
+    def default_context_builder(cls):
+        context_builder_dict = {}
+        context_builder_dict['viewport'] = {
+                'width' : cls._win_width,
+                'height' : cls._win_height
+        }
+        context_builder_dict['screen'] = {
+                'width' : cls._win_width,
+                'height' : cls._win_height
+        }
+        context_builder_dict['user_agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
+        context_builder_dict['locale'] = 'ko-KR'
+        context_builder_dict['timezone_id']='Asia/Seoul'
+
+        return context_builder_dict
+       
+    @classmethod
+    async def initiate_scraper(cls):
+        if not cls._playwright:
+            cls._playwright = await async_playwright().start()
+        if not cls._browser:
+            cls._browser = await cls._playwright.chromium.launch(headless=True)
+        if not cls._context:
+            cls._context = await cls._browser.new_context(**cls.default_context_builder())
+        cls.is_ready = True
+    
+    def __init__(self):
+        if not self.is_ready:
+            raise Exception("nvMapScraper is not initiated")
+            
+        
+    async def create_page(self):
+        while(not self.is_ready):
+            asyncio.sleep(1000)
+        self.page = await self._context.new_page()
+        await self.page.add_init_script(
+'''const defaultGetter = Object.getOwnPropertyDescriptor(
+    Navigator.prototype,
+    "webdriver"
+).get;
+defaultGetter.apply(navigator);
+defaultGetter.toString();
+Object.defineProperty(Navigator.prototype, "webdriver", {
+    set: undefined,
+    enumerable: true,
+    configurable: true,
+    get: new Proxy(defaultGetter, {
+    apply: (target, thisArg, args) => {
+        Reflect.apply(target, thisArg, args);
+        return false;
+    },
+    }),
+});
+const patchedGetter = Object.getOwnPropertyDescriptor(
+    Navigator.prototype,
+    "webdriver"
+).get;
+patchedGetter.apply(navigator);
+patchedGetter.toString();''')
+
+        await self.page.set_extra_http_headers({
+            'sec-ch-ua': '\"Not?A_Brand\";v=\"99\", \"Chromium\";v=\"130\"'
+        })
+        await self.page.goto("http://google.com")
+        
+    async def goto_url(self, url):
+        page = self.page
+        await page.goto(url, wait_until="domcontentloaded", timeout=20000)
+
+    async def get_place_id_url(self, selected):
+        
+        title = selected['title'].replace("<b>", "").replace("</b>", "")
+        address = selected.get('roadAddress', selected['address']).replace("<b>", "").replace("</b>", "")
+        encoded_query = parse.quote(f"{address} {title}")
+        url = f"https://map.naver.com/p/search/{encoded_query}"
+
+        await self.goto_url(url)
+
+        count = 0
+        while(count < 5):
+            if "isCorrectAnswer=true" in self.page.url:
+                return self.page.url
+            await asyncio.sleep(1)
+            count += 1
+
+        raise Exception("Failed to identify place id. item is ambiguous")
--- a/poc/timestamp_lyric/Untitled.ipynb
+++ b/poc/timestamp_lyric/Untitled.ipynb
@ -0,0 +1,239 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "99398cc7-e36a-494c-88f9-b26874ff0294",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import aiohttp\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "28c3e49b-1133-4a18-ab70-fd321b4d2734",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SUNO_API_KEY= '347da228e2d6ea273ef0558795a75892'\n",
+    "SUNO_BASE_URL=\"https://api.sunoapi.org\"\n",
+    "SUNO_TIMESTAPM_ROUTE = \"/api/v1/generate/get-timestamped-lyrics\"\n",
+    "SUNO_DETAIL_ROUTE = \"/api/v1/generate/record-info\"\n",
+    "suno_task_id = \"46bc90e6a2f9e9af58d7017e23f2115e\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "fe09b1d5-7198-4c40-9667-d7d0885c62a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "headers = {\n",
+    "    \"Authorization\": f\"Bearer {SUNO_API_KEY}\",\n",
+    "    \"Content-Type\": \"application/json\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "81bacedc-e488-4d04-84b1-8e8a06a64565",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "async def get_suno_audio_id_from_task_id(suno_task_id): # expire if db save audio id\n",
+    "    url = f\"{SUNO_BASE_URL}{SUNO_DETAIL_ROUTE}\"\n",
+    "    headers = {\"Authorization\": f\"Bearer {SUNO_API_KEY}\"}\n",
+    "    async with aiohttp.ClientSession() as session:\n",
+    "        async with session.get(url, headers=headers, params={\"taskId\" : suno_task_id}) as response:\n",
+    "            detail = await response.json()\n",
+    "            result = detail['data']['response']['sunoData'][0]['id']\n",
+    "    return result "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "26346a13-0663-489f-98d0-69743dd8553f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "async def get_suno_timestamp(suno_task_id, suno_audio_id): # expire if db save audio id\n",
+    "    url = f\"{SUNO_BASE_URL}{SUNO_TIMESTAPM_ROUTE}\"\n",
+    "    headers = {\"Authorization\": f\"Bearer {SUNO_API_KEY}\"}\n",
+    "    payload = {\n",
+    "        \"task_id\" : suno_task_id,\n",
+    "        \"audio_id\" : suno_audio_id\n",
+    "    }\n",
+    "    async with aiohttp.ClientSession() as session:\n",
+    "        async with session.post(url, headers=headers, data=json.dumps(payload)) as response:\n",
+    "            result = await response.json()\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "78db0f6b-a54c-4415-9e82-972b00fefefb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = await get_suno_timestamp(suno_task_id, await get_suno_audio_id_from_task_id(suno_task_id))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "44d8da8e-5a67-4125-809f-bbdb1efba55f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gt_lyric = \"\"\"\n",
+    "---\n",
+    "스테이,머뭄의 추억을 담아  \n",
+    "군산에서의 여행을 떠나보세  \n",
+    "인스타 감성 가득한 사진같은 하루,  \n",
+    "힐링의 시간, 감성 숙소에서의 휴식\n",
+    "\n",
+    "은파호수공원의 자연 속,  \n",
+    "시간이 멈춘 듯한 절골길을 걸어봐요  \n",
+    "Instagram vibes, 그림 같은 힐링 장소,  \n",
+    "잊지 못할 여행 스토리 만들어지네\n",
+    "\n",
+    "넷이서 웃고 떠들던 그 날의 사진 속,  \n",
+    "그 순간 훌쩍 떠나볼까요, 새로운 길로  \n",
+    "스테이,머뭄이 준비한 특별한 여행지  \n",
+    "몸과 마음이 따뜻해지는 그런 곳이에요  \n",
+    "---\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "e4e9ba7d-964f-4f29-95f3-0f8514fad7ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lyric_line_list = gt_lyric.split(\"\\n\")\n",
+    "lyric_line_list = [lyric_line.strip(',. ') for lyric_line in lyric_line_list if lyric_line and lyric_line != \"---\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "84a64cd5-7374-4c33-8634-6ac6ed0de425",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['스테이,머뭄의 추억을 담아',\n",
+       " '군산에서의 여행을 떠나보세',\n",
+       " '인스타 감성 가득한 사진같은 하루',\n",
+       " '힐링의 시간, 감성 숙소에서의 휴식',\n",
+       " '은파호수공원의 자연 속',\n",
+       " '시간이 멈춘 듯한 절골길을 걸어봐요',\n",
+       " 'Instagram vibes, 그림 같은 힐링 장소',\n",
+       " '잊지 못할 여행 스토리 만들어지네',\n",
+       " '넷이서 웃고 떠들던 그 날의 사진 속',\n",
+       " '그 순간 훌쩍 떠나볼까요, 새로운 길로',\n",
+       " '스테이,머뭄이 준비한 특별한 여행지',\n",
+       " '몸과 마음이 따뜻해지는 그런 곳이에요']"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "lyric_line_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17ca1a6e-c3a8-4683-958b-14bb3a46e63a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "matching = "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "a8df83b4-99ef-4751-8c98-e5423c5c2494",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "aligned_words = data['data']['alignedWords']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "c1a1b2be-0796-4e40-b8bf-cd7c08e81e3e",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "_IncompleteInputError",
+     "evalue": "incomplete input (2013651467.py, line 9)",
+     "output_type": "error",
+     "traceback": [
+      "  \u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[36]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[31m    \u001b[39m\n    ^\n\u001b[31m_IncompleteInputError\u001b[39m\u001b[31m:\u001b[39m incomplete input\n"
+     ]
+    }
+   ],
+   "source": [
+    "alignment_lyric = {}\n",
+    "lyric_index = 0                                                                             \n",
+    "for aligned_word in aligned_words:\n",
+    "    if not aligned_word['succsess']:\n",
+    "        continue\n",
+    "    if aligned_word['word'] in lyric_line_list[lyric_index]:\n",
+    "        if lyric_index in alignment_lyric:\n",
+    "            raise Exception\n",
+    "        else:\n",
+    "            \n",
+    "        \n",
+    "            \n",
+    "            "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c59c4eb1-d916-4d3a-8d02-a212b45f20ba",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/poc/timestamp_lyric/lyric_timestamp_mapper.py
+++ b/poc/timestamp_lyric/lyric_timestamp_mapper.py
@ -0,0 +1,42 @@
+from openai import OpenAI
+from difflib import SequenceMatcher
+from dataclasses import dataclass
+from typing import List, Tuple
+import aiohttp, json
+
+
+@dataclass
+class TimestampedLyric:
+    text: str
+    start: float
+    end: float
+
+SUNO_BASE_URL="https://api.sunoapi.org"
+SUNO_TIMESTAMP_ROUTE = "/api/v1/generate/get-timestamped-lyrics"
+SUNO_DETAIL_ROUTE = "/api/v1/generate/record-info"
+
+class LyricTimestampMapper:
+    suno_api_key : str
+    def __init__(self, suno_api_key):
+        self.suno_api_key = suno_api_key
+    
+    async def get_suno_audio_id_from_task_id(self, suno_task_id): # expire if db save audio id
+        url = f"{SUNO_BASE_URL}{SUNO_DETAIL_ROUTE}"
+        headers = {"Authorization": f"Bearer {self.suno_api_key}"}
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url, headers=headers, params={"taskId" : suno_task_id}) as response:
+                detail = await response.json()
+                result = detail['data']['response']['sunoData'][0]['id']
+        return result
+            
+    async def get_suno_timestamp(self, suno_task_id, suno_audio_id): # expire if db save audio id
+        url = f"{SUNO_BASE_URL}{SUNO_TIMESTAMP_ROUTE}"
+        headers = {"Authorization": f"Bearer {self.suno_api_key}"}
+        payload = {
+            "task_id" : suno_task_id,
+            "audio_id" : suno_audio_id
+        }
+        async with aiohttp.ClientSession() as session:
+            async with session.post(url, headers=headers, data=json.dumps(payload)) as response:
+                result = await response.json()
+        return result
--- a/poc/timestamp_lyric/test_mapper.py
+++ b/poc/timestamp_lyric/test_mapper.py
@ -0,0 +1,19 @@
+from lyric_timestamp_mapper import LyricTimestampMapper
+
+API_KEY = "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"
+AUDIO_PATH = "test_audio.mp3"
+
+GROUND_TRUTH_LYRICS = [
+    "첫 번째 가사 라인입니다",
+    "두 번째 가사 라인입니다",
+    "세 번째 가사 라인입니다",
+]
+
+mapper = LyricTimestampMapper(api_key=API_KEY)
+result = mapper.map_ground_truth(AUDIO_PATH, GROUND_TRUTH_LYRICS)
+
+for lyric in result:
+    if lyric.start >= 0:
+        print(f"[{lyric.start:.2f} - {lyric.end:.2f}] {lyric.text}")
+    else:
+        print(f"[매칭 실패] {lyric.text}")