diff --git a/.gitignore b/.gitignore
index 9d68df5..2de45be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,4 +26,7 @@ build/
# Media files
*.mp3
*.mp4
-media/
\ No newline at end of file
+media/
+
+
+*.ipynb_checkpoint*
\ No newline at end of file
diff --git a/poc/crawling/nvMapPwScraper.py b/poc/crawling/nvMapPwScraper.py
new file mode 100644
index 0000000..b9d4662
--- /dev/null
+++ b/poc/crawling/nvMapPwScraper.py
@@ -0,0 +1,100 @@
+import asyncio
+from playwright.async_api import async_playwright
+from urllib import parse
+
+class nvMapPwScraper():
+ # cls vars
+ is_ready = False
+ _playwright = None
+ _browser = None
+ _context = None
+ _win_width = 1280
+ _win_height = 720
+
+ @classmethod
+ def default_context_builder(cls):
+ context_builder_dict = {}
+ context_builder_dict['viewport'] = {
+ 'width' : cls._win_width,
+ 'height' : cls._win_height
+ }
+ context_builder_dict['screen'] = {
+ 'width' : cls._win_width,
+ 'height' : cls._win_height
+ }
+ context_builder_dict['user_agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
+ context_builder_dict['locale'] = 'ko-KR'
+ context_builder_dict['timezone_id']='Asia/Seoul'
+
+ return context_builder_dict
+
+ @classmethod
+ async def initiate_scraper(cls):
+ if not cls._playwright:
+ cls._playwright = await async_playwright().start()
+ if not cls._browser:
+ cls._browser = await cls._playwright.chromium.launch(headless=True)
+ if not cls._context:
+ cls._context = await cls._browser.new_context(**cls.default_context_builder())
+ cls.is_ready = True
+
+ def __init__(self):
+ if not self.is_ready:
+ raise Exception("nvMapScraper is not initiated")
+
+
+ async def create_page(self):
+ while(not self.is_ready):
+ asyncio.sleep(1000)
+ self.page = await self._context.new_page()
+ await self.page.add_init_script(
+'''const defaultGetter = Object.getOwnPropertyDescriptor(
+ Navigator.prototype,
+ "webdriver"
+).get;
+defaultGetter.apply(navigator);
+defaultGetter.toString();
+Object.defineProperty(Navigator.prototype, "webdriver", {
+ set: undefined,
+ enumerable: true,
+ configurable: true,
+ get: new Proxy(defaultGetter, {
+ apply: (target, thisArg, args) => {
+ Reflect.apply(target, thisArg, args);
+ return false;
+ },
+ }),
+});
+const patchedGetter = Object.getOwnPropertyDescriptor(
+ Navigator.prototype,
+ "webdriver"
+).get;
+patchedGetter.apply(navigator);
+patchedGetter.toString();''')
+
+ await self.page.set_extra_http_headers({
+ 'sec-ch-ua': '\"Not?A_Brand\";v=\"99\", \"Chromium\";v=\"130\"'
+ })
+ await self.page.goto("http://google.com")
+
+ async def goto_url(self, url):
+ page = self.page
+ await page.goto(url, wait_until="domcontentloaded", timeout=20000)
+
+ async def get_place_id_url(self, selected):
+
+ title = selected['title'].replace("", "").replace("", "")
+ address = selected.get('roadAddress', selected['address']).replace("", "").replace("", "")
+ encoded_query = parse.quote(f"{address} {title}")
+ url = f"https://map.naver.com/p/search/{encoded_query}"
+
+ await self.goto_url(url)
+
+ count = 0
+ while(count < 5):
+ if "isCorrectAnswer=true" in self.page.url:
+ return self.page.url
+ await asyncio.sleep(1)
+ count += 1
+
+ raise Exception("Failed to identify place id. item is ambiguous")
\ No newline at end of file
diff --git a/poc/timestamp_lyric/Untitled.ipynb b/poc/timestamp_lyric/Untitled.ipynb
new file mode 100644
index 0000000..38e1036
--- /dev/null
+++ b/poc/timestamp_lyric/Untitled.ipynb
@@ -0,0 +1,239 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "99398cc7-e36a-494c-88f9-b26874ff0294",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import aiohttp\n",
+ "import json"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "28c3e49b-1133-4a18-ab70-fd321b4d2734",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "SUNO_API_KEY= '347da228e2d6ea273ef0558795a75892'\n",
+ "SUNO_BASE_URL=\"https://api.sunoapi.org\"\n",
+ "SUNO_TIMESTAPM_ROUTE = \"/api/v1/generate/get-timestamped-lyrics\"\n",
+ "SUNO_DETAIL_ROUTE = \"/api/v1/generate/record-info\"\n",
+ "suno_task_id = \"46bc90e6a2f9e9af58d7017e23f2115e\"\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "fe09b1d5-7198-4c40-9667-d7d0885c62a3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "headers = {\n",
+ " \"Authorization\": f\"Bearer {SUNO_API_KEY}\",\n",
+ " \"Content-Type\": \"application/json\",\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "81bacedc-e488-4d04-84b1-8e8a06a64565",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "async def get_suno_audio_id_from_task_id(suno_task_id): # expire if db save audio id\n",
+ " url = f\"{SUNO_BASE_URL}{SUNO_DETAIL_ROUTE}\"\n",
+ " headers = {\"Authorization\": f\"Bearer {SUNO_API_KEY}\"}\n",
+ " async with aiohttp.ClientSession() as session:\n",
+ " async with session.get(url, headers=headers, params={\"taskId\" : suno_task_id}) as response:\n",
+ " detail = await response.json()\n",
+ " result = detail['data']['response']['sunoData'][0]['id']\n",
+ " return result "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "26346a13-0663-489f-98d0-69743dd8553f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "async def get_suno_timestamp(suno_task_id, suno_audio_id): # expire if db save audio id\n",
+ " url = f\"{SUNO_BASE_URL}{SUNO_TIMESTAPM_ROUTE}\"\n",
+ " headers = {\"Authorization\": f\"Bearer {SUNO_API_KEY}\"}\n",
+ " payload = {\n",
+ " \"task_id\" : suno_task_id,\n",
+ " \"audio_id\" : suno_audio_id\n",
+ " }\n",
+ " async with aiohttp.ClientSession() as session:\n",
+ " async with session.post(url, headers=headers, data=json.dumps(payload)) as response:\n",
+ " result = await response.json()\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "78db0f6b-a54c-4415-9e82-972b00fefefb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = await get_suno_timestamp(suno_task_id, await get_suno_audio_id_from_task_id(suno_task_id))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "44d8da8e-5a67-4125-809f-bbdb1efba55f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gt_lyric = \"\"\"\n",
+ "---\n",
+ "스테이,머뭄의 추억을 담아 \n",
+ "군산에서의 여행을 떠나보세 \n",
+ "인스타 감성 가득한 사진같은 하루, \n",
+ "힐링의 시간, 감성 숙소에서의 휴식\n",
+ "\n",
+ "은파호수공원의 자연 속, \n",
+ "시간이 멈춘 듯한 절골길을 걸어봐요 \n",
+ "Instagram vibes, 그림 같은 힐링 장소, \n",
+ "잊지 못할 여행 스토리 만들어지네\n",
+ "\n",
+ "넷이서 웃고 떠들던 그 날의 사진 속, \n",
+ "그 순간 훌쩍 떠나볼까요, 새로운 길로 \n",
+ "스테이,머뭄이 준비한 특별한 여행지 \n",
+ "몸과 마음이 따뜻해지는 그런 곳이에요 \n",
+ "---\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "e4e9ba7d-964f-4f29-95f3-0f8514fad7ee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lyric_line_list = gt_lyric.split(\"\\n\")\n",
+ "lyric_line_list = [lyric_line.strip(',. ') for lyric_line in lyric_line_list if lyric_line and lyric_line != \"---\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "84a64cd5-7374-4c33-8634-6ac6ed0de425",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['스테이,머뭄의 추억을 담아',\n",
+ " '군산에서의 여행을 떠나보세',\n",
+ " '인스타 감성 가득한 사진같은 하루',\n",
+ " '힐링의 시간, 감성 숙소에서의 휴식',\n",
+ " '은파호수공원의 자연 속',\n",
+ " '시간이 멈춘 듯한 절골길을 걸어봐요',\n",
+ " 'Instagram vibes, 그림 같은 힐링 장소',\n",
+ " '잊지 못할 여행 스토리 만들어지네',\n",
+ " '넷이서 웃고 떠들던 그 날의 사진 속',\n",
+ " '그 순간 훌쩍 떠나볼까요, 새로운 길로',\n",
+ " '스테이,머뭄이 준비한 특별한 여행지',\n",
+ " '몸과 마음이 따뜻해지는 그런 곳이에요']"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "lyric_line_list"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "17ca1a6e-c3a8-4683-958b-14bb3a46e63a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "matching = "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "a8df83b4-99ef-4751-8c98-e5423c5c2494",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "aligned_words = data['data']['alignedWords']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "c1a1b2be-0796-4e40-b8bf-cd7c08e81e3e",
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "_IncompleteInputError",
+ "evalue": "incomplete input (2013651467.py, line 9)",
+ "output_type": "error",
+ "traceback": [
+ " \u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[36]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[31m \u001b[39m\n ^\n\u001b[31m_IncompleteInputError\u001b[39m\u001b[31m:\u001b[39m incomplete input\n"
+ ]
+ }
+ ],
+ "source": [
+ "alignment_lyric = {}\n",
+ "lyric_index = 0 \n",
+ "for aligned_word in aligned_words:\n",
+ " if not aligned_word['succsess']:\n",
+ " continue\n",
+ " if aligned_word['word'] in lyric_line_list[lyric_index]:\n",
+ " if lyric_index in alignment_lyric:\n",
+ " raise Exception\n",
+ " else:\n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c59c4eb1-d916-4d3a-8d02-a212b45f20ba",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/poc/timestamp_lyric/lyric_timestamp_mapper.py b/poc/timestamp_lyric/lyric_timestamp_mapper.py
new file mode 100644
index 0000000..03713ad
--- /dev/null
+++ b/poc/timestamp_lyric/lyric_timestamp_mapper.py
@@ -0,0 +1,42 @@
+from openai import OpenAI
+from difflib import SequenceMatcher
+from dataclasses import dataclass
+from typing import List, Tuple
+import aiohttp, json
+
+
+@dataclass
+class TimestampedLyric:
+ text: str
+ start: float
+ end: float
+
+SUNO_BASE_URL="https://api.sunoapi.org"
+SUNO_TIMESTAMP_ROUTE = "/api/v1/generate/get-timestamped-lyrics"
+SUNO_DETAIL_ROUTE = "/api/v1/generate/record-info"
+
+class LyricTimestampMapper:
+ suno_api_key : str
+ def __init__(self, suno_api_key):
+ self.suno_api_key = suno_api_key
+
+ async def get_suno_audio_id_from_task_id(self, suno_task_id): # expire if db save audio id
+ url = f"{SUNO_BASE_URL}{SUNO_DETAIL_ROUTE}"
+ headers = {"Authorization": f"Bearer {self.suno_api_key}"}
+ async with aiohttp.ClientSession() as session:
+ async with session.get(url, headers=headers, params={"taskId" : suno_task_id}) as response:
+ detail = await response.json()
+ result = detail['data']['response']['sunoData'][0]['id']
+ return result
+
+ async def get_suno_timestamp(self, suno_task_id, suno_audio_id): # expire if db save audio id
+ url = f"{SUNO_BASE_URL}{SUNO_TIMESTAMP_ROUTE}"
+ headers = {"Authorization": f"Bearer {self.suno_api_key}"}
+ payload = {
+ "task_id" : suno_task_id,
+ "audio_id" : suno_audio_id
+ }
+ async with aiohttp.ClientSession() as session:
+ async with session.post(url, headers=headers, data=json.dumps(payload)) as response:
+ result = await response.json()
+ return result
\ No newline at end of file
diff --git a/poc/timestamp_lyric/test_mapper.py b/poc/timestamp_lyric/test_mapper.py
new file mode 100644
index 0000000..2a27f6b
--- /dev/null
+++ b/poc/timestamp_lyric/test_mapper.py
@@ -0,0 +1,19 @@
+from lyric_timestamp_mapper import LyricTimestampMapper
+
+API_KEY = "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"
+AUDIO_PATH = "test_audio.mp3"
+
+GROUND_TRUTH_LYRICS = [
+ "첫 번째 가사 라인입니다",
+ "두 번째 가사 라인입니다",
+ "세 번째 가사 라인입니다",
+]
+
+mapper = LyricTimestampMapper(api_key=API_KEY)
+result = mapper.map_ground_truth(AUDIO_PATH, GROUND_TRUTH_LYRICS)
+
+for lyric in result:
+ if lyric.start >= 0:
+ print(f"[{lyric.start:.2f} - {lyric.end:.2f}] {lyric.text}")
+ else:
+ print(f"[매칭 실패] {lyric.text}")