Playwright 모듈 PoC 추가

insta
jaehwang 2026-01-12 11:01:03 +09:00
parent 56d4c690bf
commit b7edba8c80
5 changed files with 404 additions and 1 deletions

3
.gitignore vendored
View File

@ -27,3 +27,6 @@ build/
*.mp3
*.mp4
media/
*.ipynb_checkpoint*

View File

@ -0,0 +1,100 @@
import asyncio
from playwright.async_api import async_playwright
from urllib import parse
class nvMapPwScraper():
# cls vars
is_ready = False
_playwright = None
_browser = None
_context = None
_win_width = 1280
_win_height = 720
@classmethod
def default_context_builder(cls):
context_builder_dict = {}
context_builder_dict['viewport'] = {
'width' : cls._win_width,
'height' : cls._win_height
}
context_builder_dict['screen'] = {
'width' : cls._win_width,
'height' : cls._win_height
}
context_builder_dict['user_agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
context_builder_dict['locale'] = 'ko-KR'
context_builder_dict['timezone_id']='Asia/Seoul'
return context_builder_dict
@classmethod
async def initiate_scraper(cls):
if not cls._playwright:
cls._playwright = await async_playwright().start()
if not cls._browser:
cls._browser = await cls._playwright.chromium.launch(headless=True)
if not cls._context:
cls._context = await cls._browser.new_context(**cls.default_context_builder())
cls.is_ready = True
def __init__(self):
if not self.is_ready:
raise Exception("nvMapScraper is not initiated")
async def create_page(self):
while(not self.is_ready):
asyncio.sleep(1000)
self.page = await self._context.new_page()
await self.page.add_init_script(
'''const defaultGetter = Object.getOwnPropertyDescriptor(
Navigator.prototype,
"webdriver"
).get;
defaultGetter.apply(navigator);
defaultGetter.toString();
Object.defineProperty(Navigator.prototype, "webdriver", {
set: undefined,
enumerable: true,
configurable: true,
get: new Proxy(defaultGetter, {
apply: (target, thisArg, args) => {
Reflect.apply(target, thisArg, args);
return false;
},
}),
});
const patchedGetter = Object.getOwnPropertyDescriptor(
Navigator.prototype,
"webdriver"
).get;
patchedGetter.apply(navigator);
patchedGetter.toString();''')
await self.page.set_extra_http_headers({
'sec-ch-ua': '\"Not?A_Brand\";v=\"99\", \"Chromium\";v=\"130\"'
})
await self.page.goto("http://google.com")
async def goto_url(self, url):
page = self.page
await page.goto(url, wait_until="domcontentloaded", timeout=20000)
async def get_place_id_url(self, selected):
title = selected['title'].replace("<b>", "").replace("</b>", "")
address = selected.get('roadAddress', selected['address']).replace("<b>", "").replace("</b>", "")
encoded_query = parse.quote(f"{address} {title}")
url = f"https://map.naver.com/p/search/{encoded_query}"
await self.goto_url(url)
count = 0
while(count < 5):
if "isCorrectAnswer=true" in self.page.url:
return self.page.url
await asyncio.sleep(1)
count += 1
raise Exception("Failed to identify place id. item is ambiguous")

View File

@ -0,0 +1,239 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 26,
"id": "99398cc7-e36a-494c-88f9-b26874ff0294",
"metadata": {},
"outputs": [],
"source": [
"import aiohttp\n",
"import json"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "28c3e49b-1133-4a18-ab70-fd321b4d2734",
"metadata": {},
"outputs": [],
"source": [
"SUNO_API_KEY= '347da228e2d6ea273ef0558795a75892'\n",
"SUNO_BASE_URL=\"https://api.sunoapi.org\"\n",
"SUNO_TIMESTAPM_ROUTE = \"/api/v1/generate/get-timestamped-lyrics\"\n",
"SUNO_DETAIL_ROUTE = \"/api/v1/generate/record-info\"\n",
"suno_task_id = \"46bc90e6a2f9e9af58d7017e23f2115e\"\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "fe09b1d5-7198-4c40-9667-d7d0885c62a3",
"metadata": {},
"outputs": [],
"source": [
"headers = {\n",
" \"Authorization\": f\"Bearer {SUNO_API_KEY}\",\n",
" \"Content-Type\": \"application/json\",\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "81bacedc-e488-4d04-84b1-8e8a06a64565",
"metadata": {},
"outputs": [],
"source": [
"async def get_suno_audio_id_from_task_id(suno_task_id): # expire if db save audio id\n",
" url = f\"{SUNO_BASE_URL}{SUNO_DETAIL_ROUTE}\"\n",
" headers = {\"Authorization\": f\"Bearer {SUNO_API_KEY}\"}\n",
" async with aiohttp.ClientSession() as session:\n",
" async with session.get(url, headers=headers, params={\"taskId\" : suno_task_id}) as response:\n",
" detail = await response.json()\n",
" result = detail['data']['response']['sunoData'][0]['id']\n",
" return result "
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "26346a13-0663-489f-98d0-69743dd8553f",
"metadata": {},
"outputs": [],
"source": [
"async def get_suno_timestamp(suno_task_id, suno_audio_id): # expire if db save audio id\n",
" url = f\"{SUNO_BASE_URL}{SUNO_TIMESTAPM_ROUTE}\"\n",
" headers = {\"Authorization\": f\"Bearer {SUNO_API_KEY}\"}\n",
" payload = {\n",
" \"task_id\" : suno_task_id,\n",
" \"audio_id\" : suno_audio_id\n",
" }\n",
" async with aiohttp.ClientSession() as session:\n",
" async with session.post(url, headers=headers, data=json.dumps(payload)) as response:\n",
" result = await response.json()\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "78db0f6b-a54c-4415-9e82-972b00fefefb",
"metadata": {},
"outputs": [],
"source": [
"data = await get_suno_timestamp(suno_task_id, await get_suno_audio_id_from_task_id(suno_task_id))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "44d8da8e-5a67-4125-809f-bbdb1efba55f",
"metadata": {},
"outputs": [],
"source": [
"gt_lyric = \"\"\"\n",
"---\n",
"스테이,머뭄의 추억을 담아 \n",
"군산에서의 여행을 떠나보세 \n",
"인스타 감성 가득한 사진같은 하루, \n",
"힐링의 시간, 감성 숙소에서의 휴식\n",
"\n",
"은파호수공원의 자연 속, \n",
"시간이 멈춘 듯한 절골길을 걸어봐요 \n",
"Instagram vibes, 그림 같은 힐링 장소, \n",
"잊지 못할 여행 스토리 만들어지네\n",
"\n",
"넷이서 웃고 떠들던 그 날의 사진 속, \n",
"그 순간 훌쩍 떠나볼까요, 새로운 길로 \n",
"스테이,머뭄이 준비한 특별한 여행지 \n",
"몸과 마음이 따뜻해지는 그런 곳이에요 \n",
"---\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "e4e9ba7d-964f-4f29-95f3-0f8514fad7ee",
"metadata": {},
"outputs": [],
"source": [
"lyric_line_list = gt_lyric.split(\"\\n\")\n",
"lyric_line_list = [lyric_line.strip(',. ') for lyric_line in lyric_line_list if lyric_line and lyric_line != \"---\"]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "84a64cd5-7374-4c33-8634-6ac6ed0de425",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['스테이,머뭄의 추억을 담아',\n",
" '군산에서의 여행을 떠나보세',\n",
" '인스타 감성 가득한 사진같은 하루',\n",
" '힐링의 시간, 감성 숙소에서의 휴식',\n",
" '은파호수공원의 자연 속',\n",
" '시간이 멈춘 듯한 절골길을 걸어봐요',\n",
" 'Instagram vibes, 그림 같은 힐링 장소',\n",
" '잊지 못할 여행 스토리 만들어지네',\n",
" '넷이서 웃고 떠들던 그 날의 사진 속',\n",
" '그 순간 훌쩍 떠나볼까요, 새로운 길로',\n",
" '스테이,머뭄이 준비한 특별한 여행지',\n",
" '몸과 마음이 따뜻해지는 그런 곳이에요']"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lyric_line_list"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "17ca1a6e-c3a8-4683-958b-14bb3a46e63a",
"metadata": {},
"outputs": [],
"source": [
"matching = "
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "a8df83b4-99ef-4751-8c98-e5423c5c2494",
"metadata": {},
"outputs": [],
"source": [
"aligned_words = data['data']['alignedWords']"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "c1a1b2be-0796-4e40-b8bf-cd7c08e81e3e",
"metadata": {},
"outputs": [
{
"ename": "_IncompleteInputError",
"evalue": "incomplete input (2013651467.py, line 9)",
"output_type": "error",
"traceback": [
" \u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[36]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[31m \u001b[39m\n ^\n\u001b[31m_IncompleteInputError\u001b[39m\u001b[31m:\u001b[39m incomplete input\n"
]
}
],
"source": [
"alignment_lyric = {}\n",
"lyric_index = 0 \n",
"for aligned_word in aligned_words:\n",
" if not aligned_word['succsess']:\n",
" continue\n",
" if aligned_word['word'] in lyric_line_list[lyric_index]:\n",
" if lyric_index in alignment_lyric:\n",
" raise Exception\n",
" else:\n",
" \n",
" \n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c59c4eb1-d916-4d3a-8d02-a212b45f20ba",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,42 @@
from openai import OpenAI
from difflib import SequenceMatcher
from dataclasses import dataclass
from typing import List, Tuple
import aiohttp, json
@dataclass
class TimestampedLyric:
text: str
start: float
end: float
SUNO_BASE_URL="https://api.sunoapi.org"
SUNO_TIMESTAMP_ROUTE = "/api/v1/generate/get-timestamped-lyrics"
SUNO_DETAIL_ROUTE = "/api/v1/generate/record-info"
class LyricTimestampMapper:
suno_api_key : str
def __init__(self, suno_api_key):
self.suno_api_key = suno_api_key
async def get_suno_audio_id_from_task_id(self, suno_task_id): # expire if db save audio id
url = f"{SUNO_BASE_URL}{SUNO_DETAIL_ROUTE}"
headers = {"Authorization": f"Bearer {self.suno_api_key}"}
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers, params={"taskId" : suno_task_id}) as response:
detail = await response.json()
result = detail['data']['response']['sunoData'][0]['id']
return result
async def get_suno_timestamp(self, suno_task_id, suno_audio_id): # expire if db save audio id
url = f"{SUNO_BASE_URL}{SUNO_TIMESTAMP_ROUTE}"
headers = {"Authorization": f"Bearer {self.suno_api_key}"}
payload = {
"task_id" : suno_task_id,
"audio_id" : suno_audio_id
}
async with aiohttp.ClientSession() as session:
async with session.post(url, headers=headers, data=json.dumps(payload)) as response:
result = await response.json()
return result

View File

@ -0,0 +1,19 @@
from lyric_timestamp_mapper import LyricTimestampMapper
API_KEY = "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"
AUDIO_PATH = "test_audio.mp3"
GROUND_TRUTH_LYRICS = [
"첫 번째 가사 라인입니다",
"두 번째 가사 라인입니다",
"세 번째 가사 라인입니다",
]
mapper = LyricTimestampMapper(api_key=API_KEY)
result = mapper.map_ground_truth(AUDIO_PATH, GROUND_TRUTH_LYRICS)
for lyric in result:
if lyric.start >= 0:
print(f"[{lyric.start:.2f} - {lyric.end:.2f}] {lyric.text}")
else:
print(f"[매칭 실패] {lyric.text}")