crawler operating in local

main
jaehwang 2025-09-26 09:42:35 +09:00
parent 56682fa54e
commit 67fa23a53e
7 changed files with 518 additions and 6 deletions

29
database/common.py Normal file
View File

@ -0,0 +1,29 @@
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
from sqlalchemy.orm import declarative_base, sessionmaker
ENVIRONMENT = "dev"
DATABASE_URL = ""
# 비동기 엔진 생성
engine = create_async_engine(
DATABASE_URL.replace("postgresql://", "postgresql+asyncpg://"),
echo=True if ENVIRONMENT == "dev" else False,
future=True
)
# 세션 팩토리 생성
AsyncSessionLocal = sessionmaker(
engine,
class_=AsyncSession,
expire_on_commit=False
)
# Base 클래스 생성
Base = declarative_base()
# 데이터베이스 세션 의존성
async def get_db():
async with AsyncSessionLocal() as session:
try:
yield session
finally:
await session.close()

396
database/models.py Normal file
View File

@ -0,0 +1,396 @@
from sqlalchemy import Column, Integer, String, DateTime, Text, Boolean, ForeignKey, JSON, Float, BigInteger, Date
from sqlalchemy.orm import relationship
from sqlalchemy.sql import func
from sqlalchemy.dialects.postgresql import UUID
from app.database import Base
import uuid
class Company(Base):
"""회사 모델"""
__tablename__ = "company"
company_id = Column(Integer, primary_key=True, index=True)
company_name = Column(String, nullable=True)
# 관계
users = relationship("User", back_populates="company")
class User(Base):
"""사용자 모델"""
__tablename__ = "users"
id = Column(Integer, primary_key=True, index=True)
username = Column(String, unique=True, index=True, nullable=True) # username 필드 추가
email = Column(String, unique=True, index=True, nullable=False)
hashed_password = Column(String, nullable=False)
is_active = Column(Boolean, default=True)
created_at = Column(DateTime(timezone=True), server_default=func.now())
updated_at = Column(DateTime(timezone=True), onupdate=func.now())
name = Column(String, nullable=False) # name 필드 복구
refresh_token = Column(String, nullable=True)
company_id = Column(Integer, ForeignKey("company.company_id"), nullable=True)
address = Column(String, nullable=True)
description = Column(String, nullable=True)
role = Column(String, nullable=True) # 사용자, 마케터, 프랜차이즈 대표, 관리자
# 관계
company = relationship("Company", back_populates="users")
projects = relationship("Project", back_populates="owner")
class AdStore(Base):
"""광고 매장 모델"""
__tablename__ = "ad_store"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer, ForeignKey("users.id"), nullable=True)
name = Column(String, nullable=True)
category = Column(String, nullable=True)
site_address = Column(String, nullable=True)
road_address = Column(String, nullable=True)
phone_number = Column(String, nullable=True)
naver_place_url = Column(String, nullable=True)
hashtags = Column(JSON, nullable=False, default=list)
description = Column(Text, nullable=True)
image_urls = Column(JSON, nullable=True)
created_at = Column(DateTime(timezone=True), server_default=func.now())
found_year = Column(Integer, nullable=True)
business_hours = Column(String, nullable=True) # 매장 오픈, close 시간, 주말, 휴무 관련
employ_count = Column(Integer, nullable=True) # 매장 직원수
specialties = Column(String, nullable=True) # 사업 전문 분야
# 관계
user = relationship("User", backref="ad_stores")
projects = relationship("Project", backref="ad_store")
class Preset(Base):
"""프리셋 모델"""
__tablename__ = "preset"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer, ForeignKey("users.id"), nullable=False)
name = Column(String, nullable=True)
category = Column(String, nullable=True)
description = Column(String, nullable=True)
favorite = Column(Boolean, default=False)
hashtag = Column(JSON, nullable=True)
image_urls = Column(JSON, nullable=True)
created_at = Column(DateTime(timezone=True), server_default=func.now())
last_used_at = Column(DateTime(timezone=True), nullable=True)
title = Column(String, nullable=True)
# 관계
user = relationship("User", backref="presets")
projects = relationship("Project", back_populates="preset")
class Template(Base):
"""템플릿 모델"""
__tablename__ = "template"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer, ForeignKey("users.id"), nullable=True)
name = Column(String, nullable=True)
description = Column(String, nullable=True)
thumbnail = Column(String, nullable=True)
template_url = Column(String, nullable=True)
category = Column(String, nullable=True)
rating = Column(Float, nullable=True) # real type in PostgreSQL maps to Float
favortie = Column(Boolean, default=False) # Note: typo in DB column name "favortie"
hashtags = Column(JSON, nullable=True)
created_at = Column(DateTime(timezone=True), server_default=func.now())
usage_count = Column(Integer, nullable=True)
# 관계
user = relationship("User", backref="templates")
projects = relationship("Project", back_populates="template")
class Project(Base):
"""프로젝트 모델"""
__tablename__ = "projects"
id = Column(Integer, primary_key=True, index=True)
title = Column(String, nullable=False)
description = Column(Text)
target_url = Column(String, nullable=False) # 크롤링 대상 URL
status = Column(String, default="created") # created, crawling, processing, completed, failed
owner_id = Column(Integer, ForeignKey("users.id"))
created_at = Column(DateTime(timezone=True), server_default=func.now())
# Template and Store references (from database)
template_id = Column(Integer, ForeignKey("template.id"), nullable=True)
store_id = Column(Integer, ForeignKey("ad_store.id"), nullable=True)
preset_id = Column(Integer, ForeignKey("preset.id"), nullable=True)
# YouTube 관련 필드
youtube_video_id = Column(String(50), nullable=True)
youtube_url = Column(String(255), nullable=True)
youtube_upload_status = Column(String(50), nullable=True)
youtube_privacy_status = Column(String(20), nullable=True)
updated_at = Column(DateTime(timezone=True), onupdate=func.now())
# 관계
owner = relationship("User", back_populates="projects")
template = relationship("Template", back_populates="projects")
preset = relationship("Preset", back_populates="projects")
crawl_data = relationship("CrawlData", back_populates="project", uselist=False)
video = relationship("Video", back_populates="project", uselist=False)
generated_content = relationship("GeneratedContent", back_populates="project", uselist=False)
task_statuses = relationship("TaskStatus", back_populates="project")
class CrawlData(Base):
"""크롤링 데이터 모델"""
__tablename__ = "crawl_data"
id = Column(Integer, primary_key=True, index=True)
project_id = Column(Integer, ForeignKey("projects.id"))
# Naver Map 데이터
business_name = Column(String)
address = Column(String)
phone = Column(String)
business_type = Column(String)
coordinates = Column(JSON) # {"lat": float, "lng": float}
# 추가 장소 정보 (crawler4)
business_hours = Column(String) # 영업시간
homepage = Column(String) # 홈페이지 URL
place_description = Column(Text) # 장소 설명
# 블로그 데이터
blog_urls = Column(JSON) # 수집된 블로그 URL 리스트
blog_content = Column(Text) # 추출된 블로그 내용
advertising_keywords = Column(JSON) # 광고 키워드 리스트
# 상세 블로그 정보 (crawler4)
blog_details = Column(JSON) # detailed_blogs 전체 저장
map_blog_reviews = Column(JSON) # 지도 블로그 리뷰
# 이미지 데이터
image_urls = Column(JSON) # 수집된 이미지 URL 리스트
stored_images = Column(JSON) # 저장된 이미지 파일 경로 리스트
# 크롤링 메타데이터
crawl_metadata = Column(JSON) # 통계 정보, 수집 시간 등
created_at = Column(DateTime(timezone=True), server_default=func.now())
# 관계
project = relationship("Project", back_populates="crawl_data")
class GeneratedContent(Base):
"""생성된 콘텐츠 모델"""
__tablename__ = "generated_content"
id = Column(Integer, primary_key=True, index=True)
project_id = Column(Integer, ForeignKey("projects.id"))
# 가사 데이터
lyrics = Column(Text)
lyrics_prompt = Column(Text) # 사용된 프롬프트
lyrics_generation_time = Column(Integer, default=0) # 가사 생성 시간(초)
lyrics_token_count = Column(Integer, default=0) # 사용된 토큰 수
lyrics_model_used = Column(String, default="gpt-3.5-turbo") # 사용된 모델
# BGM 데이터
bgm_url = Column(String)
bgm_file_path = Column(String) # 저장된 BGM 파일 경로
bgm_duration = Column(Float, default=30.0) # BGM 길이(초)
bgm_generation_time = Column(Integer, default=0) # BGM 생성 시간(초)
bgm_model_used = Column(String, default="mureka-v6") # 사용된 모델
created_at = Column(DateTime(timezone=True), server_default=func.now())
# 관계
project = relationship("Project", back_populates="generated_content")
class Video(Base):
"""동영상 모델"""
__tablename__ = "videos"
id = Column(Integer, primary_key=True, index=True)
project_id = Column(Integer, ForeignKey("projects.id"))
# 동영상 파일 정보
file_path = Column(String, nullable=False)
file_size = Column(Integer) # 바이트 단위
duration = Column(Integer) # 초 단위
resolution = Column(String) # "1920x1080" 형식
# 생성 설정
template_type = Column(String, default="default")
generation_settings = Column(JSON) # 동영상 생성 설정
# 상태
status = Column(String, default="processing") # processing, completed, failed
error_message = Column(Text)
created_at = Column(DateTime(timezone=True), server_default=func.now())
completed_at = Column(DateTime(timezone=True))
# 관계
project = relationship("Project", back_populates="video")
class TaskStatus(Base):
"""작업 상태 추적 모델"""
__tablename__ = "task_status"
id = Column(Integer, primary_key=True, index=True)
task_id = Column(String, unique=True, index=True, nullable=True) # Celery 태스크 ID
project_id = Column(Integer, ForeignKey("projects.id"), nullable=True)
task_type = Column(String, nullable=True) # crawling, content_generation, video_generation
status = Column(String, nullable=True) # pending, started, progress, success, failure
progress = Column(Integer, nullable=True) # 0-100
current_step = Column(String, nullable=True)
result = Column(JSON, nullable=True)
error_message = Column(Text, nullable=True)
created_at = Column(DateTime(timezone=True), server_default=func.now())
updated_at = Column(DateTime(timezone=True), nullable=True)
# 관계
project = relationship("Project", back_populates="task_statuses")
class UserActivityLog(Base):
"""사용자 활동 로그 모델"""
__tablename__ = "user_activity_logs"
id = Column(Integer, primary_key=True, index=True)
user_id = Column(Integer, ForeignKey("users.id"))
action = Column(String, nullable=False) # login, logout, create_project, etc.
details = Column(JSON) # 추가 정보
ip_address = Column(String)
user_agent = Column(String)
created_at = Column(DateTime(timezone=True), server_default=func.now())
# 관계
user = relationship("User")
class SystemStatistics(Base):
"""시스템 통계 모델"""
__tablename__ = "system_statistics"
id = Column(Integer, primary_key=True, index=True)
stat_date = Column(Date, unique=True, nullable=False)
# 사용자 통계
total_users = Column(Integer, nullable=True)
active_users = Column(Integer, nullable=True)
new_users = Column(Integer, nullable=True)
# 프로젝트 통계
total_projects = Column(Integer, nullable=True)
completed_projects = Column(Integer, nullable=True)
failed_projects = Column(Integer, nullable=True)
# 동영상 통계
total_videos = Column(Integer, nullable=True)
total_video_size = Column(BigInteger, nullable=True) # bytes
new_videos = Column(Integer, nullable=True) # 이번달 새로 생성한 비디오 갯수
# 템플릿 통계
total_templates = Column(Integer, nullable=True) # 전체 템플릿 갯수
# 성능 통계
avg_crawling_time = Column(Float, nullable=True) # seconds
avg_content_generation_time = Column(Float, nullable=True) # seconds
avg_video_rendering_time = Column(Float, nullable=True) # seconds
# 증가율 통계 (월단위)
user_growth_rate = Column(Float, nullable=True) # 사용자 증가량-월단위
template_growth_rate = Column(Float, nullable=True) # 템플릿 생성 증가율 - 월단위
video_growth_rate = Column(Float, nullable=True) # 비디오 생성 증가율 -월단위
media_growth_rate = Column(Float, nullable=True) # 미디어 에셋 증가율 - 월단위
created_at = Column(DateTime(timezone=True), server_default=func.now())
updated_at = Column(DateTime(timezone=True), onupdate=func.now())
class SystemUsage(Base):
"""시스템 사용량 모델"""
__tablename__ = "system_usage"
id = Column(Integer, primary_key=True, index=True)
stat_date = Column(Date, unique=True, nullable=False)
# 스토리지 사용량 (Giga 단위)
image_usage = Column(Float, nullable=True) # 이미지 스토리지 사용량 - Giga 단위
video_usage = Column(Float, nullable=True) # 비디오 스토리지 사용량 - Giga 단위
music_usage = Column(Float, nullable=True) # 노래 스토리지 사용량 - Giga 단위
extra_usage = Column(Float, nullable=True) # 기타 스토리지 사용량 - Giga 단위
created_at = Column(DateTime(timezone=True), server_default=func.now())
class SystemAvailability(Base):
"""시스템 가용성 모델"""
__tablename__ = "system_availability"
id = Column(Integer, primary_key=True, index=True)
stat_date = Column(Date, unique=True, nullable=False)
# 가용성 지표 (퍼센트 단위 - smallint로 0-100 범위)
ai_availability = Column(Integer, nullable=True) # AI 엔진 가동율
dbase_availability = Column(Integer, nullable=True) # 데이터베이스 가동율
storage_availability = Column(Integer, nullable=True) # 스토리지 서버 가동율
renderer_availability = Column(Integer, nullable=True) # 비디오 Renderer 가동율
created_at = Column(DateTime(timezone=True), server_default=func.now())
class MediaAsset(Base):
"""미디어 자산 모델"""
__tablename__ = "media_assets"
__mapper_args__ = {"confirm_deleted_rows": False}
id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False)
type = Column(String, nullable=False)
size = Column(String)
file_size_bytes = Column(Integer)
dimensions = Column(String)
width = Column(Integer)
height = Column(Integer)
duration = Column(String)
duration_seconds = Column(Float)
thumbnail = Column(String)
file_path = Column(String)
url = Column(String)
mime_type = Column(String)
tags = Column(JSON)
usage_count = Column(Integer, default=0)
owner_id = Column(Integer, ForeignKey("users.id"))
meta_info = Column(JSON) # metadata는 SQLAlchemy 예약어이므로 변경
created_at = Column(DateTime(timezone=True), server_default=func.now())
updated_at = Column(DateTime(timezone=True), onupdate=func.now())
# 관계
owner = relationship("User", backref="media_assets")
class Asset(Base):
"""미디어 자산 모델"""
__tablename__ = "media_assets"
__mapper_args__ = {"confirm_deleted_rows": False}
id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False)
type = Column(String, nullable=False)
size = Column(String)
file_size_bytes = Column(Integer)
dimensions = Column(String)
width = Column(Integer)
height = Column(Integer)
duration = Column(String)
duration_seconds = Column(Float)
thumbnail = Column(String)
file_path = Column(String)
url = Column(String)
mime_type = Column(String)
tags = Column(JSON)
usage_count = Column(Integer, default=0)
owner_id = Column(Integer, ForeignKey("users.id"))
meta_info = Column(JSON) # metadata는 SQLAlchemy 예약어이므로 변경
created_at = Column(DateTime(timezone=True), server_default=func.now())
updated_at = Column(DateTime(timezone=True), onupdate=func.now())
# 관계
owner = relationship("User", backref="media_assets")

View File

@ -34,10 +34,7 @@ async def main():
print(f" - 허용 포맷: {', '.join(sorted(image_filter.allowed_formats))}")
print(f" - 크기 조건: {'AND' if image_filter.require_both_dimensions else 'OR'}")
# 프로젝트 디렉토리 설정
if platform.system() == 'Windows':
project_dir = config.get('paths.project_dir_windows', r'C:\CrawlingData')
else:
# 프로젝트 디렉토리 설정)
project_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')
# 크롤러 초기화

View File

@ -4,5 +4,8 @@ selenium
requests
pillow
azure-identity
azure-keyvault-secrets
azure-storage-blob
azure-storage-queue
azure-mgmt-resource
azure-mgmt-compute

View File

@ -23,7 +23,7 @@ from .naver_blog_crawler import NaverBlogCrawler, ImageFilterConfig
class NaverIntegratedCrawler:
def __init__(self, project_dir: str = r'/home/jhyeu/workspace/data',
def __init__(self, project_dir:str,
image_filter: ImageFilterConfig = None):
"""
초기화

87
utils/azure.py Normal file
View File

@ -0,0 +1,87 @@
import requests
import json
from azure.storage.blob import BlobServiceClient, ContentSettings
from azure.storage.queue import QueueClient
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
KEY_VAULT_NAME = "ado2-keys"
def get_keyvault_client():
key_vault_uri = f"https://{KEY_VAULT_NAME}.vault.azure.net/"
# DefaultAzureCredential이 Managed Identity를 자동으로 사용
credential = DefaultAzureCredential()
return SecretClient(vault_url=key_vault_uri, credential=credential)
# Secret Key
BLOB_ACCOUNT_URL_KEY = "ado2-blob-account-url"# https://mystorageaccount.blob.core.windows.net
QUEUE_URL_KEY = "ado2-queue-url"
# Plain text
BLOB_CONTAINER_NAME = "ado2-media-public-access"
BLOB_MEDIA_FOLDER = "ado2-media-original"
def az_storage_upload_ado2_media(data, remote_file_path:str):
secret_client = get_keyvault_client()
account_url = secret_client.get_secret(BLOB_ACCOUNT_URL_KEY).value
# Managed Identity 사용
credential = DefaultAzureCredential()
blob_service_client = BlobServiceClient(account_url=account_url, credential=credential)
blob_client = blob_service_client.get_blob_client(container=BLOB_CONTAINER_NAME, blob=f"{BLOB_MEDIA_FOLDER}/{remote_file_path}")
# with open(local_file_path, 'rb') as data:
# blob_client.upload_blob(data, overwrite=True)
blob_client.upload_blob(data, overwrite=True)
return blob_client.url
def az_storage_read_ado2_media(remote_file_path:str):
secret_client = get_keyvault_client()
account_url = secret_client.get_secret(BLOB_ACCOUNT_URL_KEY).value
# Managed Identity 사용
credential = DefaultAzureCredential()
blob_service_client = BlobServiceClient(account_url=account_url, credential=credential)
blob_client = blob_service_client.get_blob_client(container=BLOB_CONTAINER_NAME, blob=f"{BLOB_MEDIA_FOLDER}/{remote_file_path}")
blob_data = blob_client.download_blob()
return blob_data
def az_storage_get_ado2_media_list(remote_folder_path:str) -> list[str]:
secret_client = get_keyvault_client()
account_url = secret_client.get_secret(BLOB_ACCOUNT_URL_KEY).value
# Managed Identity 사용
credential = DefaultAzureCredential()
blob_service_client = BlobServiceClient(account_url=account_url, credential=credential)
prefix = f"{BLOB_MEDIA_FOLDER}/{remote_folder_path}"
blob_media_list = blob_service_client.get_container_client(BLOB_CONTAINER_NAME).list_blobs(name_starts_with=prefix)
return blob_media_list
def az_add_queue_message(body:dict):
secret_client = get_keyvault_client()
queue_url = secret_client.get_secret(QUEUE_URL_KEY).value
credential = DefaultAzureCredential()
queue_client = QueueClient.from_queue_url(queue_url, credential=credential)
queue_client.send_message(json.dumps(body, ensure_ascii=False))
def az_get_queue_message():
secret_client = get_keyvault_client()
queue_url = secret_client.get_secret(QUEUE_URL_KEY).value
credential = DefaultAzureCredential()
queue_client = QueueClient.from_queue_url(queue_url, credential=credential)
# 메시지 읽기
messages = queue_client.receive_messages()
data_list = []
for message in messages:
# 메시지 처리
data = json.loads(message.content)
print(f"받은 메시지: {data}")
# 처리 후 삭제
queue_client.delete_message(message.id, message.pop_receipt)
data_list.append(data)
return data_list