558 lines
19 KiB
Python
558 lines
19 KiB
Python
"""
|
|
티비몬 (tvmon.site) 스크래퍼
|
|
카테고리: 영화, 한국영화, 드라마, 예능, 시사/다큐, 해외드라마, 해외(예능/다큐), [극장판] 애니메이션, 일반 애니메이션
|
|
|
|
WebView 기반 재생을 위한 스크래퍼 - 복잡한 MP4 파싱 없이 웹페이지 URL만 제공
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import re
|
|
import time
|
|
from urllib.parse import urljoin, quote
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
BASE_URL = "https://tvmon.site"
|
|
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
HEADERS = {
|
|
"User-Agent": USER_AGENT,
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
|
|
"Referer": BASE_URL,
|
|
}
|
|
|
|
# 카테고리 매핑
|
|
CATEGORIES = {
|
|
"movie": {"name": "영화", "path": "/movie"},
|
|
"kor_movie": {"name": "한국영화", "path": "/kor_movie"},
|
|
"drama": {"name": "드라마", "path": "/drama"},
|
|
"ent": {"name": "예능프로그램", "path": "/ent"},
|
|
"sisa": {"name": "시사/다큐", "path": "/sisa"},
|
|
"world": {"name": "해외드라마", "path": "/world"},
|
|
"ott_ent": {"name": "해외(예능/다큐)", "path": "/ott_ent"},
|
|
"ani_movie": {"name": "[극장판] 애니메이션", "path": "/ani_movie"},
|
|
"animation": {"name": "일반 애니메이션", "path": "/animation"},
|
|
}
|
|
|
|
|
|
class TvmonScraper:
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update(HEADERS)
|
|
|
|
def _get(self, url: str, timeout: int = 15) -> Optional[requests.Response]:
|
|
"""GET 요청 with 재시도"""
|
|
max_retries = 3
|
|
for attempt in range(max_retries):
|
|
try:
|
|
resp = self.session.get(url, timeout=timeout)
|
|
resp.raise_for_status()
|
|
return resp
|
|
except requests.RequestException as e:
|
|
if attempt < max_retries - 1:
|
|
time.sleep(1)
|
|
else:
|
|
print(f"❌ 요청 실패: {url} - {e}")
|
|
return None
|
|
return None
|
|
|
|
def get_homepage(self) -> Dict:
|
|
"""홈페이지 크롤링 - 인기 순위 및 최신 콘텐츠"""
|
|
print("=" * 60)
|
|
print("1. 홈페이지 크롤링")
|
|
print("=" * 60)
|
|
|
|
resp = self._get(f"{BASE_URL}/")
|
|
if not resp:
|
|
return {"success": False}
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
result = {
|
|
"success": True,
|
|
"popular": [],
|
|
"latest": {}
|
|
}
|
|
|
|
# 인기 순위 (무료 다시보기 순위)
|
|
popular_section = soup.select_one("h2:contains('무료 다시보기 순위')")
|
|
if popular_section:
|
|
popular_items = soup.select(".popular-section a, .ranking-item a")
|
|
# 실제 구조에 맞게 조정 필요
|
|
for item in soup.select("a[href*='/drama/'], a[href*='/movie/'], a[href*='/kor_movie/'], a[href*='/world/'], a[href*='/animation/']")[:10]:
|
|
href = item.get("href", "")
|
|
title = item.get_text(strip=True)
|
|
if title and href:
|
|
result["popular"].append({
|
|
"title": title,
|
|
"url": urljoin(BASE_URL, href),
|
|
"category": self._get_category_from_url(href)
|
|
})
|
|
|
|
# 최신 콘텐츠 섹션별
|
|
sections = ["영화", "드라마", "예능", "해외드라마", "애니메이션"]
|
|
for section in sections:
|
|
result["latest"][section] = []
|
|
|
|
# 최신 영화
|
|
movie_items = soup.select("a[href*='/movie/']")
|
|
for item in movie_items[:6]:
|
|
href = item.get("href", "")
|
|
title = item.get_text(strip=True)
|
|
img = item.select_one("img")
|
|
img_url = img.get("src") or img.get("data-src", "") if img else ""
|
|
|
|
if title and href and "/movie/" in href:
|
|
result["latest"]["영화"].append({
|
|
"title": title,
|
|
"url": urljoin(BASE_URL, href),
|
|
"thumbnail": img_url
|
|
})
|
|
|
|
# 최신 드라마
|
|
drama_items = soup.select("a[href*='/drama/']")
|
|
for item in drama_items[:6]:
|
|
href = item.get("href", "")
|
|
title = item.get_text(strip=True)
|
|
img = item.select_one("img")
|
|
img_url = img.get("src") or img.get("data-src", "") if img else ""
|
|
|
|
if title and href and "/drama/" in href:
|
|
result["latest"]["드라마"].append({
|
|
"title": title,
|
|
"url": urljoin(BASE_URL, href),
|
|
"thumbnail": img_url
|
|
})
|
|
|
|
print(f" 인기 순위: {len(result['popular'])}개")
|
|
for section, items in result["latest"].items():
|
|
print(f" 최신 {section}: {len(items)}개")
|
|
|
|
return result
|
|
|
|
def get_category(self, category_key: str, page: int = 1) -> Dict:
|
|
if category_key not in CATEGORIES:
|
|
print(f"❌ 알 수 없는 카테고리: {category_key}")
|
|
return {"success": False, "items": []}
|
|
|
|
cat_info = CATEGORIES[category_key]
|
|
print(f"\n{'=' * 60}")
|
|
print(f"2. 카테고리 크롤링: {cat_info['name']} (page={page})")
|
|
print("=" * 60)
|
|
|
|
url = f"{BASE_URL}{cat_info['path']}" if page == 1 else f"{BASE_URL}{cat_info['path']}?page={page}"
|
|
resp = self._get(url)
|
|
if not resp:
|
|
return {"success": False, "items": [], "category": cat_info['name']}
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
items = []
|
|
seen = set()
|
|
|
|
for link in soup.select(f"a[href*='/{category_key}/']"):
|
|
href = link.get("href", "")
|
|
if not href or f"/{category_key}/" not in href:
|
|
continue
|
|
|
|
full_url = urljoin(BASE_URL, href)
|
|
if full_url in seen:
|
|
continue
|
|
seen.add(full_url)
|
|
|
|
id_match = re.search(r'/(\d+)(?:/|$|\?)', href)
|
|
content_id = id_match.group(1) if id_match else ""
|
|
|
|
img_tag = link.select_one("img")
|
|
img_url = ""
|
|
if img_tag:
|
|
img_url = img_tag.get("src") or img_tag.get("data-src") or img_tag.get("data-original", "")
|
|
|
|
title = link.get_text(strip=True)
|
|
if not title:
|
|
title_tag = link.select_one(".title, .movie-title, .content-title")
|
|
title = title_tag.get_text(strip=True) if title_tag else ""
|
|
|
|
if title:
|
|
items.append({
|
|
"id": content_id,
|
|
"title": title,
|
|
"url": full_url,
|
|
"thumbnail": img_url,
|
|
"category": category_key
|
|
})
|
|
|
|
pagination = {"current": page, "max_page": 1}
|
|
for page_link in soup.select("a[href*='/page/'], a[href*='page=']"):
|
|
href = page_link.get("href", "")
|
|
page_match = re.search(r'[/&]?page[=/](\d+)', href)
|
|
if page_match:
|
|
page_num = int(page_match.group(1))
|
|
if page_num > pagination["max_page"]:
|
|
pagination["max_page"] = page_num
|
|
|
|
print(f" 항목 수: {len(items)}")
|
|
print(f" 페이지 정보: 현재 {page} / 최대 {pagination['max_page']}")
|
|
|
|
for item in items[:5]:
|
|
print(f" - [{item['id']}] {item['title'][:50]}")
|
|
|
|
return {
|
|
"success": True,
|
|
"category": cat_info['name'],
|
|
"items": items,
|
|
"page": page,
|
|
"pagination": pagination
|
|
}
|
|
|
|
def get_detail(self, url_or_id: str, category: str = None) -> Dict:
|
|
print(f"\n{'=' * 60}")
|
|
print(f"3. 상세 페이지 크롤링")
|
|
print("=" * 60)
|
|
|
|
if url_or_id.startswith("http"):
|
|
url = url_or_id
|
|
else:
|
|
if category:
|
|
url = f"{BASE_URL}/{category}/{url_or_id}"
|
|
else:
|
|
url = f"{BASE_URL}/movie/{url_or_id}"
|
|
|
|
resp = self._get(url)
|
|
if not resp:
|
|
return {"success": False}
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
result = {
|
|
"success": True,
|
|
"url": url,
|
|
"title": "",
|
|
"thumbnail": "",
|
|
"info": {},
|
|
"episodes": [],
|
|
"video_links": [],
|
|
"play_url": ""
|
|
}
|
|
|
|
title_tag = soup.select_one("h1, h2.title, .content-title, title")
|
|
if title_tag:
|
|
title_text = title_tag.get_text(strip=True)
|
|
if " - " in title_text:
|
|
result["title"] = title_text.split(" - ")[0].strip()
|
|
else:
|
|
result["title"] = title_text
|
|
|
|
og_image = soup.select_one('meta[property="og:image"]')
|
|
if og_image:
|
|
result["thumbnail"] = og_image.get("content", "")
|
|
|
|
seen_episode_ids = set()
|
|
|
|
episode_links = soup.select(f"a[href*='/{category or 'drama'}/']")
|
|
|
|
for link in episode_links:
|
|
href = link.get("href", "")
|
|
if not href:
|
|
continue
|
|
|
|
full_url = urljoin(BASE_URL, href)
|
|
|
|
if full_url == url:
|
|
continue
|
|
|
|
episode_id_match = re.search(r'/(\d+)/(\d+)', href)
|
|
if not episode_id_match:
|
|
continue
|
|
|
|
episode_id = episode_id_match.group(2)
|
|
if episode_id in seen_episode_ids:
|
|
continue
|
|
seen_episode_ids.add(episode_id)
|
|
|
|
link_text = link.get_text(strip=True)
|
|
|
|
episode_num = re.search(r'(\d+화|\d+회|EP?\d+|제?\d+부?|\d+)', link_text, re.IGNORECASE)
|
|
if episode_num:
|
|
episode_title = f"{episode_num.group(1)}"
|
|
else:
|
|
episode_title = f"Episode {len(result['episodes']) + 1}"
|
|
|
|
result["episodes"].append({
|
|
"number": episode_title,
|
|
"title": link_text or episode_title,
|
|
"url": full_url,
|
|
"type": "webview"
|
|
})
|
|
|
|
result["video_links"].append({
|
|
"type": "play_page",
|
|
"url": full_url,
|
|
"title": link_text or episode_title
|
|
})
|
|
|
|
if result["episodes"]:
|
|
result["play_url"] = result["episodes"][0]["url"]
|
|
|
|
print(f" 제목: {result['title']}")
|
|
print(f" 썸네일: {result['thumbnail'][:60]}...")
|
|
print(f" 에피소드 수: {len(result['episodes'])}")
|
|
print(f" 비디오 링크 수: {len(result['video_links'])}")
|
|
|
|
for ep in result["episodes"][:5]:
|
|
print(f" - [{ep['number']}] {ep['title'][:40]}")
|
|
|
|
for vl in result["video_links"][:3]:
|
|
print(f" - [{vl['type']}] {vl['url'][:80]}...")
|
|
|
|
return result
|
|
|
|
def search(self, keyword: str, page: int = 1) -> Dict:
|
|
print(f"\n{'=' * 60}")
|
|
print(f"4. 검색: '{keyword}' (page={page})")
|
|
print("=" * 60)
|
|
|
|
encoded = quote(keyword)
|
|
url = f"{BASE_URL}/search?stx={encoded}" if page == 1 else f"{BASE_URL}/search?stx={encoded}&page={page}"
|
|
|
|
resp = self._get(url)
|
|
if not resp:
|
|
return {"success": False, "keyword": keyword, "results": []}
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
results = []
|
|
seen = set()
|
|
|
|
nav_patterns = ['/login', '/logout', '/register', '/mypage', '/bbs/', '/menu', '/faq', '/privacy']
|
|
|
|
all_links = soup.select("a[href*='/movie/'], a[href*='/drama/'], a[href*='/ent/'], a[href*='/world/'], a[href*='/animation/'], a[href*='/kor_movie/']")
|
|
|
|
for link in all_links:
|
|
href = link.get("href", "")
|
|
if not href:
|
|
continue
|
|
|
|
full_url = urljoin(BASE_URL, href)
|
|
|
|
if full_url in seen:
|
|
continue
|
|
if any(nav in full_url for nav in nav_patterns):
|
|
continue
|
|
|
|
seen.add(full_url)
|
|
|
|
id_match = re.search(r'/(\d+)(?:/|$|\?)', href)
|
|
content_id = id_match.group(1) if id_match else ""
|
|
|
|
category = self._get_category_from_url(href)
|
|
|
|
img_tag = link.select_one("img")
|
|
img_url = ""
|
|
if img_tag:
|
|
img_url = img_tag.get("src") or img_tag.get("data-src", "")
|
|
|
|
title = link.get_text(strip=True)
|
|
if not title:
|
|
title_tag = link.select_one(".title, .movie-title")
|
|
title = title_tag.get_text(strip=True) if title_tag else ""
|
|
|
|
if title:
|
|
results.append({
|
|
"id": content_id,
|
|
"title": title,
|
|
"url": full_url,
|
|
"thumbnail": img_url,
|
|
"category": category
|
|
})
|
|
|
|
print(f" 검색 결과: {len(results)}개")
|
|
for r in results[:10]:
|
|
print(f" - [{r['category']}] {r['title'][:50]}")
|
|
|
|
return {
|
|
"success": True,
|
|
"keyword": keyword,
|
|
"results": results,
|
|
"page": page
|
|
}
|
|
|
|
def get_popular(self) -> Dict:
|
|
print(f"\n{'=' * 60}")
|
|
print(f"5. 인기 영상 크롤링")
|
|
print("=" * 60)
|
|
|
|
url = f"{BASE_URL}/popular"
|
|
resp = self._get(url)
|
|
if not resp:
|
|
return {"success": False, "items": []}
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
items = []
|
|
seen = set()
|
|
|
|
nav_patterns = ['/login', '/logout', '/register', '/mypage', '/bbs/', '/menu', '/faq', '/privacy']
|
|
|
|
all_links = soup.select("a[href*='/movie/'], a[href*='/drama/'], a[href*='/ent/'], a[href*='/world/'], a[href*='/animation/'], a[href*='/kor_movie/']")
|
|
|
|
for link in all_links:
|
|
href = link.get("href", "")
|
|
if not href:
|
|
continue
|
|
|
|
full_url = urljoin(BASE_URL, href)
|
|
|
|
if full_url in seen:
|
|
continue
|
|
if any(nav in full_url for nav in nav_patterns):
|
|
continue
|
|
|
|
seen.add(full_url)
|
|
|
|
id_match = re.search(r'/(\d+)(?:/|$|\?)', href)
|
|
content_id = id_match.group(1) if id_match else ""
|
|
|
|
category = self._get_category_from_url(href)
|
|
|
|
img_tag = link.select_one("img")
|
|
img_url = ""
|
|
if img_tag:
|
|
img_url = img_tag.get("src") or img_tag.get("data-src", "")
|
|
|
|
title = link.get_text(strip=True)
|
|
if not title:
|
|
title_tag = link.select_one(".title, .movie-title")
|
|
title = title_tag.get_text(strip=True) if title_tag else ""
|
|
|
|
if title:
|
|
items.append({
|
|
"id": content_id,
|
|
"title": title,
|
|
"url": full_url,
|
|
"thumbnail": img_url,
|
|
"category": category
|
|
})
|
|
|
|
print(f" 인기 항목 수: {len(items)}")
|
|
for item in items[:10]:
|
|
print(f" - [{item['category']}] {item['title'][:50]}")
|
|
|
|
return {
|
|
"success": True,
|
|
"items": items
|
|
}
|
|
|
|
def _get_category_from_url(self, url: str) -> str:
|
|
"""URL에서 카테고리 추출"""
|
|
for cat_key in CATEGORIES:
|
|
if f"/{cat_key}/" in url:
|
|
return cat_key
|
|
return "unknown"
|
|
|
|
def _extract_id_from_url(self, url: str) -> str:
|
|
"""URL에서 콘텐츠 ID 추출"""
|
|
# 패턴: /category/12345 또는 /category/12345/67890
|
|
match = re.search(r'/(\d+)(?:/|$|\?)', url)
|
|
if match:
|
|
return match.group(1)
|
|
return ""
|
|
|
|
def _extract_episode_number(self, text: str) -> str:
|
|
"""텍스트에서 회차 번호 추출"""
|
|
match = re.search(r'(\d+화|\d+회|제?\d+부?|EP?\d+)', text, re.IGNORECASE)
|
|
if match:
|
|
return match.group(1)
|
|
return text
|
|
|
|
def _get_pagination(self, soup: BeautifulSoup) -> Dict:
|
|
"""페이지네이션 정보 추출"""
|
|
pagination = {"current": 1, "max_page": 1}
|
|
|
|
page_links = soup.select("a[href*='/page/']")
|
|
for link in page_links:
|
|
href = link.get("href", "")
|
|
match = re.search(r'/page/(\d+)', href)
|
|
if match:
|
|
page_num = int(match.group(1))
|
|
if page_num > pagination["max_page"]:
|
|
pagination["max_page"] = page_num
|
|
|
|
return pagination
|
|
|
|
|
|
def test_all_categories():
|
|
"""모든 카테고리 테스트"""
|
|
scraper = TvmonScraper()
|
|
|
|
print("\n🔍 tvmon.site 카테고리별 크롤링 테스트\n")
|
|
|
|
results = {}
|
|
|
|
# 홈페이지
|
|
try:
|
|
results["homepage"] = scraper.get_homepage()
|
|
except Exception as e:
|
|
print(f" ❌ 홈페이지 실패: {e}")
|
|
results["homepage"] = {"success": False}
|
|
|
|
# 각 카테고리 테스트
|
|
for cat_key in ["movie", "kor_movie", "drama", "ent", "sisa", "world", "ott_ent", "ani_movie", "animation"]:
|
|
try:
|
|
results[cat_key] = scraper.get_category(cat_key, page=1)
|
|
except Exception as e:
|
|
print(f" ❌ {cat_key} 실패: {e}")
|
|
results[cat_key] = {"success": False}
|
|
|
|
# 검색 테스트
|
|
try:
|
|
results["search"] = scraper.search("사냥개들")
|
|
except Exception as e:
|
|
print(f" ❌ 검색 실패: {e}")
|
|
results["search"] = {"success": False}
|
|
|
|
# 인기 영상
|
|
try:
|
|
results["popular"] = scraper.get_popular()
|
|
except Exception as e:
|
|
print(f" ❌ 인기 영상 실패: {e}")
|
|
results["popular"] = {"success": False}
|
|
|
|
# 결과 요약
|
|
print(f"\n{'=' * 60}")
|
|
print("📊 테스트 결과 요약")
|
|
print("=" * 60)
|
|
|
|
for name, result in results.items():
|
|
status = "✅ PASS" if result.get("success") else "❌ FAIL"
|
|
print(f" {status} - {name}")
|
|
|
|
return results
|
|
|
|
|
|
def test_detail_page():
|
|
"""상세 페이지 테스트"""
|
|
scraper = TvmonScraper()
|
|
|
|
print("\n🔍 상세 페이지 크롤링 테스트\n")
|
|
|
|
# 테스트할 URL들 (사용자가 제공한 URL 기반)
|
|
test_urls = [
|
|
"https://tvmon.site/drama/3781", # 사냥개들 시즌 2
|
|
"https://tvmon.site/kor_movie/30314", # 휴민트
|
|
"https://tvmon.site/world/19479", # 월린기기
|
|
]
|
|
|
|
for url in test_urls:
|
|
try:
|
|
result = scraper.get_detail(url)
|
|
print(f"\n{'=' * 40}")
|
|
except Exception as e:
|
|
print(f" ❌ 상세 페이지 실패 ({url}): {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# 전체 테스트 실행
|
|
test_all_categories()
|
|
|
|
print("\n" + "=" * 60)
|
|
|
|
# 상세 페이지 테스트
|
|
test_detail_page()
|