tvmon/tvmon_scraper.py

"""
티비몬 (tvmon.site) 스크래퍼
카테고리: 영화, 한국영화, 드라마, 예능, 시사/다큐, 해외드라마, 해외(예능/다큐), [극장판] 애니메이션, 일반 애니메이션

WebView 기반 재생을 위한 스크래퍼 - 복잡한 MP4 파싱 없이 웹페이지 URL만 제공
"""

import requests
from bs4 import BeautifulSoup
import json
import re
import time
from urllib.parse import urljoin, quote
from typing import Dict, List, Optional, Tuple

BASE_URL = "https://tvmon.site"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
HEADERS = {
    "User-Agent": USER_AGENT,
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "Referer": BASE_URL,
}

# 카테고리 매핑
CATEGORIES = {
    "movie": {"name": "영화", "path": "/movie"},
    "kor_movie": {"name": "한국영화", "path": "/kor_movie"},
    "drama": {"name": "드라마", "path": "/drama"},
    "ent": {"name": "예능프로그램", "path": "/ent"},
    "sisa": {"name": "시사/다큐", "path": "/sisa"},
    "world": {"name": "해외드라마", "path": "/world"},
    "ott_ent": {"name": "해외(예능/다큐)", "path": "/ott_ent"},
    "ani_movie": {"name": "[극장판] 애니메이션", "path": "/ani_movie"},
    "animation": {"name": "일반 애니메이션", "path": "/animation"},
}


class TvmonScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update(HEADERS)

    def _get(self, url: str, timeout: int = 15) -> Optional[requests.Response]:
        """GET 요청 with 재시도"""
        max_retries = 3
        for attempt in range(max_retries):
            try:
                resp = self.session.get(url, timeout=timeout)
                resp.raise_for_status()
                return resp
            except requests.RequestException as e:
                if attempt < max_retries - 1:
                    time.sleep(1)
                else:
                    print(f"❌ 요청 실패: {url} - {e}")
                    return None
        return None

    def get_homepage(self) -> Dict:
        """홈페이지 크롤링 - 인기 순위 및 최신 콘텐츠"""
        print("=" * 60)
        print("1. 홈페이지 크롤링")
        print("=" * 60)

        resp = self._get(f"{BASE_URL}/")
        if not resp:
            return {"success": False}

        soup = BeautifulSoup(resp.text, "html.parser")
        result = {
            "success": True,
            "popular": [],
            "latest": {}
        }

        # 인기 순위 (무료 다시보기 순위)
        popular_section = soup.select_one("h2:contains('무료 다시보기 순위')")
        if popular_section:
            popular_items = soup.select(".popular-section a, .ranking-item a")
            # 실제 구조에 맞게 조정 필요
            for item in soup.select("a[href*='/drama/'], a[href*='/movie/'], a[href*='/kor_movie/'], a[href*='/world/'], a[href*='/animation/']")[:10]:
                href = item.get("href", "")
                title = item.get_text(strip=True)
                if title and href:
                    result["popular"].append({
                        "title": title,
                        "url": urljoin(BASE_URL, href),
                        "category": self._get_category_from_url(href)
                    })

        # 최신 콘텐츠 섹션별
        sections = ["영화", "드라마", "예능", "해외드라마", "애니메이션"]
        for section in sections:
            result["latest"][section] = []

        # 최신 영화
        movie_items = soup.select("a[href*='/movie/']")
        for item in movie_items[:6]:
            href = item.get("href", "")
            title = item.get_text(strip=True)
            img = item.select_one("img")
            img_url = img.get("src") or img.get("data-src", "") if img else ""

            if title and href and "/movie/" in href:
                result["latest"]["영화"].append({
                    "title": title,
                    "url": urljoin(BASE_URL, href),
                    "thumbnail": img_url
                })

        # 최신 드라마
        drama_items = soup.select("a[href*='/drama/']")
        for item in drama_items[:6]:
            href = item.get("href", "")
            title = item.get_text(strip=True)
            img = item.select_one("img")
            img_url = img.get("src") or img.get("data-src", "") if img else ""

            if title and href and "/drama/" in href:
                result["latest"]["드라마"].append({
                    "title": title,
                    "url": urljoin(BASE_URL, href),
                    "thumbnail": img_url
                })

        print(f" 인기 순위: {len(result['popular'])}개")
        for section, items in result["latest"].items():
            print(f" 최신 {section}: {len(items)}개")

        return result

    def get_category(self, category_key: str, page: int = 1) -> Dict:
        if category_key not in CATEGORIES:
            print(f"❌ 알 수 없는 카테고리: {category_key}")
            return {"success": False, "items": []}

        cat_info = CATEGORIES[category_key]
        print(f"\n{'=' * 60}")
        print(f"2. 카테고리 크롤링: {cat_info['name']} (page={page})")
        print("=" * 60)

        url = f"{BASE_URL}{cat_info['path']}" if page == 1 else f"{BASE_URL}{cat_info['path']}?page={page}"
        resp = self._get(url)
        if not resp:
            return {"success": False, "items": [], "category": cat_info['name']}

        soup = BeautifulSoup(resp.text, "html.parser")
        items = []
        seen = set()

        for link in soup.select(f"a[href*='/{category_key}/']"):
            href = link.get("href", "")
            if not href or f"/{category_key}/" not in href:
                continue

            full_url = urljoin(BASE_URL, href)
            if full_url in seen:
                continue
            seen.add(full_url)

            id_match = re.search(r'/(\d+)(?:/|$|\?)', href)
            content_id = id_match.group(1) if id_match else ""

            img_tag = link.select_one("img")
            img_url = ""
            if img_tag:
                img_url = img_tag.get("src") or img_tag.get("data-src") or img_tag.get("data-original", "")

            title = link.get_text(strip=True)
            if not title:
                title_tag = link.select_one(".title, .movie-title, .content-title")
                title = title_tag.get_text(strip=True) if title_tag else ""

            if title:
                items.append({
                    "id": content_id,
                    "title": title,
                    "url": full_url,
                    "thumbnail": img_url,
                    "category": category_key
                })

        pagination = {"current": page, "max_page": 1}
        for page_link in soup.select("a[href*='/page/'], a[href*='page=']"):
            href = page_link.get("href", "")
            page_match = re.search(r'[/&]?page[=/](\d+)', href)
            if page_match:
                page_num = int(page_match.group(1))
                if page_num > pagination["max_page"]:
                    pagination["max_page"] = page_num

        print(f" 항목 수: {len(items)}")
        print(f" 페이지 정보: 현재 {page} / 최대 {pagination['max_page']}")

        for item in items[:5]:
            print(f" - [{item['id']}] {item['title'][:50]}")

        return {
            "success": True,
            "category": cat_info['name'],
            "items": items,
            "page": page,
            "pagination": pagination
        }

    def get_detail(self, url_or_id: str, category: str = None) -> Dict:
        print(f"\n{'=' * 60}")
        print(f"3. 상세 페이지 크롤링")
        print("=" * 60)

        if url_or_id.startswith("http"):
            url = url_or_id
        else:
            if category:
                url = f"{BASE_URL}/{category}/{url_or_id}"
            else:
                url = f"{BASE_URL}/movie/{url_or_id}"

        resp = self._get(url)
        if not resp:
            return {"success": False}

        soup = BeautifulSoup(resp.text, "html.parser")
        result = {
            "success": True,
            "url": url,
            "title": "",
            "thumbnail": "",
            "info": {},
            "episodes": [],
            "video_links": [],
            "play_url": ""
        }

        title_tag = soup.select_one("h1, h2.title, .content-title, title")
        if title_tag:
            title_text = title_tag.get_text(strip=True)
            if " - " in title_text:
                result["title"] = title_text.split(" - ")[0].strip()
            else:
                result["title"] = title_text

        og_image = soup.select_one('meta[property="og:image"]')
        if og_image:
            result["thumbnail"] = og_image.get("content", "")

        seen_episode_ids = set()

        episode_links = soup.select(f"a[href*='/{category or 'drama'}/']")

        for link in episode_links:
            href = link.get("href", "")
            if not href:
                continue

            full_url = urljoin(BASE_URL, href)

            if full_url == url:
                continue

            episode_id_match = re.search(r'/(\d+)/(\d+)', href)
            if not episode_id_match:
                continue

            episode_id = episode_id_match.group(2)
            if episode_id in seen_episode_ids:
                continue
            seen_episode_ids.add(episode_id)

            link_text = link.get_text(strip=True)

            episode_num = re.search(r'(\d+화|\d+회|EP?\d+|제?\d+부?|\d+)', link_text, re.IGNORECASE)
            if episode_num:
                episode_title = f"{episode_num.group(1)}"
            else:
                episode_title = f"Episode {len(result['episodes']) + 1}"

            result["episodes"].append({
                "number": episode_title,
                "title": link_text or episode_title,
                "url": full_url,
                "type": "webview"
            })

            result["video_links"].append({
                "type": "play_page",
                "url": full_url,
                "title": link_text or episode_title
            })

        if result["episodes"]:
            result["play_url"] = result["episodes"][0]["url"]

        print(f" 제목: {result['title']}")
        print(f" 썸네일: {result['thumbnail'][:60]}...")
        print(f" 에피소드 수: {len(result['episodes'])}")
        print(f" 비디오 링크 수: {len(result['video_links'])}")

        for ep in result["episodes"][:5]:
            print(f" - [{ep['number']}] {ep['title'][:40]}")

        for vl in result["video_links"][:3]:
            print(f" - [{vl['type']}] {vl['url'][:80]}...")

        return result

    def search(self, keyword: str, page: int = 1) -> Dict:
        print(f"\n{'=' * 60}")
        print(f"4. 검색: '{keyword}' (page={page})")
        print("=" * 60)

        encoded = quote(keyword)
        url = f"{BASE_URL}/search?stx={encoded}" if page == 1 else f"{BASE_URL}/search?stx={encoded}&page={page}"

        resp = self._get(url)
        if not resp:
            return {"success": False, "keyword": keyword, "results": []}

        soup = BeautifulSoup(resp.text, "html.parser")
        results = []
        seen = set()

        nav_patterns = ['/login', '/logout', '/register', '/mypage', '/bbs/', '/menu', '/faq', '/privacy']

        all_links = soup.select("a[href*='/movie/'], a[href*='/drama/'], a[href*='/ent/'], a[href*='/world/'], a[href*='/animation/'], a[href*='/kor_movie/']")

        for link in all_links:
            href = link.get("href", "")
            if not href:
                continue

            full_url = urljoin(BASE_URL, href)

            if full_url in seen:
                continue
            if any(nav in full_url for nav in nav_patterns):
                continue

            seen.add(full_url)

            id_match = re.search(r'/(\d+)(?:/|$|\?)', href)
            content_id = id_match.group(1) if id_match else ""

            category = self._get_category_from_url(href)

            img_tag = link.select_one("img")
            img_url = ""
            if img_tag:
                img_url = img_tag.get("src") or img_tag.get("data-src", "")

            title = link.get_text(strip=True)
            if not title:
                title_tag = link.select_one(".title, .movie-title")
                title = title_tag.get_text(strip=True) if title_tag else ""

            if title:
                results.append({
                    "id": content_id,
                    "title": title,
                    "url": full_url,
                    "thumbnail": img_url,
                    "category": category
                })

        print(f" 검색 결과: {len(results)}개")
        for r in results[:10]:
            print(f" - [{r['category']}] {r['title'][:50]}")

        return {
            "success": True,
            "keyword": keyword,
            "results": results,
            "page": page
        }

    def get_popular(self) -> Dict:
        print(f"\n{'=' * 60}")
        print(f"5. 인기 영상 크롤링")
        print("=" * 60)

        url = f"{BASE_URL}/popular"
        resp = self._get(url)
        if not resp:
            return {"success": False, "items": []}

        soup = BeautifulSoup(resp.text, "html.parser")
        items = []
        seen = set()

        nav_patterns = ['/login', '/logout', '/register', '/mypage', '/bbs/', '/menu', '/faq', '/privacy']

        all_links = soup.select("a[href*='/movie/'], a[href*='/drama/'], a[href*='/ent/'], a[href*='/world/'], a[href*='/animation/'], a[href*='/kor_movie/']")

        for link in all_links:
            href = link.get("href", "")
            if not href:
                continue

            full_url = urljoin(BASE_URL, href)

            if full_url in seen:
                continue
            if any(nav in full_url for nav in nav_patterns):
                continue

            seen.add(full_url)

            id_match = re.search(r'/(\d+)(?:/|$|\?)', href)
            content_id = id_match.group(1) if id_match else ""

            category = self._get_category_from_url(href)

            img_tag = link.select_one("img")
            img_url = ""
            if img_tag:
                img_url = img_tag.get("src") or img_tag.get("data-src", "")

            title = link.get_text(strip=True)
            if not title:
                title_tag = link.select_one(".title, .movie-title")
                title = title_tag.get_text(strip=True) if title_tag else ""

            if title:
                items.append({
                    "id": content_id,
                    "title": title,
                    "url": full_url,
                    "thumbnail": img_url,
                    "category": category
                })

        print(f" 인기 항목 수: {len(items)}")
        for item in items[:10]:
            print(f" - [{item['category']}] {item['title'][:50]}")

        return {
            "success": True,
            "items": items
        }

    def _get_category_from_url(self, url: str) -> str:
        """URL에서 카테고리 추출"""
        for cat_key in CATEGORIES:
            if f"/{cat_key}/" in url:
                return cat_key
        return "unknown"

    def _extract_id_from_url(self, url: str) -> str:
        """URL에서 콘텐츠 ID 추출"""
        # 패턴: /category/12345 또는 /category/12345/67890
        match = re.search(r'/(\d+)(?:/|$|\?)', url)
        if match:
            return match.group(1)
        return ""

    def _extract_episode_number(self, text: str) -> str:
        """텍스트에서 회차 번호 추출"""
        match = re.search(r'(\d+화|\d+회|제?\d+부?|EP?\d+)', text, re.IGNORECASE)
        if match:
            return match.group(1)
        return text

    def _get_pagination(self, soup: BeautifulSoup) -> Dict:
        """페이지네이션 정보 추출"""
        pagination = {"current": 1, "max_page": 1}

        page_links = soup.select("a[href*='/page/']")
        for link in page_links:
            href = link.get("href", "")
            match = re.search(r'/page/(\d+)', href)
            if match:
                page_num = int(match.group(1))
                if page_num > pagination["max_page"]:
                    pagination["max_page"] = page_num

        return pagination


def test_all_categories():
    """모든 카테고리 테스트"""
    scraper = TvmonScraper()

    print("\n🔍 tvmon.site 카테고리별 크롤링 테스트\n")

    results = {}

    # 홈페이지
    try:
        results["homepage"] = scraper.get_homepage()
    except Exception as e:
        print(f" ❌ 홈페이지 실패: {e}")
        results["homepage"] = {"success": False}

    # 각 카테고리 테스트
    for cat_key in ["movie", "kor_movie", "drama", "ent", "sisa", "world", "ott_ent", "ani_movie", "animation"]:
        try:
            results[cat_key] = scraper.get_category(cat_key, page=1)
        except Exception as e:
            print(f" ❌ {cat_key} 실패: {e}")
            results[cat_key] = {"success": False}

    # 검색 테스트
    try:
        results["search"] = scraper.search("사냥개들")
    except Exception as e:
        print(f" ❌ 검색 실패: {e}")
        results["search"] = {"success": False}

    # 인기 영상
    try:
        results["popular"] = scraper.get_popular()
    except Exception as e:
        print(f" ❌ 인기 영상 실패: {e}")
        results["popular"] = {"success": False}

    # 결과 요약
    print(f"\n{'=' * 60}")
    print("📊 테스트 결과 요약")
    print("=" * 60)

    for name, result in results.items():
        status = "✅ PASS" if result.get("success") else "❌ FAIL"
        print(f" {status} - {name}")

    return results


def test_detail_page():
    """상세 페이지 테스트"""
    scraper = TvmonScraper()

    print("\n🔍 상세 페이지 크롤링 테스트\n")

    # 테스트할 URL들 (사용자가 제공한 URL 기반)
    test_urls = [
        "https://tvmon.site/drama/3781",  # 사냥개들 시즌 2
        "https://tvmon.site/kor_movie/30314",  # 휴민트
        "https://tvmon.site/world/19479",  # 월린기기
    ]

    for url in test_urls:
        try:
            result = scraper.get_detail(url)
            print(f"\n{'=' * 40}")
        except Exception as e:
            print(f" ❌ 상세 페이지 실패 ({url}): {e}")


if __name__ == "__main__":
    # 전체 테스트 실행
    test_all_categories()

    print("\n" + "=" * 60)

    # 상세 페이지 테스트
    test_detail_page()