#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
使用 Playwright 爬取动态网站 - 优化版
"""

import json
from playwright.sync_api import sync_playwright

def crawl_chnmuseum():
    """爬取国家博物馆展览"""
    exhibitions = []
    
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True, args=['--disable-blink-features=AutomationControlled'])
        page = browser.new_page()
        page.set_extra_http_headers({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        })
        
        try:
            # 不等待 networkidle，只等待 domcontentloaded
            page.goto("https://www.chnmuseum.cn/zl/zhanlanyugao/", wait_until="domcontentloaded", timeout=60000)
            
            # 等待一段时间让 JS 执行
            page.wait_for_timeout(5000)
            
            # 获取页面内容
            content = page.content()
            print(f"页面长度：{len(content)}")
            
            # 尝试获取展览标题
            titles = page.query_selector_all("a[href*='zl']")
            for title_el in titles[:10]:
                try:
                    title = title_el.inner_text().strip()
                    if title and len(title) > 5:
                        link = title_el.get_attribute("href")
                        exhibitions.append({
                            "title": title,
                            "venue": "中国国家博物馆",
                            "area": "东城区",
                            "url": link,
                            "source": "国家博物馆官网"
                        })
                except:
                    pass
            
            print(f"✓ 国家博物馆：{len(exhibitions)} 个展览")
            
        except Exception as e:
            print(f"✗ 国家博物馆失败：{e}")
        
        browser.close()
    
    return exhibitions


def crawl_dpm():
    """爬取故宫博物院展览"""
    exhibitions = []
    
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True, args=['--disable-blink-features=AutomationControlled'])
        page = browser.new_page()
        page.set_extra_http_headers({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        })
        
        try:
            page.goto("https://www.dpm.org.cn/shows.html", wait_until="domcontentloaded", timeout=60000)
            page.wait_for_timeout(5000)
            
            content = page.content()
            print(f"页面长度：{len(content)}")
            
            # 尝试获取展览信息
            items = page.query_selector_all("a[href*='shows']")
            for item in items[:10]:
                try:
                    title = item.inner_text().strip()
                    if title and len(title) > 5:
                        link = item.get_attribute("href")
                        exhibitions.append({
                            "title": title,
                            "venue": "故宫博物院",
                            "area": "东城区",
                            "url": link,
                            "source": "故宫博物院官网"
                        })
                except:
                    pass
            
            print(f"✓ 故宫博物院：{len(exhibitions)} 个展览")
            
        except Exception as e:
            print(f"✗ 故宫博物院失败：{e}")
        
        browser.close()
    
    return exhibitions


if __name__ == "__main__":
    print("=" * 50)
    print("Playwright 爬取优化版")
    print("=" * 50)
    
    print("\n【国家博物馆】")
    chnmuseum = crawl_chnmuseum()
    
    print("\n【故宫博物院】")
    dpm = crawl_dpm()
    
    result = {
        "chnmuseum": chnmuseum,
        "dpm": dpm
    }
    
    with open("data/playwright_result_v2.json", "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    
    print(f"\n总计：{len(chnmuseum) + len(dpm)} 个展览")