#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
补充爬取 - 豆瓣同城和国家博物馆
"""

import json
import requests
from bs4 import BeautifulSoup

def crawl_douban_webfetch():
    """使用 web_fetch 方式爬取豆瓣"""
    print("爬取豆瓣同城（web_fetch 方式）...")
    
    try:
        # 使用 Jina Reader
        url = "https://r.jina.ai/http://beijing.douban.com/events/week-exhibition"
        response = requests.get(url, timeout=30)
        text = response.text
        
        exhibitions = []
        lines = text.split("\n")
        
        i = 0
        while i < len(lines):
            line = lines[i].strip()
            
            # 查找展览标题
            if ("展" in line or "展览" in line) and len(line) > 5 and len(line) < 60:
                if "北京" not in line and "同城" not in line and "事件" not in line and "展开" not in line:
                    ex = {"title": line, "source": "豆瓣同城"}
                    
                    for j in range(i+1, min(i+15, len(lines))):
                        next_line = lines[j].strip()
                        
                        if "朝阳区" in next_line or "东城区" in next_line or "海淀区" in next_line or "西城区" in next_line:
                            parts = next_line.split()
                            if len(parts) >= 2:
                                ex["area"] = parts[0]
                                ex["venue"] = " ".join(parts[1:])
                            else:
                                ex["venue"] = next_line
                        
                        if "2026" in next_line and ("~" in next_line or "-" in next_line):
                            ex["date"] = next_line
                        
                        if "¥" in next_line and "起" in next_line:
                            ex["price"] = next_line
                        
                        if "人参加" in next_line:
                            ex["hot"] = next_line
                        elif "人感兴趣" in next_line:
                            if "hot" not in ex:
                                ex["hot"] = next_line
                            else:
                                ex["hot"] = ex["hot"] + " | " + next_line
                    
                    if ex.get("venue") or ex.get("area"):
                        exhibitions.append(ex)
            i += 1
        
        print(f"   ✓ 获取 {len(exhibitions)} 个展览")
        return exhibitions
        
    except Exception as e:
        print(f"   ✗ 失败：{e}")
        return []


def crawl_namoc_detail():
    """详细爬取中国美术馆"""
    print("爬取中国美术馆详情...")
    
    try:
        url = "https://r.jina.ai/http://www.namoc.cn/namoc/zhanlan/zl_list.shtml"
        response = requests.get(url, timeout=30)
        text = response.text
        
        exhibitions = []
        lines = text.split("\n")
        
        current = {}
        for line in lines:
            line = line.strip()
            
            if "饰文焕彩" in line or "跃马春风" in line or "瑞器呈华" in line or "骏驰云章" in line or "奔腾启新" in line or "河北古代" in line or "楹联书法" in line or "工艺美术" in line or "闫平" in line:
                if len(line) > 5 and len(line) < 80:
                    if current.get("title"):
                        exhibitions.append(current)
                    current = {
                        "title": line,
                        "venue": "中国美术馆",
                        "area": "东城区",
                        "source": "中国美术馆官网",
                        "price": "免费"
                    }
            
            if "号厅" in line and "展厅" in line:
                current["hall"] = line
            if "2026" in line and ("~" in line or "至" in line):
                current["date"] = line
            if "在展" in line:
                current["status"] = "在展"
        
        if current.get("title"):
            exhibitions.append(current)
        
        print(f"   ✓ 获取 {len(exhibitions)} 个展览")
        return exhibitions
        
    except Exception as e:
        print(f"   ✗ 失败：{e}")
        return []


if __name__ == "__main__":
    print("=" * 50)
    print("补充爬取")
    print("=" * 50)
    
    douban = crawl_douban_webfetch()
    namoc = crawl_namoc_detail()
    
    all_ex = douban + namoc
    
    with open("data/supplement_exhibitions.json", "w", encoding="utf-8") as f:
        json.dump({
            "douban": douban,
            "namoc": namoc,
            "total": len(all_ex)
        }, f, ensure_ascii=False, indent=2)
    
    print(f"\n总计：{len(all_ex)} 个展览")
    print("已保存到 data/supplement_exhibitions.json")
