#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
北京展览信息爬虫 - Tavily 版（优先北京市文物局）

数据源优先级：
1. ⭐⭐⭐⭐⭐ 北京市文物局官网（第一数据源）
2. ⭐⭐⭐⭐ 国家博物馆、故宫博物院、首都博物馆、中国美术馆
3. ⭐⭐⭐ 豆瓣同城、本地宝等聚合信息

新增功能：
1. 与前一天数据对比（检测是否更新）
2. 有效数据验证（标题/场馆完整性）
3. 数据量验证（< 5 条告警）
"""

import json
import time
import random
import sqlite3
import os
import requests
import re
from datetime import datetime, timedelta
from pathlib import Path

# 配置
BASE_DIR = Path(__file__).parent
DATA_DIR = BASE_DIR / "data"
LOG_DIR = BASE_DIR / "logs"
DB_PATH = DATA_DIR / "exhibitions.db"

# 确保目录存在
DATA_DIR.mkdir(exist_ok=True)
LOG_DIR.mkdir(exist_ok=True)

# Tavily API 配置
TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "")
TAVILY_API_URL = "https://api.tavily.com/search"

# 已知展览数据（用于兜底）- 已过滤过期展览
KNOWN_EXHIBITIONS = [
    {"title": "叩问永恒——庞贝的探索与发掘", "venue": "国家博物馆", "date": "02-04~10-11", "price": "¥120"},
    {"title": "双星耀世——三星堆—金沙古蜀文明展", "venue": "国家博物馆", "date": "即日起~08-18", "price": "免费"},
    {"title": "遇见考烈王——安徽淮南武王墩一号墓考古成果展", "venue": "国家博物馆", "date": "2025/12/26~2026/4/14", "price": "免费"},
    {"title": "几何·和谐·生活——安德烈亚·帕拉第奥建筑艺术展", "venue": "国家博物馆", "date": "02-04~05-24", "price": "含门票"},
    {"title": "万法归一：萨迦寺历史文化艺术展", "venue": "故宫博物院", "date": "02-10~05-10", "price": "含门票"},
    {"title": "饰文焕彩——河北古代艺术珍品展", "venue": "中国美术馆", "date": "02-13~05-12", "price": "免费"},
]


def log_message(message, level="INFO"):
    """记录日志"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"[{timestamp}] [{level}] {message}"
    print(log_entry)
    log_file = LOG_DIR / f"exhibition_{datetime.now().strftime('%Y%m%d')}.log"
    with open(log_file, "a", encoding="utf-8") as f:
        f.write(log_entry + "\n")


def tavily_search(query, max_results=15):
    """使用 Tavily 搜索"""
    if not TAVILY_API_KEY:
        log_message("❌ Tavily API Key 未配置", "ERROR")
        return []
    
    try:
        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {TAVILY_API_KEY}"}
        data = {"query": query, "search_depth": "basic", "max_results": max_results}
        response = requests.post(TAVILY_API_URL, headers=headers, json=data, timeout=30)
        
        if response.status_code == 200:
            result = response.json()
            return result.get("results", [])
        else:
            log_message(f"✗ Tavily 搜索失败：HTTP {response.status_code}", "ERROR")
            return []
    except Exception as e:
        log_message(f"✗ Tavily 搜索异常：{str(e)}", "ERROR")
        return []


def extract_exhibition_title(text, url=""):
    """从文本中提取展览名称"""
    # 跳过公文/通知类/非展览内容
    skip_keywords = ["京文物〔", "京文物许可〔", "通知", "各博物馆：", "我局", "请示", "批复", 
                     "条例", "组织开展", "发布 2026", "展览计划", "本地宝 >", "资讯快递",
                     "近日抵", "教育厅", "系统呈现", "辉煌图景"]
    if any(skip in text for skip in skip_keywords):
        return None
    
    # 1. 优先匹配：展览名称——副标题 格式（破折号）
    match = re.search(r'([A-Za-z0-9\u4e00-\u9fa5]{4,20}——[A-Za-z0-9\u4e00-\u9fa5]{2,20})', text)
    if match:
        title = match.group(1).strip()
        if len(title) >= 8 and len(title) <= 35:
            return title
    
    # 2. 匹配书名号
    match = re.search(r'《([^》]{4,30})》', text)
    if match:
        return match.group(1).strip()
    
    # 3. 匹配"展"字结尾的短标题（更严格）
    match = re.search(r'([A-Za-z0-9\u4e00-\u9fa5]{6,18}展)', text)
    if match:
        title = match.group(1).strip()
        skip_titles = ["展览预告", "正在展出", "新展", "发布 2026", "展览计划"]
        if not any(skip in title for skip in skip_titles):
            return title
    
    # 4. 从 URL 推断（如果是博物馆官网）
    if "chnmuseum.cn" in url:
        return "国家博物馆展览"
    elif "dpm.org.cn" in url:
        return "故宫博物院展览"
    elif "namoc.org" in url:
        return "中国美术馆展览"
    
    return None


def load_yesterday_data():
    """加载前一天的数据"""
    yesterday = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
    yesterday_file = DATA_DIR / f"beijing_exhibitions_{yesterday}.md"
    
    if not yesterday_file.exists():
        return []
    
    try:
        with open(yesterday_file, "r", encoding="utf-8") as f:
            content = f.read()
        
        exhibitions = []
        for line in content.split("\n"):
            if line.strip().startswith(("1.", "2.", "3.", "4.", "5.")):
                match = re.search(r'\*\*(.+?)\*\*', line)
                if match:
                    exhibitions.append(match.group(1).split(" - ")[0].strip())
        return exhibitions
    except:
        return []


def compare_data(today, yesterday):
    """对比今天和昨天的数据"""
    if not yesterday:
        return {"is_same": False, "new_count": len(today), "message": "无昨日数据"}
    
    today_set = set(t[:40] for t in today)
    yesterday_set = set(yesterday)
    
    new = today_set - yesterday_set
    removed = yesterday_set - today_set
    
    return {
        "is_same": len(new) == 0 and len(removed) == 0,
        "new_count": len(new),
        "removed_count": len(removed),
        "message": f"新增{len(new)}个，移除{len(removed)}个"
    }


def init_database():
    """初始化数据库"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS exhibitions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT NOT NULL,
            venue TEXT,
            start_date TEXT,
            end_date TEXT,
            price TEXT,
            source TEXT,
            url TEXT,
            is_priority INTEGER DEFAULT 0,
            created_at TEXT DEFAULT CURRENT_TIMESTAMP,
            UNIQUE(title, venue)
        )
    """)
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS crawl_history (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            source TEXT,
            crawl_time TEXT,
            items_count INTEGER,
            status TEXT,
            comparison_result TEXT
        )
    """)
    
    conn.commit()
    conn.close()
    log_message("数据库初始化完成")


def generate_report(exhibitions, comparison):
    """生成报告"""
    today = datetime.now().strftime("%Y-%m-%d")
    report_path = DATA_DIR / f"beijing_exhibitions_{today}.md"
    
    # 过滤符合兴趣的
    keywords = ["历史", "人文", "古代", "考古", "文物", "博物馆", "故宫", "书法", "藏传", "佛教", "古蜀", "青铜", "文明", "庞贝", "三星堆", "艺术"]
    filtered = [ex for ex in exhibitions if any(kw in ex["title"] for kw in keywords)][:10]
    
    note = "⚠️ 数据与昨日相同" if comparison.get("is_same") else f"✅ {comparison.get('message', '')}"
    
    content = f"""# 北京展览推荐 - {today}

## 📅 今日概览
- **在展数量**：{len(exhibitions)} 个
- **符合兴趣**：{len(filtered)} 个
- **数据更新时间**：{today} {datetime.now().strftime("%H:%M")}
- **数据对比**：{note}

---

## 🌟 重点推荐展览

"""
    for i, ex in enumerate(filtered, 1):
        content += f"{i}. **{ex['title']}** - {ex.get('venue', '待查询')}（{ex.get('date', '展期详情')}）\n"
    
    content += f"""
---

## 📊 数据来源
- **优先**：北京市文物局、国家博物馆、故宫博物院、首都博物馆、中国美术馆
- **补充**：豆瓣同城、本地宝等

---

*由 Travel Agent 自动生成 | {today} {datetime.now().strftime('%H:%M')}*
"""
    
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(content)
    
    return {"total": len(exhibitions), "filtered": len(filtered), "path": str(report_path)}


def main():
    """主函数"""
    log_message("=" * 60)
    log_message("北京展览爬取（优先北京市文物局）")
    log_message("=" * 60)
    
    init_database()
    
    all_exhibitions = []
    
    # 步骤 1：北京市文物局
    log_message("\n【1】北京市文物局 ⭐⭐⭐⭐⭐")
    results = tavily_search("北京市文物局 博物馆展览 2026 site:wwj.beijing.gov.cn", max_results=10)
    for r in results:
        title = extract_exhibition_title(r.get("content", ""))
        if title:
            all_exhibitions.append({"title": title, "venue": "北京市文物局汇总", "source": "官方"})
    log_message(f"获取 {len(all_exhibitions)} 个")
    
    # 步骤 2：各大博物馆
    log_message("\n【2】各大博物馆 ⭐⭐⭐⭐")
    for query in ["国家博物馆 展览 site:chnmuseum.cn", "故宫 展览 site:dpm.org.cn", "中国美术馆 展览 site:namoc.org"]:
        results = tavily_search(query, max_results=5)
        for r in results:
            title = extract_exhibition_title(r.get("content", ""))
            if title and not any(ex["title"] == title for ex in all_exhibitions):
                venue = "国家博物馆" if "chnmuseum" in query else "故宫博物院" if "dpm" in query else "中国美术馆"
                all_exhibitions.append({"title": title, "venue": venue, "source": "官方"})
        time.sleep(1)
    
    # 步骤 3：聚合信息
    log_message("\n【3】聚合信息 ⭐⭐⭐")
    results = tavily_search("北京展览 2026 年 3 月 博物馆", max_results=10)
    for r in results:
        title = extract_exhibition_title(r.get("content", ""))
        if title and not any(ex["title"] == title for ex in all_exhibitions):
            all_exhibitions.append({"title": title, "venue": "待查询", "source": "聚合"})
    
    # 如果数据太少，使用已知展览兜底
    if len(all_exhibitions) < 5:
        log_message(f"⚠️ 爬取数据过少 ({len(all_exhibitions)} 条)，使用已知展览兜底", "WARNING")
        for ex in KNOWN_EXHIBITIONS:
            if not any(e["title"] == ex["title"] for e in all_exhibitions):
                all_exhibitions.append({
                    "title": ex["title"],
                    "venue": ex["venue"],
                    "date": ex["date"],
                    "price": ex["price"],
                    "source": "已知数据"
                })
    
    # 去重
    seen = set()
    unique = []
    for ex in all_exhibitions:
        key = ex["title"][:30]
        if key not in seen:
            seen.add(key)
            unique.append(ex)
    
    log_message(f"去重后：{len(unique)} 个")
    
    # 对比昨日
    yesterday = load_yesterday_data()
    comparison = compare_data([ex["title"] for ex in unique], yesterday)
    log_message(f"对比：{comparison['message']}")
    
    if comparison.get("is_same"):
        log_message("⚠️ 警告：数据与昨日完全相同", "WARNING")
    
    # 生成报告
    report = generate_report(unique, comparison)
    
    # 验证
    status = "success" if len(unique) >= 5 and not comparison.get("is_same") else "warning"
    if len(unique) < 5:
        log_message(f"⚠️ 警告：数据过少 ({len(unique)} 条)", "WARNING")
    
    # 记录历史
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute("INSERT INTO crawl_history (source, crawl_time, items_count, status, comparison_result) VALUES (?, ?, ?, ?, ?)",
                   ("Tavily", datetime.now().strftime("%Y-%m-%d %H:%M:%S"), len(unique), status, json.dumps(comparison)))
    conn.commit()
    conn.close()
    
    result = {"status": status, "total": len(unique), "comparison": comparison, "report": report["path"]}
    print(json.dumps(result, ensure_ascii=False))
    
    log_message("=" * 60)
    log_message(f"完成！{len(unique)} 个展览，{comparison['message']}")
    
    return 0 if status == "success" else 1


if __name__ == "__main__":
    exit(main())
