#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
北京展览信息爬虫 - 修复版
使用可靠数据源：北京市文物局官网汇总 + Jina Reader

修复内容：
1. 使用北京市文物局汇总数据（最可靠）
2. 添加实际爬取数量统计（不硬编码）
3. 添加数据验证（少于 5 条视为失败）
"""

import json
import time
import random
import sqlite3
from datetime import datetime, timedelta
from pathlib import Path
import requests

# 配置
BASE_DIR = Path(__file__).parent
DATA_DIR = BASE_DIR / "data"
LOG_DIR = BASE_DIR / "logs"
DB_PATH = DATA_DIR / "exhibitions.db"

# 确保目录存在
DATA_DIR.mkdir(exist_ok=True)
LOG_DIR.mkdir(exist_ok=True)


def log_message(message, level="INFO"):
    """记录日志"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"[{timestamp}] [{level}] {message}"
    print(log_entry)
    
    # 写入日志文件
    log_file = LOG_DIR / f"exhibition_{datetime.now().strftime('%Y%m%d')}.log"
    with open(log_file, "a", encoding="utf-8") as f:
        f.write(log_entry + "\n")


# 北京市文物局汇总数据（最可靠数据源）
# 包含：国家博物馆、故宫博物院、首都博物馆、中国美术馆等 8 个主要博物馆
BEIJING_MUSEUM_BUREAU_URL = "https://r.jina.ai/http://wwj.beijing.gov.cn/ztlm/bowuguan/"

# 豆瓣同城展览
DOUBAN_URL = "https://r.jina.ai/http://www.douban.com/location/beijing/exhibitions"

# 中国美术馆（直接 API）
NAMOC_URL = "https://r.jina.ai/http://www.namoc.org/zx/zl/"


def init_database():
    """初始化 SQLite 数据库"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS exhibitions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT NOT NULL,
            venue TEXT,
            area TEXT,
            start_date TEXT,
            end_date TEXT,
            price TEXT,
            type TEXT,
            source TEXT,
            url TEXT,
            description TEXT,
            recommend_level INTEGER DEFAULT 0,
            is_new INTEGER DEFAULT 1,
            created_at TEXT DEFAULT CURRENT_TIMESTAMP,
            updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
            UNIQUE(title, venue, source)
        )
    """)
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS crawl_history (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            source TEXT NOT NULL,
            crawl_time TEXT NOT NULL,
            items_count INTEGER DEFAULT 0,
            status TEXT DEFAULT 'success',
            error_message TEXT
        )
    """)
    
    conn.commit()
    conn.close()
    log_message("数据库初始化完成")


def crawl_beijing_museum_bureau():
    """
    爬取北京市文物局官网汇总数据
    这是最可靠的数据源，包含 8 个主要博物馆
    """
    exhibitions = []
    
    try:
        log_message("开始爬取：北京市文物局官网（汇总数据）")
        
        response = requests.get(BEIJING_MUSEUM_BUREAU_URL, timeout=30)
        
        if response.status_code == 200:
            text = response.text
            lines = text.split('\n')
            
            # 提取展览信息（简化版，实际需要更精确的解析）
            current_venue = ""
            for line in lines:
                line = line.strip()
                
                # 检测博物馆名称
                if any(m in line for m in ["国家博物馆", "故宫博物院", "首都博物馆", "中国美术馆", 
                                            "中国园林博物馆", "中国非物质文化遗产馆", 
                                            "中国地质博物馆", "北京鲁迅博物馆"]):
                    current_venue = line.replace("正在展出", "").strip()
                
                # 提取展览名称（包含"展"字，长度合理）
                if "展" in line and 5 < len(line) < 60 and current_venue:
                    if not any(skip in line for skip in ["展览预告", "正在展出", "常设展览", "关闭", "时间"]):
                        exhibitions.append({
                            "title": line,
                            "venue": current_venue,
                            "area": "北京市",
                            "start_date": "待查询",
                            "end_date": "待查询",
                            "price": "待查询",
                            "type": "博物馆展览",
                            "source": "北京市文物局",
                            "url": "http://wwj.beijing.gov.cn/",
                            "description": f"{current_venue} 展览"
                        })
            
            log_message(f"✓ 北京市文物局 爬取成功，获取 {len(exhibitions)} 条信息")
            
            # 记录爬取历史
            record_crawl_history("北京市文物局", len(exhibitions), "success")
        else:
            log_message(f"✗ 北京市文物局 爬取失败：HTTP {response.status_code}", "ERROR")
            record_crawl_history("北京市文物局", 0, "error", f"HTTP {response.status_code}")
            
    except Exception as e:
        log_message(f"✗ 北京市文物局 爬取异常：{str(e)}", "ERROR")
        record_crawl_history("北京市文物局", 0, "error", str(e))
    
    return exhibitions


def crawl_douban():
    """爬取豆瓣同城展览"""
    exhibitions = []
    
    try:
        log_message("开始爬取：豆瓣同城")
        
        response = requests.get(DOUBAN_URL, timeout=30)
        
        if response.status_code == 200:
            text = response.text
            lines = text.split('\n')
            
            for line in lines:
                line = line.strip()
                if "展" in line and 5 < len(line) < 60:
                    if not any(skip in line for skip in ["展览预告", "正在展出", "时间", "地点"]):
                        exhibitions.append({
                            "title": line,
                            "venue": "待查询",
                            "area": "北京市",
                            "start_date": "待查询",
                            "end_date": "待查询",
                            "price": "待查询",
                            "type": "展览",
                            "source": "豆瓣同城",
                            "url": "https://www.douban.com/location/beijing/exhibitions/",
                            "description": "豆瓣同城展览"
                        })
            
            log_message(f"✓ 豆瓣同城 爬取成功，获取 {len(exhibitions)} 条信息")
            record_crawl_history("豆瓣同城", len(exhibitions), "success")
        else:
            log_message(f"✗ 豆瓣同城 爬取失败：HTTP {response.status_code}", "ERROR")
            record_crawl_history("豆瓣同城", 0, "error", f"HTTP {response.status_code}")
            
    except Exception as e:
        log_message(f"✗ 豆瓣同城 爬取异常：{str(e)}", "ERROR")
        record_crawl_history("豆瓣同城", 0, "error", str(e))
    
    return exhibitions


def crawl_namoc():
    """爬取中国美术馆 - 优化解析逻辑"""
    exhibitions = []
    
    try:
        log_message("开始爬取：中国美术馆")
        
        response = requests.get(NAMOC_URL, timeout=30)
        
        if response.status_code == 200:
            text = response.text
            
            # 方法 1：提取 Markdown 标题格式的展览（### 或 #### 开头）
            import re
            # 匹配：### [展览名称](链接) 或 #### 展览名称
            title_pattern = r'(?:###|####)\s*(?:\[)?([^\]]+?)(?:\][^\n]*)?\n'
            matches = re.findall(title_pattern, text)
            
            for title in matches:
                title = title.strip()
                # 过滤：必须是真正的展览名称
                if ("展" in title and 
                    5 < len(title) < 50 and
                    not any(skip in title for skip in ["展览预告", "正在展出", "时间", "地点", "展厅", "展讯", "参观", "导览", "更多", "近期"])):
                    exhibitions.append({
                        "title": title,
                        "venue": "中国美术馆",
                        "area": "东城区",
                        "start_date": "待查询",
                        "end_date": "待查询",
                        "price": "免费",
                        "type": "艺术展览",
                        "source": "中国美术馆",
                        "url": "http://www.namoc.org/",
                        "description": "中国美术馆展览"
                    })
            
            # 方法 2：提取带展期的展览（格式：YYYY-MM-DD 至 YYYY-MM-DD）
            date_pattern = r'(\d{4}-\d{2}-\d{2}) 至 (\d{4}-\d{2}-\d{2})'
            date_matches = re.findall(date_pattern, text)
            
            # 提取展览名称（在展期附近的文本）
            lines = text.split('\n')
            for i, line in enumerate(lines):
                if any("展" in line and len(line) > 10 and len(line) < 60 for _ in [0]):
                    # 检查前后几行是否有日期
                    context = '\n'.join(lines[max(0,i-3):min(len(lines),i+3)])
                    if re.search(date_pattern, context):
                        title = line.strip()
                        if (5 < len(title) < 50 and
                            not any(skip in title for skip in ["展览预告", "正在展出", "时间", "地点", "展厅", "展讯", "参观", "导览", "更多", "近期", "###", "####", "[", "]"])):
                            # 避免重复
                            if not any(ex["title"] == title for ex in exhibitions):
                                # 提取日期
                                date_match = re.search(date_pattern, context)
                                start_date = date_match.group(1) if date_match else "待查询"
                                end_date = date_match.group(2) if date_match else "待查询"
                                
                                exhibitions.append({
                                    "title": title,
                                    "venue": "中国美术馆",
                                    "area": "东城区",
                                    "start_date": start_date,
                                    "end_date": end_date,
                                    "price": "免费",
                                    "type": "艺术展览",
                                    "source": "中国美术馆",
                                    "url": "http://www.namoc.org/",
                                    "description": f"中国美术馆展览 ({start_date} 至 {end_date})"
                                })
            
            log_message(f"✓ 中国美术馆 爬取成功，获取 {len(exhibitions)} 条信息")
            record_crawl_history("中国美术馆", len(exhibitions), "success")
        else:
            log_message(f"✗ 中国美术馆 爬取失败：HTTP {response.status_code}", "ERROR")
            record_crawl_history("中国美术馆", 0, "error", f"HTTP {response.status_code}")
            
    except Exception as e:
        log_message(f"✗ 中国美术馆 爬取异常：{str(e)}", "ERROR")
        record_crawl_history("中国美术馆", 0, "error", str(e))
    
    return exhibitions


def record_crawl_history(source, items_count, status, error_message=""):
    """记录爬取历史到数据库"""
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        
        cursor.execute("""
            INSERT INTO crawl_history (source, crawl_time, items_count, status, error_message)
            VALUES (?, ?, ?, ?, ?)
        """, (source, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), items_count, status, error_message))
        
        conn.commit()
        conn.close()
    except Exception as e:
        log_message(f"记录爬取历史失败：{e}", "WARNING")


def save_exhibitions(exhibitions):
    """保存展览信息到数据库"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    
    saved_count = 0
    for ex in exhibitions:
        try:
            cursor.execute("""
                INSERT OR IGNORE INTO exhibitions 
                (title, venue, area, start_date, end_date, price, type, source, url, description)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                ex.get("title", ""),
                ex.get("venue", ""),
                ex.get("area", ""),
                ex.get("start_date", ""),
                ex.get("end_date", ""),
                ex.get("price", ""),
                ex.get("type", ""),
                ex.get("source", ""),
                ex.get("url", ""),
                ex.get("description", "")
            ))
            if cursor.rowcount > 0:
                saved_count += 1
        except Exception as e:
            log_message(f"保存展览失败：{e}", "WARNING")
    
    conn.commit()
    conn.close()
    
    return saved_count


def generate_report(exhibitions):
    """生成 Markdown 报告"""
    today = datetime.now().strftime("%Y-%m-%d")
    report_path = DATA_DIR / f"beijing_exhibitions_{today}.md"
    
    # 统计
    total = len(exhibitions)
    official_count = sum(1 for ex in exhibitions if ex.get("source") in ["北京市文物局", "中国美术馆"])
    
    content = f"""# 北京展览推荐 - {today}

## 📅 今日概览
- **在展数量**：{total} 个
- **官方权威**：{official_count} 个
- **数据更新时间**：{today} {datetime.now().strftime("%H:%M")}

---

## 🌟 重点推荐展览

"""
    
    for i, ex in enumerate(exhibitions[:10], 1):
        content += f"{i}. **{ex.get('title', '未知')}** - {ex.get('venue', '未知')}\n"
    
    content += f"\n---\n\n*由 Travel Agent 自动生成 | 最后更新：{today} {datetime.now().strftime('%H:%M')}*\n"
    
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(content)
    
    log_message(f"生成报告：{report_path}")
    
    return {
        "total": total,
        "official": official_count,
        "report_path": str(report_path)
    }


def main():
    """主函数"""
    log_message("=" * 50)
    log_message("开始执行北京展览信息爬取任务")
    log_message("=" * 50)
    
    # 初始化数据库
    init_database()
    
    all_exhibitions = []
    
    # 爬取数据源（按优先级）
    # 1. 北京市文物局（最可靠）
    bureau_exhibitions = crawl_beijing_museum_bureau()
    all_exhibitions.extend(bureau_exhibitions)
    
    time.sleep(random.uniform(2, 4))
    
    # 2. 豆瓣同城
    douban_exhibitions = crawl_douban()
    all_exhibitions.extend(douban_exhibitions)
    
    time.sleep(random.uniform(2, 4))
    
    # 3. 中国美术馆
    namoc_exhibitions = crawl_namoc()
    all_exhibitions.extend(namoc_exhibitions)
    
    # 去重
    seen = set()
    unique_exhibitions = []
    for ex in all_exhibitions:
        key = (ex.get("title", ""), ex.get("venue", ""))
        if key not in seen:
            seen.add(key)
            unique_exhibitions.append(ex)
    
    log_message(f"去重后展览数量：{len(unique_exhibitions)}")
    
    # 保存到数据库
    saved_count = save_exhibitions(unique_exhibitions)
    log_message(f"保存 {saved_count} 条展览信息到数据库")
    
    # 生成报告
    report = generate_report(unique_exhibitions)
    
    # ========== 关键验证：检查数据是否合理 ==========
    if len(unique_exhibitions) < 5:
        log_message(f"⚠️ 警告：爬取到的展览数量过少（{len(unique_exhibitions)} 条），可能数据源异常", "WARNING")
        log_message("建议检查数据源 URL 是否有效", "WARNING")
    
    # 输出结果（供 cron 脚本使用）
    result = {
        "status": "success" if len(unique_exhibitions) >= 5 else "warning",
        "total_items": len(unique_exhibitions),
        "saved_items": saved_count,
        "report_path": report["report_path"],
        "official_count": report["official"]
    }
    
    print(json.dumps(result, ensure_ascii=False))
    
    log_message("=" * 50)
    log_message(f"任务执行完成！共处理 {len(unique_exhibitions)} 条展览信息")
    log_message("=" * 50)
    
    return 0 if len(unique_exhibitions) >= 5 else 1


if __name__ == "__main__":
    exit(main())
