#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
北京展览信息爬虫
每天 9:00 自动执行，爬取各大展览信息源
"""

import json
import time
import random
import sqlite3
from datetime import datetime, timedelta
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import re

# 配置
BASE_DIR = Path(__file__).parent
DATA_DIR = BASE_DIR / "data"
LOG_DIR = BASE_DIR / "logs"
DB_PATH = DATA_DIR / "exhibitions.db"

# 确保目录存在
DATA_DIR.mkdir(exist_ok=True)
LOG_DIR.mkdir(exist_ok=True)


def log_message(message, level="INFO"):
    """记录日志"""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"[{timestamp}] [{level}] {message}"
    print(log_entry)
    
    # 写入日志文件
    log_file = LOG_DIR / f"exhibition_{datetime.now().strftime('%Y%m%d')}.log"
    with open(log_file, "a", encoding="utf-8") as f:
        f.write(log_entry + "\n")


# 用户偏好配置
USER_PREFERENCES = {
    "prefer_types": ["历史", "人文", "艺术", "博物馆", "古代文明", "考古", "书画", "陶瓷", "青铜器", "传统工艺", "世界遗产", "非遗"],
    "avoid_types": ["网红展", "沉浸式", "打卡展", "娱乐", "商业"],
    "max_price": 200,
    "prefer_areas": ["东城区", "西城区", "海淀区", "朝阳区"],
    "party_size": 3  # 3-4 人出行
}

# 导入展览过滤器（自动应用用户偏好）
exhibition_filter = None
try:
    from exhibition_filter import ExhibitionFilter
    exhibition_filter = ExhibitionFilter()
    log_message("✓ 展览过滤器已加载，自动应用用户偏好")
except Exception as e:
    log_message(f"⚠ 过滤器加载失败：{e}", "WARNING")


def init_database():
    """初始化 SQLite 数据库"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS exhibitions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT NOT NULL,
            venue TEXT,
            area TEXT,
            start_date TEXT,
            end_date TEXT,
            price TEXT,
            type TEXT,
            source TEXT,
            url TEXT,
            description TEXT,
            recommend_level INTEGER DEFAULT 0,
            is_new INTEGER DEFAULT 1,
            created_at TEXT DEFAULT CURRENT_TIMESTAMP,
            updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
            UNIQUE(title, venue, source)
        )
    """)
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS crawl_history (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            source TEXT NOT NULL,
            crawl_time TEXT NOT NULL,
            items_count INTEGER DEFAULT 0,
            status TEXT DEFAULT 'success',
            error_message TEXT
        )
    """)
    
    conn.commit()
    conn.close()
    log_message("数据库初始化完成")


def calculate_recommend_level(exhibition):
    """根据用户偏好计算推荐等级 (1-5 星)"""
    score = 3  # 基础分
    
    title = exhibition.get("title", "")
    ex_type = exhibition.get("type", "")
    price_str = exhibition.get("price", "0")
    area = exhibition.get("area", "")
    
    # 类型匹配加分
    for prefer in USER_PREFERENCES["prefer_types"]:
        if prefer in title or prefer in ex_type:
            score += 1
            break
    
    # 避免类型减分
    for avoid in USER_PREFERENCES["avoid_types"]:
        if avoid in title or avoid in ex_type:
            score -= 2
            break
    
    # 价格过滤
    try:
        price = float(re.search(r"\d+", price_str).group()) if re.search(r"\d+", price_str) else 0
        if price > USER_PREFERENCES["max_price"]:
            score -= 1
        if price == 0:  # 免费展览加分
            score += 1
    except:
        pass
    
    # 区域偏好
    if area in USER_PREFERENCES["prefer_areas"]:
        score += 1
    
    # 限制分数范围 1-5
    return max(1, min(5, score))


def crawl_official_museums():
    """爬取官方博物馆展览信息"""
    exhibitions = []
    sources = [
        {
            "name": "国家博物馆",
            "url": "http://www.chnmuseum.cn/fwzy/zl/zbzl/",
            "area": "东城区"
        },
        {
            "name": "故宫博物院",
            "url": "https://www.dpm.org.cn/Exhibition.html",
            "area": "东城区"
        },
        {
            "name": "首都博物馆",
            "url": "http://www.capitalmuseum.org.cn/",
            "area": "西城区"
        },
        {
            "name": "中国美术馆",
            "url": "http://www.namoc.org/",
            "area": "东城区"
        }
    ]
    
    for source in sources:
        try:
            log_message(f"开始爬取：{source['name']}")
            time.sleep(random.uniform(3, 5))  # 控制频率
            
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
            }
            
            response = requests.get(source["url"], headers=headers, timeout=10)
            
            if response.status_code == 200:
                # 这里简化处理，实际需要针对每个网站写解析逻辑
                exhibitions.append({
                    "title": f"{source['name']} - 待详细解析",
                    "venue": source["name"],
                    "area": source["area"],
                    "start_date": "待查询",
                    "end_date": "待查询",
                    "price": "免费/待查询",
                    "type": "博物馆常设展",
                    "source": source["name"],
                    "url": source["url"],
                    "description": f"{source['name']}官方展览"
                })
                
                log_message(f"✓ {source['name']} 爬取成功")
            else:
                log_message(f"✗ {source['name']} 爬取失败：HTTP {response.status_code}", "ERROR")
                
        except Exception as e:
            log_message(f"✗ {source['name']} 爬取异常：{str(e)}", "ERROR")
    
    return exhibitions


def crawl_ticketing_platforms():
    """爬取票务平台展览信息"""
    exhibitions = []
    
    # 大麦网展览分类
    sources = [
        {
            "name": "大麦网 - 北京展览",
            "url": "https://www.damai.cn/beijing-exhibition",
            "area": "全市"
        },
        {
            "name": "猫眼 - 北京展览",
            "url": "https://www.maoyan.com/beijing/exhibition",
            "area": "全市"
        }
    ]
    
    for source in sources:
        try:
            log_message(f"开始爬取：{source['name']}")
            time.sleep(random.uniform(3, 5))
            
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
            }
            
            response = requests.get(source["url"], headers=headers, timeout=10)
            
            if response.status_code == 200:
                exhibitions.append({
                    "title": f"{source['name']} - 待详细解析",
                    "venue": "待查询",
                    "area": source["area"],
                    "start_date": "待查询",
                    "end_date": "待查询",
                    "price": "待查询",
                    "type": "商业展览",
                    "source": source["name"],
                    "url": source["url"],
                    "description": "票务平台展览信息"
                })
                
                log_message(f"✓ {source['name']} 爬取成功")
            else:
                log_message(f"✗ {source['name']} 爬取失败：HTTP {response.status_code}", "ERROR")
                
        except Exception as e:
            log_message(f"✗ {source['name']} 爬取异常：{str(e)}", "ERROR")
    
    return exhibitions


def save_to_database(exhibitions):
    """保存展览信息到数据库"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    
    saved_count = 0
    for ex in exhibitions:
        try:
            recommend_level = calculate_recommend_level(ex)
            
            cursor.execute("""
                INSERT OR REPLACE INTO exhibitions 
                (title, venue, area, start_date, end_date, price, type, source, url, description, recommend_level, updated_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
            """, (
                ex["title"], ex["venue"], ex["area"],
                ex["start_date"], ex["end_date"], ex["price"],
                ex["type"], ex["source"], ex["url"],
                ex["description"], recommend_level
            ))
            
            saved_count += 1
            
        except Exception as e:
            log_message(f"保存展览失败 {ex.get('title', 'Unknown')}: {str(e)}", "ERROR")
    
    conn.commit()
    conn.close()
    
    log_message(f"保存 {saved_count} 条展览信息到数据库")
    return saved_count


def record_crawl_history(source, count, status="success", error=""):
    """记录爬取历史"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    
    cursor.execute("""
        INSERT INTO crawl_history (source, crawl_time, items_count, status, error_message)
        VALUES (?, ?, ?, ?, ?)
    """, (source, datetime.now().strftime("%Y-%m-%d %H:%M:%S"), count, status, error))
    
    conn.commit()
    conn.close()


def generate_markdown_report():
    """生成 Markdown 格式的报告"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    
    today = datetime.now().strftime("%Y-%m-%d")
    
    # 获取今日推荐展览（按推荐等级排序）
    cursor.execute("""
        SELECT title, venue, area, start_date, end_date, price, type, recommend_level, url
        FROM exhibitions
        WHERE is_new = 1 OR date(updated_at) = date('now')
        ORDER BY recommend_level DESC, created_at DESC
        LIMIT 20
    """)
    
    exhibitions = cursor.fetchall()
    
    # 应用展览过滤器（如果已配置）
    if exhibition_filter:
        ex_list = [
            {
                "title": ex[0], "venue": ex[1], "area": ex[2],
                "start_date": ex[3], "end_date": ex[4],
                "price": ex[5], "type": ex[6],
                "recommend_level": ex[7], "url": ex[8]
            }
            for ex in exhibitions
        ]
        filtered_list = exhibition_filter.filter_exhibitions(ex_list)
        exhibitions = [
            (
                ex["title"], ex["venue"], ex["area"],
                ex["start_date"], ex["end_date"], ex["price"],
                ex["type"], ex["recommend_level"], ex["url"]
            )
            for ex in filtered_list
        ]
        log_message(f"✓ 应用过滤器：{len(exhibitions)}/{len(cursor.fetchall())} 个展览符合兴趣")
    
    # 生成报告
    report = f"""# 北京展览推荐 - {today}

## 📅 今日概览
- 新增展览：{len(exhibitions)} 个（已自动过滤不符合兴趣的展览）
- 数据更新时间：{datetime.now().strftime("%Y-%m-%d %H:%M")}

## 🌟 重点推荐（TOP 5）

"""
    
    for i, ex in enumerate(exhibitions[:5], 1):
        stars = "⭐" * ex[7]
        report += f"""{i}. **{ex[0]}**
   - 📍 地点：{ex[1]} ({ex[2]})
   - 📅 展期：{ex[3]} 至 {ex[4]}
   - 🎫 票价：{ex[5]}
   - 🏷️ 类型：{ex[6]}
   - 推荐度：{stars}
   - 🔗 [详情]({ex[8]})

"""
    
    report += """## 📋 完整清单

| 展览名称 | 地点 | 区域 | 展期 | 票价 | 类型 | 推荐度 |
|---------|------|------|------|------|------|--------|
"""
    
    for ex in exhibitions:
        stars = "⭐" * ex[7]
        report += f"| {ex[0]} | {ex[1]} | {ex[2]} | {ex[3]}~{ex[4]} | {ex[5]} | {ex[6]} | {stars} |\n"
    
    report += f"""
## ℹ️ 观展提示

- **预约方式**：大部分博物馆需提前在官网/公众号预约
- **开放时间**：通常 9:00-17:00（周一闭馆，节假日除外）
- **交通建议**：优先选择地铁出行，避免停车困难
- **携带证件**：部分场馆需身份证入馆

---

*由 Travel Agent 自动生成 | 最后更新：{datetime.now().strftime("%Y-%m-%d %H:%M")}*
"""
    
    conn.close()
    
    # 保存报告
    report_path = DATA_DIR / f"beijing_exhibitions_{today}.md"
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(report)
    
    log_message(f"生成报告：{report_path}")
    return report, report_path


def main():
    """主执行函数"""
    log_message("=" * 50)
    log_message("开始执行北京展览信息爬取任务")
    
    try:
        # 初始化数据库
        init_database()
        
        # 爬取官方博物馆
        museum_exhibitions = crawl_official_museums()
        record_crawl_history("official_museums", len(museum_exhibitions))
        
        # 爬取票务平台
        platform_exhibitions = crawl_ticketing_platforms()
        record_crawl_history("ticketing_platforms", len(platform_exhibitions))
        
        # 合并所有展览
        all_exhibitions = museum_exhibitions + platform_exhibitions
        
        # 保存到数据库
        saved_count = save_to_database(all_exhibitions)
        
        # 生成报告
        report, report_path = generate_markdown_report()
        
        log_message("=" * 50)
        log_message(f"任务执行完成！共处理 {len(all_exhibitions)} 条展览信息，保存 {saved_count} 条")
        log_message(f"报告路径：{report_path}")
        
        return {
            "status": "success",
            "total_items": len(all_exhibitions),
            "saved_items": saved_count,
            "report_path": str(report_path)
        }
        
    except Exception as e:
        log_message(f"任务执行失败：{str(e)}", "ERROR")
        return {
            "status": "error",
            "error": str(e)
        }


if __name__ == "__main__":
    result = main()
    print(json.dumps(result, ensure_ascii=False, indent=2))
