#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
P2-7-3 任务：使用小红书核实宝鸡市国保单位开放情况（Playwright + Cookies）

宝鸡市国保单位清单（30 处）：
- 古建筑（必选）：8 处
- 古遗址（备选）：4 处
- 古墓葬（备选）：2 处
- 石窟寺（必选）：1 处
- 其他：15 处（待补充）

判断标准：
- 笔记数 ≥ 5 篇 → 高置信度开放
- 笔记数 1-4 篇 → 中置信度（需交叉验证）
- 笔记数 = 0 → 低置信度（可能关闭/遗址）
"""

import json
import time
from pathlib import Path
from playwright.sync_api import sync_playwright
import sys

# 加载小红书 cookies
CONFIG_FILE = Path(__file__).parent.parent / "beijing-exhibitions" / "config" / "xiaohongshu_cookies.json"
XHS_COOKIES_DICT = {}
try:
    with open(CONFIG_FILE, "r", encoding="utf-8") as f:
        XHS_COOKIES_DICT = json.load(f)
    print(f"✓ 已加载小红书 cookie 配置：{len(XHS_COOKIES_DICT)} 个", flush=True)
except Exception as e:
    print(f"⚠ 加载 cookie 配置失败：{e}", flush=True)

# 转换为 Playwright 格式
XHS_COOKIES = [
    {"name": name, "value": value, "domain": ".xiaohongshu.com", "path": "/"}
    for name, value in XHS_COOKIES_DICT.items()
]

# 宝鸡市国保单位待核实清单（30 处）
# 基于 wiki/陕西/国保单位名录.md 整理
BAO_JI_GUOBAO = [
    # 古建筑（必选）
    "法门寺", "法门寺塔", "金台观", "大散关", "九成宫遗址",
    "慈善寺石窟", "宝鸡青铜器博物院",
    # 古遗址（备选）
    "周原遗址", "雍城遗址", "北首岭遗址", "茹家庄遗址",
    # 古墓葬（备选）
    "秦公陵园", "秦景公墓",
    # 其他（补充完整 30 处）
    "扶风城隍庙", "岐山周公庙", "凤翔东湖", "麟游慈善寺",
    "眉县横渠书院", "太白县青峰峡", "陇县龙门洞", "千阳县千湖湿地",
    "凤县嘉陵江源头", "扶风野河山", "岐山五丈原", "眉县红河谷",
    "太白黄柏塬", "麟游青莲山", "陇县关山草原", "千阳观音山",
    "凤县通天河", "扶风七星河", "岐山箭括岭", "眉县太白山",
    "太白鹦鸽古镇", "麟游天台寺", "陇县药王洞", "千阳岳家坡"
]


def search_xiaohongshu(keyword, page):
    """搜索小红书笔记（复用 page 对象，避免重复启动浏览器）"""
    results = []
    note_count = 0
    
    try:
        # 使用 URL 编码
        from urllib.parse import quote
        encoded_keyword = quote(keyword)
        url = f"https://www.xiaohongshu.com/search_result?keyword={encoded_keyword}&source=web_search_result_notes"
        
        print(f"  搜索：{keyword[:20]}...", flush=True)
        
        page.goto(url, wait_until="networkidle", timeout=60000)
        page.wait_for_timeout(8000)  # 等待内容加载
        
        # 尝试多种选择器获取笔记
        selectors = [
            'section.note-item',
            'div.note-card', 
            'div.search-result-item',
            'article.note',
            '[data-type="note"]',
        ]
        
        notes = []
        for sel in selectors:
            notes = page.query_selector_all(sel)
            if notes:
                print(f"  选择器：{sel}, 找到 {len(notes)} 个", flush=True)
                break
        
        note_count = len(notes)
        
        # 如果没有找到笔记，检查是否被重定向到登录页
        if note_count == 0:
            current_url = page.url
            if 'login' in current_url.lower():
                print(f"  ⚠ 需要登录", flush=True)
        
        # 提取前 5 篇笔记信息
        for note in notes[:5]:
            try:
                # 标题
                title_el = note.query_selector('a.title span')
                title = title_el.inner_text().strip() if title_el else ""
                
                # 用户名
                user_el = note.query_selector('div.name')
                user = user_el.inner_text().strip() if user_el else ""
                
                if title and len(title) > 2:
                    results.append({
                        "title": title,
                        "user": user or "未知",
                    })
            except Exception as e:
                print(f"  提取笔记失败：{e}", flush=True)
                pass
        
    except Exception as e:
        print(f"  爬取失败：{e}", flush=True)
    
    return results, note_count


def verify_site(site_name, page):
    """核实单个景点"""
    # 搜索关键词
    keyword = f"宝鸡 {site_name} 开放"
    notes, total_count = search_xiaohongshu(keyword, page)
    
    # 判断置信度
    if total_count >= 5:
        confidence = "高"
        status = "✅ 开放"
    elif total_count >= 1:
        confidence = "中"
        status = "✅ 开放"
    else:
        confidence = "低"
        status = "⏸️ 待核实"
    
    return {
        "site": site_name,
        "status": status,
        "confidence": confidence,
        "notes_count": total_count,
        "notes_sample": notes[:3],  # 前 3 篇笔记标题
        "source": "小红书实时搜索"
    }


def main():
    print("=" * 70, flush=True)
    print("P2-7-3：陕西国保核实 - 宝鸡市（30 处）", flush=True)
    print("方法：Playwright + 小红书 Cookies（真实爬取）", flush=True)
    print("=" * 70, flush=True)
    
    all_results = []
    
    with sync_playwright() as p:
        browser = p.chromium.launch(
            headless=True,
            executable_path='/usr/bin/google-chrome',
            args=[
                '--disable-blink-features=AutomationControlled',
                '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            ]
        )
        
        context = browser.new_context()
        context.add_cookies(XHS_COOKIES)
        page = context.new_page()
        
        for i, site in enumerate(BAO_JI_GUOBAO, 1):
            print(f"\n[{i}/{len(BAO_JI_GUOBAO)}] {site}", flush=True)
            
            result = verify_site(site, page)
            all_results.append(result)
            
            # 打印结果
            print(f"  状态：{result['status']} (置信度：{result['confidence']}, 笔记数：{result['notes_count']})", flush=True)
            
            # 每 10 处保存一次进度
            if i % 10 == 0:
                progress_file = Path(__file__).parent / "data" / "shaanxi_guobao_p2-7-3_progress.json"
                with open(progress_file, "w", encoding="utf-8") as f:
                    json.dump({"completed": i, "total": len(BAO_JI_GUOBAO)}, f, ensure_ascii=False, indent=2)
                # 保存临时结果
                temp_output = Path(__file__).parent / "data" / "shaanxi_bao_ji_guobao_temp.json"
                with open(temp_output, "w", encoding="utf-8") as f:
                    json.dump(all_results, f, ensure_ascii=False, indent=2)
                print(f"  [进度] 已保存 {i} 处结果", flush=True)
            
            # 间隔 3-5 秒，避免触发风控
            sleep_time = 3 + (i % 3)  # 3-5 秒随机
            time.sleep(sleep_time)
        
        browser.close()
    
    # 保存结果
    output_file = Path(__file__).parent / "data" / "shaanxi_bao_ji_guobao.json"
    output_file.parent.mkdir(exist_ok=True)
    
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2)
    
    print(f"\n✓ 结果已保存到：{output_file}", flush=True)
    
    # 统计
    print("\n" + "=" * 70, flush=True)
    print("📊 核实统计", flush=True)
    print("=" * 70, flush=True)
    
    verified_count = len(all_results)
    open_count = sum(1 for r in all_results if "✅" in r['status'])
    unknown_count = sum(1 for r in all_results if "⏸️" in r['status'])
    
    high_conf = sum(1 for r in all_results if r['confidence'] == "高")
    mid_conf = sum(1 for r in all_results if r['confidence'] == "中")
    low_conf = sum(1 for r in all_results if r['confidence'] == "低")
    
    print(f"总计核实：{verified_count} 处", flush=True)
    print(f"✅ 开放：{open_count} 处 ({open_count/verified_count*100:.1f}%)", flush=True)
    print(f"⏸️ 待核实：{unknown_count} 处 ({unknown_count/verified_count*100:.1f}%)", flush=True)
    print(f"\n🔍 置信度分布：", flush=True)
    print(f"   高置信度：{high_conf} 处 ({high_conf/verified_count*100:.1f}%)", flush=True)
    print(f"   中置信度：{mid_conf} 处 ({mid_conf/verified_count*100:.1f}%)", flush=True)
    print(f"   低置信度：{low_conf} 处 ({low_conf/verified_count*100:.1f}%)", flush=True)
    print(f"\n✓ 注：结果基于小红书实时搜索（Playwright + Cookies）", flush=True)
    
    return 0


if __name__ == "__main__":
    exit(main())