#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
P2-7 任务：使用小红书核实陕西国保单位开放情况（Playwright + Cookies）

基于山东成功经验复用，真实爬取小红书数据

判断标准：
- 笔记数 ≥ 5 篇 → 高置信度开放
- 笔记数 1-4 篇 → 中置信度（需交叉验证）
- 笔记数 = 0 → 低置信度（可能关闭/遗址）
"""

import json
import time
from pathlib import Path
from playwright.sync_api import sync_playwright
import sys

# 加载小红书 cookies
CONFIG_FILE = Path(__file__).parent.parent / "beijing-exhibitions" / "config" / "xiaohongshu_cookies.json"
XHS_COOKIES_DICT = {}
try:
    with open(CONFIG_FILE, "r", encoding="utf-8") as f:
        XHS_COOKIES_DICT = json.load(f)
    print(f"✓ 已加载小红书 cookie 配置：{len(XHS_COOKIES_DICT)} 个", flush=True)
except Exception as e:
    print(f"⚠ 加载 cookie 配置失败：{e}", flush=True)

# 转换为 Playwright 格式
XHS_COOKIES = [
    {"name": name, "value": value, "domain": ".xiaohongshu.com", "path": "/"}
    for name, value in XHS_COOKIES_DICT.items()
]

# 西安市国保单位待核实清单（53 处）
# 基于 wiki/陕西/国保单位名录.md 整理
XI_AN_GUOBAO = [
    # 古建筑（知名）
    "西安城墙", "大雁塔", "小雁塔", "钟楼", "鼓楼", "西安碑林",
    "西安清真寺", "化觉巷清真大寺", "大兴善寺", "青龙寺", "八仙宫", "水陆庵",
    "兴教寺塔", "香积寺塔", "华严寺塔", "牛头寺", "卧龙寺", "广仁寺",
    "宝庆寺塔", "荐福寺", "慈恩寺", "都城隍庙", "高家大院",
    "化觉巷清真寺", "大学习巷清真寺", "小皮院清真寺", "大皮院清真寺",
    "营里寺", "五星街教堂",
    # 古遗址（知名）
    "秦始皇陵", "兵马俑", "大明宫遗址", "半坡遗址",
    "汉长安城遗址", "唐长安城遗址", "阿房宫遗址", "丰镐遗址",
    "姜寨遗址", "老牛坡遗址", "杨官寨遗址",
    # 古墓葬（帝陵）
    "汉武帝茂陵", "唐太宗昭陵", "唐高宗乾陵",
    "汉昭帝平陵", "杜陵", "霸陵", "阳陵",
    # 近现代
    "西安事变旧址", "八路军西安办事处旧址",
    "西安交通大学早期建筑", "西北大学早期建筑",
    "易俗社剧场", "人民剧院",
]


def search_xiaohongshu(keyword, page):
    """搜索小红书笔记（复用 page 对象，避免重复启动浏览器）"""
    results = []
    note_count = 0
    
    try:
        # 使用 URL 编码
        from urllib.parse import quote
        encoded_keyword = quote(keyword)
        url = f"https://www.xiaohongshu.com/search_result?keyword={encoded_keyword}&source=web_search_result_notes"
        
        print(f"  搜索 URL: {url[:80]}...", flush=True)
        
        page.goto(url, wait_until="networkidle", timeout=60000)
        page.wait_for_timeout(8000)  # 等待内容加载
        
        # 尝试多种选择器获取笔记
        selectors = [
            'section.note-item',
            'div.note-card', 
            'div.search-result-item',
            'article.note',
            '[data-type="note"]',
        ]
        
        notes = []
        for sel in selectors:
            notes = page.query_selector_all(sel)
            if notes:
                print(f"  使用选择器：{sel}, 找到 {len(notes)} 个", flush=True)
                break
        
        note_count = len(notes)
        
        # 如果没有找到笔记，尝试从页面文本中提取
        if note_count == 0:
            # 检查是否被重定向到登录页
            current_url = page.url
            if 'login' in current_url.lower():
                print(f"  ⚠ 需要登录", flush=True)
            else:
                page_content = page.content()
                # 查找搜索结果计数
                count_patterns = page.query_selector_all('.result-count, .search-count, span.count')
                for p in count_patterns[:3]:
                    text = p.inner_text()
                    print(f"  计数元素：{text}", flush=True)
        
        # 提取前 5 篇笔记信息
        for note in notes[:5]:
            try:
                # 标题：<a class="title"><span>标题文本</span></a>
                title_el = note.query_selector('a.title span')
                title = title_el.inner_text().strip() if title_el else ""
                
                # 用户名：<div class="name">用户名</div>
                user_el = note.query_selector('div.name')
                user = user_el.inner_text().strip() if user_el else ""
                
                if title and len(title) > 2:
                    results.append({
                        "title": title,
                        "user": user or "未知",
                    })
            except Exception as e:
                print(f"  提取笔记失败：{e}", flush=True)
                pass
        
    except Exception as e:
        print(f"  爬取失败：{e}", flush=True)
    
    return results, note_count


def verify_site(site_name, page):
    """核实单个景点"""
    # 搜索关键词
    keyword = f"西安 {site_name} 开放"
    notes, total_count = search_xiaohongshu(keyword, page)
    
    # 判断置信度
    if total_count >= 5:
        confidence = "高"
        status = "✅ 开放"
    elif total_count >= 1:
        confidence = "中"
        status = "✅ 开放"
    else:
        confidence = "低"
        status = "⏸️ 待核实"
    
    return {
        "site": site_name,
        "status": status,
        "confidence": confidence,
        "notes_count": total_count,
        "notes_sample": notes[:3],  # 前 3 篇笔记标题
        "source": "小红书实时搜索"
    }


def main():
    print("=" * 70, flush=True)
    print("P2-7-1：陕西国保核实 - 西安市（53 处）", flush=True)
    print("方法：Playwright + 小红书 Cookies（真实爬取）", flush=True)
    print("=" * 70, flush=True)
    
    all_results = []
    
    with sync_playwright() as p:
        browser = p.chromium.launch(
            headless=True,
            executable_path='/usr/bin/google-chrome',
            args=[
                '--disable-blink-features=AutomationControlled',
                '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            ]
        )
        
        context = browser.new_context()
        context.add_cookies(XHS_COOKIES)
        page = context.new_page()
        
        for i, site in enumerate(XI_AN_GUOBAO, 1):
            print(f"\n[{i}/{len(XI_AN_GUOBAO)}] {site}", flush=True)
            
            result = verify_site(site, page)
            all_results.append(result)
            
            # 打印结果
            print(f"  状态：{result['status']} (置信度：{result['confidence']}, 笔记数：{result['notes_count']})", flush=True)
            
            # 每 20 处保存一次进度
            if i % 20 == 0:
                progress_file = Path(__file__).parent / "data" / "shaanxi_guobao_p2-7-1_progress.json"
                with open(progress_file, "w", encoding="utf-8") as f:
                    json.dump({"completed": i, "total": len(XI_AN_GUOBAO)}, f, ensure_ascii=False, indent=2)
                # 保存临时结果
                temp_output = Path(__file__).parent / "data" / "shaanxi_xi_an_guobao_temp.json"
                with open(temp_output, "w", encoding="utf-8") as f:
                    json.dump(all_results, f, ensure_ascii=False, indent=2)
                print(f"  [进度] 已保存 {i} 处结果", flush=True)
            
            # 间隔 3-5 秒，避免触发风控
            sleep_time = 3 + (i % 3)  # 3-5 秒随机
            time.sleep(sleep_time)
        
        browser.close()
    
    # 保存结果
    output_file = Path(__file__).parent / "data" / "shaanxi_xi_an_guobao.json"
    output_file.parent.mkdir(exist_ok=True)
    
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2)
    
    print(f"\n✓ 结果已保存到：{output_file}", flush=True)
    
    # 统计
    print("\n" + "=" * 70, flush=True)
    print("📊 核实统计", flush=True)
    print("=" * 70, flush=True)
    
    verified_count = len(all_results)
    open_count = sum(1 for r in all_results if "✅" in r['status'])
    unknown_count = sum(1 for r in all_results if "⏸️" in r['status'])
    
    high_conf = sum(1 for r in all_results if r['confidence'] == "高")
    mid_conf = sum(1 for r in all_results if r['confidence'] == "中")
    low_conf = sum(1 for r in all_results if r['confidence'] == "低")
    
    print(f"总计核实：{verified_count} 处", flush=True)
    print(f"✅ 开放：{open_count} 处 ({open_count/verified_count*100:.1f}%)", flush=True)
    print(f"⏸️ 待核实：{unknown_count} 处 ({unknown_count/verified_count*100:.1f}%)", flush=True)
    print(f"\n🔍 置信度分布：", flush=True)
    print(f"   高置信度：{high_conf} 处 ({high_conf/verified_count*100:.1f}%)", flush=True)
    print(f"   中置信度：{mid_conf} 处 ({mid_conf/verified_count*100:.1f}%)", flush=True)
    print(f"   低置信度：{low_conf} 处 ({low_conf/verified_count*100:.1f}%)", flush=True)
    print(f"\n✓ 注：结果基于小红书实时搜索（Playwright + Cookies）", flush=True)
    
    return 0


if __name__ == "__main__":
    exit(main())
