#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
使用 Playwright + OCR 核实山东国保单位开放情况
流程：搜索小红书 → 下载图片 → OCR 识别 → 判断开放状态
"""

import json
import time
import os
from pathlib import Path
from playwright.sync_api import sync_playwright

# 尝试导入 OCR 库
try:
    import pytesseract
    from PIL import Image
    HAS_TESSERACT = True
    print("✓ 已加载 Tesseract OCR")
except ImportError:
    HAS_TESSERACT = False
    print("⚠ Tesseract 未安装，将跳过 OCR 识别")

# 加载小红书 cookies
CONFIG_FILE = Path(__file__).parent.parent / "beijing-exhibitions" / "config" / "xiaohongshu_cookies.json"
try:
    with open(CONFIG_FILE, "r", encoding="utf-8") as f:
        XHS_COOKIES_DICT = json.load(f)
    print(f"✓ 已加载小红书 cookie 配置：{len(XHS_COOKIES_DICT)} 个")
except Exception as e:
    print(f"⚠ 加载 cookie 配置失败：{e}")
    XHS_COOKIES_DICT = {}

XHS_COOKIES = [
    {"name": name, "value": value, "domain": ".xiaohongshu.com", "path": "/"}
    for name, value in XHS_COOKIES_DICT.items()
]

# 山东各城市国保单位待核实清单
SHANDONG_GUOBAO = {
    "济南": ["洪家楼天主教堂", "千佛山", "四门塔", "灵岩寺", "府学文庙", "万竹园", "城子崖遗址"],
    "青岛": ["栈桥", "天后宫", "青岛天主教堂", "琅琊台", "康有为故居", "老舍故居"],
    "烟台": ["蓬莱水城", "烟台山近代建筑", "牟氏庄园", "长岛庙岛", "莱州云峰山刻石"],
    "威海": ["刘公岛", "成山头", "圣经山摩崖", "威海英式建筑"],
    "潍坊": ["十笏园", "沂山", "云门山", "青州古城", "诸城恐龙化石"],
    "淄博": ["齐国故城", "临淄墓群", "周村古商城", "蒲松龄故居", "博山古窑址"],
    "泰安": ["岱庙", "经石峪", "泰山石刻", "蒿里山遗址"],
    "济宁": ["孔庙", "孔府", "孔林", "颜庙", "周公庙", "孟庙", "孟府", "铁山摩崖", "武氏墓群石刻"]
}


def ocr_image(image_path):
    """使用 Tesseract OCR 识别图片文字"""
    if not HAS_TESSERACT:
        return ""
    
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image, lang='chi_sim+eng')
        return text.strip()
    except Exception as e:
        print(f"    OCR 失败：{e}")
        return ""


def search_and_verify(keyword):
    """搜索小红书并核实"""
    results = {
        "keyword": keyword,
        "notes_found": 0,
        "images_ocrd": 0,
        "open_mentions": 0,
        "close_mentions": 0,
        "status": "⏸️ 待核实",
        "confidence": "-",
        "ocr_texts": []
    }
    
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True, args=[
            '--disable-blink-features=AutomationControlled',
            '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        ])
        
        context = browser.new_context()
        context.add_cookies(XHS_COOKIES)
        page = context.new_page()
        
        try:
            url = f"https://www.xiaohongshu.com/search_result?keyword={keyword}&source=web_search_result_notes"
            page.goto(url, wait_until="domcontentloaded", timeout=60000)
            page.wait_for_timeout(5000)
            
            # 获取笔记列表
            notes = page.query_selector_all("section.note-item")
            if not notes:
                notes = page.query_selector_all("div.note-card")
            
            results["notes_found"] = len(notes)
            
            # 处理前 3 篇笔记
            for i, note in enumerate(notes[:3]):
                try:
                    # 提取文字内容
                    text = note.inner_text().strip().replace('\n', ' ')[:200]
                    
                    # 判断开放状态
                    if any(kw in text for kw in ["开放", "开门", "营业", "游玩", "打卡", "参观", "值得", "推荐"]):
                        results["open_mentions"] += 1
                    if any(kw in text for kw in ["关闭", "没开", "维修", "改造", "不开放", "别去", "踩雷"]):
                        results["close_mentions"] += 1
                    
                    # 尝试获取图片并 OCR
                    img_el = note.query_selector("img")
                    if img_el and HAS_TESSERACT:
                        img_src = img_el.get_attribute("src")
                        if img_src and img_src.startswith("http"):
                            # 下载图片
                            img_path = Path(__file__).parent / "data" / "ocr_images" / f"{keyword.replace(' ', '_')}_{i}.jpg"
                            img_path.parent.mkdir(exist_ok=True)
                            
                            response = page.request.get(img_src)
                            if response.ok:
                                with open(img_path, "wb") as f:
                                    f.write(response.body())
                                
                                # OCR 识别
                                ocr_text = ocr_image(img_path)
                                if ocr_text:
                                    results["images_ocrd"] += 1
                                    results["ocr_texts"].append(ocr_text[:100])
                                    
                                    # 从 OCR 结果判断
                                    if any(kw in ocr_text for kw in ["开放", "营业", "门票", "时间"]):
                                        results["open_mentions"] += 1
                                    
                except Exception as e:
                    pass
            
            # 综合判断
            if results["open_mentions"] > results["close_mentions"]:
                results["status"] = "✅ 开放"
                results["confidence"] = "高" if results["open_mentions"] >= 3 else "中"
            elif results["close_mentions"] > results["open_mentions"]:
                results["status"] = "❌ 关闭"
                results["confidence"] = "高" if results["close_mentions"] >= 3 else "低"
            elif results["open_mentions"] > 0:
                results["status"] = "✅ 开放"
                results["confidence"] = "低"
            
        except Exception as e:
            print(f"  爬取失败：{e}")
        
        browser.close()
    
    return results


def verify_site(city, site):
    """核实单个景点"""
    keyword = f"{city} {site} 开放 门票"
    print(f"  搜索：{keyword}...", end=" ", flush=True)
    
    result = search_and_verify(keyword)
    
    ocr_info = ""
    if result["images_ocrd"] > 0:
        ocr_info = f" [OCR {result['images_ocrd']}张]"
    
    print(f"{result['status']} ({result['confidence']}置信度，{result['notes_found']}篇笔记{ocr_info})")
    
    # 间隔 3 秒，避免触发风控
    time.sleep(3)
    
    return {
        "site": site,
        "status": result["status"],
        "confidence": result["confidence"],
        "notes_found": result["notes_found"],
        "images_ocrd": result["images_ocrd"],
        "ocr_texts": result["ocr_texts"][:2]  # 只保留前 2 个 OCR 结果
    }


def main():
    print("=" * 70)
    print("山东国保单位开放情况核实 - Playwright + OCR")
    print("=" * 70)
    
    if not XHS_COOKIES_DICT:
        print("⚠ 警告：小红书 cookie 未加载，可能无法搜索")
    
    if not HAS_TESSERACT:
        print("⚠ 提示：Tesseract 未安装，将跳过图片 OCR 识别")
        print("   安装：apt install tesseract-ocr && pip install pytesseract pillow")
    
    all_results = {}
    verified_count = 0
    
    for city, sites in SHANDONG_GUOBAO.items():
        print(f"\n【{city}】共 {len(sites)} 处")
        city_results = []
        
        for i, site in enumerate(sites):
            print(f"  [{i+1}/{len(sites)}] ", end="")
            
            result = verify_site(city, site)
            city_results.append(result)
            verified_count += 1
        
        all_results[city] = city_results
        print(f"  ✓ {city} 完成")
    
    # 保存结果
    output_file = Path(__file__).parent / "data" / "shandong_guobao_playwright_ocr.json"
    output_file.parent.mkdir(exist_ok=True)
    
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2)
    
    print(f"\n✓ 结果已保存到：{output_file}")
    
    # 统计
    print("\n" + "=" * 70)
    print("📊 核实统计")
    print("=" * 70)
    
    open_count = sum(1 for city in all_results.values() for r in city if "✅" in r['status'])
    close_count = sum(1 for city in all_results.values() for r in city if "❌" in r['status'])
    unknown_count = sum(1 for city in all_results.values() for r in city if "⏸️" in r['status'])
    total_ocr = sum(r['images_ocrd'] for city in all_results.values() for r in city)
    
    print(f"总计核实：{verified_count} 处")
    print(f"✅ 开放：{open_count} 处 ({open_count/verified_count*100:.1f}%)")
    print(f"❌ 关闭：{close_count} 处 ({close_count/verified_count*100:.1f}%)")
    print(f"⏸️ 待核实：{unknown_count} 处 ({unknown_count/verified_count*100:.1f}%)")
    print(f"📸 OCR 识别：{total_ocr} 张图片")
    
    return 0


if __name__ == "__main__":
    exit(main())
