#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
诊断小红书搜索页面结构
"""

import json
import time
from pathlib import Path
from playwright.sync_api import sync_playwright

# 加载 cookies
CONFIG_FILE = Path(__file__).parent.parent / "beijing-exhibitions" / "config" / "xiaohongshu_cookies.json"

with open(CONFIG_FILE, "r", encoding="utf-8") as f:
    COOKIES_DICT = json.load(f)

# 转换为 Playwright 格式
COOKIES = [
    {"name": name, "value": value, "domain": ".xiaohongshu.com", "path": "/"}
    for name, value in COOKIES_DICT.items()
]

# Clash 代理配置
PROXY_SERVER = "http://127.0.0.1:7890"

print("=" * 70)
print("诊断小红书搜索页面结构", flush=True)
print("=" * 70)

with sync_playwright() as p:
    browser = p.chromium.launch(
        headless=True,
        executable_path='/usr/bin/google-chrome',
        args=['--disable-blink-features=AutomationControlled']
    )
    
    context = browser.new_context(
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        viewport={"width": 1920, "height": 1080}
    )
    context.add_cookies(COOKIES)
    
    page = context.new_page()
    
    # 访问搜索页面
    print("\n[1/3] 访问搜索页面...", flush=True)
    search_url = 'https://www.xiaohongshu.com/search_result?keyword=%E6%B3%95%E9%97%A8%E5%AF%BA&source=web_search_result_notes'
    page.goto(search_url, wait_until='networkidle', timeout=90000)
    time.sleep(10)
    
    # 保存页面 HTML
    print("[2/3] 保存页面 HTML...", flush=True)
    html_content = page.content()
    html_path = Path(__file__).parent / "data" / "xhs_search_page.html"
    with open(html_path, 'w', encoding='utf-8') as f:
        f.write(html_content)
    print(f"HTML 已保存: {html_path}", flush=True)
    print(f"HTML 大小: {len(html_content)} bytes", flush=True)
    
    # 查找可能的选择器
    print("[3/3] 分析页面结构...", flush=True)
    
    # 常见的小红书笔记选择器
    selectors_to_test = [
        'section.note-item',
        'div.note-card',
        'article.note',
        'div[data-v-note-item]',
        'a[href*="/explore/"]',
        'div.feed-card',
        'div.note-content',
        'div[class*="note"]',
        'div[class*="card"]',
        'section[class*="note"]'
    ]
    
    results = {}
    for selector in selectors_to_test:
        try:
            elements = page.query_selector_all(selector)
            count = len(elements)
            if count > 0:
                results[selector] = count
                print(f"  ✅ {selector}: {count} 个元素", flush=True)
        except Exception as e:
            pass
    
    # 如果找到元素，提取第一个元素的 HTML
    if results:
        best_selector = max(results.items(), key=lambda x: x[1])[0]
        print(f"\n最可能的选择器: {best_selector}", flush=True)
        
        try:
            first_element = page.query_selector(best_selector)
            if first_element:
                element_html = first_element.inner_html()
                element_path = Path(__file__).parent / "data" / "xhs_first_note.html"
                with open(element_path, 'w', encoding='utf-8') as f:
                    f.write(element_html)
                print(f"第一个笔记元素已保存: {element_path}", flush=True)
        except Exception as e:
            print(f"提取元素失败: {e}", flush=True)
    
    # 截图
    screenshot_path = Path(__file__).parent / "data" / "xhs_search_diagnosis.png"
    page.screenshot(path=str(screenshot_path))
    print(f"\n截图已保存: {screenshot_path}", flush=True)
    
    browser.close()

print("\n" + "=" * 70)
print("✅ 诊断完成", flush=True)
print("=" * 70)
print("\n找到的选择器:", flush=True)
for selector, count in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f"  {selector}: {count} 个", flush=True)
