#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
使用 Playwright + Cookies 爬取小红书 - 获取文本
"""

import json
from playwright.sync_api import sync_playwright

XHS_COOKIES = [
    {"name": "acw_tc", "value": "0a0bb41a17726804792802066ef22266fb6b0216da2a3e9f089735f8353b77", "domain": ".xiaohongshu.com", "path": "/"},
    {"name": "abRequestId", "value": "b5346cbb-6db3-5645-accc-df5d27fd9362", "domain": ".xiaohongshu.com", "path": "/"},
    {"name": "webBuild", "value": "5.13.1", "domain": ".xiaohongshu.com", "path": "/"},
    {"name": "xsecappid", "value": "xhs-pc-web", "domain": ".xiaohongshu.com", "path": "/"},
    {"name": "a1", "value": "19cbbfd8f46tok3grdu3mmi72tpiihd7co02rf9oa30000122754", "domain": ".xiaohongshu.com", "path": "/"},
    {"name": "webId", "value": "68ac71cf3f14eb4a280b442b71aad7e5", "domain": ".xiaohongshu.com", "path": "/"},
    {"name": "gid", "value": "yjSDDifj0fSfyjSDDifYilij4K9lTqkFf7q68l063WJ9UJq833xfWF888yJJW248dDqfjJ0Y", "domain": ".xiaohongshu.com", "path": "/"},
    {"name": "web_session", "value": "040069b8dcb7aa9bcf6957bd9d3b4b7b5c866c", "domain": ".xiaohongshu.com", "path": "/"},
    {"name": "id_token", "value": "VjEAAPwuVKkxM5M3tgTrefheWsAsAIisJtFuRYQM3EFnhkneE3Zag62PzVeUMmRAgCOznnJXrYICwToncTBIL4u7bKSd7M8QMVA0TgyLKo+Oknjg00IY1MQziJXeiutd2NTcUd+B", "domain": ".xiaohongshu.com", "path": "/"},
    {"name": "loadts", "value": "1772680707856", "domain": ".xiaohongshu.com", "path": "/"},
]

def search_xiaohongshu(keyword):
    """搜索小红书笔记"""
    exhibitions = []
    
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        context = browser.new_context()
        context.add_cookies(XHS_COOKIES)
        page = context.new_page()
        
        try:
            url = f"https://www.xiaohongshu.com/search_result?keyword={keyword}&source=web_search_result_notes"
            print(f"访问：{url}")
            
            page.goto(url, wait_until="domcontentloaded", timeout=60000)
            page.wait_for_timeout(8000)  # 等待更长时间让 JS 执行
            
            # 获取页面所有文本
            text = page.inner_text("body")
            print(f"页面文本长度：{len(text)}")
            
            # 保存文本
            with open("data/xiaohongshu_text.txt", "w", encoding="utf-8") as f:
                f.write(text)
            print("已保存文本到 data/xiaohongshu_text.txt")
            
            # 查找笔记项
            notes = page.query_selector_all("section.note-item")
            print(f"找到 {len(notes)} 篇笔记")
            
            for i, note in enumerate(notes[:10]):
                try:
                    note_text = note.inner_text().strip()
                    if note_text and len(note_text) > 10:
                        # 提取标题（第一行）
                        lines = note_text.split('\n')
                        title = lines[0].strip() if lines else ""
                        user = lines[1].strip() if len(lines) > 1 else ""
                        
                        if title and "展览" in title or "看展" in title or "北京" in title:
                            exhibitions.append({
                                "title": title,
                                "user": user,
                                "source": "小红书"
                            })
                            print(f"  [{i+1}] {title[:50]}...")
                except Exception as e:
                    print(f"解析笔记失败：{e}")
            
        except Exception as e:
            print(f"爬取失败：{e}")
        
        browser.close()
    
    return exhibitions


if __name__ == "__main__":
    print("=" * 50)
    print("小红书爬虫 - 北京看展 2026")
    print("=" * 50)
    
    exhibitions = search_xiaohongshu("北京看展 2026")
    
    print(f"\n共找到 {len(exhibitions)} 篇相关笔记")
    
    # 保存结果
    with open("data/xiaohongshu_exhibitions.json", "w", encoding="utf-8") as f:
        json.dump(exhibitions, f, ensure_ascii=False, indent=2)
    
    print(f"结果已保存到 data/xiaohongshu_exhibitions.json")
