#!/usr/bin/env python3
"""
easyclaw 爬虫监控脚本
=====================
监控 easyclaw 的运行状态、成功率、日志和数据量。

支持的日志格式（每行一条记录）：
  2026-03-20 10:30:15 [INFO] Task started: fetch https://example.com/page1
  2026-03-20 10:30:16 [SUCCESS] Fetched 1024 bytes from https://example.com/page1
  2026-03-20 10:30:17 [ERROR] Timeout fetching https://example.com/page2
  2026-03-20 10:30:18 [WARNING] Retry 1/3 for https://example.com/page2

用法：
  python easyclaw_monitor.py <日志文件路径> [选项]

选项：
  --tail N         只看最近 N 条日志（默认 20）
  --watch          持续监控模式，每隔几秒刷新
  --interval N     watch 模式刷新间隔秒数（默认 5）
  --level LEVEL    过滤日志级别：INFO/SUCCESS/ERROR/WARNING
  --since TIME     只看该时间之后的日志，格式：YYYY-MM-DD 或 YYYY-MM-DD HH:MM:SS
"""

import argparse
import os
import re
import sys
import time
from collections import Counter, defaultdict
from datetime import datetime, timedelta

# --- 日志解析 ---

LOG_PATTERN = re.compile(
    r"^(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})\s+"
    r"\[(\w+)\]\s+"
    r"(.*)$"
)

BYTES_PATTERN = re.compile(r"(\d+)\s*bytes")
URL_PATTERN = re.compile(r"https?://\S+")


def parse_line(line: str) -> dict | None:
    """解析单行日志，返回结构化 dict 或 None。"""
    line = line.strip()
    if not line:
        return None
    m = LOG_PATTERN.match(line)
    if not m:
        return None
    ts_str, level, message = m.groups()
    try:
        ts = datetime.strptime(ts_str, "%Y-%m-%d %H:%M:%S")
    except ValueError:
        return None
    return {"timestamp": ts, "level": level.upper(), "message": message, "raw": line}


def parse_log_file(path: str, since: datetime | None = None) -> list[dict]:
    """读取并解析整个日志文件。"""
    entries = []
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            entry = parse_line(line)
            if entry is None:
                continue
            if since and entry["timestamp"] < since:
                continue
            entries.append(entry)
    return entries


# --- 统计分析 ---

def compute_stats(entries: list[dict]) -> dict:
    """计算各项统计指标。"""
    if not entries:
        return {
            "total": 0, "first_ts": None, "last_ts": None,
            "level_counts": Counter(), "success_rate": 0.0,
            "total_bytes": 0, "url_counts": Counter(),
            "hourly": defaultdict(int), "errors": [],
        }

    level_counts = Counter(e["level"] for e in entries)
    total_tasks = level_counts.get("SUCCESS", 0) + level_counts.get("ERROR", 0)
    success_rate = (level_counts["SUCCESS"] / total_tasks * 100) if total_tasks > 0 else 0.0

    total_bytes = 0
    url_counts = Counter()
    hourly = defaultdict(int)
    errors = []

    for e in entries:
        # 数据量
        bm = BYTES_PATTERN.search(e["message"])
        if bm:
            total_bytes += int(bm.group(1))
        # URL 统计
        um = URL_PATTERN.search(e["message"])
        if um:
            url_counts[um.group()] += 1
        # 每小时分布
        hourly[e["timestamp"].strftime("%Y-%m-%d %H:00")] += 1
        # 收集错误
        if e["level"] == "ERROR":
            errors.append(e)

    return {
        "total": len(entries),
        "first_ts": entries[0]["timestamp"],
        "last_ts": entries[-1]["timestamp"],
        "level_counts": level_counts,
        "success_rate": success_rate,
        "total_bytes": total_bytes,
        "url_counts": url_counts,
        "hourly": dict(sorted(hourly.items())),
        "errors": errors,
    }


# --- 展示 ---

COLORS = {
    "RESET": "\033[0m",
    "BOLD": "\033[1m",
    "RED": "\033[91m",
    "GREEN": "\033[92m",
    "YELLOW": "\033[93m",
    "BLUE": "\033[94m",
    "CYAN": "\033[96m",
    "DIM": "\033[2m",
}

LEVEL_COLORS = {
    "INFO": COLORS["BLUE"],
    "SUCCESS": COLORS["GREEN"],
    "ERROR": COLORS["RED"],
    "WARNING": COLORS["YELLOW"],
}


def c(text, color_key):
    return f"{COLORS.get(color_key, '')}{text}{COLORS['RESET']}"


def fmt_bytes(n: int) -> str:
    for unit in ("B", "KB", "MB", "GB"):
        if n < 1024:
            return f"{n:.1f} {unit}"
        n /= 1024
    return f"{n:.1f} TB"


def print_separator(title: str = ""):
    width = 60
    if title:
        pad = width - len(title) - 4
        left = pad // 2
        right = pad - left
        print(c(f"{'─' * left}┤ {title} ├{'─' * right}", "DIM"))
    else:
        print(c("─" * width, "DIM"))


def display_status(stats: dict, log_path: str):
    """展示运行状态概要。"""
    print()
    print_separator("easyclaw 运行状态")
    print()

    # 基本信息
    print(f"  日志文件: {c(log_path, 'CYAN')}")
    print(f"  日志总量: {c(str(stats['total']), 'BOLD')} 条")

    if stats["first_ts"]:
        print(f"  时间范围: {stats['first_ts']} ~ {stats['last_ts']}")
        elapsed = stats["last_ts"] - stats["first_ts"]
        print(f"  持续时间: {elapsed}")

        # 判断是否在线
        age = datetime.now() - stats["last_ts"]
        if age < timedelta(minutes=5):
            status = c("● 运行中", "GREEN")
        elif age < timedelta(hours=1):
            status = c("● 空闲 (最近有活动)", "YELLOW")
        else:
            status = c("● 离线 (长时间无活动)", "RED")
        print(f"  当前状态: {status}")
        print(f"  最后活动: {stats['last_ts']} ({_human_delta(age)}前)")

    print()


def display_success_rate(stats: dict):
    """展示成功率统计。"""
    print_separator("成功率统计")
    print()
    lc = stats["level_counts"]
    for level in ("SUCCESS", "ERROR", "WARNING", "INFO"):
        cnt = lc.get(level, 0)
        color = LEVEL_COLORS.get(level, "")
        bar_len = min(cnt * 40 // max(stats["total"], 1), 40)
        bar = "█" * bar_len + "░" * (40 - bar_len)
        print(f"  {color}{level:>8}{COLORS['RESET']} │{bar}│ {cnt}")
    print()

    success = lc.get("SUCCESS", 0)
    error = lc.get("ERROR", 0)
    total_tasks = success + error
    if total_tasks > 0:
        rate = stats["success_rate"]
        rate_color = "GREEN" if rate >= 90 else ("YELLOW" if rate >= 70 else "RED")
        print(f"  任务成功率: {c(f'{rate:.1f}%', rate_color)}  ({success}/{total_tasks})")
    else:
        print(f"  任务成功率: {c('N/A (无成功/失败记录)', 'DIM')}")
    print()


def display_data_volume(stats: dict):
    """展示数据量统计。"""
    print_separator("数据量统计")
    print()
    print(f"  总抓取数据: {c(fmt_bytes(stats['total_bytes']), 'BOLD')}")
    print(f"  涉及 URL 数: {c(str(len(stats['url_counts'])), 'BOLD')}")
    print()

    if stats["hourly"]:
        print("  每小时请求分布:")
        max_count = max(stats["hourly"].values()) if stats["hourly"] else 1
        for hour, cnt in list(stats["hourly"].items())[-12:]:
            bar_len = min(cnt * 30 // max(max_count, 1), 30)
            bar = "▓" * bar_len
            print(f"    {hour} │ {bar} {cnt}")
        print()


def display_errors(stats: dict, limit: int = 10):
    """展示最近的错误。"""
    errors = stats["errors"]
    if not errors:
        print_separator("错误记录")
        print(f"\n  {c('无错误记录 ✓', 'GREEN')}\n")
        return

    print_separator(f"最近错误 (共 {len(errors)} 条)")
    print()
    for e in errors[-limit:]:
        print(f"  {c(str(e['timestamp']), 'DIM')} {c(e['message'], 'RED')}")
    if len(errors) > limit:
        print(f"  {c(f'... 还有 {len(errors) - limit} 条错误', 'DIM')}")
    print()


def display_logs(entries: list[dict], tail: int = 20, level_filter: str | None = None):
    """展示日志内容。"""
    filtered = entries
    if level_filter:
        filtered = [e for e in entries if e["level"] == level_filter.upper()]

    shown = filtered[-tail:]
    print_separator(f"日志 (最近 {len(shown)} 条)")
    print()
    for e in shown:
        color = LEVEL_COLORS.get(e["level"], "")
        ts = c(str(e["timestamp"]), "DIM")
        lvl = f"{color}[{e['level']:>7}]{COLORS['RESET']}"
        print(f"  {ts} {lvl} {e['message']}")
    print()


def _human_delta(td: timedelta) -> str:
    seconds = int(td.total_seconds())
    if seconds < 60:
        return f"{seconds} 秒"
    elif seconds < 3600:
        return f"{seconds // 60} 分钟"
    elif seconds < 86400:
        return f"{seconds // 3600} 小时"
    else:
        return f"{seconds // 86400} 天"


# --- 主流程 ---

def run_report(log_path: str, tail: int, level: str | None, since: datetime | None):
    """单次执行完整报告。"""
    if not os.path.isfile(log_path):
        print(c(f"错误: 找不到日志文件 '{log_path}'", "RED"))
        sys.exit(1)

    entries = parse_log_file(log_path, since=since)
    if not entries:
        print(c("日志为空或无匹配记录。", "YELLOW"))
        return

    stats = compute_stats(entries)
    display_status(stats, log_path)
    display_success_rate(stats)
    display_data_volume(stats)
    display_errors(stats)
    display_logs(entries, tail=tail, level_filter=level)
    print_separator()


def run_watch(log_path: str, interval: int, tail: int, level: str | None, since: datetime | None):
    """持续监控模式。"""
    print(c(f"持续监控模式 (每 {interval} 秒刷新, Ctrl+C 退出)", "CYAN"))
    try:
        while True:
            os.system("clear" if os.name != "nt" else "cls")
            run_report(log_path, tail, level, since)
            print(c(f"下次刷新: {interval} 秒后 | Ctrl+C 退出", "DIM"))
            time.sleep(interval)
    except KeyboardInterrupt:
        print(c("\n已停止监控。", "YELLOW"))


def main():
    parser = argparse.ArgumentParser(
        description="easyclaw 爬虫监控工具",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
    parser.add_argument("logfile", help="easyclaw 日志文件路径")
    parser.add_argument("--tail", type=int, default=20, help="显示最近 N 条日志 (默认 20)")
    parser.add_argument("--watch", action="store_true", help="持续监控模式")
    parser.add_argument("--interval", type=int, default=5, help="watch 模式刷新间隔 (默认 5 秒)")
    parser.add_argument("--level", choices=["INFO", "SUCCESS", "ERROR", "WARNING"], help="过滤日志级别")
    parser.add_argument("--since", help="只看此时间之后的日志 (格式: YYYY-MM-DD 或 YYYY-MM-DD HH:MM:SS)")

    args = parser.parse_args()

    since = None
    if args.since:
        for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d"):
            try:
                since = datetime.strptime(args.since, fmt)
                break
            except ValueError:
                continue
        if since is None:
            print(c("错误: --since 格式不正确，应为 YYYY-MM-DD 或 YYYY-MM-DD HH:MM:SS", "RED"))
            sys.exit(1)

    if args.watch:
        run_watch(args.logfile, args.interval, args.tail, args.level, since)
    else:
        run_report(args.logfile, args.tail, args.level, since)


if __name__ == "__main__":
    main()