Files
testdata-cli/http_log_explorer/analyzers/stats_generator.py

278 lines
8.9 KiB
Python

"""Statistics generator for HTTP traffic analytics."""
import re
from collections import Counter, defaultdict
from dataclasses import dataclass
from typing import Any
from rich.table import Table
from http_log_explorer.models import HTTPEntry
@dataclass
class TrafficStats:
"""Container for traffic statistics."""
total_requests: int
endpoint_count: dict[str, int]
method_distribution: dict[str, int]
status_breakdown: dict[int, int]
content_type_distribution: dict[str, int]
response_time_stats: dict[str, float]
hosts: dict[str, int]
class StatsGenerator:
"""Generate statistics from HTTP entries."""
def __init__(self, entries: list[HTTPEntry]) -> None:
"""Initialize with HTTP entries.
Args:
entries: List of HTTPEntry objects
"""
self.entries = entries
def generate(self) -> TrafficStats:
"""Generate all statistics.
Returns:
TrafficStats object with all computed statistics
"""
return TrafficStats(
total_requests=len(self.entries),
endpoint_count=self.endpoint_count(),
method_distribution=self.method_distribution(),
status_breakdown=self.status_breakdown(),
content_type_distribution=self.content_type_distribution(),
response_time_stats=self.response_time_stats(),
hosts=self.hosts(),
)
def endpoint_count(self) -> dict[str, int]:
"""Count requests per endpoint pattern.
Returns:
Dictionary mapping endpoint patterns to counts
"""
counter: Counter[str] = Counter()
for entry in self.entries:
endpoint = self._normalize_endpoint(entry.endpoint)
counter[endpoint] += 1
return dict(counter.most_common())
def method_distribution(self) -> dict[str, int]:
"""Get distribution of HTTP methods.
Returns:
Dictionary mapping methods to counts
"""
counter = Counter(e.request.method for e in self.entries)
return dict(counter)
def status_breakdown(self) -> dict[int, int]:
"""Get breakdown of status codes.
Returns:
Dictionary mapping status codes to counts
"""
counter = Counter(e.response.status for e in self.entries)
return dict(sorted(counter.items()))
def content_type_distribution(self) -> dict[str, int]:
"""Get distribution of content types.
Returns:
Dictionary mapping content types to counts
"""
counter: Counter[str] = Counter()
for entry in self.entries:
ct = entry.content_type or "unknown"
main_type = ct.split(";")[0].strip()
counter[main_type] += 1
return dict(counter.most_common())
def response_time_stats(self) -> dict[str, float]:
"""Calculate response time statistics.
Returns:
Dictionary with min, max, avg, median response times in ms
"""
times = [e.duration_ms for e in self.entries if e.duration_ms is not None]
if not times:
return {"min": 0.0, "max": 0.0, "avg": 0.0, "median": 0.0, "p95": 0.0, "p99": 0.0}
sorted_times = sorted(times)
n = len(sorted_times)
stats = {
"min": float(sorted_times[0]),
"max": float(sorted_times[-1]),
"avg": float(sum(times) / n),
"median": float(sorted_times[n // 2]),
}
p95_idx = int(n * 0.95)
p99_idx = int(n * 0.99)
stats["p95"] = float(sorted_times[min(p95_idx, n - 1)])
stats["p99"] = float(sorted_times[min(p99_idx, n - 1)])
return stats
def hosts(self) -> dict[str, int]:
"""Get request count per host.
Returns:
Dictionary mapping hosts to counts
"""
counter = Counter(e.host for e in self.entries)
return dict(counter.most_common())
def status_code_categories(self) -> dict[str, int]:
"""Get counts by status code category.
Returns:
Dictionary with 1xx, 2xx, 3xx, 4xx, 5xx counts
"""
categories: dict[str, int] = {
"1xx informational": 0,
"2xx success": 0,
"3xx redirection": 0,
"4xx client error": 0,
"5xx server error": 0,
}
for entry in self.entries:
status = entry.response.status
if 100 <= status < 200:
categories["1xx informational"] += 1
elif 200 <= status < 300:
categories["2xx success"] += 1
elif 300 <= status < 400:
categories["3xx redirection"] += 1
elif 400 <= status < 500:
categories["4xx client error"] += 1
elif 500 <= status < 600:
categories["5xx server error"] += 1
return categories
def endpoint_patterns(self) -> dict[str, int]:
"""Extract common endpoint patterns with path parameters.
Returns:
Dictionary mapping patterns to counts
"""
patterns: dict[str, int] = defaultdict(int)
for entry in self.entries:
pattern = self._extract_pattern(entry.endpoint)
patterns[pattern] += 1
return dict(sorted(patterns.items(), key=lambda x: x[1], reverse=True))
def _normalize_endpoint(self, endpoint: str) -> str:
"""Normalize endpoint by removing IDs and versions."""
cleaned = re.sub(r"/\d+", "/{id}", endpoint)
cleaned = re.sub(r"/[a-f0-9-]{36}", "/{uuid}", cleaned)
cleaned = re.sub(r"/v\d+(?:\.\d+)?", "", cleaned)
return cleaned
def _extract_pattern(self, endpoint: str) -> str:
"""Extract endpoint pattern with parameter placeholders."""
parts = endpoint.split("/")
normalized_parts = []
for part in parts:
if not part:
normalized_parts.append("")
elif part.isdigit():
normalized_parts.append("{id}")
elif self._is_uuid(part):
normalized_parts.append("{uuid}")
elif self._is_hash(part):
normalized_parts.append("{hash}")
else:
normalized_parts.append(part)
return "/".join(normalized_parts)
def _is_uuid(self, s: str) -> bool:
"""Check if string looks like a UUID."""
uuid_pattern = re.compile(
r"^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$",
re.IGNORECASE,
)
return bool(uuid_pattern.match(s))
def _is_hash(self, s: str) -> bool:
"""Check if string looks like a hash."""
hash_pattern = re.compile(r"^[a-f0-9]{32,}$", re.IGNORECASE)
return bool(hash_pattern.match(s))
def render_table(self, stats: TrafficStats | None = None) -> Table:
"""Render statistics as a Rich table.
Args:
stats: Pre-generated stats, or None to generate new
Returns:
Rich Table object
"""
if stats is None:
stats = self.generate()
table = Table(title="Traffic Statistics")
table.add_column("Metric", style="cyan")
table.add_column("Value", style="green")
table.add_row("Total Requests", str(stats.total_requests))
method_rows = [f"{m}: {c}" for m, c in sorted(stats.method_distribution.items())]
table.add_row("Methods", ", ".join(method_rows) if method_rows else "N/A")
status_rows = [f"{s}: {c}" for s, c in sorted(stats.status_breakdown.items())]
table.add_row("Status Codes", ", ".join(status_rows) if status_rows else "N/A")
rt = stats.response_time_stats
if rt["avg"] > 0:
table.add_row(
"Response Time (avg)",
f"{rt['avg']:.2f}ms",
)
table.add_row(
"Response Time (p95)",
f"{rt['p95']:.2f}ms",
)
top_endpoints = list(stats.endpoint_count.items())[:5]
endpoint_rows = [f"{e}: {c}" for e, c in top_endpoints]
table.add_row("Top Endpoints", ", ".join(endpoint_rows) if endpoint_rows else "N/A")
return table
def to_dict(self, stats: TrafficStats | None = None) -> dict[str, Any]:
"""Convert stats to dictionary.
Args:
stats: Pre-generated stats, or None to generate new
Returns:
Dictionary representation of stats
"""
if stats is None:
stats = self.generate()
return {
"total_requests": stats.total_requests,
"endpoint_count": stats.endpoint_count,
"method_distribution": stats.method_distribution,
"status_breakdown": stats.status_breakdown,
"content_type_distribution": stats.content_type_distribution,
"response_time_stats": stats.response_time_stats,
"hosts": stats.hosts,
"status_code_categories": self.status_code_categories(),
}