Add testing module (ab_test, metrics, validator)

This commit is contained in:
2026-02-04 12:32:35 +00:00
parent ad4c01dac7
commit d8a7115822

View File

@@ -0,0 +1,86 @@
from dataclasses import dataclass, field
from typing import Dict, List, Optional
@dataclass
class TestMetrics:
test_id: str
prompt_name: str
provider: str
model: str
latency_ms: float
success: bool
tokens_used: int = 0
cost_estimate: float = 0.0
error_message: Optional[str] = None
@dataclass
class ComparisonResult:
prompt_name: str
total_runs: int
successful_runs: int
failed_runs: int
avg_latency_ms: float
min_latency_ms: float
max_latency_ms: float
avg_tokens: float
avg_cost: float
success_rate: float
all_metrics: List[TestMetrics] = field(default_factory=list)
class MetricsCollector:
def __init__(self):
self.metrics: List[TestMetrics] = []
def add(self, metrics: TestMetrics) -> None:
self.metrics.append(metrics)
def compare(self, prompt_name: str, metrics_list: List[TestMetrics]) -> ComparisonResult:
if not metrics_list:
return ComparisonResult(
prompt_name=prompt_name,
total_runs=0,
successful_runs=0,
failed_runs=0,
avg_latency_ms=0,
min_latency_ms=0,
max_latency_ms=0,
avg_tokens=0,
avg_cost=0,
success_rate=0,
)
successful = [m for m in metrics_list if m.success]
failed = [m for m in metrics_list if not m.success]
latencies = [m.latency_ms for m in successful]
tokens = [m.tokens_used for m in successful]
costs = [m.cost_estimate for m in successful]
return ComparisonResult(
prompt_name=prompt_name,
total_runs=len(metrics_list),
successful_runs=len(successful),
failed_runs=len(failed),
avg_latency_ms=sum(latencies) / len(latencies) if latencies else 0,
min_latency_ms=min(latencies) if latencies else 0,
max_latency_ms=max(latencies) if latencies else 0,
avg_tokens=sum(tokens) / len(tokens) if tokens else 0,
avg_cost=sum(costs) / len(costs) if costs else 0,
success_rate=len(successful) / len(metrics_list) if metrics_list else 0,
all_metrics=metrics_list,
)
def get_summary(self) -> Dict[str, ComparisonResult]:
by_prompt: Dict[str, List[TestMetrics]] = {}
for m in self.metrics:
if m.prompt_name not in by_prompt:
by_prompt[m.prompt_name] = []
by_prompt[m.prompt_name].append(m)
return {
name: self.compare(name, metrics)
for name, metrics in by_prompt.items()
}