Add testing module (ab_test, metrics, validator)
This commit is contained in:
86
src/promptforge/testing/metrics.py
Normal file
86
src/promptforge/testing/metrics.py
Normal file
@@ -0,0 +1,86 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestMetrics:
|
||||
test_id: str
|
||||
prompt_name: str
|
||||
provider: str
|
||||
model: str
|
||||
latency_ms: float
|
||||
success: bool
|
||||
tokens_used: int = 0
|
||||
cost_estimate: float = 0.0
|
||||
error_message: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ComparisonResult:
|
||||
prompt_name: str
|
||||
total_runs: int
|
||||
successful_runs: int
|
||||
failed_runs: int
|
||||
avg_latency_ms: float
|
||||
min_latency_ms: float
|
||||
max_latency_ms: float
|
||||
avg_tokens: float
|
||||
avg_cost: float
|
||||
success_rate: float
|
||||
all_metrics: List[TestMetrics] = field(default_factory=list)
|
||||
|
||||
|
||||
class MetricsCollector:
|
||||
def __init__(self):
|
||||
self.metrics: List[TestMetrics] = []
|
||||
|
||||
def add(self, metrics: TestMetrics) -> None:
|
||||
self.metrics.append(metrics)
|
||||
|
||||
def compare(self, prompt_name: str, metrics_list: List[TestMetrics]) -> ComparisonResult:
|
||||
if not metrics_list:
|
||||
return ComparisonResult(
|
||||
prompt_name=prompt_name,
|
||||
total_runs=0,
|
||||
successful_runs=0,
|
||||
failed_runs=0,
|
||||
avg_latency_ms=0,
|
||||
min_latency_ms=0,
|
||||
max_latency_ms=0,
|
||||
avg_tokens=0,
|
||||
avg_cost=0,
|
||||
success_rate=0,
|
||||
)
|
||||
|
||||
successful = [m for m in metrics_list if m.success]
|
||||
failed = [m for m in metrics_list if not m.success]
|
||||
|
||||
latencies = [m.latency_ms for m in successful]
|
||||
tokens = [m.tokens_used for m in successful]
|
||||
costs = [m.cost_estimate for m in successful]
|
||||
|
||||
return ComparisonResult(
|
||||
prompt_name=prompt_name,
|
||||
total_runs=len(metrics_list),
|
||||
successful_runs=len(successful),
|
||||
failed_runs=len(failed),
|
||||
avg_latency_ms=sum(latencies) / len(latencies) if latencies else 0,
|
||||
min_latency_ms=min(latencies) if latencies else 0,
|
||||
max_latency_ms=max(latencies) if latencies else 0,
|
||||
avg_tokens=sum(tokens) / len(tokens) if tokens else 0,
|
||||
avg_cost=sum(costs) / len(costs) if costs else 0,
|
||||
success_rate=len(successful) / len(metrics_list) if metrics_list else 0,
|
||||
all_metrics=metrics_list,
|
||||
)
|
||||
|
||||
def get_summary(self) -> Dict[str, ComparisonResult]:
|
||||
by_prompt: Dict[str, List[TestMetrics]] = {}
|
||||
for m in self.metrics:
|
||||
if m.prompt_name not in by_prompt:
|
||||
by_prompt[m.prompt_name] = []
|
||||
by_prompt[m.prompt_name].append(m)
|
||||
|
||||
return {
|
||||
name: self.compare(name, metrics)
|
||||
for name, metrics in by_prompt.items()
|
||||
}
|
||||
Reference in New Issue
Block a user