From d8a71158223f5aac043993bcb8711f2fdbd89271 Mon Sep 17 00:00:00 2001 From: 7000pctAUTO Date: Wed, 4 Feb 2026 12:32:35 +0000 Subject: [PATCH] Add testing module (ab_test, metrics, validator) --- src/promptforge/testing/metrics.py | 86 ++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 src/promptforge/testing/metrics.py diff --git a/src/promptforge/testing/metrics.py b/src/promptforge/testing/metrics.py new file mode 100644 index 0000000..8076788 --- /dev/null +++ b/src/promptforge/testing/metrics.py @@ -0,0 +1,86 @@ +from dataclasses import dataclass, field +from typing import Dict, List, Optional + + +@dataclass +class TestMetrics: + test_id: str + prompt_name: str + provider: str + model: str + latency_ms: float + success: bool + tokens_used: int = 0 + cost_estimate: float = 0.0 + error_message: Optional[str] = None + + +@dataclass +class ComparisonResult: + prompt_name: str + total_runs: int + successful_runs: int + failed_runs: int + avg_latency_ms: float + min_latency_ms: float + max_latency_ms: float + avg_tokens: float + avg_cost: float + success_rate: float + all_metrics: List[TestMetrics] = field(default_factory=list) + + +class MetricsCollector: + def __init__(self): + self.metrics: List[TestMetrics] = [] + + def add(self, metrics: TestMetrics) -> None: + self.metrics.append(metrics) + + def compare(self, prompt_name: str, metrics_list: List[TestMetrics]) -> ComparisonResult: + if not metrics_list: + return ComparisonResult( + prompt_name=prompt_name, + total_runs=0, + successful_runs=0, + failed_runs=0, + avg_latency_ms=0, + min_latency_ms=0, + max_latency_ms=0, + avg_tokens=0, + avg_cost=0, + success_rate=0, + ) + + successful = [m for m in metrics_list if m.success] + failed = [m for m in metrics_list if not m.success] + + latencies = [m.latency_ms for m in successful] + tokens = [m.tokens_used for m in successful] + costs = [m.cost_estimate for m in successful] + + return ComparisonResult( + prompt_name=prompt_name, + total_runs=len(metrics_list), + successful_runs=len(successful), + failed_runs=len(failed), + avg_latency_ms=sum(latencies) / len(latencies) if latencies else 0, + min_latency_ms=min(latencies) if latencies else 0, + max_latency_ms=max(latencies) if latencies else 0, + avg_tokens=sum(tokens) / len(tokens) if tokens else 0, + avg_cost=sum(costs) / len(costs) if costs else 0, + success_rate=len(successful) / len(metrics_list) if metrics_list else 0, + all_metrics=metrics_list, + ) + + def get_summary(self) -> Dict[str, ComparisonResult]: + by_prompt: Dict[str, List[TestMetrics]] = {} + for m in self.metrics: + if m.prompt_name not in by_prompt: + by_prompt[m.prompt_name] = [] + by_prompt[m.prompt_name].append(m) + + return { + name: self.compare(name, metrics) + for name, metrics in by_prompt.items() + } \ No newline at end of file