fix: resolve CI linting and type errors
This commit is contained in:
@@ -1,33 +1,139 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional
|
||||
"""Test results and formatting."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from .ab_test import ABTestSummary
|
||||
from .metrics import MetricsSummary
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestResult:
|
||||
"""Result of a single test."""
|
||||
|
||||
test_id: str
|
||||
prompt_name: str
|
||||
provider: str
|
||||
success: bool
|
||||
response: str
|
||||
metrics: "TestMetrics"
|
||||
error: Optional[str] = None
|
||||
metrics: Dict[str, Any] = field(default_factory=dict)
|
||||
validation_results: Dict[str, bool] = field(default_factory=dict)
|
||||
error_message: Optional[str] = None
|
||||
timestamp: datetime = field(default_factory=datetime.utcnow)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ComparisonResult:
|
||||
prompt_name: str
|
||||
total_runs: int
|
||||
successful_runs: int
|
||||
failed_runs: int
|
||||
avg_latency_ms: float
|
||||
min_latency_ms: float
|
||||
max_latency_ms: float
|
||||
avg_tokens: float
|
||||
avg_cost: float
|
||||
success_rate: float
|
||||
all_metrics: List["TestMetrics"]
|
||||
class TestSessionResults:
|
||||
"""Collection of test results."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestReport:
|
||||
test_id: str
|
||||
timestamp: str
|
||||
results: Dict[str, ComparisonResult]
|
||||
summary: Dict[str, float]
|
||||
name: str
|
||||
results: List[TestResult] = field(default_factory=list)
|
||||
metrics: MetricsSummary = field(default_factory=lambda: MetricsSummary(name=""))
|
||||
ab_comparisons: Dict[str, ABTestSummary] = field(default_factory=dict)
|
||||
start_time: datetime = field(default_factory=datetime.utcnow)
|
||||
end_time: Optional[datetime] = None
|
||||
|
||||
__test__ = False
|
||||
|
||||
@property
|
||||
def duration_seconds(self) -> float:
|
||||
"""Get test duration in seconds."""
|
||||
if self.end_time is None:
|
||||
return 0.0
|
||||
return (self.end_time - self.start_time).total_seconds()
|
||||
|
||||
@property
|
||||
def success_count(self) -> int:
|
||||
"""Count of successful tests."""
|
||||
return sum(1 for r in self.results if r.success)
|
||||
|
||||
@property
|
||||
def failure_count(self) -> int:
|
||||
"""Count of failed tests."""
|
||||
return sum(1 for r in self.results if not r.success)
|
||||
|
||||
@property
|
||||
def pass_rate(self) -> float:
|
||||
"""Calculate pass rate."""
|
||||
if not self.results:
|
||||
return 0.0
|
||||
return self.success_count / len(self.results)
|
||||
|
||||
|
||||
class ResultFormatter:
|
||||
"""Format test results for display."""
|
||||
|
||||
@staticmethod
|
||||
def format_text(results: TestSessionResults) -> str:
|
||||
"""Format results as plain text."""
|
||||
lines = [
|
||||
f"Test Results: {results.name}",
|
||||
f"Duration: {results.duration_seconds:.2f}s",
|
||||
f"Passed: {results.success_count}/{len(results.results)} ({results.pass_rate:.1%})",
|
||||
"",
|
||||
]
|
||||
|
||||
for result in results.results:
|
||||
status = "PASS" if result.success else "FAIL"
|
||||
lines.append(f"[{status}] {result.prompt_name}")
|
||||
if result.error_message:
|
||||
lines.append(f" Error: {result.error_message}")
|
||||
if result.metrics:
|
||||
metrics_str = ", ".join(f"{k}: {v}" for k, v in result.metrics.items())
|
||||
lines.append(f" Metrics: {metrics_str}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
@staticmethod
|
||||
def format_json(results: TestSessionResults) -> str:
|
||||
"""Format results as JSON."""
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
def serialize(obj):
|
||||
if isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
|
||||
|
||||
data = {
|
||||
"test_id": results.test_id,
|
||||
"name": results.name,
|
||||
"duration_seconds": results.duration_seconds,
|
||||
"summary": {
|
||||
"total": len(results.results),
|
||||
"passed": results.success_count,
|
||||
"failed": results.failure_count,
|
||||
"pass_rate": results.pass_rate,
|
||||
},
|
||||
"results": [
|
||||
{
|
||||
"test_id": r.test_id,
|
||||
"prompt_name": r.prompt_name,
|
||||
"provider": r.provider,
|
||||
"success": r.success,
|
||||
"response": r.response[:500] if r.response else "",
|
||||
"metrics": r.metrics,
|
||||
"validation_results": r.validation_results,
|
||||
"error_message": r.error_message,
|
||||
"timestamp": r.timestamp.isoformat(),
|
||||
}
|
||||
for r in results.results
|
||||
],
|
||||
}
|
||||
return json.dumps(data, default=serialize, indent=2)
|
||||
|
||||
@staticmethod
|
||||
def format_ab_comparison(comparisons: Dict[str, ABTestSummary]) -> str:
|
||||
"""Format A/B test comparisons."""
|
||||
lines = ["A/B Test Comparison", "=" * 40]
|
||||
|
||||
for name, summary in comparisons.items():
|
||||
lines.append(f"\nPrompt: {name}")
|
||||
lines.append(f" Runs: {summary.successful_runs}/{summary.total_runs}")
|
||||
lines.append(f" Avg Latency: {summary.avg_latency_ms:.2f}ms")
|
||||
lines.append(f" Avg Tokens: {summary.avg_tokens:.0f}")
|
||||
lines.append(f" Avg Cost: ${summary.avg_cost:.4f}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
Reference in New Issue
Block a user