fix: resolve CI linting and type errors
This commit is contained in:
@@ -1,33 +1,139 @@
|
|||||||
from dataclasses import dataclass
|
"""Test results and formatting."""
|
||||||
from typing import Dict, List, Optional
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from .ab_test import ABTestSummary
|
||||||
|
from .metrics import MetricsSummary
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TestResult:
|
class TestResult:
|
||||||
|
"""Result of a single test."""
|
||||||
|
|
||||||
|
test_id: str
|
||||||
|
prompt_name: str
|
||||||
|
provider: str
|
||||||
success: bool
|
success: bool
|
||||||
response: str
|
response: str
|
||||||
metrics: "TestMetrics"
|
metrics: Dict[str, Any] = field(default_factory=dict)
|
||||||
error: Optional[str] = None
|
validation_results: Dict[str, bool] = field(default_factory=dict)
|
||||||
|
error_message: Optional[str] = None
|
||||||
|
timestamp: datetime = field(default_factory=datetime.utcnow)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ComparisonResult:
|
class TestSessionResults:
|
||||||
prompt_name: str
|
"""Collection of test results."""
|
||||||
total_runs: int
|
|
||||||
successful_runs: int
|
|
||||||
failed_runs: int
|
|
||||||
avg_latency_ms: float
|
|
||||||
min_latency_ms: float
|
|
||||||
max_latency_ms: float
|
|
||||||
avg_tokens: float
|
|
||||||
avg_cost: float
|
|
||||||
success_rate: float
|
|
||||||
all_metrics: List["TestMetrics"]
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class TestReport:
|
|
||||||
test_id: str
|
test_id: str
|
||||||
timestamp: str
|
name: str
|
||||||
results: Dict[str, ComparisonResult]
|
results: List[TestResult] = field(default_factory=list)
|
||||||
summary: Dict[str, float]
|
metrics: MetricsSummary = field(default_factory=lambda: MetricsSummary(name=""))
|
||||||
|
ab_comparisons: Dict[str, ABTestSummary] = field(default_factory=dict)
|
||||||
|
start_time: datetime = field(default_factory=datetime.utcnow)
|
||||||
|
end_time: Optional[datetime] = None
|
||||||
|
|
||||||
|
__test__ = False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def duration_seconds(self) -> float:
|
||||||
|
"""Get test duration in seconds."""
|
||||||
|
if self.end_time is None:
|
||||||
|
return 0.0
|
||||||
|
return (self.end_time - self.start_time).total_seconds()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def success_count(self) -> int:
|
||||||
|
"""Count of successful tests."""
|
||||||
|
return sum(1 for r in self.results if r.success)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def failure_count(self) -> int:
|
||||||
|
"""Count of failed tests."""
|
||||||
|
return sum(1 for r in self.results if not r.success)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pass_rate(self) -> float:
|
||||||
|
"""Calculate pass rate."""
|
||||||
|
if not self.results:
|
||||||
|
return 0.0
|
||||||
|
return self.success_count / len(self.results)
|
||||||
|
|
||||||
|
|
||||||
|
class ResultFormatter:
|
||||||
|
"""Format test results for display."""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def format_text(results: TestSessionResults) -> str:
|
||||||
|
"""Format results as plain text."""
|
||||||
|
lines = [
|
||||||
|
f"Test Results: {results.name}",
|
||||||
|
f"Duration: {results.duration_seconds:.2f}s",
|
||||||
|
f"Passed: {results.success_count}/{len(results.results)} ({results.pass_rate:.1%})",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
|
||||||
|
for result in results.results:
|
||||||
|
status = "PASS" if result.success else "FAIL"
|
||||||
|
lines.append(f"[{status}] {result.prompt_name}")
|
||||||
|
if result.error_message:
|
||||||
|
lines.append(f" Error: {result.error_message}")
|
||||||
|
if result.metrics:
|
||||||
|
metrics_str = ", ".join(f"{k}: {v}" for k, v in result.metrics.items())
|
||||||
|
lines.append(f" Metrics: {metrics_str}")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def format_json(results: TestSessionResults) -> str:
|
||||||
|
"""Format results as JSON."""
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
def serialize(obj):
|
||||||
|
if isinstance(obj, datetime):
|
||||||
|
return obj.isoformat()
|
||||||
|
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"test_id": results.test_id,
|
||||||
|
"name": results.name,
|
||||||
|
"duration_seconds": results.duration_seconds,
|
||||||
|
"summary": {
|
||||||
|
"total": len(results.results),
|
||||||
|
"passed": results.success_count,
|
||||||
|
"failed": results.failure_count,
|
||||||
|
"pass_rate": results.pass_rate,
|
||||||
|
},
|
||||||
|
"results": [
|
||||||
|
{
|
||||||
|
"test_id": r.test_id,
|
||||||
|
"prompt_name": r.prompt_name,
|
||||||
|
"provider": r.provider,
|
||||||
|
"success": r.success,
|
||||||
|
"response": r.response[:500] if r.response else "",
|
||||||
|
"metrics": r.metrics,
|
||||||
|
"validation_results": r.validation_results,
|
||||||
|
"error_message": r.error_message,
|
||||||
|
"timestamp": r.timestamp.isoformat(),
|
||||||
|
}
|
||||||
|
for r in results.results
|
||||||
|
],
|
||||||
|
}
|
||||||
|
return json.dumps(data, default=serialize, indent=2)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def format_ab_comparison(comparisons: Dict[str, ABTestSummary]) -> str:
|
||||||
|
"""Format A/B test comparisons."""
|
||||||
|
lines = ["A/B Test Comparison", "=" * 40]
|
||||||
|
|
||||||
|
for name, summary in comparisons.items():
|
||||||
|
lines.append(f"\nPrompt: {name}")
|
||||||
|
lines.append(f" Runs: {summary.successful_runs}/{summary.total_runs}")
|
||||||
|
lines.append(f" Avg Latency: {summary.avg_latency_ms:.2f}ms")
|
||||||
|
lines.append(f" Avg Tokens: {summary.avg_tokens:.0f}")
|
||||||
|
lines.append(f" Avg Cost: ${summary.avg_cost:.4f}")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|||||||
Reference in New Issue
Block a user