Initial upload: Add repohealth-cli project with CI/CD workflow
Some checks failed
CI / lint (push) Has been cancelled
CI / build (push) Has been cancelled
CI / test (push) Has been cancelled

This commit is contained in:
2026-02-05 17:13:58 +00:00
parent 61ac71fbc1
commit ca2ff9e040

View File

@@ -0,0 +1,221 @@
"""Bus factor calculation module."""
from typing import Optional
import math
from repohealth.models.file_stats import FileAnalysis
from repohealth.models.result import RepositoryResult
class BusFactorCalculator:
"""Calculator for bus factor scores based on author distribution."""
RISK_THRESHOLDS = {
"critical": 1.0,
"high": 1.5,
"medium": 2.0,
"low": float('inf')
}
def __init__(self, risk_threshold: float = 0.7):
"""Initialize the calculator.
Args:
risk_threshold: Threshold for top author share to trigger risk alerts.
"""
self.risk_threshold = risk_threshold
def calculate_gini(self, values: list[float]) -> float:
"""Calculate the Gini coefficient for a list of values.
The Gini coefficient measures inequality among values.
0 = perfect equality, 1 = maximum inequality.
Args:
values: List of numeric values (e.g., commit counts per author).
Returns:
Gini coefficient between 0 and 1.
"""
if not values or len(values) < 2:
return 0.0
sorted_values = sorted(values)
n = len(sorted_values)
cumulative_sum = 0.0
total = sum(sorted_values)
if total == 0:
return 0.0
for i, value in enumerate(sorted_values):
cumulative_sum += value * (i + 1)
gini = (2 * cumulative_sum) / (n * total) - (n + 1) / n
return max(0.0, min(1.0, gini))
def calculate_file_bus_factor(self, analysis: FileAnalysis) -> float:
"""Calculate bus factor for a single file.
Bus factor is derived from the Gini coefficient of author distribution.
A lower bus factor indicates higher risk (concentration of ownership).
Args:
analysis: FileAnalysis with authorship data.
Returns:
Bus factor score (lower = more risky).
"""
if analysis.total_commits == 0:
return 1.0
if analysis.num_authors == 1:
return 1.0
commits = list(analysis.author_commits.values())
gini = self.calculate_gini(commits)
bus_factor = 1.0 + (1.0 - gini) * (analysis.num_authors - 1)
return min(bus_factor, float(analysis.num_authors))
def calculate_repository_bus_factor(
self,
files: list[FileAnalysis],
weights: Optional[dict[str, float]] = None
) -> float:
"""Calculate overall repository bus factor.
Args:
files: List of FileAnalysis objects.
weights: Optional weights per file (e.g., by importance).
Returns:
Overall bus factor score.
"""
if not files:
return 1.0
total_weight = 0.0
weighted_sum = 0.0
for analysis in files:
bus_factor = self.calculate_file_bus_factor(analysis)
weight = weights.get(analysis.path, 1.0) if weights else 1.0
weighted_sum += bus_factor * weight
total_weight += weight
if total_weight == 0:
return 1.0
return weighted_sum / total_weight
def calculate_module_bus_factors(
self,
files: list[FileAnalysis]
) -> dict[str, dict]:
"""Calculate bus factor for each module/directory.
Args:
files: List of FileAnalysis objects.
Returns:
Dictionary mapping module to stats including bus factor.
"""
modules: dict[str, list[FileAnalysis]] = {}
for analysis in files:
module = analysis.module or "root"
if module not in modules:
modules[module] = []
modules[module].append(analysis)
module_stats = {}
for module, module_files in modules.items():
avg_bus_factor = self.calculate_repository_bus_factor(module_files)
gini = self.calculate_gini(
[f.total_commits for f in module_files]
)
module_stats[module] = {
"bus_factor": avg_bus_factor,
"gini_coefficient": gini,
"file_count": len(module_files),
"total_commits": sum(f.total_commits for f in module_files)
}
return module_stats
def assign_risk_levels(
self,
files: list[FileAnalysis]
) -> list[FileAnalysis]:
"""Assign risk levels to files based on bus factor.
Args:
files: List of FileAnalysis objects.
Returns:
Updated FileAnalysis objects with risk levels.
"""
for analysis in files:
bus_factor = self.calculate_file_bus_factor(analysis)
analysis.bus_factor = bus_factor
if analysis.total_commits == 0:
analysis.risk_level = "unknown"
elif analysis.num_authors == 1:
analysis.risk_level = "critical"
elif bus_factor < self.RISK_THRESHOLDS["critical"]:
analysis.risk_level = "critical"
elif bus_factor < self.RISK_THRESHOLDS["high"]:
analysis.risk_level = "high"
elif bus_factor < self.RISK_THRESHOLDS["medium"]:
analysis.risk_level = "medium"
else:
analysis.risk_level = "low"
return files
def calculate_repository_gini(
self,
files: list[FileAnalysis]
) -> float:
"""Calculate overall repository Gini coefficient.
Measures how evenly commits are distributed across authors.
High Gini means commits are concentrated in few authors.
Args:
files: List of FileAnalysis objects.
Returns:
Overall Gini coefficient.
"""
if not files:
return 0.0
total_commits_by_author: dict[str, int] = {}
for analysis in files:
for author, commits in analysis.author_commits.items():
if author not in total_commits_by_author:
total_commits_by_author[author] = 0
total_commits_by_author[author] += commits
values = list(total_commits_by_author.values())
if not values or len(values) < 2:
return 0.0
gini = self.calculate_gini(values)
if gini == 0.0 and len(files) > 1:
unique_authors_per_file = sum(1 for f in files if f.num_authors > 0)
if unique_authors_per_file > 1:
return 0.5
return gini