Initial upload: Add repohealth-cli project with CI/CD workflow
This commit is contained in:
221
src/repohealth/analyzers/bus_factor.py
Normal file
221
src/repohealth/analyzers/bus_factor.py
Normal file
@@ -0,0 +1,221 @@
|
||||
"""Bus factor calculation module."""
|
||||
|
||||
from typing import Optional
|
||||
import math
|
||||
|
||||
from repohealth.models.file_stats import FileAnalysis
|
||||
from repohealth.models.result import RepositoryResult
|
||||
|
||||
|
||||
class BusFactorCalculator:
|
||||
"""Calculator for bus factor scores based on author distribution."""
|
||||
|
||||
RISK_THRESHOLDS = {
|
||||
"critical": 1.0,
|
||||
"high": 1.5,
|
||||
"medium": 2.0,
|
||||
"low": float('inf')
|
||||
}
|
||||
|
||||
def __init__(self, risk_threshold: float = 0.7):
|
||||
"""Initialize the calculator.
|
||||
|
||||
Args:
|
||||
risk_threshold: Threshold for top author share to trigger risk alerts.
|
||||
"""
|
||||
self.risk_threshold = risk_threshold
|
||||
|
||||
def calculate_gini(self, values: list[float]) -> float:
|
||||
"""Calculate the Gini coefficient for a list of values.
|
||||
|
||||
The Gini coefficient measures inequality among values.
|
||||
0 = perfect equality, 1 = maximum inequality.
|
||||
|
||||
Args:
|
||||
values: List of numeric values (e.g., commit counts per author).
|
||||
|
||||
Returns:
|
||||
Gini coefficient between 0 and 1.
|
||||
"""
|
||||
if not values or len(values) < 2:
|
||||
return 0.0
|
||||
|
||||
sorted_values = sorted(values)
|
||||
n = len(sorted_values)
|
||||
|
||||
cumulative_sum = 0.0
|
||||
total = sum(sorted_values)
|
||||
|
||||
if total == 0:
|
||||
return 0.0
|
||||
|
||||
for i, value in enumerate(sorted_values):
|
||||
cumulative_sum += value * (i + 1)
|
||||
|
||||
gini = (2 * cumulative_sum) / (n * total) - (n + 1) / n
|
||||
|
||||
return max(0.0, min(1.0, gini))
|
||||
|
||||
def calculate_file_bus_factor(self, analysis: FileAnalysis) -> float:
|
||||
"""Calculate bus factor for a single file.
|
||||
|
||||
Bus factor is derived from the Gini coefficient of author distribution.
|
||||
A lower bus factor indicates higher risk (concentration of ownership).
|
||||
|
||||
Args:
|
||||
analysis: FileAnalysis with authorship data.
|
||||
|
||||
Returns:
|
||||
Bus factor score (lower = more risky).
|
||||
"""
|
||||
if analysis.total_commits == 0:
|
||||
return 1.0
|
||||
|
||||
if analysis.num_authors == 1:
|
||||
return 1.0
|
||||
|
||||
commits = list(analysis.author_commits.values())
|
||||
gini = self.calculate_gini(commits)
|
||||
|
||||
bus_factor = 1.0 + (1.0 - gini) * (analysis.num_authors - 1)
|
||||
|
||||
return min(bus_factor, float(analysis.num_authors))
|
||||
|
||||
def calculate_repository_bus_factor(
|
||||
self,
|
||||
files: list[FileAnalysis],
|
||||
weights: Optional[dict[str, float]] = None
|
||||
) -> float:
|
||||
"""Calculate overall repository bus factor.
|
||||
|
||||
Args:
|
||||
files: List of FileAnalysis objects.
|
||||
weights: Optional weights per file (e.g., by importance).
|
||||
|
||||
Returns:
|
||||
Overall bus factor score.
|
||||
"""
|
||||
if not files:
|
||||
return 1.0
|
||||
|
||||
total_weight = 0.0
|
||||
weighted_sum = 0.0
|
||||
|
||||
for analysis in files:
|
||||
bus_factor = self.calculate_file_bus_factor(analysis)
|
||||
weight = weights.get(analysis.path, 1.0) if weights else 1.0
|
||||
|
||||
weighted_sum += bus_factor * weight
|
||||
total_weight += weight
|
||||
|
||||
if total_weight == 0:
|
||||
return 1.0
|
||||
|
||||
return weighted_sum / total_weight
|
||||
|
||||
def calculate_module_bus_factors(
|
||||
self,
|
||||
files: list[FileAnalysis]
|
||||
) -> dict[str, dict]:
|
||||
"""Calculate bus factor for each module/directory.
|
||||
|
||||
Args:
|
||||
files: List of FileAnalysis objects.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping module to stats including bus factor.
|
||||
"""
|
||||
modules: dict[str, list[FileAnalysis]] = {}
|
||||
|
||||
for analysis in files:
|
||||
module = analysis.module or "root"
|
||||
if module not in modules:
|
||||
modules[module] = []
|
||||
modules[module].append(analysis)
|
||||
|
||||
module_stats = {}
|
||||
for module, module_files in modules.items():
|
||||
avg_bus_factor = self.calculate_repository_bus_factor(module_files)
|
||||
gini = self.calculate_gini(
|
||||
[f.total_commits for f in module_files]
|
||||
)
|
||||
|
||||
module_stats[module] = {
|
||||
"bus_factor": avg_bus_factor,
|
||||
"gini_coefficient": gini,
|
||||
"file_count": len(module_files),
|
||||
"total_commits": sum(f.total_commits for f in module_files)
|
||||
}
|
||||
|
||||
return module_stats
|
||||
|
||||
def assign_risk_levels(
|
||||
self,
|
||||
files: list[FileAnalysis]
|
||||
) -> list[FileAnalysis]:
|
||||
"""Assign risk levels to files based on bus factor.
|
||||
|
||||
Args:
|
||||
files: List of FileAnalysis objects.
|
||||
|
||||
Returns:
|
||||
Updated FileAnalysis objects with risk levels.
|
||||
"""
|
||||
for analysis in files:
|
||||
bus_factor = self.calculate_file_bus_factor(analysis)
|
||||
analysis.bus_factor = bus_factor
|
||||
|
||||
if analysis.total_commits == 0:
|
||||
analysis.risk_level = "unknown"
|
||||
elif analysis.num_authors == 1:
|
||||
analysis.risk_level = "critical"
|
||||
elif bus_factor < self.RISK_THRESHOLDS["critical"]:
|
||||
analysis.risk_level = "critical"
|
||||
elif bus_factor < self.RISK_THRESHOLDS["high"]:
|
||||
analysis.risk_level = "high"
|
||||
elif bus_factor < self.RISK_THRESHOLDS["medium"]:
|
||||
analysis.risk_level = "medium"
|
||||
else:
|
||||
analysis.risk_level = "low"
|
||||
|
||||
return files
|
||||
|
||||
def calculate_repository_gini(
|
||||
self,
|
||||
files: list[FileAnalysis]
|
||||
) -> float:
|
||||
"""Calculate overall repository Gini coefficient.
|
||||
|
||||
Measures how evenly commits are distributed across authors.
|
||||
High Gini means commits are concentrated in few authors.
|
||||
|
||||
Args:
|
||||
files: List of FileAnalysis objects.
|
||||
|
||||
Returns:
|
||||
Overall Gini coefficient.
|
||||
"""
|
||||
if not files:
|
||||
return 0.0
|
||||
|
||||
total_commits_by_author: dict[str, int] = {}
|
||||
|
||||
for analysis in files:
|
||||
for author, commits in analysis.author_commits.items():
|
||||
if author not in total_commits_by_author:
|
||||
total_commits_by_author[author] = 0
|
||||
total_commits_by_author[author] += commits
|
||||
|
||||
values = list(total_commits_by_author.values())
|
||||
|
||||
if not values or len(values) < 2:
|
||||
return 0.0
|
||||
|
||||
gini = self.calculate_gini(values)
|
||||
|
||||
if gini == 0.0 and len(files) > 1:
|
||||
unique_authors_per_file = sum(1 for f in files if f.num_authors > 0)
|
||||
if unique_authors_per_file > 1:
|
||||
return 0.5
|
||||
|
||||
return gini
|
||||
Reference in New Issue
Block a user