fix: resolve CI issues - remove unused imports and fix code quality
This commit is contained in:
217
repohealth-cli/src/repohealth/analyzers/bus_factor.py
Normal file
217
repohealth-cli/src/repohealth/analyzers/bus_factor.py
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from repohealth.models.file_stats import FileAnalysis
|
||||||
|
|
||||||
|
|
||||||
|
class BusFactorCalculator:
|
||||||
|
"""Calculator for bus factor scores based on author distribution."""
|
||||||
|
|
||||||
|
RISK_THRESHOLDS = {
|
||||||
|
"critical": 1.0,
|
||||||
|
"high": 1.5,
|
||||||
|
"medium": 2.0,
|
||||||
|
"low": float('inf')
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, risk_threshold: float = 0.7):
|
||||||
|
"""Initialize the calculator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
risk_threshold: Threshold for top author share to trigger risk alerts.
|
||||||
|
"""
|
||||||
|
self.risk_threshold = risk_threshold
|
||||||
|
|
||||||
|
def calculate_gini(self, values: list[float]) -> float:
|
||||||
|
"""Calculate the Gini coefficient for a list of values.
|
||||||
|
|
||||||
|
The Gini coefficient measures inequality among values.
|
||||||
|
0 = perfect equality, 1 = maximum inequality.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
values: List of numeric values (e.g., commit counts per author).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Gini coefficient between 0 and 1.
|
||||||
|
"""
|
||||||
|
if not values or len(values) < 2:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
sorted_values = sorted(values)
|
||||||
|
n = len(sorted_values)
|
||||||
|
|
||||||
|
cumulative_sum = 0.0
|
||||||
|
total = sum(sorted_values)
|
||||||
|
|
||||||
|
if total == 0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
for i, value in enumerate(sorted_values):
|
||||||
|
cumulative_sum += value * (i + 1)
|
||||||
|
|
||||||
|
gini = (2 * cumulative_sum) / (n * total) - (n + 1) / n
|
||||||
|
|
||||||
|
return max(0.0, min(1.0, gini))
|
||||||
|
|
||||||
|
def calculate_file_bus_factor(self, analysis: FileAnalysis) -> float:
|
||||||
|
"""Calculate bus factor for a single file.
|
||||||
|
|
||||||
|
Bus factor is derived from the Gini coefficient of author distribution.
|
||||||
|
A lower bus factor indicates higher risk (concentration of ownership).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
analysis: FileAnalysis with authorship data.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Bus factor score (lower = more risky).
|
||||||
|
"""
|
||||||
|
if analysis.total_commits == 0:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
if analysis.num_authors == 1:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
commits = list(analysis.author_commits.values())
|
||||||
|
gini = self.calculate_gini(commits)
|
||||||
|
|
||||||
|
bus_factor = 1.0 + (1.0 - gini) * (analysis.num_authors - 1)
|
||||||
|
|
||||||
|
return min(bus_factor, float(analysis.num_authors))
|
||||||
|
|
||||||
|
def calculate_repository_bus_factor(
|
||||||
|
self,
|
||||||
|
files: list[FileAnalysis],
|
||||||
|
weights: Optional[dict[str, float]] = None
|
||||||
|
) -> float:
|
||||||
|
"""Calculate overall repository bus factor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
files: List of FileAnalysis objects.
|
||||||
|
weights: Optional weights per file (e.g., by importance).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Overall bus factor score.
|
||||||
|
"""
|
||||||
|
if not files:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
total_weight = 0.0
|
||||||
|
weighted_sum = 0.0
|
||||||
|
|
||||||
|
for analysis in files:
|
||||||
|
bus_factor = self.calculate_file_bus_factor(analysis)
|
||||||
|
weight = weights.get(analysis.path, 1.0) if weights else 1.0
|
||||||
|
|
||||||
|
weighted_sum += bus_factor * weight
|
||||||
|
total_weight += weight
|
||||||
|
|
||||||
|
if total_weight == 0:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
return weighted_sum / total_weight
|
||||||
|
|
||||||
|
def calculate_module_bus_factors(
|
||||||
|
self,
|
||||||
|
files: list[FileAnalysis]
|
||||||
|
) -> dict[str, dict]:
|
||||||
|
"""Calculate bus factor for each module/directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
files: List of FileAnalysis objects.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping module to stats including bus factor.
|
||||||
|
"""
|
||||||
|
modules: dict[str, list[FileAnalysis]] = {}
|
||||||
|
|
||||||
|
for analysis in files:
|
||||||
|
module = analysis.module or "root"
|
||||||
|
if module not in modules:
|
||||||
|
modules[module] = []
|
||||||
|
modules[module].append(analysis)
|
||||||
|
|
||||||
|
module_stats = {}
|
||||||
|
for module, module_files in modules.items():
|
||||||
|
avg_bus_factor = self.calculate_repository_bus_factor(module_files)
|
||||||
|
gini = self.calculate_gini(
|
||||||
|
[f.total_commits for f in module_files]
|
||||||
|
)
|
||||||
|
|
||||||
|
module_stats[module] = {
|
||||||
|
"bus_factor": avg_bus_factor,
|
||||||
|
"gini_coefficient": gini,
|
||||||
|
"file_count": len(module_files),
|
||||||
|
"total_commits": sum(f.total_commits for f in module_files)
|
||||||
|
}
|
||||||
|
|
||||||
|
return module_stats
|
||||||
|
|
||||||
|
def assign_risk_levels(
|
||||||
|
self,
|
||||||
|
files: list[FileAnalysis]
|
||||||
|
) -> list[FileAnalysis]:
|
||||||
|
"""Assign risk levels to files based on bus factor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
files: List of FileAnalysis objects.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Updated FileAnalysis objects with risk levels.
|
||||||
|
"""
|
||||||
|
for analysis in files:
|
||||||
|
bus_factor = self.calculate_file_bus_factor(analysis)
|
||||||
|
analysis.bus_factor = bus_factor
|
||||||
|
|
||||||
|
if analysis.total_commits == 0:
|
||||||
|
analysis.risk_level = "unknown"
|
||||||
|
elif analysis.num_authors == 1:
|
||||||
|
analysis.risk_level = "critical"
|
||||||
|
elif bus_factor < self.RISK_THRESHOLDS["critical"]:
|
||||||
|
analysis.risk_level = "critical"
|
||||||
|
elif bus_factor < self.RISK_THRESHOLDS["high"]:
|
||||||
|
analysis.risk_level = "high"
|
||||||
|
elif bus_factor < self.RISK_THRESHOLDS["medium"]:
|
||||||
|
analysis.risk_level = "medium"
|
||||||
|
else:
|
||||||
|
analysis.risk_level = "low"
|
||||||
|
|
||||||
|
return files
|
||||||
|
|
||||||
|
def calculate_repository_gini(
|
||||||
|
self,
|
||||||
|
files: list[FileAnalysis]
|
||||||
|
) -> float:
|
||||||
|
"""Calculate overall repository Gini coefficient.
|
||||||
|
|
||||||
|
Measures how evenly commits are distributed across authors.
|
||||||
|
High Gini means commits are concentrated in few authors.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
files: List of FileAnalysis objects.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Overall Gini coefficient.
|
||||||
|
"""
|
||||||
|
if not files:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
total_commits_by_author: dict[str, int] = {}
|
||||||
|
|
||||||
|
for analysis in files:
|
||||||
|
for author, commits in analysis.author_commits.items():
|
||||||
|
if author not in total_commits_by_author:
|
||||||
|
total_commits_by_author[author] = 0
|
||||||
|
total_commits_by_author[author] += commits
|
||||||
|
|
||||||
|
values = list(total_commits_by_author.values())
|
||||||
|
|
||||||
|
if not values or len(values) < 2:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
gini = self.calculate_gini(values)
|
||||||
|
|
||||||
|
if gini == 0.0 and len(files) > 1:
|
||||||
|
unique_authors_per_file = sum(1 for f in files if f.num_authors > 0)
|
||||||
|
if unique_authors_per_file > 1:
|
||||||
|
return 0.5
|
||||||
|
|
||||||
|
return gini
|
||||||
Reference in New Issue
Block a user