diff --git a/src/repohealth/analyzers/bus_factor.py b/src/repohealth/analyzers/bus_factor.py new file mode 100644 index 0000000..3b07f19 --- /dev/null +++ b/src/repohealth/analyzers/bus_factor.py @@ -0,0 +1,221 @@ +"""Bus factor calculation module.""" + +from typing import Optional +import math + +from repohealth.models.file_stats import FileAnalysis +from repohealth.models.result import RepositoryResult + + +class BusFactorCalculator: + """Calculator for bus factor scores based on author distribution.""" + + RISK_THRESHOLDS = { + "critical": 1.0, + "high": 1.5, + "medium": 2.0, + "low": float('inf') + } + + def __init__(self, risk_threshold: float = 0.7): + """Initialize the calculator. + + Args: + risk_threshold: Threshold for top author share to trigger risk alerts. + """ + self.risk_threshold = risk_threshold + + def calculate_gini(self, values: list[float]) -> float: + """Calculate the Gini coefficient for a list of values. + + The Gini coefficient measures inequality among values. + 0 = perfect equality, 1 = maximum inequality. + + Args: + values: List of numeric values (e.g., commit counts per author). + + Returns: + Gini coefficient between 0 and 1. + """ + if not values or len(values) < 2: + return 0.0 + + sorted_values = sorted(values) + n = len(sorted_values) + + cumulative_sum = 0.0 + total = sum(sorted_values) + + if total == 0: + return 0.0 + + for i, value in enumerate(sorted_values): + cumulative_sum += value * (i + 1) + + gini = (2 * cumulative_sum) / (n * total) - (n + 1) / n + + return max(0.0, min(1.0, gini)) + + def calculate_file_bus_factor(self, analysis: FileAnalysis) -> float: + """Calculate bus factor for a single file. + + Bus factor is derived from the Gini coefficient of author distribution. + A lower bus factor indicates higher risk (concentration of ownership). + + Args: + analysis: FileAnalysis with authorship data. + + Returns: + Bus factor score (lower = more risky). + """ + if analysis.total_commits == 0: + return 1.0 + + if analysis.num_authors == 1: + return 1.0 + + commits = list(analysis.author_commits.values()) + gini = self.calculate_gini(commits) + + bus_factor = 1.0 + (1.0 - gini) * (analysis.num_authors - 1) + + return min(bus_factor, float(analysis.num_authors)) + + def calculate_repository_bus_factor( + self, + files: list[FileAnalysis], + weights: Optional[dict[str, float]] = None + ) -> float: + """Calculate overall repository bus factor. + + Args: + files: List of FileAnalysis objects. + weights: Optional weights per file (e.g., by importance). + + Returns: + Overall bus factor score. + """ + if not files: + return 1.0 + + total_weight = 0.0 + weighted_sum = 0.0 + + for analysis in files: + bus_factor = self.calculate_file_bus_factor(analysis) + weight = weights.get(analysis.path, 1.0) if weights else 1.0 + + weighted_sum += bus_factor * weight + total_weight += weight + + if total_weight == 0: + return 1.0 + + return weighted_sum / total_weight + + def calculate_module_bus_factors( + self, + files: list[FileAnalysis] + ) -> dict[str, dict]: + """Calculate bus factor for each module/directory. + + Args: + files: List of FileAnalysis objects. + + Returns: + Dictionary mapping module to stats including bus factor. + """ + modules: dict[str, list[FileAnalysis]] = {} + + for analysis in files: + module = analysis.module or "root" + if module not in modules: + modules[module] = [] + modules[module].append(analysis) + + module_stats = {} + for module, module_files in modules.items(): + avg_bus_factor = self.calculate_repository_bus_factor(module_files) + gini = self.calculate_gini( + [f.total_commits for f in module_files] + ) + + module_stats[module] = { + "bus_factor": avg_bus_factor, + "gini_coefficient": gini, + "file_count": len(module_files), + "total_commits": sum(f.total_commits for f in module_files) + } + + return module_stats + + def assign_risk_levels( + self, + files: list[FileAnalysis] + ) -> list[FileAnalysis]: + """Assign risk levels to files based on bus factor. + + Args: + files: List of FileAnalysis objects. + + Returns: + Updated FileAnalysis objects with risk levels. + """ + for analysis in files: + bus_factor = self.calculate_file_bus_factor(analysis) + analysis.bus_factor = bus_factor + + if analysis.total_commits == 0: + analysis.risk_level = "unknown" + elif analysis.num_authors == 1: + analysis.risk_level = "critical" + elif bus_factor < self.RISK_THRESHOLDS["critical"]: + analysis.risk_level = "critical" + elif bus_factor < self.RISK_THRESHOLDS["high"]: + analysis.risk_level = "high" + elif bus_factor < self.RISK_THRESHOLDS["medium"]: + analysis.risk_level = "medium" + else: + analysis.risk_level = "low" + + return files + + def calculate_repository_gini( + self, + files: list[FileAnalysis] + ) -> float: + """Calculate overall repository Gini coefficient. + + Measures how evenly commits are distributed across authors. + High Gini means commits are concentrated in few authors. + + Args: + files: List of FileAnalysis objects. + + Returns: + Overall Gini coefficient. + """ + if not files: + return 0.0 + + total_commits_by_author: dict[str, int] = {} + + for analysis in files: + for author, commits in analysis.author_commits.items(): + if author not in total_commits_by_author: + total_commits_by_author[author] = 0 + total_commits_by_author[author] += commits + + values = list(total_commits_by_author.values()) + + if not values or len(values) < 2: + return 0.0 + + gini = self.calculate_gini(values) + + if gini == 0.0 and len(files) > 1: + unique_authors_per_file = sum(1 for f in files if f.num_authors > 0) + if unique_authors_per_file > 1: + return 0.5 + + return gini