Initial upload: Add repohealth-cli project with CI/CD workflow

2026-02-05 17:13:58 +00:00
parent 61ac71fbc1
commit ca2ff9e040
1 changed files with 221 additions and 0 deletions
--- a/src/repohealth/analyzers/bus_factor.py
+++ b/src/repohealth/analyzers/bus_factor.py
@@ -0,0 +1,221 @@
+"""Bus factor calculation module."""
+
+from typing import Optional
+import math
+
+from repohealth.models.file_stats import FileAnalysis
+from repohealth.models.result import RepositoryResult
+
+
+class BusFactorCalculator:
+    """Calculator for bus factor scores based on author distribution."""
+
+    RISK_THRESHOLDS = {
+        "critical": 1.0,
+        "high": 1.5,
+        "medium": 2.0,
+        "low": float('inf')
+    }
+
+    def __init__(self, risk_threshold: float = 0.7):
+        """Initialize the calculator.
+
+        Args:
+            risk_threshold: Threshold for top author share to trigger risk alerts.
+        """
+        self.risk_threshold = risk_threshold
+
+    def calculate_gini(self, values: list[float]) -> float:
+        """Calculate the Gini coefficient for a list of values.
+
+        The Gini coefficient measures inequality among values.
+        0 = perfect equality, 1 = maximum inequality.
+
+        Args:
+            values: List of numeric values (e.g., commit counts per author).
+
+        Returns:
+            Gini coefficient between 0 and 1.
+        """
+        if not values or len(values) < 2:
+            return 0.0
+
+        sorted_values = sorted(values)
+        n = len(sorted_values)
+
+        cumulative_sum = 0.0
+        total = sum(sorted_values)
+
+        if total == 0:
+            return 0.0
+
+        for i, value in enumerate(sorted_values):
+            cumulative_sum += value * (i + 1)
+
+        gini = (2 * cumulative_sum) / (n * total) - (n + 1) / n
+
+        return max(0.0, min(1.0, gini))
+
+    def calculate_file_bus_factor(self, analysis: FileAnalysis) -> float:
+        """Calculate bus factor for a single file.
+
+        Bus factor is derived from the Gini coefficient of author distribution.
+        A lower bus factor indicates higher risk (concentration of ownership).
+
+        Args:
+            analysis: FileAnalysis with authorship data.
+
+        Returns:
+            Bus factor score (lower = more risky).
+        """
+        if analysis.total_commits == 0:
+            return 1.0
+
+        if analysis.num_authors == 1:
+            return 1.0
+
+        commits = list(analysis.author_commits.values())
+        gini = self.calculate_gini(commits)
+
+        bus_factor = 1.0 + (1.0 - gini) * (analysis.num_authors - 1)
+
+        return min(bus_factor, float(analysis.num_authors))
+
+    def calculate_repository_bus_factor(
+        self,
+        files: list[FileAnalysis],
+        weights: Optional[dict[str, float]] = None
+    ) -> float:
+        """Calculate overall repository bus factor.
+
+        Args:
+            files: List of FileAnalysis objects.
+            weights: Optional weights per file (e.g., by importance).
+
+        Returns:
+            Overall bus factor score.
+        """
+        if not files:
+            return 1.0
+
+        total_weight = 0.0
+        weighted_sum = 0.0
+
+        for analysis in files:
+            bus_factor = self.calculate_file_bus_factor(analysis)
+            weight = weights.get(analysis.path, 1.0) if weights else 1.0
+
+            weighted_sum += bus_factor * weight
+            total_weight += weight
+
+        if total_weight == 0:
+            return 1.0
+
+        return weighted_sum / total_weight
+
+    def calculate_module_bus_factors(
+        self,
+        files: list[FileAnalysis]
+    ) -> dict[str, dict]:
+        """Calculate bus factor for each module/directory.
+
+        Args:
+            files: List of FileAnalysis objects.
+
+        Returns:
+            Dictionary mapping module to stats including bus factor.
+        """
+        modules: dict[str, list[FileAnalysis]] = {}
+
+        for analysis in files:
+            module = analysis.module or "root"
+            if module not in modules:
+                modules[module] = []
+            modules[module].append(analysis)
+
+        module_stats = {}
+        for module, module_files in modules.items():
+            avg_bus_factor = self.calculate_repository_bus_factor(module_files)
+            gini = self.calculate_gini(
+                [f.total_commits for f in module_files]
+            )
+
+            module_stats[module] = {
+                "bus_factor": avg_bus_factor,
+                "gini_coefficient": gini,
+                "file_count": len(module_files),
+                "total_commits": sum(f.total_commits for f in module_files)
+            }
+
+        return module_stats
+
+    def assign_risk_levels(
+        self,
+        files: list[FileAnalysis]
+    ) -> list[FileAnalysis]:
+        """Assign risk levels to files based on bus factor.
+
+        Args:
+            files: List of FileAnalysis objects.
+
+        Returns:
+            Updated FileAnalysis objects with risk levels.
+        """
+        for analysis in files:
+            bus_factor = self.calculate_file_bus_factor(analysis)
+            analysis.bus_factor = bus_factor
+
+            if analysis.total_commits == 0:
+                analysis.risk_level = "unknown"
+            elif analysis.num_authors == 1:
+                analysis.risk_level = "critical"
+            elif bus_factor < self.RISK_THRESHOLDS["critical"]:
+                analysis.risk_level = "critical"
+            elif bus_factor < self.RISK_THRESHOLDS["high"]:
+                analysis.risk_level = "high"
+            elif bus_factor < self.RISK_THRESHOLDS["medium"]:
+                analysis.risk_level = "medium"
+            else:
+                analysis.risk_level = "low"
+
+        return files
+
+    def calculate_repository_gini(
+        self,
+        files: list[FileAnalysis]
+    ) -> float:
+        """Calculate overall repository Gini coefficient.
+
+        Measures how evenly commits are distributed across authors.
+        High Gini means commits are concentrated in few authors.
+
+        Args:
+            files: List of FileAnalysis objects.
+
+        Returns:
+            Overall Gini coefficient.
+        """
+        if not files:
+            return 0.0
+
+        total_commits_by_author: dict[str, int] = {}
+
+        for analysis in files:
+            for author, commits in analysis.author_commits.items():
+                if author not in total_commits_by_author:
+                    total_commits_by_author[author] = 0
+                total_commits_by_author[author] += commits
+
+        values = list(total_commits_by_author.values())
+        
+        if not values or len(values) < 2:
+            return 0.0
+
+        gini = self.calculate_gini(values)
+
+        if gini == 0.0 and len(files) > 1:
+            unique_authors_per_file = sum(1 for f in files if f.num_authors > 0)
+            if unique_authors_per_file > 1:
+                return 0.5
+
+        return gini