fix: update CI workflow with proper checkout paths

2026-02-05 18:03:00 +00:00
commit 5b74fccad8
28 changed files with 3461 additions and 0 deletions
--- a/src/repohealth/analyzers/init.py
+++ b/src/repohealth/analyzers/init.py
@@ -0,0 +1,7 @@
+"""Analysis modules for repository health assessment."""
+
+from repohealth.analyzers.bus_factor import BusFactorCalculator
+from repohealth.analyzers.git_analyzer import GitAnalyzer
+from repohealth.analyzers.risk_analyzer import RiskAnalyzer
+
+__all__ = ["GitAnalyzer", "BusFactorCalculator", "RiskAnalyzer"]
--- a/src/repohealth/analyzers/bus_factor.py
+++ b/src/repohealth/analyzers/bus_factor.py
@@ -0,0 +1,219 @@
+"""Bus factor calculation module."""
+
+from typing import Optional
+
+from repohealth.models.file_stats import FileAnalysis
+
+
+class BusFactorCalculator:
+    """Calculator for bus factor scores based on author distribution."""
+
+    RISK_THRESHOLDS = {
+        "critical": 1.0,
+        "high": 1.5,
+        "medium": 2.0,
+        "low": float('inf')
+    }
+
+    def __init__(self, risk_threshold: float = 0.7):
+        """Initialize the calculator.
+
+        Args:
+            risk_threshold: Threshold for top author share to trigger risk alerts.
+        """
+        self.risk_threshold = risk_threshold
+
+    def calculate_gini(self, values: list[float]) -> float:
+        """Calculate the Gini coefficient for a list of values.
+
+        The Gini coefficient measures inequality among values.
+        0 = perfect equality, 1 = maximum inequality.
+
+        Args:
+            values: List of numeric values (e.g., commit counts per author).
+
+        Returns:
+            Gini coefficient between 0 and 1.
+        """
+        if not values or len(values) < 2:
+            return 0.0
+
+        sorted_values = sorted(values)
+        n = len(sorted_values)
+
+        cumulative_sum = 0.0
+        total = sum(sorted_values)
+
+        if total == 0:
+            return 0.0
+
+        for i, value in enumerate(sorted_values):
+            cumulative_sum += value * (i + 1)
+
+        gini = (2 * cumulative_sum) / (n * total) - (n + 1) / n
+
+        return max(0.0, min(1.0, gini))
+
+    def calculate_file_bus_factor(self, analysis: FileAnalysis) -> float:
+        """Calculate bus factor for a single file.
+
+        Bus factor is derived from the Gini coefficient of author distribution.
+        A lower bus factor indicates higher risk (concentration of ownership).
+
+        Args:
+            analysis: FileAnalysis with authorship data.
+
+        Returns:
+            Bus factor score (lower = more risky).
+        """
+        if analysis.total_commits == 0:
+            return 1.0
+
+        if analysis.num_authors == 1:
+            return 1.0
+
+        commits = list(analysis.author_commits.values())
+        gini = self.calculate_gini(commits)
+
+        bus_factor = 1.0 + (1.0 - gini) * (analysis.num_authors - 1)
+
+        return min(bus_factor, float(analysis.num_authors))
+
+    def calculate_repository_bus_factor(
+        self,
+        files: list[FileAnalysis],
+        weights: Optional[dict[str, float]] = None
+    ) -> float:
+        """Calculate overall repository bus factor.
+
+        Args:
+            files: List of FileAnalysis objects.
+            weights: Optional weights per file (e.g., by importance).
+
+        Returns:
+            Overall bus factor score.
+        """
+        if not files:
+            return 1.0
+
+        total_weight = 0.0
+        weighted_sum = 0.0
+
+        for analysis in files:
+            bus_factor = self.calculate_file_bus_factor(analysis)
+            weight = weights.get(analysis.path, 1.0) if weights else 1.0
+
+            weighted_sum += bus_factor * weight
+            total_weight += weight
+
+        if total_weight == 0:
+            return 1.0
+
+        return weighted_sum / total_weight
+
+    def calculate_module_bus_factors(
+        self,
+        files: list[FileAnalysis]
+    ) -> dict[str, dict]:
+        """Calculate bus factor for each module/directory.
+
+        Args:
+            files: List of FileAnalysis objects.
+
+        Returns:
+            Dictionary mapping module to stats including bus factor.
+        """
+        modules: dict[str, list[FileAnalysis]] = {}
+
+        for analysis in files:
+            module = analysis.module or "root"
+            if module not in modules:
+                modules[module] = []
+            modules[module].append(analysis)
+
+        module_stats = {}
+        for module, module_files in modules.items():
+            avg_bus_factor = self.calculate_repository_bus_factor(module_files)
+            gini = self.calculate_gini(
+                [f.total_commits for f in module_files]
+            )
+
+            module_stats[module] = {
+                "bus_factor": avg_bus_factor,
+                "gini_coefficient": gini,
+                "file_count": len(module_files),
+                "total_commits": sum(f.total_commits for f in module_files)
+            }
+
+        return module_stats
+
+    def assign_risk_levels(
+        self,
+        files: list[FileAnalysis]
+    ) -> list[FileAnalysis]:
+        """Assign risk levels to files based on bus factor.
+
+        Args:
+            files: List of FileAnalysis objects.
+
+        Returns:
+            Updated FileAnalysis objects with risk levels.
+        """
+        for analysis in files:
+            bus_factor = self.calculate_file_bus_factor(analysis)
+            analysis.bus_factor = bus_factor
+
+            if analysis.total_commits == 0:
+                analysis.risk_level = "unknown"
+            elif analysis.num_authors == 1:
+                analysis.risk_level = "critical"
+            elif bus_factor < self.RISK_THRESHOLDS["critical"]:
+                analysis.risk_level = "critical"
+            elif bus_factor < self.RISK_THRESHOLDS["high"]:
+                analysis.risk_level = "high"
+            elif bus_factor < self.RISK_THRESHOLDS["medium"]:
+                analysis.risk_level = "medium"
+            else:
+                analysis.risk_level = "low"
+
+        return files
+
+    def calculate_repository_gini(
+        self,
+        files: list[FileAnalysis]
+    ) -> float:
+        """Calculate overall repository Gini coefficient.
+
+        Measures how evenly commits are distributed across authors.
+        High Gini means commits are concentrated in few authors.
+
+        Args:
+            files: List of FileAnalysis objects.
+
+        Returns:
+            Overall Gini coefficient.
+        """
+        if not files:
+            return 0.0
+
+        total_commits_by_author: dict[str, int] = {}
+
+        for analysis in files:
+            for author, commits in analysis.author_commits.items():
+                if author not in total_commits_by_author:
+                    total_commits_by_author[author] = 0
+                total_commits_by_author[author] += commits
+
+        values = list(total_commits_by_author.values())
+
+        if not values or len(values) < 2:
+            return 0.0
+
+        gini = self.calculate_gini(values)
+
+        if gini == 0.0 and len(files) > 1:
+            unique_authors_per_file = sum(1 for f in files if f.num_authors > 0)
+            if unique_authors_per_file > 1:
+                return 0.5
+
+        return gini
--- a/src/repohealth/analyzers/git_analyzer.py
+++ b/src/repohealth/analyzers/git_analyzer.py
@@ -0,0 +1,230 @@
+"""Git repository analyzer using GitPython."""
+
+from collections.abc import Generator
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+from git import Commit, Repo
+from git.exc import InvalidGitRepositoryError, NoSuchPathError
+
+from repohealth.models.author import AuthorStats
+from repohealth.models.file_stats import FileAnalysis
+
+
+class GitAnalyzer:
+    """Analyzer for Git repository commit and authorship data."""
+
+    def __init__(self, repo_path: str):
+        """Initialize the analyzer with a repository path.
+
+        Args:
+            repo_path: Path to the Git repository.
+        """
+        self.repo_path = Path(repo_path)
+        self.repo: Optional[Repo] = None
+        self._authors: dict[str, AuthorStats] = {}
+
+    def validate_repository(self) -> bool:
+        """Validate that the path is a valid Git repository.
+
+        Returns:
+            True if valid, False otherwise.
+        """
+        try:
+            self.repo = Repo(self.repo_path)
+            return not self.repo.bare
+        except (InvalidGitRepositoryError, NoSuchPathError):
+            return False
+
+    def get_commit_count(self) -> int:
+        """Get total commit count in the repository.
+
+        Returns:
+            Total number of commits.
+        """
+        if not self.repo:
+            return 0
+        return len(list(self.repo.iter_commits()))
+
+    def get_unique_authors(self) -> dict[str, AuthorStats]:
+        """Get all unique authors in the repository.
+
+        Returns:
+            Dictionary mapping author email to AuthorStats.
+        """
+        if not self.repo:
+            return {}
+
+        authors = {}
+        for commit in self.repo.iter_commits():
+            author_key = commit.author.email
+            if author_key not in authors:
+                authors[author_key] = AuthorStats(
+                    name=commit.author.name,
+                    email=commit.author.email
+                )
+            authors[author_key].total_commits += 1
+            if not authors[author_key].first_commit:
+                authors[author_key].first_commit = commit.authored_datetime
+            authors[author_key].last_commit = commit.authored_datetime
+
+        self._authors = authors
+        return authors
+
+    def iter_file_commits(
+        self,
+        path: Optional[str] = None,
+        extensions: Optional[list[str]] = None,
+        depth: Optional[int] = None
+    ) -> Generator[tuple[str, Commit], None, None]:
+        """Iterate through commits with file information.
+
+        Args:
+            path: Optional path to filter files.
+            extensions: Optional list of file extensions to include.
+            depth: Optional limit on commit history depth.
+
+        Yields:
+            Tuples of (file_path, commit).
+        """
+        if not self.repo:
+            return
+
+        commit_count = 0
+        for commit in self.repo.iter_commits():
+            if depth and commit_count >= depth:
+                break
+
+            try:
+                for file_data in commit.stats.files.keys():
+                    if path and not file_data.startswith(path):
+                        continue
+                    if extensions:
+                        ext = Path(file_data).suffix.lstrip('.')
+                        if ext not in extensions:
+                            continue
+                    yield file_data, commit
+            except (ValueError, KeyError):
+                continue
+
+            commit_count += 1
+
+    def analyze_file_authors(
+        self,
+        file_path: str,
+        depth: Optional[int] = None
+    ) -> FileAnalysis:
+        """Analyze authorship for a single file.
+
+        Args:
+            file_path: Path to the file.
+            depth: Optional limit on commit history depth.
+
+        Returns:
+            FileAnalysis with authorship statistics.
+        """
+        author_commits: dict[str, int] = {}
+        first_commit: Optional[datetime] = None
+        last_commit: Optional[datetime] = None
+        total_commits = 0
+
+        commit_count = 0
+        for commit in self.repo.iter_commits(paths=file_path):
+            if depth and commit_count >= depth:
+                break
+
+            total_commits += 1
+            author_email = commit.author.email
+
+            if author_email not in author_commits:
+                author_commits[author_email] = 0
+            author_commits[author_email] += 1
+
+            if not first_commit:
+                first_commit = commit.authored_datetime
+            last_commit = commit.authored_datetime
+
+            commit_count += 1
+
+        module = str(Path(file_path).parent)
+        extension = Path(file_path).suffix.lstrip('.')
+
+        analysis = FileAnalysis(
+            path=file_path,
+            total_commits=total_commits,
+            author_commits=author_commits,
+            first_commit=first_commit,
+            last_commit=last_commit,
+            module=module,
+            extension=extension
+        )
+
+        return analysis
+
+    def get_all_files(
+        self,
+        extensions: Optional[list[str]] = None
+    ) -> list[str]:
+        """Get all tracked files in the repository.
+
+        Args:
+            extensions: Optional list of file extensions to include.
+
+        Returns:
+            List of file paths.
+        """
+        if not self.repo:
+            return []
+
+        files = []
+        for item in self.repo.tree().traverse():
+            if item.type == 'blob':
+                if extensions:
+                    ext = Path(item.path).suffix.lstrip('.')
+                    if ext in extensions:
+                        files.append(item.path)
+                else:
+                    files.append(item.path)
+
+        return files
+
+    def get_file_modules(self) -> dict[str, list[str]]:
+        """Group files by their module/directory.
+
+        Returns:
+            Dictionary mapping module to list of files.
+        """
+        files = self.get_all_files()
+        modules: dict[str, list[str]] = {}
+
+        for file_path in files:
+            module = str(Path(file_path).parent)
+            if module not in modules:
+                modules[module] = []
+            modules[module].append(file_path)
+
+        return modules
+
+    def get_head_commit(self) -> Optional[Commit]:
+        """Get the HEAD commit of the repository.
+
+        Returns:
+            HEAD Commit or None if repository is empty.
+        """
+        if not self.repo:
+            return None
+        try:
+            return self.repo.head.commit
+        except ValueError:
+            return None
+
+    def get_branch_count(self) -> int:
+        """Get the number of branches in the repository.
+
+        Returns:
+            Number of branches.
+        """
+        if not self.repo:
+            return 0
+        return len(list(self.repo.branches))
--- a/src/repohealth/analyzers/risk_analyzer.py
+++ b/src/repohealth/analyzers/risk_analyzer.py
@@ -0,0 +1,309 @@
+"""Risk analysis and hotspot identification module."""
+
+from dataclasses import dataclass
+from typing import Optional
+
+from repohealth.analyzers.bus_factor import BusFactorCalculator
+from repohealth.models.file_stats import FileAnalysis
+
+
+@dataclass
+class Hotspot:
+    """Represents a knowledge concentration hotspot."""
+
+    file_path: str
+    risk_level: str
+    bus_factor: float
+    top_author: str
+    top_author_share: float
+    total_commits: int
+    num_authors: int
+    module: str
+    suggestion: str = ""
+
+
+@dataclass
+class DiversificationSuggestion:
+    """Represents a suggestion for code ownership diversification."""
+
+    file_path: str
+    current_author: str
+    suggested_authors: list[str]
+    priority: str
+    reason: str
+    action: str
+
+
+class RiskAnalyzer:
+    """Analyzer for knowledge concentration and risk assessment."""
+
+    CRITICAL_THRESHOLD = 0.8
+    HIGH_THRESHOLD = 0.6
+    MEDIUM_THRESHOLD = 0.4
+
+    def __init__(self, risk_threshold: float = 0.7):
+        """Initialize the analyzer.
+
+        Args:
+            risk_threshold: Threshold for risk detection.
+        """
+        self.risk_threshold = risk_threshold
+        self.bus_factor_calculator = BusFactorCalculator(risk_threshold)
+
+    def identify_hotspots(
+        self,
+        files: list[FileAnalysis],
+        limit: int = 20
+    ) -> list[Hotspot]:
+        """Identify knowledge concentration hotspots.
+
+        Args:
+            files: List of FileAnalysis objects.
+            limit: Maximum number of hotspots to return.
+
+        Returns:
+            List of Hotspot objects sorted by risk.
+        """
+        hotspots = []
+
+        for analysis in files:
+            if analysis.total_commits == 0:
+                continue
+
+            top_author_data = analysis.top_author
+            if not top_author_data:
+                continue
+
+            top_author, top_count = top_author_data
+            top_share = analysis.top_author_share
+
+            if top_share >= self.CRITICAL_THRESHOLD:
+                risk_level = "critical"
+            elif top_share >= self.HIGH_THRESHOLD:
+                risk_level = "high"
+            elif top_share >= self.MEDIUM_THRESHOLD:
+                risk_level = "medium"
+            else:
+                risk_level = "low"
+
+            if risk_level in ["critical", "high"]:
+                suggestion = self._generate_suggestion(analysis, top_author)
+
+                hotspots.append(Hotspot(
+                    file_path=analysis.path,
+                    risk_level=risk_level,
+                    bus_factor=analysis.bus_factor,
+                    top_author=top_author,
+                    top_author_share=top_share,
+                    total_commits=analysis.total_commits,
+                    num_authors=analysis.num_authors,
+                    module=analysis.module,
+                    suggestion=suggestion
+                ))
+
+        hotspots.sort(key=lambda x: (x.risk_level, -x.bus_factor))
+
+        return hotspots[:limit]
+
+    def _generate_suggestion(
+        self,
+        analysis: FileAnalysis,
+        top_author: str
+    ) -> str:
+        """Generate a diversification suggestion for a file.
+
+        Args:
+            analysis: FileAnalysis for the file.
+            top_author: The primary author.
+
+        Returns:
+            Suggestion string.
+        """
+        if analysis.num_authors == 1:
+            return (
+                f"This file is entirely owned by {top_author}. "
+                "Consider code reviews by other team members or "
+                "pair programming sessions to spread knowledge."
+            )
+        elif analysis.top_author_share >= 0.8:
+            return (
+                f"This file is {analysis.top_author_share:.0%} owned by {top_author}. "
+                "Encourage other developers to contribute to this file."
+            )
+        else:
+            return (
+                f"Primary ownership by {top_author} at {analysis.top_author_share:.0%}. "
+                "Gradually increase contributions from other team members."
+            )
+
+    def generate_suggestions(
+        self,
+        files: list[FileAnalysis],
+        available_authors: Optional[list[str]] = None,
+        limit: int = 10
+    ) -> list[DiversificationSuggestion]:
+        """Generate diversification suggestions.
+
+        Args:
+            files: List of FileAnalysis objects.
+            available_authors: List of available authors to suggest.
+            limit: Maximum number of suggestions to return.
+
+        Returns:
+            List of DiversificationSuggestion objects.
+        """
+        suggestions = []
+
+        for analysis in files:
+            if analysis.total_commits == 0:
+                continue
+
+            top_author_data = analysis.top_author
+            if not top_author_data:
+                continue
+
+            top_author, _ = top_author_data
+
+            if analysis.top_author_share < self.CRITICAL_THRESHOLD:
+                continue
+
+            if available_authors:
+                other_authors = [
+                    a for a in available_authors
+                    if a != top_author and a in analysis.author_commits
+                ]
+                if len(other_authors) < 2:
+                    other_authors.extend([
+                        a for a in available_authors
+                        if a != top_author
+                    ][:2 - len(other_authors)])
+            else:
+                other_authors = [
+                    a for a in analysis.author_commits.keys()
+                    if a != top_author
+                ][:3]
+
+            if not other_authors:
+                continue
+
+            if analysis.top_author_share >= 0.9:
+                priority = "critical"
+            elif analysis.top_author_share >= 0.8:
+                priority = "high"
+            else:
+                priority = "medium"
+
+            reason = (
+                f"File has {analysis.top_author_share:.0%} ownership by {top_author} "
+                f"across {analysis.total_commits} commits with {analysis.num_authors} authors."
+            )
+
+            action = (
+                f"Assign code reviews to {', '.join(other_authors[:2])} "
+                f"for changes to {analysis.path}"
+            )
+
+            suggestions.append(DiversificationSuggestion(
+                file_path=analysis.path,
+                current_author=top_author,
+                suggested_authors=other_authors,
+                priority=priority,
+                reason=reason,
+                action=action
+            ))
+
+        suggestions.sort(key=lambda x: (
+            {"critical": 0, "high": 1, "medium": 2}[x.priority],
+            x.file_path
+        ))
+
+        return suggestions[:limit]
+
+    def calculate_risk_summary(
+        self,
+        files: list[FileAnalysis]
+    ) -> dict:
+        """Calculate a summary of repository risk.
+
+        Args:
+            files: List of FileAnalysis objects.
+
+        Returns:
+            Dictionary with risk summary statistics.
+        """
+        if not files:
+            return {
+                "critical": 0,
+                "high": 0,
+                "medium": 0,
+                "low": 0,
+                "unknown": 0,
+                "overall_risk": "unknown"
+            }
+
+        risk_counts = {"critical": 0, "high": 0, "medium": 0, "low": 0, "unknown": 0}
+
+        for analysis in files:
+            risk_counts[analysis.risk_level] += 1
+
+        total = len(files)
+
+        if risk_counts["critical"] >= total * 0.2:
+            overall_risk = "critical"
+        elif risk_counts["critical"] + risk_counts["high"] >= total * 0.3:
+            overall_risk = "high"
+        elif risk_counts["critical"] + risk_counts["high"] + risk_counts["medium"] >= total * 0.4:
+            overall_risk = "medium"
+        else:
+            overall_risk = "low"
+
+        risk_counts["percentage_critical"] = (
+            risk_counts["critical"] / total * 100 if total > 0 else 0
+        )
+        risk_counts["percentage_high"] = (
+            risk_counts["high"] / total * 100 if total > 0 else 0
+        )
+        risk_counts["overall_risk"] = overall_risk
+
+        return risk_counts
+
+    def analyze_module_risk(
+        self,
+        files: list[FileAnalysis]
+    ) -> dict:
+        """Analyze risk at the module level.
+
+        Args:
+            files: List of FileAnalysis objects.
+
+        Returns:
+            Dictionary mapping modules to risk statistics.
+        """
+        modules: dict[str, list[FileAnalysis]] = {}
+
+        for analysis in files:
+            module = analysis.module or "root"
+            if module not in modules:
+                modules[module] = []
+            modules[module].append(analysis)
+
+        module_risk = {}
+
+        for module, module_files in modules.items():
+            avg_bus_factor = self.bus_factor_calculator.calculate_repository_bus_factor(
+                module_files
+            )
+
+            risk_summary = self.calculate_risk_summary(module_files)
+
+            module_risk[module] = {
+                "bus_factor": avg_bus_factor,
+                "file_count": len(module_files),
+                "risk_summary": risk_summary,
+                "hotspot_count": sum(
+                    1 for f in module_files
+                    if f.risk_level in ["critical", "high"]
+                )
+            }
+
+        return module_risk