fix: update CI workflow with proper checkout paths
Some checks failed
CI / lint (push) Successful in 9m27s
CI / test (push) Failing after 4m46s
CI / build (push) Has been skipped

This commit is contained in:
Developer
2026-02-05 18:03:00 +00:00
commit 5b74fccad8
28 changed files with 3461 additions and 0 deletions

View File

@@ -0,0 +1,7 @@
"""Analysis modules for repository health assessment."""
from repohealth.analyzers.bus_factor import BusFactorCalculator
from repohealth.analyzers.git_analyzer import GitAnalyzer
from repohealth.analyzers.risk_analyzer import RiskAnalyzer
__all__ = ["GitAnalyzer", "BusFactorCalculator", "RiskAnalyzer"]

View File

@@ -0,0 +1,219 @@
"""Bus factor calculation module."""
from typing import Optional
from repohealth.models.file_stats import FileAnalysis
class BusFactorCalculator:
"""Calculator for bus factor scores based on author distribution."""
RISK_THRESHOLDS = {
"critical": 1.0,
"high": 1.5,
"medium": 2.0,
"low": float('inf')
}
def __init__(self, risk_threshold: float = 0.7):
"""Initialize the calculator.
Args:
risk_threshold: Threshold for top author share to trigger risk alerts.
"""
self.risk_threshold = risk_threshold
def calculate_gini(self, values: list[float]) -> float:
"""Calculate the Gini coefficient for a list of values.
The Gini coefficient measures inequality among values.
0 = perfect equality, 1 = maximum inequality.
Args:
values: List of numeric values (e.g., commit counts per author).
Returns:
Gini coefficient between 0 and 1.
"""
if not values or len(values) < 2:
return 0.0
sorted_values = sorted(values)
n = len(sorted_values)
cumulative_sum = 0.0
total = sum(sorted_values)
if total == 0:
return 0.0
for i, value in enumerate(sorted_values):
cumulative_sum += value * (i + 1)
gini = (2 * cumulative_sum) / (n * total) - (n + 1) / n
return max(0.0, min(1.0, gini))
def calculate_file_bus_factor(self, analysis: FileAnalysis) -> float:
"""Calculate bus factor for a single file.
Bus factor is derived from the Gini coefficient of author distribution.
A lower bus factor indicates higher risk (concentration of ownership).
Args:
analysis: FileAnalysis with authorship data.
Returns:
Bus factor score (lower = more risky).
"""
if analysis.total_commits == 0:
return 1.0
if analysis.num_authors == 1:
return 1.0
commits = list(analysis.author_commits.values())
gini = self.calculate_gini(commits)
bus_factor = 1.0 + (1.0 - gini) * (analysis.num_authors - 1)
return min(bus_factor, float(analysis.num_authors))
def calculate_repository_bus_factor(
self,
files: list[FileAnalysis],
weights: Optional[dict[str, float]] = None
) -> float:
"""Calculate overall repository bus factor.
Args:
files: List of FileAnalysis objects.
weights: Optional weights per file (e.g., by importance).
Returns:
Overall bus factor score.
"""
if not files:
return 1.0
total_weight = 0.0
weighted_sum = 0.0
for analysis in files:
bus_factor = self.calculate_file_bus_factor(analysis)
weight = weights.get(analysis.path, 1.0) if weights else 1.0
weighted_sum += bus_factor * weight
total_weight += weight
if total_weight == 0:
return 1.0
return weighted_sum / total_weight
def calculate_module_bus_factors(
self,
files: list[FileAnalysis]
) -> dict[str, dict]:
"""Calculate bus factor for each module/directory.
Args:
files: List of FileAnalysis objects.
Returns:
Dictionary mapping module to stats including bus factor.
"""
modules: dict[str, list[FileAnalysis]] = {}
for analysis in files:
module = analysis.module or "root"
if module not in modules:
modules[module] = []
modules[module].append(analysis)
module_stats = {}
for module, module_files in modules.items():
avg_bus_factor = self.calculate_repository_bus_factor(module_files)
gini = self.calculate_gini(
[f.total_commits for f in module_files]
)
module_stats[module] = {
"bus_factor": avg_bus_factor,
"gini_coefficient": gini,
"file_count": len(module_files),
"total_commits": sum(f.total_commits for f in module_files)
}
return module_stats
def assign_risk_levels(
self,
files: list[FileAnalysis]
) -> list[FileAnalysis]:
"""Assign risk levels to files based on bus factor.
Args:
files: List of FileAnalysis objects.
Returns:
Updated FileAnalysis objects with risk levels.
"""
for analysis in files:
bus_factor = self.calculate_file_bus_factor(analysis)
analysis.bus_factor = bus_factor
if analysis.total_commits == 0:
analysis.risk_level = "unknown"
elif analysis.num_authors == 1:
analysis.risk_level = "critical"
elif bus_factor < self.RISK_THRESHOLDS["critical"]:
analysis.risk_level = "critical"
elif bus_factor < self.RISK_THRESHOLDS["high"]:
analysis.risk_level = "high"
elif bus_factor < self.RISK_THRESHOLDS["medium"]:
analysis.risk_level = "medium"
else:
analysis.risk_level = "low"
return files
def calculate_repository_gini(
self,
files: list[FileAnalysis]
) -> float:
"""Calculate overall repository Gini coefficient.
Measures how evenly commits are distributed across authors.
High Gini means commits are concentrated in few authors.
Args:
files: List of FileAnalysis objects.
Returns:
Overall Gini coefficient.
"""
if not files:
return 0.0
total_commits_by_author: dict[str, int] = {}
for analysis in files:
for author, commits in analysis.author_commits.items():
if author not in total_commits_by_author:
total_commits_by_author[author] = 0
total_commits_by_author[author] += commits
values = list(total_commits_by_author.values())
if not values or len(values) < 2:
return 0.0
gini = self.calculate_gini(values)
if gini == 0.0 and len(files) > 1:
unique_authors_per_file = sum(1 for f in files if f.num_authors > 0)
if unique_authors_per_file > 1:
return 0.5
return gini

View File

@@ -0,0 +1,230 @@
"""Git repository analyzer using GitPython."""
from collections.abc import Generator
from datetime import datetime
from pathlib import Path
from typing import Optional
from git import Commit, Repo
from git.exc import InvalidGitRepositoryError, NoSuchPathError
from repohealth.models.author import AuthorStats
from repohealth.models.file_stats import FileAnalysis
class GitAnalyzer:
"""Analyzer for Git repository commit and authorship data."""
def __init__(self, repo_path: str):
"""Initialize the analyzer with a repository path.
Args:
repo_path: Path to the Git repository.
"""
self.repo_path = Path(repo_path)
self.repo: Optional[Repo] = None
self._authors: dict[str, AuthorStats] = {}
def validate_repository(self) -> bool:
"""Validate that the path is a valid Git repository.
Returns:
True if valid, False otherwise.
"""
try:
self.repo = Repo(self.repo_path)
return not self.repo.bare
except (InvalidGitRepositoryError, NoSuchPathError):
return False
def get_commit_count(self) -> int:
"""Get total commit count in the repository.
Returns:
Total number of commits.
"""
if not self.repo:
return 0
return len(list(self.repo.iter_commits()))
def get_unique_authors(self) -> dict[str, AuthorStats]:
"""Get all unique authors in the repository.
Returns:
Dictionary mapping author email to AuthorStats.
"""
if not self.repo:
return {}
authors = {}
for commit in self.repo.iter_commits():
author_key = commit.author.email
if author_key not in authors:
authors[author_key] = AuthorStats(
name=commit.author.name,
email=commit.author.email
)
authors[author_key].total_commits += 1
if not authors[author_key].first_commit:
authors[author_key].first_commit = commit.authored_datetime
authors[author_key].last_commit = commit.authored_datetime
self._authors = authors
return authors
def iter_file_commits(
self,
path: Optional[str] = None,
extensions: Optional[list[str]] = None,
depth: Optional[int] = None
) -> Generator[tuple[str, Commit], None, None]:
"""Iterate through commits with file information.
Args:
path: Optional path to filter files.
extensions: Optional list of file extensions to include.
depth: Optional limit on commit history depth.
Yields:
Tuples of (file_path, commit).
"""
if not self.repo:
return
commit_count = 0
for commit in self.repo.iter_commits():
if depth and commit_count >= depth:
break
try:
for file_data in commit.stats.files.keys():
if path and not file_data.startswith(path):
continue
if extensions:
ext = Path(file_data).suffix.lstrip('.')
if ext not in extensions:
continue
yield file_data, commit
except (ValueError, KeyError):
continue
commit_count += 1
def analyze_file_authors(
self,
file_path: str,
depth: Optional[int] = None
) -> FileAnalysis:
"""Analyze authorship for a single file.
Args:
file_path: Path to the file.
depth: Optional limit on commit history depth.
Returns:
FileAnalysis with authorship statistics.
"""
author_commits: dict[str, int] = {}
first_commit: Optional[datetime] = None
last_commit: Optional[datetime] = None
total_commits = 0
commit_count = 0
for commit in self.repo.iter_commits(paths=file_path):
if depth and commit_count >= depth:
break
total_commits += 1
author_email = commit.author.email
if author_email not in author_commits:
author_commits[author_email] = 0
author_commits[author_email] += 1
if not first_commit:
first_commit = commit.authored_datetime
last_commit = commit.authored_datetime
commit_count += 1
module = str(Path(file_path).parent)
extension = Path(file_path).suffix.lstrip('.')
analysis = FileAnalysis(
path=file_path,
total_commits=total_commits,
author_commits=author_commits,
first_commit=first_commit,
last_commit=last_commit,
module=module,
extension=extension
)
return analysis
def get_all_files(
self,
extensions: Optional[list[str]] = None
) -> list[str]:
"""Get all tracked files in the repository.
Args:
extensions: Optional list of file extensions to include.
Returns:
List of file paths.
"""
if not self.repo:
return []
files = []
for item in self.repo.tree().traverse():
if item.type == 'blob':
if extensions:
ext = Path(item.path).suffix.lstrip('.')
if ext in extensions:
files.append(item.path)
else:
files.append(item.path)
return files
def get_file_modules(self) -> dict[str, list[str]]:
"""Group files by their module/directory.
Returns:
Dictionary mapping module to list of files.
"""
files = self.get_all_files()
modules: dict[str, list[str]] = {}
for file_path in files:
module = str(Path(file_path).parent)
if module not in modules:
modules[module] = []
modules[module].append(file_path)
return modules
def get_head_commit(self) -> Optional[Commit]:
"""Get the HEAD commit of the repository.
Returns:
HEAD Commit or None if repository is empty.
"""
if not self.repo:
return None
try:
return self.repo.head.commit
except ValueError:
return None
def get_branch_count(self) -> int:
"""Get the number of branches in the repository.
Returns:
Number of branches.
"""
if not self.repo:
return 0
return len(list(self.repo.branches))

View File

@@ -0,0 +1,309 @@
"""Risk analysis and hotspot identification module."""
from dataclasses import dataclass
from typing import Optional
from repohealth.analyzers.bus_factor import BusFactorCalculator
from repohealth.models.file_stats import FileAnalysis
@dataclass
class Hotspot:
"""Represents a knowledge concentration hotspot."""
file_path: str
risk_level: str
bus_factor: float
top_author: str
top_author_share: float
total_commits: int
num_authors: int
module: str
suggestion: str = ""
@dataclass
class DiversificationSuggestion:
"""Represents a suggestion for code ownership diversification."""
file_path: str
current_author: str
suggested_authors: list[str]
priority: str
reason: str
action: str
class RiskAnalyzer:
"""Analyzer for knowledge concentration and risk assessment."""
CRITICAL_THRESHOLD = 0.8
HIGH_THRESHOLD = 0.6
MEDIUM_THRESHOLD = 0.4
def __init__(self, risk_threshold: float = 0.7):
"""Initialize the analyzer.
Args:
risk_threshold: Threshold for risk detection.
"""
self.risk_threshold = risk_threshold
self.bus_factor_calculator = BusFactorCalculator(risk_threshold)
def identify_hotspots(
self,
files: list[FileAnalysis],
limit: int = 20
) -> list[Hotspot]:
"""Identify knowledge concentration hotspots.
Args:
files: List of FileAnalysis objects.
limit: Maximum number of hotspots to return.
Returns:
List of Hotspot objects sorted by risk.
"""
hotspots = []
for analysis in files:
if analysis.total_commits == 0:
continue
top_author_data = analysis.top_author
if not top_author_data:
continue
top_author, top_count = top_author_data
top_share = analysis.top_author_share
if top_share >= self.CRITICAL_THRESHOLD:
risk_level = "critical"
elif top_share >= self.HIGH_THRESHOLD:
risk_level = "high"
elif top_share >= self.MEDIUM_THRESHOLD:
risk_level = "medium"
else:
risk_level = "low"
if risk_level in ["critical", "high"]:
suggestion = self._generate_suggestion(analysis, top_author)
hotspots.append(Hotspot(
file_path=analysis.path,
risk_level=risk_level,
bus_factor=analysis.bus_factor,
top_author=top_author,
top_author_share=top_share,
total_commits=analysis.total_commits,
num_authors=analysis.num_authors,
module=analysis.module,
suggestion=suggestion
))
hotspots.sort(key=lambda x: (x.risk_level, -x.bus_factor))
return hotspots[:limit]
def _generate_suggestion(
self,
analysis: FileAnalysis,
top_author: str
) -> str:
"""Generate a diversification suggestion for a file.
Args:
analysis: FileAnalysis for the file.
top_author: The primary author.
Returns:
Suggestion string.
"""
if analysis.num_authors == 1:
return (
f"This file is entirely owned by {top_author}. "
"Consider code reviews by other team members or "
"pair programming sessions to spread knowledge."
)
elif analysis.top_author_share >= 0.8:
return (
f"This file is {analysis.top_author_share:.0%} owned by {top_author}. "
"Encourage other developers to contribute to this file."
)
else:
return (
f"Primary ownership by {top_author} at {analysis.top_author_share:.0%}. "
"Gradually increase contributions from other team members."
)
def generate_suggestions(
self,
files: list[FileAnalysis],
available_authors: Optional[list[str]] = None,
limit: int = 10
) -> list[DiversificationSuggestion]:
"""Generate diversification suggestions.
Args:
files: List of FileAnalysis objects.
available_authors: List of available authors to suggest.
limit: Maximum number of suggestions to return.
Returns:
List of DiversificationSuggestion objects.
"""
suggestions = []
for analysis in files:
if analysis.total_commits == 0:
continue
top_author_data = analysis.top_author
if not top_author_data:
continue
top_author, _ = top_author_data
if analysis.top_author_share < self.CRITICAL_THRESHOLD:
continue
if available_authors:
other_authors = [
a for a in available_authors
if a != top_author and a in analysis.author_commits
]
if len(other_authors) < 2:
other_authors.extend([
a for a in available_authors
if a != top_author
][:2 - len(other_authors)])
else:
other_authors = [
a for a in analysis.author_commits.keys()
if a != top_author
][:3]
if not other_authors:
continue
if analysis.top_author_share >= 0.9:
priority = "critical"
elif analysis.top_author_share >= 0.8:
priority = "high"
else:
priority = "medium"
reason = (
f"File has {analysis.top_author_share:.0%} ownership by {top_author} "
f"across {analysis.total_commits} commits with {analysis.num_authors} authors."
)
action = (
f"Assign code reviews to {', '.join(other_authors[:2])} "
f"for changes to {analysis.path}"
)
suggestions.append(DiversificationSuggestion(
file_path=analysis.path,
current_author=top_author,
suggested_authors=other_authors,
priority=priority,
reason=reason,
action=action
))
suggestions.sort(key=lambda x: (
{"critical": 0, "high": 1, "medium": 2}[x.priority],
x.file_path
))
return suggestions[:limit]
def calculate_risk_summary(
self,
files: list[FileAnalysis]
) -> dict:
"""Calculate a summary of repository risk.
Args:
files: List of FileAnalysis objects.
Returns:
Dictionary with risk summary statistics.
"""
if not files:
return {
"critical": 0,
"high": 0,
"medium": 0,
"low": 0,
"unknown": 0,
"overall_risk": "unknown"
}
risk_counts = {"critical": 0, "high": 0, "medium": 0, "low": 0, "unknown": 0}
for analysis in files:
risk_counts[analysis.risk_level] += 1
total = len(files)
if risk_counts["critical"] >= total * 0.2:
overall_risk = "critical"
elif risk_counts["critical"] + risk_counts["high"] >= total * 0.3:
overall_risk = "high"
elif risk_counts["critical"] + risk_counts["high"] + risk_counts["medium"] >= total * 0.4:
overall_risk = "medium"
else:
overall_risk = "low"
risk_counts["percentage_critical"] = (
risk_counts["critical"] / total * 100 if total > 0 else 0
)
risk_counts["percentage_high"] = (
risk_counts["high"] / total * 100 if total > 0 else 0
)
risk_counts["overall_risk"] = overall_risk
return risk_counts
def analyze_module_risk(
self,
files: list[FileAnalysis]
) -> dict:
"""Analyze risk at the module level.
Args:
files: List of FileAnalysis objects.
Returns:
Dictionary mapping modules to risk statistics.
"""
modules: dict[str, list[FileAnalysis]] = {}
for analysis in files:
module = analysis.module or "root"
if module not in modules:
modules[module] = []
modules[module].append(analysis)
module_risk = {}
for module, module_files in modules.items():
avg_bus_factor = self.bus_factor_calculator.calculate_repository_bus_factor(
module_files
)
risk_summary = self.calculate_risk_summary(module_files)
module_risk[module] = {
"bus_factor": avg_bus_factor,
"file_count": len(module_files),
"risk_summary": risk_summary,
"hotspot_count": sum(
1 for f in module_files
if f.risk_level in ["critical", "high"]
)
}
return module_risk