144 lines
5.6 KiB
Python
144 lines
5.6 KiB
Python
from pathlib import Path
|
|
from typing import List, Optional, Dict, Any
|
|
from dataclasses import dataclass, field
|
|
import yaml
|
|
|
|
|
|
@dataclass
|
|
class ChunkingConfig:
|
|
include_patterns: List[str] = field(default_factory=lambda: [
|
|
"*.py", "*.js", "*.ts", "*.go", "*.rs", "*.java", "*.cpp", "*.c", "*.h"
|
|
])
|
|
exclude_patterns: List[str] = field(default_factory=lambda: [
|
|
"**/test_*.py", "**/__pycache__/**", "**/node_modules/**",
|
|
"**/.git/**", "**/venv/**", "**/.env/**"
|
|
])
|
|
max_chunk_size: int = 500
|
|
min_chunk_size: int = 3
|
|
preserve_docstrings: bool = True
|
|
remove_comments: bool = False
|
|
boilerplate_patterns: List[str] = field(default_factory=lambda: [
|
|
r"@property\s*\n\s*def\s+\w+\s*\(\s*\)\s*:",
|
|
r"@abstractmethod",
|
|
r"@staticmethod",
|
|
r"@classmethod"
|
|
])
|
|
|
|
|
|
@dataclass
|
|
class PrioritizationConfig:
|
|
keywords: List[str] = field(default_factory=lambda: [
|
|
"main", "core", "handler", "controller", "service", "model"
|
|
])
|
|
size_limit: int = 10000
|
|
exclude_patterns: List[str] = field(default_factory=lambda: [
|
|
"**/test_*.py", "**/*_test.py", "**/conftest.py"
|
|
])
|
|
include_only: List[str] = field(default_factory=list)
|
|
weight_by_depth: bool = True
|
|
|
|
|
|
@dataclass
|
|
class OutputConfig:
|
|
format: str = "markdown"
|
|
max_tokens: int = 8192
|
|
include_metadata: bool = True
|
|
syntax_highlighting: bool = True
|
|
|
|
|
|
@dataclass
|
|
class Config:
|
|
chunking: ChunkingConfig = field(default_factory=ChunkingConfig)
|
|
prioritization: PrioritizationConfig = field(default_factory=PrioritizationConfig)
|
|
output: OutputConfig = field(default_factory=OutputConfig)
|
|
env_overrides: Dict[str, str] = field(default_factory=dict)
|
|
|
|
|
|
def load_config(config_path: Optional[str] = None) -> Config:
|
|
"""Load configuration from YAML file."""
|
|
if config_path is None:
|
|
config_path = Path.cwd() / ".codechunk.yaml"
|
|
|
|
config_file = Path(config_path)
|
|
|
|
if not config_file.exists():
|
|
return Config()
|
|
|
|
try:
|
|
with open(config_file, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if data is None:
|
|
return Config()
|
|
|
|
config = Config()
|
|
|
|
if "chunking" in data:
|
|
chunking_data = data["chunking"]
|
|
config.chunking = ChunkingConfig(
|
|
include_patterns=chunking_data.get("include_patterns", config.chunking.include_patterns),
|
|
exclude_patterns=chunking_data.get("exclude_patterns", config.chunking.exclude_patterns),
|
|
max_chunk_size=chunking_data.get("max_chunk_size", config.chunking.max_chunk_size),
|
|
min_chunk_size=chunking_data.get("min_chunk_size", config.chunking.min_chunk_size),
|
|
preserve_docstrings=chunking_data.get("preserve_docstrings", config.chunking.preserve_docstrings),
|
|
remove_comments=chunking_data.get("remove_comments", config.chunking.remove_comments),
|
|
boilerplate_patterns=chunking_data.get("boilerplate_patterns", config.chunking.boilerplate_patterns)
|
|
)
|
|
|
|
if "prioritization" in data:
|
|
prio_data = data["prioritization"]
|
|
config.prioritization = PrioritizationConfig(
|
|
keywords=prio_data.get("keywords", config.prioritization.keywords),
|
|
size_limit=prio_data.get("size_limit", config.prioritization.size_limit),
|
|
exclude_patterns=prio_data.get("exclude_patterns", config.prioritization.exclude_patterns),
|
|
include_only=prio_data.get("include_only", config.prioritization.include_only),
|
|
weight_by_depth=prio_data.get("weight_by_depth", config.prioritization.weight_by_depth)
|
|
)
|
|
|
|
if "output" in data:
|
|
out_data = data["output"]
|
|
config.output = OutputConfig(
|
|
format=out_data.get("format", config.output.format),
|
|
max_tokens=out_data.get("max_tokens", config.output.max_tokens),
|
|
include_metadata=out_data.get("include_metadata", config.output.include_metadata),
|
|
syntax_highlighting=out_data.get("syntax_highlighting", config.output.syntax_highlighting)
|
|
)
|
|
|
|
return config
|
|
|
|
except yaml.YAMLError as e:
|
|
raise ValueError(f"Invalid YAML in config file: {e}")
|
|
except Exception as e:
|
|
raise ValueError(f"Error loading config file: {e}")
|
|
|
|
|
|
def save_config(config: Config, config_path: str = ".codechunk.yaml") -> None:
|
|
"""Save configuration to YAML file."""
|
|
data = {
|
|
"chunking": {
|
|
"include_patterns": config.chunking.include_patterns,
|
|
"exclude_patterns": config.chunking.exclude_patterns,
|
|
"max_chunk_size": config.chunking.max_chunk_size,
|
|
"min_chunk_size": config.chunking.min_chunk_size,
|
|
"preserve_docstrings": config.chunking.preserve_docstrings,
|
|
"remove_comments": config.chunking.remove_comments,
|
|
"boilerplate_patterns": config.chunking.boilerplate_patterns
|
|
},
|
|
"prioritization": {
|
|
"keywords": config.prioritization.keywords,
|
|
"size_limit": config.prioritization.size_limit,
|
|
"exclude_patterns": config.prioritization.exclude_patterns,
|
|
"include_only": config.prioritization.include_only,
|
|
"weight_by_depth": config.prioritization.weight_by_depth
|
|
},
|
|
"output": {
|
|
"format": config.output.format,
|
|
"max_tokens": config.output.max_tokens,
|
|
"include_metadata": config.output.include_metadata,
|
|
"syntax_highlighting": config.output.syntax_highlighting
|
|
}
|
|
}
|
|
|
|
with open(config_path, 'w') as f:
|
|
yaml.dump(data, f, default_flow_style=False, indent=2)
|