from pathlib import Path from typing import Any, Optional from dataclasses import dataclass, field import yaml @dataclass class ChunkingConfig: include_patterns: list[str] = field(default_factory=lambda: [ "*.py", "*.js", "*.ts", "*.go", "*.rs", "*.java", "*.cpp", "*.c", "*.h" ]) exclude_patterns: list[str] = field(default_factory=lambda: [ "**/test_*.py", "**/__pycache__/**", "**/node_modules/**", "**/.git/**", "**/venv/**", "**/.env/**" ]) max_chunk_size: int = 500 min_chunk_size: int = 3 preserve_docstrings: bool = True remove_comments: bool = False boilerplate_patterns: list[str] = field(default_factory=lambda: [ r"@property\s*\n\s*def\s+\w+\s*\(\s*\)\s*:", r"@abstractmethod", r"@staticmethod", r"@classmethod" ]) @dataclass class PrioritizationConfig: keywords: list[str] = field(default_factory=lambda: [ "main", "core", "handler", "controller", "service", "model" ]) size_limit: int = 10000 exclude_patterns: list[str] = field(default_factory=lambda: [ "**/test_*.py", "**/*_test.py", "**/conftest.py" ]) include_only: list[str] = field(default_factory=list) weight_by_depth: bool = True @dataclass class OutputConfig: format: str = "markdown" max_tokens: int = 8192 include_metadata: bool = True syntax_highlighting: bool = True @dataclass class Config: chunking: ChunkingConfig = field(default_factory=ChunkingConfig) prioritization: PrioritizationConfig = field(default_factory=PrioritizationConfig) output: OutputConfig = field(default_factory=OutputConfig) env_overrides: dict[str, str] = field(default_factory=dict) def load_config(config_path: Optional[str] = None) -> Config: """Load configuration from YAML file.""" if config_path is None: config_file = Path.cwd() / ".codechunk.yaml" else: config_file = Path(config_path) if not config_file.exists(): return Config() try: with open(config_file) as f: data = yaml.safe_load(f) if data is None: return Config() config = Config() if "chunking" in data: chunking_data = data["chunking"] config.chunking = ChunkingConfig( include_patterns=chunking_data.get("include_patterns", config.chunking.include_patterns), exclude_patterns=chunking_data.get("exclude_patterns", config.chunking.exclude_patterns), max_chunk_size=chunking_data.get("max_chunk_size", config.chunking.max_chunk_size), min_chunk_size=chunking_data.get("min_chunk_size", config.chunking.min_chunk_size), preserve_docstrings=chunking_data.get("preserve_docstrings", config.chunking.preserve_docstrings), remove_comments=chunking_data.get("remove_comments", config.chunking.remove_comments), boilerplate_patterns=chunking_data.get("boilerplate_patterns", config.chunking.boilerplate_patterns) ) if "prioritization" in data: prio_data = data["prioritization"] config.prioritization = PrioritizationConfig( keywords=prio_data.get("keywords", config.prioritization.keywords), size_limit=prio_data.get("size_limit", config.prioritization.size_limit), exclude_patterns=prio_data.get("exclude_patterns", config.prioritization.exclude_patterns), include_only=prio_data.get("include_only", config.prioritization.include_only), weight_by_depth=prio_data.get("weight_by_depth", config.prioritization.weight_by_depth) ) if "output" in data: out_data = data["output"] config.output = OutputConfig( format=out_data.get("format", config.output.format), max_tokens=out_data.get("max_tokens", config.output.max_tokens), include_metadata=out_data.get("include_metadata", config.output.include_metadata), syntax_highlighting=out_data.get("syntax_highlighting", config.output.syntax_highlighting) ) return config except yaml.YAMLError as e: raise ValueError(f"Invalid YAML in config file: {e}") except Exception as e: raise ValueError(f"Error loading config file: {e}") def save_config(config: Config, config_path: str = ".codechunk.yaml") -> None: """Save configuration to YAML file.""" data = { "chunking": { "include_patterns": config.chunking.include_patterns, "exclude_patterns": config.chunking.exclude_patterns, "max_chunk_size": config.chunking.max_chunk_size, "min_chunk_size": config.chunking.min_chunk_size, "preserve_docstrings": config.chunking.preserve_docstrings, "remove_comments": config.chunking.remove_comments, "boilerplate_patterns": config.chunking.boilerplate_patterns }, "prioritization": { "keywords": config.prioritization.keywords, "size_limit": config.prioritization.size_limit, "exclude_patterns": config.prioritization.exclude_patterns, "include_only": config.prioritization.include_only, "weight_by_depth": config.prioritization.weight_by_depth }, "output": { "format": config.output.format, "max_tokens": config.output.max_tokens, "include_metadata": config.output.include_metadata, "syntax_highlighting": config.output.syntax_highlighting } } with open(config_path, 'w') as f: yaml.dump(data, f, default_flow_style=False, indent=2)