From 160970f039f17e719d768f6d075ea55718870e68 Mon Sep 17 00:00:00 2001 From: 7000pctAUTO Date: Sun, 1 Feb 2026 23:42:00 +0000 Subject: [PATCH] Add codechunk package files --- codechunk/config.py | 143 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 codechunk/config.py diff --git a/codechunk/config.py b/codechunk/config.py new file mode 100644 index 0000000..7048fbc --- /dev/null +++ b/codechunk/config.py @@ -0,0 +1,143 @@ +from pathlib import Path +from typing import List, Optional, Dict, Any +from dataclasses import dataclass, field +import yaml + + +@dataclass +class ChunkingConfig: + include_patterns: List[str] = field(default_factory=lambda: [ + "*.py", "*.js", "*.ts", "*.go", "*.rs", "*.java", "*.cpp", "*.c", "*.h" + ]) + exclude_patterns: List[str] = field(default_factory=lambda: [ + "**/test_*.py", "**/__pycache__/**", "**/node_modules/**", + "**/.git/**", "**/venv/**", "**/.env/**" + ]) + max_chunk_size: int = 500 + min_chunk_size: int = 3 + preserve_docstrings: bool = True + remove_comments: bool = False + boilerplate_patterns: List[str] = field(default_factory=lambda: [ + r"@property\s*\n\s*def\s+\w+\s*\(\s*\)\s*:", + r"@abstractmethod", + r"@staticmethod", + r"@classmethod" + ]) + + +@dataclass +class PrioritizationConfig: + keywords: List[str] = field(default_factory=lambda: [ + "main", "core", "handler", "controller", "service", "model" + ]) + size_limit: int = 10000 + exclude_patterns: List[str] = field(default_factory=lambda: [ + "**/test_*.py", "**/*_test.py", "**/conftest.py" + ]) + include_only: List[str] = field(default_factory=list) + weight_by_depth: bool = True + + +@dataclass +class OutputConfig: + format: str = "markdown" + max_tokens: int = 8192 + include_metadata: bool = True + syntax_highlighting: bool = True + + +@dataclass +class Config: + chunking: ChunkingConfig = field(default_factory=ChunkingConfig) + prioritization: PrioritizationConfig = field(default_factory=PrioritizationConfig) + output: OutputConfig = field(default_factory=OutputConfig) + env_overrides: Dict[str, str] = field(default_factory=dict) + + +def load_config(config_path: Optional[str] = None) -> Config: + """Load configuration from YAML file.""" + if config_path is None: + config_path = Path.cwd() / ".codechunk.yaml" + + config_file = Path(config_path) + + if not config_file.exists(): + return Config() + + try: + with open(config_file, 'r') as f: + data = yaml.safe_load(f) + + if data is None: + return Config() + + config = Config() + + if "chunking" in data: + chunking_data = data["chunking"] + config.chunking = ChunkingConfig( + include_patterns=chunking_data.get("include_patterns", config.chunking.include_patterns), + exclude_patterns=chunking_data.get("exclude_patterns", config.chunking.exclude_patterns), + max_chunk_size=chunking_data.get("max_chunk_size", config.chunking.max_chunk_size), + min_chunk_size=chunking_data.get("min_chunk_size", config.chunking.min_chunk_size), + preserve_docstrings=chunking_data.get("preserve_docstrings", config.chunking.preserve_docstrings), + remove_comments=chunking_data.get("remove_comments", config.chunking.remove_comments), + boilerplate_patterns=chunking_data.get("boilerplate_patterns", config.chunking.boilerplate_patterns) + ) + + if "prioritization" in data: + prio_data = data["prioritization"] + config.prioritization = PrioritizationConfig( + keywords=prio_data.get("keywords", config.prioritization.keywords), + size_limit=prio_data.get("size_limit", config.prioritization.size_limit), + exclude_patterns=prio_data.get("exclude_patterns", config.prioritization.exclude_patterns), + include_only=prio_data.get("include_only", config.prioritization.include_only), + weight_by_depth=prio_data.get("weight_by_depth", config.prioritization.weight_by_depth) + ) + + if "output" in data: + out_data = data["output"] + config.output = OutputConfig( + format=out_data.get("format", config.output.format), + max_tokens=out_data.get("max_tokens", config.output.max_tokens), + include_metadata=out_data.get("include_metadata", config.output.include_metadata), + syntax_highlighting=out_data.get("syntax_highlighting", config.output.syntax_highlighting) + ) + + return config + + except yaml.YAMLError as e: + raise ValueError(f"Invalid YAML in config file: {e}") + except Exception as e: + raise ValueError(f"Error loading config file: {e}") + + +def save_config(config: Config, config_path: str = ".codechunk.yaml") -> None: + """Save configuration to YAML file.""" + data = { + "chunking": { + "include_patterns": config.chunking.include_patterns, + "exclude_patterns": config.chunking.exclude_patterns, + "max_chunk_size": config.chunking.max_chunk_size, + "min_chunk_size": config.chunking.min_chunk_size, + "preserve_docstrings": config.chunking.preserve_docstrings, + "remove_comments": config.chunking.remove_comments, + "boilerplate_patterns": config.chunking.boilerplate_patterns + }, + "prioritization": { + "keywords": config.prioritization.keywords, + "size_limit": config.prioritization.size_limit, + "exclude_patterns": config.prioritization.exclude_patterns, + "include_only": config.prioritization.include_only, + "weight_by_depth": config.prioritization.weight_by_depth + }, + "output": { + "format": config.output.format, + "max_tokens": config.output.max_tokens, + "include_metadata": config.output.include_metadata, + "syntax_highlighting": config.output.syntax_highlighting + } + } + + with open(config_path, 'w') as f: + yaml.dump(data, f, default_flow_style=False, indent=2)