Add parsers module

2026-02-02 02:38:27 +00:00
parent 15e580f23e
commit c56dd18794
1 changed files with 170 additions and 0 deletions
--- a/src/parsers/python.py
+++ b/src/parsers/python.py
@@ -0,0 +1,170 @@
+from pathlib import Path
+import re
+
+from src.parsers.base import BaseParser, ParserResult, Entity, EntityType
+
+
+class PythonParser(BaseParser):
+    SUPPORTED_EXTENSIONS = [".py", ".pyi"]
+
+    def __init__(self):
+        self._use_simple_parsing = True
+
+    def parse(self, file_path: Path, content: str) -> ParserResult:
+        result = ParserResult(file_path=file_path, language="python")
+        try:
+            result.entities = self.extract_entities(content, file_path)
+            result.imports = self.extract_imports(content)
+            return result
+        except Exception as e:
+            result.errors.append(f"Parse error: {str(e)}")
+            return result
+
+    def extract_entities(self, content: str, file_path: Path) -> list[Entity]:
+        entities = []
+        entities.extend(self._extract_functions(content, file_path))
+        entities.extend(self._extract_classes(content, file_path))
+        return entities
+
+    def _extract_functions(self, content: str, file_path: Path) -> list[Entity]:
+        functions = []
+        lines = content.split('\n')
+        pattern = r'^(\s*)def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(([^)]*)\)\s*:'
+
+        for i, line in enumerate(lines):
+            match = re.match(pattern, line)
+            if match:
+                indent = match.group(1)
+                func_name = match.group(2)
+                params = match.group(3)
+
+                start_line = i + 1
+                end_line = self._find_block_end(lines, i, indent)
+
+                code_lines = lines[i:end_line]
+                code = '\n'.join(code_lines)
+
+                entity = Entity(
+                    name=func_name,
+                    entity_type=EntityType.FUNCTION,
+                    file_path=file_path,
+                    start_line=start_line,
+                    end_line=end_line,
+                    code=code,
+                    attributes={"parameters": [p.strip() for p in params.split(',') if p.strip()]},
+                    calls=self._extract_function_calls(code),
+                )
+                functions.append(entity)
+
+        return functions
+
+    def _extract_classes(self, content: str, file_path: Path) -> list[Entity]:
+        classes = []
+        lines = content.split('\n')
+        pattern = r'^(\s*)class\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*(\([^)]*\))?\s*:'
+
+        for i, line in enumerate(lines):
+            match = re.match(pattern, line)
+            if match:
+                indent = match.group(1)
+                class_name = match.group(2)
+
+                start_line = i + 1
+                end_line = self._find_block_end(lines, i, indent)
+
+                code_lines = lines[i:end_line]
+                code = '\n'.join(code_lines)
+
+                class_content = '\n'.join(lines[i+1:end_line])
+                methods = self._extract_methods_from_content(class_content, file_path, indent + '    ')
+
+                entity = Entity(
+                    name=class_name,
+                    entity_type=EntityType.CLASS,
+                    file_path=file_path,
+                    start_line=start_line,
+                    end_line=end_line,
+                    code=code,
+                    children=methods,
+                )
+                classes.append(entity)
+
+        return classes
+
+    def _extract_methods_from_content(self, content: str, file_path: Path, base_indent: str) -> list[Entity]:
+        methods = []
+        lines = content.split('\n')
+        pattern = rf'^{re.escape(base_indent)}(\s*)def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(([^)]*)\)\s*:'
+
+        for i, line in enumerate(lines):
+            match = re.match(pattern, line)
+            if match:
+                method_indent = match.group(1)
+                method_name = match.group(2)
+                params = match.group(3)
+
+                start_line_offset = i
+                end_offset = self._find_block_end(lines, i, base_indent + method_indent)
+
+                full_start_line = start_line_offset + 1
+                full_end_line = end_offset + 1
+
+                code_lines = lines[i:end_offset]
+                code = '\n'.join(code_lines)
+
+                entity = Entity(
+                    name=method_name,
+                    entity_type=EntityType.METHOD,
+                    file_path=file_path,
+                    start_line=full_start_line,
+                    end_line=full_end_line,
+                    code=code,
+                    attributes={"parameters": [p.strip() for p in params.split(',') if p.strip()]},
+                    calls=self._extract_function_calls(code),
+                )
+                methods.append(entity)
+
+        return methods
+
+    def _extract_function_calls(self, code: str) -> list[str]:
+        calls = []
+        pattern = r'\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\('
+        for match in re.finditer(pattern, code):
+            func_name = match.group(1)
+            if func_name not in ['if', 'while', 'for', 'with', 'assert', 'return', 'print', 'len', 'str', 'int', 'list', 'dict', 'range', 'open', 'super']:
+                calls.append(func_name)
+        return list(set(calls))
+
+    def _find_block_end(self, lines: list[str], start_index: int, indent: str) -> int:
+        for i in range(start_index + 1, len(lines)):
+            line = lines[i]
+            if line.strip() and not line.startswith(indent) and not line.strip().startswith('#'):
+                if line.strip():
+                    return i
+            if line.strip().startswith(('elif ', 'else ', 'except ', 'finally ')):
+                return i
+        return len(lines)
+
+    def extract_imports(self, content: str) -> list[str]:
+        imports = []
+        lines = content.split('\n')
+
+        for line in lines:
+            line = line.strip()
+            if line.startswith('import '):
+                import_part = line[7:].strip()
+                for imp in import_part.split(','):
+                    imp = imp.strip()
+                    if ' as ' in imp:
+                        imp = imp.split(' as ')[0].strip()
+                    imports.append(imp)
+            elif line.startswith('from '):
+                match = re.match(r'from\s+([\w.]+)\s+import', line)
+                if match:
+                    module = match.group(1)
+                    imports.append(module)
+
+        return imports
+
+    def extract_calls(self, content: str) -> list[str]:
+        return self._extract_function_calls(content)