From c56dd1879495cfb58f6d4c097188aa46d0c006aa Mon Sep 17 00:00:00 2001 From: 7000pctAUTO Date: Mon, 2 Feb 2026 02:38:27 +0000 Subject: [PATCH] Add parsers module --- src/parsers/python.py | 170 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 src/parsers/python.py diff --git a/src/parsers/python.py b/src/parsers/python.py new file mode 100644 index 0000000..f472d68 --- /dev/null +++ b/src/parsers/python.py @@ -0,0 +1,170 @@ +from pathlib import Path +import re + +from src.parsers.base import BaseParser, ParserResult, Entity, EntityType + + +class PythonParser(BaseParser): + SUPPORTED_EXTENSIONS = [".py", ".pyi"] + + def __init__(self): + self._use_simple_parsing = True + + def parse(self, file_path: Path, content: str) -> ParserResult: + result = ParserResult(file_path=file_path, language="python") + try: + result.entities = self.extract_entities(content, file_path) + result.imports = self.extract_imports(content) + return result + except Exception as e: + result.errors.append(f"Parse error: {str(e)}") + return result + + def extract_entities(self, content: str, file_path: Path) -> list[Entity]: + entities = [] + entities.extend(self._extract_functions(content, file_path)) + entities.extend(self._extract_classes(content, file_path)) + return entities + + def _extract_functions(self, content: str, file_path: Path) -> list[Entity]: + functions = [] + lines = content.split('\n') + pattern = r'^(\s*)def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(([^)]*)\)\s*:' + + for i, line in enumerate(lines): + match = re.match(pattern, line) + if match: + indent = match.group(1) + func_name = match.group(2) + params = match.group(3) + + start_line = i + 1 + end_line = self._find_block_end(lines, i, indent) + + code_lines = lines[i:end_line] + code = '\n'.join(code_lines) + + entity = Entity( + name=func_name, + entity_type=EntityType.FUNCTION, + file_path=file_path, + start_line=start_line, + end_line=end_line, + code=code, + attributes={"parameters": [p.strip() for p in params.split(',') if p.strip()]}, + calls=self._extract_function_calls(code), + ) + functions.append(entity) + + return functions + + def _extract_classes(self, content: str, file_path: Path) -> list[Entity]: + classes = [] + lines = content.split('\n') + pattern = r'^(\s*)class\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*(\([^)]*\))?\s*:' + + for i, line in enumerate(lines): + match = re.match(pattern, line) + if match: + indent = match.group(1) + class_name = match.group(2) + + start_line = i + 1 + end_line = self._find_block_end(lines, i, indent) + + code_lines = lines[i:end_line] + code = '\n'.join(code_lines) + + class_content = '\n'.join(lines[i+1:end_line]) + methods = self._extract_methods_from_content(class_content, file_path, indent + ' ') + + entity = Entity( + name=class_name, + entity_type=EntityType.CLASS, + file_path=file_path, + start_line=start_line, + end_line=end_line, + code=code, + children=methods, + ) + classes.append(entity) + + return classes + + def _extract_methods_from_content(self, content: str, file_path: Path, base_indent: str) -> list[Entity]: + methods = [] + lines = content.split('\n') + pattern = rf'^{re.escape(base_indent)}(\s*)def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(([^)]*)\)\s*:' + + for i, line in enumerate(lines): + match = re.match(pattern, line) + if match: + method_indent = match.group(1) + method_name = match.group(2) + params = match.group(3) + + start_line_offset = i + end_offset = self._find_block_end(lines, i, base_indent + method_indent) + + full_start_line = start_line_offset + 1 + full_end_line = end_offset + 1 + + code_lines = lines[i:end_offset] + code = '\n'.join(code_lines) + + entity = Entity( + name=method_name, + entity_type=EntityType.METHOD, + file_path=file_path, + start_line=full_start_line, + end_line=full_end_line, + code=code, + attributes={"parameters": [p.strip() for p in params.split(',') if p.strip()]}, + calls=self._extract_function_calls(code), + ) + methods.append(entity) + + return methods + + def _extract_function_calls(self, code: str) -> list[str]: + calls = [] + pattern = r'\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(' + for match in re.finditer(pattern, code): + func_name = match.group(1) + if func_name not in ['if', 'while', 'for', 'with', 'assert', 'return', 'print', 'len', 'str', 'int', 'list', 'dict', 'range', 'open', 'super']: + calls.append(func_name) + return list(set(calls)) + + def _find_block_end(self, lines: list[str], start_index: int, indent: str) -> int: + for i in range(start_index + 1, len(lines)): + line = lines[i] + if line.strip() and not line.startswith(indent) and not line.strip().startswith('#'): + if line.strip(): + return i + if line.strip().startswith(('elif ', 'else ', 'except ', 'finally ')): + return i + return len(lines) + + def extract_imports(self, content: str) -> list[str]: + imports = [] + lines = content.split('\n') + + for line in lines: + line = line.strip() + if line.startswith('import '): + import_part = line[7:].strip() + for imp in import_part.split(','): + imp = imp.strip() + if ' as ' in imp: + imp = imp.split(' as ')[0].strip() + imports.append(imp) + elif line.startswith('from '): + match = re.match(r'from\s+([\w.]+)\s+import', line) + if match: + module = match.group(1) + imports.append(module) + + return imports + + def extract_calls(self, content: str) -> list[str]: + return self._extract_function_calls(content)