Initial upload with CI/CD workflow
This commit is contained in:
251
codesnap/core/parser.py
Normal file
251
codesnap/core/parser.py
Normal file
@@ -0,0 +1,251 @@
|
||||
"""Code parsing module using tree-sitter."""
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from tree_sitter import Language, Parser
|
||||
|
||||
from .language_detection import LanguageDetector
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedFile:
|
||||
"""Represents a parsed source file."""
|
||||
|
||||
path: Path
|
||||
language: str
|
||||
content: str
|
||||
ast: Optional[object] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ImportStatement:
|
||||
"""Represents an import/require statement."""
|
||||
|
||||
module: str
|
||||
alias: Optional[str] = None
|
||||
line_number: int = 0
|
||||
is_from: False = False
|
||||
names: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
LANGUAGE_PARSERS: dict[str, Parser] = {}
|
||||
|
||||
|
||||
def get_parser(language: str) -> Parser:
|
||||
"""Get or create a tree-sitter parser for a language."""
|
||||
if language not in LANGUAGE_PARSERS:
|
||||
try:
|
||||
lang = Language.for_language(language)
|
||||
parser = Parser()
|
||||
parser.set_language(lang)
|
||||
LANGUAGE_PARSERS[language] = parser
|
||||
except Exception as e:
|
||||
raise ValueError(f"Failed to load parser for {language}: {e}")
|
||||
return LANGUAGE_PARSERS[language]
|
||||
|
||||
|
||||
class CodeParser:
|
||||
"""Parses source code files using tree-sitter."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.language_detector = LanguageDetector()
|
||||
|
||||
def parse_file(self, path: Path, content: Optional[str] = None) -> ParsedFile:
|
||||
"""Parse a single file."""
|
||||
try:
|
||||
if content is None:
|
||||
content = path.read_text(encoding="utf-8")
|
||||
except (IOError, UnicodeDecodeError) as e:
|
||||
return ParsedFile(
|
||||
path=path,
|
||||
language="unknown",
|
||||
content="",
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
language = self.language_detector.detect(path, content)
|
||||
if language is None:
|
||||
return ParsedFile(
|
||||
path=path,
|
||||
language="unknown",
|
||||
content=content,
|
||||
)
|
||||
|
||||
try:
|
||||
parser = get_parser(language)
|
||||
tree = parser.parse(content.encode("utf-8"))
|
||||
return ParsedFile(
|
||||
path=path,
|
||||
language=language,
|
||||
content=content,
|
||||
ast=tree,
|
||||
)
|
||||
except Exception as e:
|
||||
return ParsedFile(
|
||||
path=path,
|
||||
language=language,
|
||||
content=content,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
def parse_directory(
|
||||
self, directory: Path, max_files: int = 1000
|
||||
) -> list[ParsedFile]:
|
||||
"""Parse all source files in a directory."""
|
||||
parsed_files: list[ParsedFile] = []
|
||||
files_processed = 0
|
||||
|
||||
for root, _, filenames in os.walk(directory):
|
||||
if files_processed >= max_files:
|
||||
break
|
||||
|
||||
for filename in filenames:
|
||||
if files_processed >= max_files:
|
||||
break
|
||||
|
||||
filepath = Path(root) / filename
|
||||
parsed = self.parse_file(filepath)
|
||||
parsed_files.append(parsed)
|
||||
files_processed += 1
|
||||
|
||||
return parsed_files
|
||||
|
||||
def extract_imports(self, parsed_file: ParsedFile) -> list[ImportStatement]:
|
||||
"""Extract import statements from a parsed file."""
|
||||
imports: list[ImportStatement] = []
|
||||
|
||||
if parsed_file.ast is None:
|
||||
return imports
|
||||
|
||||
language = parsed_file.language
|
||||
content = parsed_file.content
|
||||
|
||||
if language == "python":
|
||||
imports = self._extract_python_imports(content, parsed_file.path)
|
||||
elif language in ("javascript", "typescript"):
|
||||
imports = self._extract_js_imports(content, parsed_file.path)
|
||||
elif language == "go":
|
||||
imports = self._extract_go_imports(content, parsed_file.path)
|
||||
|
||||
return imports
|
||||
|
||||
def _extract_python_imports(
|
||||
self, content: str, path: Path
|
||||
) -> list[ImportStatement]:
|
||||
"""Extract Python import statements."""
|
||||
imports: list[ImportStatement] = []
|
||||
lines = content.split("\n")
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line = line.strip()
|
||||
if line.startswith("import "):
|
||||
module = line[7:].split()[0].split(".")[0]
|
||||
imports.append(
|
||||
ImportStatement(
|
||||
module=module,
|
||||
line_number=i + 1,
|
||||
is_from=False,
|
||||
)
|
||||
)
|
||||
elif line.startswith("from "):
|
||||
parts = line[5:].split()
|
||||
if parts:
|
||||
module = parts[0]
|
||||
names = []
|
||||
if "import" in parts:
|
||||
idx = parts.index("import")
|
||||
if idx + 1 < len(parts):
|
||||
names = [n.strip().split(" as ")[0] for n in parts[idx + 1].split(",")]
|
||||
imports.append(
|
||||
ImportStatement(
|
||||
module=module,
|
||||
line_number=i + 1,
|
||||
is_from=True,
|
||||
names=names,
|
||||
)
|
||||
)
|
||||
|
||||
return imports
|
||||
|
||||
def _extract_js_imports(
|
||||
self, content: str, path: Path
|
||||
) -> list[ImportStatement]:
|
||||
"""Extract JavaScript/TypeScript import statements."""
|
||||
imports: list[ImportStatement] = []
|
||||
lines = content.split("\n")
|
||||
|
||||
import_pattern = __import__("re").compile(
|
||||
r"(?:import\s+(?:\{[^}]*\}|\*|[\w$]+)(?:\s+as\s+[\w$]+)?\s+from\s+)?['\"]([^'\"]+)['\"]"
|
||||
)
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line = line.strip()
|
||||
if line.startswith("import "):
|
||||
match = import_pattern.search(line)
|
||||
if match:
|
||||
module = match.group(1)
|
||||
imports.append(
|
||||
ImportStatement(
|
||||
module=module,
|
||||
line_number=i + 1,
|
||||
is_from=True,
|
||||
)
|
||||
)
|
||||
elif line.startswith("require("):
|
||||
match = __import__("re").search(r"require\(['\"]([^'\"]+)['\"]", line)
|
||||
if match:
|
||||
imports.append(
|
||||
ImportStatement(
|
||||
module=match.group(1),
|
||||
line_number=i + 1,
|
||||
is_from=False,
|
||||
)
|
||||
)
|
||||
|
||||
return imports
|
||||
|
||||
def _extract_go_imports(
|
||||
self, content: str, path: Path
|
||||
) -> list[ImportStatement]:
|
||||
"""Extract Go import statements."""
|
||||
imports: list[ImportStatement] = []
|
||||
lines = content.split("\n")
|
||||
|
||||
in_import_block = False
|
||||
import_start = -1
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if line.strip() == "import (":
|
||||
in_import_block = True
|
||||
import_start = i + 1
|
||||
continue
|
||||
if in_import_block:
|
||||
if line.strip() == ")":
|
||||
in_import_block = False
|
||||
continue
|
||||
match = __import__("re").search(r'"([^"]+)"', line)
|
||||
if match:
|
||||
module = match.group(1)
|
||||
imports.append(
|
||||
ImportStatement(
|
||||
module=module,
|
||||
line_number=i + 1,
|
||||
is_from=False,
|
||||
)
|
||||
)
|
||||
elif line.startswith('import "'):
|
||||
match = __import__("re").search(r'import "([^"]+)"', line)
|
||||
if match:
|
||||
imports.append(
|
||||
ImportStatement(
|
||||
module=match.group(1),
|
||||
line_number=i + 1,
|
||||
is_from=False,
|
||||
)
|
||||
)
|
||||
|
||||
return imports
|
||||
Reference in New Issue
Block a user