Initial upload with CI/CD workflow

This commit is contained in:
2026-01-30 22:12:49 +00:00
parent 70cc6415f7
commit 3a3a91f709

251
codesnap/core/parser.py Normal file
View File

@@ -0,0 +1,251 @@
"""Code parsing module using tree-sitter."""
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
from tree_sitter import Language, Parser
from .language_detection import LanguageDetector
@dataclass
class ParsedFile:
"""Represents a parsed source file."""
path: Path
language: str
content: str
ast: Optional[object] = None
error: Optional[str] = None
@dataclass
class ImportStatement:
"""Represents an import/require statement."""
module: str
alias: Optional[str] = None
line_number: int = 0
is_from: False = False
names: list[str] = field(default_factory=list)
LANGUAGE_PARSERS: dict[str, Parser] = {}
def get_parser(language: str) -> Parser:
"""Get or create a tree-sitter parser for a language."""
if language not in LANGUAGE_PARSERS:
try:
lang = Language.for_language(language)
parser = Parser()
parser.set_language(lang)
LANGUAGE_PARSERS[language] = parser
except Exception as e:
raise ValueError(f"Failed to load parser for {language}: {e}")
return LANGUAGE_PARSERS[language]
class CodeParser:
"""Parses source code files using tree-sitter."""
def __init__(self) -> None:
self.language_detector = LanguageDetector()
def parse_file(self, path: Path, content: Optional[str] = None) -> ParsedFile:
"""Parse a single file."""
try:
if content is None:
content = path.read_text(encoding="utf-8")
except (IOError, UnicodeDecodeError) as e:
return ParsedFile(
path=path,
language="unknown",
content="",
error=str(e),
)
language = self.language_detector.detect(path, content)
if language is None:
return ParsedFile(
path=path,
language="unknown",
content=content,
)
try:
parser = get_parser(language)
tree = parser.parse(content.encode("utf-8"))
return ParsedFile(
path=path,
language=language,
content=content,
ast=tree,
)
except Exception as e:
return ParsedFile(
path=path,
language=language,
content=content,
error=str(e),
)
def parse_directory(
self, directory: Path, max_files: int = 1000
) -> list[ParsedFile]:
"""Parse all source files in a directory."""
parsed_files: list[ParsedFile] = []
files_processed = 0
for root, _, filenames in os.walk(directory):
if files_processed >= max_files:
break
for filename in filenames:
if files_processed >= max_files:
break
filepath = Path(root) / filename
parsed = self.parse_file(filepath)
parsed_files.append(parsed)
files_processed += 1
return parsed_files
def extract_imports(self, parsed_file: ParsedFile) -> list[ImportStatement]:
"""Extract import statements from a parsed file."""
imports: list[ImportStatement] = []
if parsed_file.ast is None:
return imports
language = parsed_file.language
content = parsed_file.content
if language == "python":
imports = self._extract_python_imports(content, parsed_file.path)
elif language in ("javascript", "typescript"):
imports = self._extract_js_imports(content, parsed_file.path)
elif language == "go":
imports = self._extract_go_imports(content, parsed_file.path)
return imports
def _extract_python_imports(
self, content: str, path: Path
) -> list[ImportStatement]:
"""Extract Python import statements."""
imports: list[ImportStatement] = []
lines = content.split("\n")
for i, line in enumerate(lines):
line = line.strip()
if line.startswith("import "):
module = line[7:].split()[0].split(".")[0]
imports.append(
ImportStatement(
module=module,
line_number=i + 1,
is_from=False,
)
)
elif line.startswith("from "):
parts = line[5:].split()
if parts:
module = parts[0]
names = []
if "import" in parts:
idx = parts.index("import")
if idx + 1 < len(parts):
names = [n.strip().split(" as ")[0] for n in parts[idx + 1].split(",")]
imports.append(
ImportStatement(
module=module,
line_number=i + 1,
is_from=True,
names=names,
)
)
return imports
def _extract_js_imports(
self, content: str, path: Path
) -> list[ImportStatement]:
"""Extract JavaScript/TypeScript import statements."""
imports: list[ImportStatement] = []
lines = content.split("\n")
import_pattern = __import__("re").compile(
r"(?:import\s+(?:\{[^}]*\}|\*|[\w$]+)(?:\s+as\s+[\w$]+)?\s+from\s+)?['\"]([^'\"]+)['\"]"
)
for i, line in enumerate(lines):
line = line.strip()
if line.startswith("import "):
match = import_pattern.search(line)
if match:
module = match.group(1)
imports.append(
ImportStatement(
module=module,
line_number=i + 1,
is_from=True,
)
)
elif line.startswith("require("):
match = __import__("re").search(r"require\(['\"]([^'\"]+)['\"]", line)
if match:
imports.append(
ImportStatement(
module=match.group(1),
line_number=i + 1,
is_from=False,
)
)
return imports
def _extract_go_imports(
self, content: str, path: Path
) -> list[ImportStatement]:
"""Extract Go import statements."""
imports: list[ImportStatement] = []
lines = content.split("\n")
in_import_block = False
import_start = -1
for i, line in enumerate(lines):
if line.strip() == "import (":
in_import_block = True
import_start = i + 1
continue
if in_import_block:
if line.strip() == ")":
in_import_block = False
continue
match = __import__("re").search(r'"([^"]+)"', line)
if match:
module = match.group(1)
imports.append(
ImportStatement(
module=module,
line_number=i + 1,
is_from=False,
)
)
elif line.startswith('import "'):
match = __import__("re").search(r'import "([^"]+)"', line)
if match:
imports.append(
ImportStatement(
module=match.group(1),
line_number=i + 1,
is_from=False,
)
)
return imports