Add parsers: Python, Go, and JavaScript docstring parsers

2026-01-31 00:56:37 +00:00
parent fb2fb893aa
commit 3ac81c4290
1 changed files with 316 additions and 0 deletions
--- a/doc2man/parsers/javascript.py
+++ b/doc2man/parsers/javascript.py
@@ -0,0 +1,316 @@
 """JavaScript docstring parser for Doc2Man."""
 import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 class JavaScriptDocstringParser:
    """Parser for JSDoc comments from JavaScript/TypeScript files."""
    JSDOC_PATTERN = re.compile(
        r'/\*\*([^*]*(?:\*(?!/)[^*]*)*)\*/',
        re.DOTALL
    )
    TAG_PATTERN = re.compile(
        r'@(\w+)(?:\s*(\{[^\{\}]+\}))?(?:\s*(\S+))?(?:\s+(.*))?',
        re.DOTALL
    )
    def __init__(self):
        """Initialize the parser."""
        pass
    def parse_file(self, file_path: Path) -> Dict[str, Any]:
        """Parse a JavaScript/TypeScript file and extract documentation."""
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
            return self.parse_content(content, str(file_path))
        except UnicodeDecodeError:
            try:
                with open(file_path, "r", encoding="latin-1") as f:
                    content = f.read()
                return self.parse_content(content, str(file_path))
            except Exception as e:
                raise ValueError(f"Error reading file {file_path}: {e}")
        except Exception as e:
            raise ValueError(f"Error reading file {file_path}: {e}")
    def parse_content(self, content: str, file_path: str = "") -> Dict[str, Any]:
        """Parse JavaScript/TypeScript content and extract documentation."""
        result = {
            "title": None,
            "description": None,
            "module_docstring": None,
            "functions": [],
            "classes": [],
            "file_path": file_path,
            "language": "javascript",
        }
        module_doc = self._extract_module_documentation(content)
        if module_doc:
            result["module_docstring"] = module_doc.get("description")
            result["description"] = module_doc.get("description")
            result["title"] = module_doc.get("title")
        functions = self._extract_functions(content)
        result["functions"] = functions
        classes = self._extract_classes(content)
        result["classes"] = classes
        return result
    def _extract_module_documentation(self, content: str) -> Optional[Dict[str, Any]]:
        """Extract module/file-level documentation."""
        lines = content.split("\n")
        for i, line in enumerate(lines):
            stripped = line.strip()
            if stripped.startswith("import ") or stripped.startswith("export "):
                break
            if stripped.startswith("/**"):
                comment = self._extract_jsdoc_block(content, i)
                if comment:
                    return self._parse_jsdoc_comment(comment)
                break
        return None
    def _extract_jsdoc_block(self, content: str, start: int) -> Optional[str]:
        """Extract a JSDoc block starting at the given position."""
        lines = content.split("\n")
        if start >= len(lines):
            return None
        line = lines[start]
        if not line.strip().startswith("/**"):
            return None
        end = start + 1
        while end < len(lines):
            if "*/" in lines[end]:
                break
            end += 1
        block = "\n".join(lines[start:end + 1])
        match = self.JSDOC_PATTERN.search(block)
        if match:
            return match.group(1).strip()
        return None
    def _parse_jsdoc_comment(self, comment: str) -> Dict[str, Any]:
        """Parse a JSDoc comment and extract its components."""
        result = {
            "description": "",
            "params": [],
            "returns": None,
            "examples": [],
            "tags": {},
        }
        lines = comment.split("\n")
        description_lines = []
        current_tag = None
        current_tag_content = []
        for line in lines:
            stripped = line.strip()
            if stripped.startswith("*"):
                stripped = stripped[1:].strip()
            if stripped.startswith("@"):
                if current_tag and current_tag_content:
                    self._add_tag_content(result, current_tag, current_tag_content)
                match = self.TAG_PATTERN.match(stripped)
                if match:
                    current_tag = match.group(1)
                    current_tag_content = []
                    type_hint = match.group(2)
                    name = match.group(3)
                    description = match.group(4) or ""
                    if type_hint:
                        current_tag_content.append(f"type:{type_hint}")
                    if name:
                        current_tag_content.append(f"name:{name}")
                    if description:
                        current_tag_content.append(description)
                else:
                    current_tag = None
                    current_tag_content = []
            elif current_tag:
                current_tag_content.append(stripped)
            elif stripped:
                description_lines.append(stripped)
        if current_tag and current_tag_content:
            self._add_tag_content(result, current_tag, current_tag_content)
        result["description"] = "\n".join(description_lines).strip()
        return result
    def _add_tag_content(self, result: Dict, tag: str, content: List[str]) -> None:
        """Add parsed tag content to the result."""
        combined = " ".join(content).strip()
        if tag == "param":
            param = self._parse_param_tag(combined)
            if param:
                result["params"].append(param)
        elif tag == "returns" or tag == "return":
            ret = self._parse_returns_tag(combined)
            if ret:
                result["returns"] = ret
        elif tag == "example":
            result["examples"].append(combined)
        elif tag == "examples":
            for line in content:
                if line.strip():
                    result["examples"].append(line.strip())
        else:
            result["tags"][tag] = combined
    def _parse_param_tag(self, content: str) -> Optional[Dict[str, str]]:
        """Parse a @param tag."""
        param = {"name": "", "type": "", "description": ""}
        if not content:
            return None
        match = re.match(r'(?:\{([^}]+)\})?\s*(\S+)?\s*(?:-)?\s*(.*)', content, re.DOTALL)
        if match:
            param["type"] = match.group(1) or ""
            param["name"] = match.group(2) or ""
            param["description"] = (match.group(3) or "").strip()
        if not param["name"]:
            return None
        return param
    def _parse_returns_tag(self, content: str) -> Optional[Dict[str, str]]:
        """Parse a @returns tag."""
        ret = {"type": "", "description": ""}
        if not content:
            return None
        match = re.match(r'(?:\{([^}]+)\})?(?:\s*-)?\s*(.*)', content, re.DOTALL)
        if match:
            ret["type"] = match.group(1) or ""
            ret["description"] = (match.group(2) or "").strip()
        return ret
    def _extract_functions(self, content: str) -> List[Dict[str, Any]]:
        """Extract function documentation from source."""
        functions = []
        lines = content.split("\n")
        for line_num, line in enumerate(lines):
            stripped = line.strip()
            funcs = [
                (r'export\s+async\s+function\s+(\w+)', True),
                (r'export\s+function\s+(\w+)', True),
                (r'export\s+const\s+(\w+)\s*=', True),
                (r'export\s+let\s+(\w+)\s*=', True),
                (r'async\s+function\s+(\w+)', True),
                (r'^function\s+(\w+)', True),
                (r'const\s+(\w+)\s*=\s*function', True),
                (r'const\s+(\w+)\s*=\s*async', True),
                (r'let\s+(\w+)\s*=\s*function', True),
                (r'let\s+(\w+)\s*=\s*async', True),
                (r'const\s+(\w+)\s*=\s*\(([^)]*)\)\s*=>', True),
                (r'let\s+(\w+)\s*=\s*\(([^)]*)\)\s*=>', True),
                (r'const\s+(\w+)\s*=\s*\(([^)]*)\)\s*=>\s*async', True),
                (r'let\s+(\w+)\s*=\s*\(([^)]*)\)\s*=>\s*async', True),
            ]
            for pattern, has_jsdoc in funcs:
                match = re.search(pattern, stripped)
                if match:
                    func_name = match.group(1)
                    if func_name.startswith("_"):
                        continue
                    func_doc = {
                        "name": func_name,
                        "description": "",
                        "args": [],
                        "returns": None,
                        "examples": [],
                        "line_number": line_num + 1,
                    }
                    jsdoc = self._extract_jsdoc_block(content, max(0, line_num - 10))
                    if jsdoc:
                        parsed = self._parse_jsdoc_comment(jsdoc)
                        func_doc["description"] = parsed.get("description", "")
                        func_doc["args"] = parsed.get("params", [])
                        func_doc["returns"] = parsed.get("returns")
                        func_doc["examples"] = parsed.get("examples", [])
                    functions.append(func_doc)
                    break
        return functions
    def _extract_classes(self, content: str) -> List[Dict[str, Any]]:
        """Extract class documentation from source."""
        classes = []
        lines = content.split("\n")
        for line_num, line in enumerate(lines):
            stripped = line.strip()
            export_match = re.search(r'export\s+class\s+(\w+)', stripped)
            if export_match:
                class_name = export_match.group(1)
                class_doc = {
                    "name": class_name,
                    "description": "",
                    "methods": [],
                    "line_number": line_num + 1,
                }
                jsdoc = self._extract_jsdoc_block(content, max(0, line_num - 10))
                if jsdoc:
                    parsed = self._parse_jsdoc_comment(jsdoc)
                    class_doc["description"] = parsed.get("description", "")
                classes.append(class_doc)
                continue
            if re.match(r'^class\s+(\w+)', stripped):
                match = re.match(r'^class\s+(\w+)', stripped)
                if match:
                    class_name = match.group(1)
                    class_doc = {
                        "name": class_name,
                        "description": "",
                        "methods": [],
                        "line_number": line_num + 1,
                    }
                    jsdoc = self._extract_jsdoc_block(content, max(0, line_num - 10))
                    if jsdoc:
                        parsed = self._parse_jsdoc_comment(jsdoc)
                        class_doc["description"] = parsed.get("description", "")
                    classes.append(class_doc)
        return classes
 def parse_javascript_file(file_path: Path) -> Dict[str, Any]:
    """Parse a JavaScript/TypeScript file and extract documentation."""
    parser = JavaScriptDocstringParser()
    return parser.parse_file(file_path)