man-card/man_card/man_parser.py

"""Man page parser module."""

import subprocess
import re
from dataclasses import dataclass, field
from typing import Optional


@dataclass
class Option:
    """Represents a command option/flag."""
    flag: str
    description: str
    argument: Optional[str] = None


@dataclass
class CommandInfo:
    """Structured information extracted from a man page."""
    name: str
    synopsis: str = ""
    description: str = ""
    options: list[Option] = field(default_factory=list)
    examples: list[str] = field(default_factory=list)
    section: str = "1"


class ManPageParser:
    """Parser for Unix man pages."""

    SECTION_PATTERN = re.compile(r'^([A-Z][A-Z\s]+)$')
    NAME_PATTERN = re.compile(r'^([a-zA-Z0-9_-]+)\s*-\s*(.+)$')
    SYNOPSIS_PATTERN = re.compile(r'^\s*(?:\\fB)?(.+?)(?:\\fR)?\s*$')
    OPTION_PATTERN = re.compile(r'^(\-[a-zA-Z0-9],?\s*(--[a-zA-Z0-9-]+)?)\s+(.+)$')
    BLANK_LINE = re.compile(r'^\s*$')
    FONT_PATTERN = re.compile(r'\\f[BIRP]')

    def __init__(self):
        pass

    def parse(self, command: str, section: str = "1") -> CommandInfo:
        """Parse a man page for the given command."""
        try:
            env = {"PAGER": "", "COLUMNS": "200"}
            result = subprocess.run(
                ["man", "-P", "cat", section, command],
                capture_output=True,
                text=True,
                env={**dict(__import__("os").environ), **env},
                timeout=30
            )
            if result.returncode != 0:
                raise ValueError(f"man: {command}: command not found")
            return self._parse_content(result.stdout, command)
        except subprocess.CalledProcessError as e:
            raise ValueError(f"man: {command}: command not found") from e
        except subprocess.TimeoutExpired:
            raise ValueError(f"man: {command}: timeout while fetching man page")

    def _parse_content(self, content: str, command: str) -> CommandInfo:
        """Parse man page content into structured CommandInfo."""
        info = CommandInfo(name=command)
        lines = content.split('\n')
        current_section = ""
        buffer: list[str] = []
        options_buffer: list[tuple[str, str, str]] = []

        for i, line in enumerate(lines):
            stripped = line.strip()
            section_match = self.SECTION_PATTERN.match(stripped)

            if section_match:
                section_name = section_match.group(1).strip()

                if section_name == "NAME":
                    if buffer:
                        self._process_section(info, current_section, buffer, options_buffer)
                    current_section = "NAME"
                    buffer = []
                    options_buffer = []
                elif section_name in ("SYNOPSIS", "OPTIONS", "DESCRIPTION", "EXAMPLES"):
                    if buffer and current_section:
                        self._process_section(info, current_section, buffer, options_buffer)
                    current_section = section_name
                    buffer = []
                    options_buffer = []
                else:
                    if buffer and current_section:
                        self._process_section(info, current_section, buffer, options_buffer)
                    current_section = ""
                    buffer = []
                    options_buffer = []
            elif current_section:
                processed = self.FONT_PATTERN.sub('', stripped)
                if processed and not self.BLANK_LINE.match(processed):
                    buffer.append(processed)
                elif buffer and self.BLANK_LINE.match(processed):
                    buffer.append("")

        if buffer and current_section:
            self._process_section(info, current_section, buffer, options_buffer)

        if not info.synopsis and buffer:
            for line in lines:
                name_match = self.NAME_PATTERN.match(line.strip())
                if name_match:
                    info.synopsis = name_match.group(1)
                    break

        return info

    def _process_section(self, info: CommandInfo, section: str, buffer: list[str], options_buffer: list[tuple]):
        """Process a parsed section and populate CommandInfo."""
        text = ' '.join(line for line in buffer if line.strip())

        if section == "NAME":
            for line in buffer:
                match = self.NAME_PATTERN.match(line.strip())
                if match:
                    info.name = match.group(1)
                    break
        elif section == "SYNOPSIS":
            synopsis_lines = [line for line in buffer if line.strip()]
            if synopsis_lines:
                info.synopsis = ' '.join(synopsis_lines)
        elif section == "OPTIONS":
            for line in buffer:
                match = self.OPTION_PATTERN.match(line)
                if match:
                    flag = match.group(1).strip()
                    long_opt = match.group(2) if match.group(2) else ""
                    desc = match.group(3).strip() if match.group(3) else ""
                    info.options.append(Option(flag=flag, description=desc))
        elif section == "DESCRIPTION":
            info.description = text
        elif section == "EXAMPLES":
            examples = [line for line in buffer if line.strip() and not line.strip().startswith('#')]
            info.examples.extend(examples)