curl-converter-cli/curlconverter/parser.py

"""Parser module for curl commands."""

import re
from dataclasses import dataclass, field
from typing import Optional


@dataclass
class ParsedCurl:
    """Represents a parsed curl command."""
    url: str
    method: str = "GET"
    headers: dict = field(default_factory=dict)
    data: Optional[str] = None
    auth: Optional[tuple] = None
    cookies: Optional[str] = None
    user_agent: Optional[str] = None


def unquote(s: str) -> str:
    """Remove outer quotes from a string."""
    if not s:
        return s
    if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
        return s[1:-1]
    return s


def parse_curl(curl_command: str) -> ParsedCurl:
    """Parse a curl command string into structured data.

    Args:
        curl_command: The curl command string to parse.

    Returns:
        ParsedCurl object with extracted components.

    Raises:
        ValueError: If the curl command is invalid.
    """
    if not curl_command.strip():
        raise ValueError("Empty curl command")

    curl_command = curl_command.strip()
    if curl_command.startswith("curl "):
        curl_command = curl_command[5:]
    elif curl_command.startswith("curl"):
        curl_command = curl_command[4:]

    tokens = tokenize_command(curl_command)

    url = ""
    method = "GET"
    headers = {}
    data = None
    auth = None
    cookies = None
    user_agent = None

    i = 0
    while i < len(tokens):
        token = tokens[i]

        if token == "-X" or token == "--request":
            if i + 1 < len(tokens):
                method = tokens[i + 1].upper()
                i += 2
                continue

        elif token == "-H" or token == "--header":
            if i + 1 < len(tokens):
                header = tokens[i + 1]
                if ":" in header:
                    key, value = header.split(":", 1)
                    headers[key.strip()] = value.strip()
                i += 2
                continue

        elif token == "-d" or token == "--data" or token == "--data-raw":
            if i + 1 < len(tokens):
                data = tokens[i + 1]
                if method == "GET":
                    method = "POST"
                i += 2
                continue

        elif token == "-u" or token == "--user":
            if i + 1 < len(tokens):
                auth_str = tokens[i + 1]
                if ":" in auth_str:
                    auth = tuple(auth_str.split(":", 1))
                else:
                    auth = auth_str
                i += 2
                continue

        elif token == "-b" or token == "--cookie":
            if i + 1 < len(tokens):
                cookies = tokens[i + 1]
                i += 2
                continue

        elif token == "-A" or token == "--user-agent":
            if i + 1 < len(tokens):
                user_agent = tokens[i + 1]
                i += 2
                continue

        elif token == "-L" or token == "--location" or token == "-s" or token == "--silent" or token == "-S" or token == "--show-error":
            i += 1
            continue

        elif token.startswith("-"):
            i += 1
            continue

        else:
            if not url:
                url = token
            i += 1

    if not url:
        raise ValueError("No URL found in curl command")

    if not url.startswith(("http://", "https://")):
        url = "https://" + url

    if "Authorization" in headers:
        auth_header = headers["Authorization"]
        if auth_header.startswith("Basic "):
            import base64
            try:
                encoded = auth_header[6:]
                decoded = base64.b64decode(encoded).decode("utf-8")
                if ":" in decoded:
                    auth = tuple(decoded.split(":", 1))
            except Exception:
                pass
        elif auth_header.startswith("Bearer "):
            headers["Authorization"] = auth_header

    return ParsedCurl(
        url=url,
        method=method,
        headers=headers,
        data=data,
        auth=auth,
        cookies=cookies,
        user_agent=user_agent
    )


def tokenize_command(cmd: str) -> list:
    """Tokenize a curl command into components, handling quotes and escapes."""
    tokens = []
    current = ""
    in_single_quote = False
    in_double_quote = False
    escape_next = False

    i = 0
    while i < len(cmd):
        char = cmd[i]

        if escape_next:
            current += char
            escape_next = False
            i += 1
            continue

        if char == "\\" and not in_single_quote:
            escape_next = True
            i += 1
            continue

        if char == "'" and not in_double_quote:
            in_single_quote = not in_single_quote
            i += 1
            continue

        if char == '"' and not in_single_quote:
            in_double_quote = not in_double_quote
            i += 1
            continue

        if char == " " and not in_single_quote and not in_double_quote:
            if current:
                tokens.append(current)
                current = ""
            i += 1
            continue

        current += char
        i += 1

    if current:
        tokens.append(current)

    return tokens