Re-upload: CI infrastructure issue resolved, all tests verified passing
This commit is contained in:
76
http_log_explorer/parsers/__init__.py
Normal file
76
http_log_explorer/parsers/__init__.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""Parser interface for HTTP log formats."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from http_log_explorer.models import HTTPEntry
|
||||
|
||||
|
||||
class ParserInterface(ABC):
|
||||
"""Abstract base class for HTTP log parsers."""
|
||||
|
||||
@abstractmethod
|
||||
def parse(self, content: str | bytes, source_file: str | None = None) -> list[HTTPEntry]:
|
||||
"""Parse content and return list of HTTP entries.
|
||||
|
||||
Args:
|
||||
content: The content to parse (string or bytes)
|
||||
source_file: Optional source file name for reference
|
||||
|
||||
Returns:
|
||||
List of HTTPEntry objects
|
||||
|
||||
Raises:
|
||||
ValueError: If content cannot be parsed
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def can_parse(self, content: str | bytes) -> bool:
|
||||
"""Check if this parser can handle the given content.
|
||||
|
||||
Args:
|
||||
content: The content to check
|
||||
|
||||
Returns:
|
||||
True if this parser can handle the content
|
||||
"""
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def get_parser_name() -> str:
|
||||
"""Return the name of this parser."""
|
||||
return "unknown"
|
||||
|
||||
|
||||
def get_parser(content: str | bytes) -> ParserInterface:
|
||||
"""Get the appropriate parser for the given content.
|
||||
|
||||
Args:
|
||||
content: The content to parse
|
||||
|
||||
Returns:
|
||||
An appropriate parser instance
|
||||
|
||||
Raises:
|
||||
ValueError: If no suitable parser is found
|
||||
"""
|
||||
from http_log_explorer.parsers.curl_parser import CurlParser
|
||||
from http_log_explorer.parsers.devtools_parser import DevToolsParser
|
||||
from http_log_explorer.parsers.har_parser import HARParser
|
||||
|
||||
parsers: list[ParserInterface] = [
|
||||
HARParser(),
|
||||
CurlParser(),
|
||||
DevToolsParser(),
|
||||
]
|
||||
|
||||
for parser in parsers:
|
||||
if parser.can_parse(content):
|
||||
return parser
|
||||
|
||||
raise ValueError(
|
||||
"Unsupported format. Supported formats are: HAR files, curl -v output, and Chrome DevTools network exports."
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["ParserInterface", "get_parser"]
|
||||
140
http_log_explorer/parsers/curl_parser.py
Normal file
140
http_log_explorer/parsers/curl_parser.py
Normal file
@@ -0,0 +1,140 @@
|
||||
"""Parser for curl -v output."""
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from http_log_explorer.models import HTTPEntry, Request, Response
|
||||
from http_log_explorer.parsers import ParserInterface
|
||||
|
||||
|
||||
class CurlParser(ParserInterface):
|
||||
"""Parser for curl -v verbose output."""
|
||||
|
||||
REQUEST_LINE_RE = re.compile(r"^> (\w+) (\S+) (HTTP/[\d.]+)$", re.MULTILINE)
|
||||
RESPONSE_LINE_RE = re.compile(r"^< (HTTP/[\d.]+) (\d+) (.+)$", re.MULTILINE)
|
||||
HEADER_RE = re.compile(r"^(> |<) ([^:]+): (.+)$")
|
||||
TIMING_RE = re.compile(r"^\* time_conditional check:.*$")
|
||||
|
||||
@staticmethod
|
||||
def get_parser_name() -> str:
|
||||
return "curl"
|
||||
|
||||
def can_parse(self, content: str | bytes) -> bool:
|
||||
"""Check if content appears to be curl -v output."""
|
||||
if isinstance(content, bytes):
|
||||
content = content.decode("utf-8", errors="ignore")
|
||||
has_request = bool(self.REQUEST_LINE_RE.search(content))
|
||||
has_response = bool(self.RESPONSE_LINE_RE.search(content))
|
||||
return has_request and has_response
|
||||
|
||||
def parse(self, content: str | bytes, source_file: str | None = None) -> list[HTTPEntry]:
|
||||
"""Parse curl -v output into HTTPEntry objects."""
|
||||
if isinstance(content, bytes):
|
||||
content = content.decode("utf-8", errors="replace")
|
||||
|
||||
entries: list[HTTPEntry] = []
|
||||
blocks = self._split_blocks(content)
|
||||
|
||||
for idx, block in enumerate(blocks):
|
||||
try:
|
||||
entry = self._parse_block(block, idx, source_file)
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return entries
|
||||
|
||||
def _split_blocks(self, content: str) -> list[dict[str, Any]]:
|
||||
"""Split curl output into request/response blocks."""
|
||||
blocks: list[dict[str, Any]] = []
|
||||
current_block: dict[str, Any] = {}
|
||||
|
||||
lines = content.split("\n")
|
||||
for line in lines:
|
||||
request_match = self.REQUEST_LINE_RE.match(line)
|
||||
if request_match:
|
||||
if current_block.get("request"):
|
||||
blocks.append(current_block)
|
||||
current_block = {
|
||||
"request": {
|
||||
"method": request_match.group(1),
|
||||
"url": request_match.group(2),
|
||||
"http_version": request_match.group(3),
|
||||
},
|
||||
"headers": [],
|
||||
"body": None,
|
||||
"response": None,
|
||||
}
|
||||
continue
|
||||
|
||||
response_match = self.RESPONSE_LINE_RE.match(line)
|
||||
if response_match:
|
||||
if current_block.get("request"):
|
||||
current_block["response"] = {
|
||||
"http_version": response_match.group(1),
|
||||
"status": int(response_match.group(2)),
|
||||
"status_text": response_match.group(3),
|
||||
}
|
||||
continue
|
||||
|
||||
header_match = self.HEADER_RE.match(line)
|
||||
if header_match:
|
||||
direction = header_match.group(1)
|
||||
name = header_match.group(2)
|
||||
value = header_match.group(3)
|
||||
if direction == ">" and "headers" in current_block:
|
||||
current_block["headers"].append((name, value))
|
||||
continue
|
||||
|
||||
if current_block and current_block.get("response") and line.strip():
|
||||
if current_block["response"].get("body") is None:
|
||||
current_block["response"]["body"] = ""
|
||||
current_block["response"]["body"] += line + "\n"
|
||||
|
||||
if current_block.get("request"):
|
||||
blocks.append(current_block)
|
||||
|
||||
return blocks
|
||||
|
||||
def _parse_block(
|
||||
self, block: dict[str, Any], idx: int, source_file: str | None
|
||||
) -> HTTPEntry | None:
|
||||
"""Parse a single request/response block."""
|
||||
if not block.get("request") or not block.get("response"):
|
||||
return None
|
||||
|
||||
req_data = block["request"]
|
||||
resp_data = block["response"]
|
||||
|
||||
headers = dict(block.get("headers", []))
|
||||
|
||||
request = Request(
|
||||
method=req_data.get("method", "GET"),
|
||||
url=req_data.get("url", "/"),
|
||||
http_version=req_data.get("http_version", "HTTP/1.1"),
|
||||
headers=headers,
|
||||
body=block.get("body"),
|
||||
)
|
||||
|
||||
response_body = resp_data.get("body", "")
|
||||
if response_body:
|
||||
response_body = response_body.strip()
|
||||
|
||||
response = Response(
|
||||
status=resp_data.get("status", 0),
|
||||
status_text=resp_data.get("status_text", ""),
|
||||
http_version=resp_data.get("http_version", "HTTP/1.1"),
|
||||
headers={},
|
||||
body=response_body if response_body else None,
|
||||
content_type=headers.get("Content-Type") or headers.get("content-type"),
|
||||
)
|
||||
|
||||
return HTTPEntry(
|
||||
id=f"curl-{idx}",
|
||||
request=request,
|
||||
response=response,
|
||||
timestamp=datetime.now(),
|
||||
source_file=source_file,
|
||||
)
|
||||
133
http_log_explorer/parsers/devtools_parser.py
Normal file
133
http_log_explorer/parsers/devtools_parser.py
Normal file
@@ -0,0 +1,133 @@
|
||||
"""Parser for Chrome DevTools network export format."""
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from http_log_explorer.models import HTTPEntry, Request, Response
|
||||
from http_log_explorer.parsers import ParserInterface
|
||||
|
||||
|
||||
class DevToolsParser(ParserInterface):
|
||||
"""Parser for Chrome DevTools network export JSON."""
|
||||
|
||||
@staticmethod
|
||||
def get_parser_name() -> str:
|
||||
return "DevTools"
|
||||
|
||||
def can_parse(self, content: str | bytes) -> bool:
|
||||
"""Check if content appears to be DevTools network export."""
|
||||
if isinstance(content, bytes):
|
||||
content = content.decode("utf-8", errors="ignore")
|
||||
try:
|
||||
data = json.loads(content)
|
||||
if isinstance(data, list):
|
||||
return all(
|
||||
"request" in item and "response" in item for item in data[:3] if isinstance(item, dict)
|
||||
)
|
||||
if isinstance(data, dict):
|
||||
has_log = "log" in data
|
||||
has_entries = "entries" in data.get("log", {})
|
||||
has_creator = "creator" in data.get("log", {})
|
||||
return has_log and has_entries and not has_creator
|
||||
except json.JSONDecodeError:
|
||||
return False
|
||||
return False
|
||||
|
||||
def parse(self, content: str | bytes, source_file: str | None = None) -> list[HTTPEntry]:
|
||||
"""Parse DevTools network export into HTTPEntry objects."""
|
||||
if isinstance(content, bytes):
|
||||
content = content.decode("utf-8", errors="replace")
|
||||
|
||||
try:
|
||||
data = json.loads(content)
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON format: {e}") from e
|
||||
|
||||
if isinstance(data, dict) and "log" in data:
|
||||
entries_data = data.get("log", {}).get("entries", [])
|
||||
elif isinstance(data, list):
|
||||
entries_data = data
|
||||
else:
|
||||
raise ValueError("Unrecognized DevTools format")
|
||||
|
||||
entries: list[HTTPEntry] = []
|
||||
for idx, entry_data in enumerate(entries_data):
|
||||
try:
|
||||
entry = self._convert_entry(entry_data, idx, source_file)
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return entries
|
||||
|
||||
def _convert_entry(
|
||||
self, entry_data: dict[str, Any], idx: int, source_file: str | None
|
||||
) -> HTTPEntry | None:
|
||||
"""Convert a DevTools entry to our HTTPEntry model."""
|
||||
request_data = entry_data.get("request", {})
|
||||
response_data = entry_data.get("response", {})
|
||||
|
||||
if not request_data or not response_data:
|
||||
return None
|
||||
|
||||
request = Request(
|
||||
method=request_data.get("method", "GET"),
|
||||
url=request_data.get("url", ""),
|
||||
http_version=request_data.get("httpVersion", "HTTP/1.1"),
|
||||
headers=self._parse_headers(request_data.get("headers", {})),
|
||||
body=request_data.get("postData", {}).get("text") if request_data.get("postData") else None,
|
||||
query_params=self._parse_query_params(request_data.get("queryString", [])),
|
||||
)
|
||||
|
||||
response = Response(
|
||||
status=response_data.get("status", 0),
|
||||
status_text=response_data.get("statusText", ""),
|
||||
http_version=response_data.get("httpVersion", "HTTP/1.1"),
|
||||
headers=self._parse_headers(response_data.get("headers", {})),
|
||||
body=response_data.get("content", {}).get("text") if isinstance(response_data.get("content"), dict) else None,
|
||||
content_type=response_data.get("content", {}).get("mimeType") if isinstance(response_data.get("content"), dict) else None,
|
||||
response_time_ms=self._parse_time(entry_data),
|
||||
)
|
||||
|
||||
timestamp = self._parse_timestamp(entry_data)
|
||||
|
||||
return HTTPEntry(
|
||||
id=f"devtools-{idx}",
|
||||
request=request,
|
||||
response=response,
|
||||
timestamp=timestamp,
|
||||
server_ip=entry_data.get("serverIPAddress"),
|
||||
connection=entry_data.get("connection"),
|
||||
source_file=source_file,
|
||||
)
|
||||
|
||||
def _parse_headers(self, headers: dict[str, Any] | list) -> dict[str, str]:
|
||||
"""Parse headers to dictionary."""
|
||||
if isinstance(headers, dict):
|
||||
return dict(headers)
|
||||
if isinstance(headers, list):
|
||||
return {h.get("name", ""): h.get("value", "") for h in headers}
|
||||
return {}
|
||||
|
||||
def _parse_query_params(self, query_string: list[dict[str, Any]]) -> dict[str, str]:
|
||||
"""Parse query string list to dictionary."""
|
||||
if isinstance(query_string, list):
|
||||
return {p.get("name", ""): p.get("value", "") for p in query_string}
|
||||
return {}
|
||||
|
||||
def _parse_time(self, entry_data: dict[str, Any]) -> float | None:
|
||||
"""Parse time from DevTools entry."""
|
||||
if "time" in entry_data:
|
||||
return float(entry_data["time"])
|
||||
return None
|
||||
|
||||
def _parse_timestamp(self, entry_data: dict[str, Any]) -> datetime | None:
|
||||
"""Parse timestamp from DevTools entry."""
|
||||
if "startedDateTime" in entry_data:
|
||||
try:
|
||||
return datetime.fromisoformat(entry_data["startedDateTime"].replace("Z", "+00:00"))
|
||||
except (ValueError, AttributeError):
|
||||
pass
|
||||
return None
|
||||
47
http_log_explorer/parsers/factory.py
Normal file
47
http_log_explorer/parsers/factory.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""Parser factory for creating appropriate parsers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from http_log_explorer.parsers.curl_parser import CurlParser
|
||||
from http_log_explorer.parsers.devtools_parser import DevToolsParser
|
||||
from http_log_explorer.parsers.har_parser import HARParser
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from http_log_explorer.parsers import ParserInterface
|
||||
|
||||
|
||||
def get_parser(content: str | bytes) -> ParserInterface:
|
||||
"""Get the appropriate parser for the given content.
|
||||
|
||||
Args:
|
||||
content: The content to parse
|
||||
|
||||
Returns:
|
||||
An appropriate parser instance
|
||||
|
||||
Raises:
|
||||
ValueError: If no suitable parser is found
|
||||
"""
|
||||
parsers = [
|
||||
HARParser(),
|
||||
CurlParser(),
|
||||
DevToolsParser(),
|
||||
]
|
||||
|
||||
for parser in parsers:
|
||||
if parser.can_parse(content):
|
||||
return parser
|
||||
|
||||
raise ValueError(
|
||||
"Unsupported format. Supported formats are: HAR files, curl -v output, and Chrome DevTools network exports."
|
||||
)
|
||||
|
||||
|
||||
def get_all_parsers() -> list[ParserInterface]:
|
||||
"""Get all available parser instances."""
|
||||
return [HARParser(), CurlParser(), DevToolsParser()]
|
||||
|
||||
|
||||
__all__ = ["get_parser", "get_all_parsers"]
|
||||
146
http_log_explorer/parsers/har_parser.py
Normal file
146
http_log_explorer/parsers/har_parser.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""HAR file parser using haralyzer."""
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from haralyzer import HarParser
|
||||
|
||||
from http_log_explorer.models import HTTPEntry, Request, Response
|
||||
from http_log_explorer.parsers import ParserInterface
|
||||
|
||||
|
||||
class HARParser(ParserInterface):
|
||||
"""Parser for HAR (HTTP Archive) files."""
|
||||
|
||||
@staticmethod
|
||||
def get_parser_name() -> str:
|
||||
return "HAR"
|
||||
|
||||
def can_parse(self, content: str | bytes) -> bool:
|
||||
"""Check if content appears to be a HAR file."""
|
||||
if isinstance(content, bytes):
|
||||
content = content.decode("utf-8", errors="ignore")
|
||||
try:
|
||||
data = json.loads(content)
|
||||
has_log = "log" in data
|
||||
has_entries = "entries" in data.get("log", {})
|
||||
has_creator = "creator" in data.get("log", {})
|
||||
return has_log and has_entries and has_creator
|
||||
except (json.JSONDecodeError, AttributeError):
|
||||
return False
|
||||
|
||||
def parse(self, content: str | bytes, source_file: str | None = None) -> list[HTTPEntry]:
|
||||
"""Parse HAR content into HTTPEntry objects."""
|
||||
if isinstance(content, bytes):
|
||||
content = content.decode("utf-8", errors="replace")
|
||||
|
||||
try:
|
||||
data = json.loads(content)
|
||||
har_parser = HarParser(data)
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid HAR format: {e}") from e
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid HAR format: {e}") from e
|
||||
|
||||
entries: list[HTTPEntry] = []
|
||||
har_entries = har_parser.har_data.get("entries", [])
|
||||
for idx, har_entry in enumerate(har_entries):
|
||||
try:
|
||||
entry = self._convert_har_entry(har_entry, idx, source_file)
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return entries
|
||||
|
||||
def _convert_har_entry(
|
||||
self, har_entry: Any, idx: int, source_file: str | None
|
||||
) -> HTTPEntry | None:
|
||||
"""Convert a haralyzer entry to our HTTPEntry model."""
|
||||
request_data = har_entry.get("request")
|
||||
response_data = har_entry.get("response")
|
||||
|
||||
if not request_data or not response_data:
|
||||
return None
|
||||
|
||||
request = Request(
|
||||
method=request_data.get("method", "GET"),
|
||||
url=self._build_url(request_data),
|
||||
http_version=request_data.get("httpVersion", "HTTP/1.1"),
|
||||
headers=self._parse_headers(request_data.get("headers", [])),
|
||||
body=self._get_request_body(request_data),
|
||||
query_params=self._parse_query_params(request_data.get("queryString", [])),
|
||||
)
|
||||
|
||||
response = Response(
|
||||
status=response_data.get("status", 0),
|
||||
status_text=response_data.get("statusText", ""),
|
||||
http_version=response_data.get("httpVersion", "HTTP/1.1"),
|
||||
headers=self._parse_headers(response_data.get("headers", [])),
|
||||
body=self._get_response_body(response_data),
|
||||
content_type=self._get_content_type(response_data.get("content", {})),
|
||||
response_time_ms=har_entry.get("time", None),
|
||||
)
|
||||
|
||||
timestamp = self._parse_timestamp(har_entry)
|
||||
|
||||
return HTTPEntry(
|
||||
id=f"har-{idx}",
|
||||
request=request,
|
||||
response=response,
|
||||
timestamp=timestamp,
|
||||
server_ip=har_entry.get("serverIPAddress", None),
|
||||
connection=har_entry.get("connection", None),
|
||||
source_file=source_file,
|
||||
)
|
||||
|
||||
def _build_url(self, request_data: dict[str, Any]) -> str:
|
||||
"""Build full URL from request data."""
|
||||
url = request_data.get("url", "")
|
||||
if not url:
|
||||
host = ""
|
||||
for header in request_data.get("headers", []):
|
||||
if header.get("name", "").lower() == "host":
|
||||
host = header.get("value", "")
|
||||
break
|
||||
url = f"http://{host}/"
|
||||
return url
|
||||
|
||||
def _parse_headers(self, headers: list[dict[str, Any]]) -> dict[str, str]:
|
||||
"""Parse headers list to dictionary."""
|
||||
return {h.get("name", ""): h.get("value", "") for h in headers}
|
||||
|
||||
def _parse_query_params(self, query_string: list[dict[str, Any]]) -> dict[str, str]:
|
||||
"""Parse query string list to dictionary."""
|
||||
return {p.get("name", ""): p.get("value", "") for p in query_string}
|
||||
|
||||
def _get_request_body(self, request_data: dict[str, Any]) -> str | None:
|
||||
"""Extract request body."""
|
||||
post_data = request_data.get("postData", {})
|
||||
if post_data:
|
||||
if isinstance(post_data, dict):
|
||||
return post_data.get("text", None)
|
||||
return str(post_data)
|
||||
return None
|
||||
|
||||
def _get_response_body(self, response_data: dict[str, Any]) -> str | None:
|
||||
"""Extract response body."""
|
||||
content = response_data.get("content", {})
|
||||
if isinstance(content, dict):
|
||||
return content.get("text", None)
|
||||
return None
|
||||
|
||||
def _get_content_type(self, content: dict[str, Any]) -> str | None:
|
||||
"""Extract content type from content dict."""
|
||||
if isinstance(content, dict):
|
||||
return content.get("mimeType", None)
|
||||
return None
|
||||
|
||||
def _parse_timestamp(self, har_entry: Any) -> datetime | None:
|
||||
"""Parse timestamp from HAR entry."""
|
||||
started_datetime = getattr(har_entry, "started_datetime", None)
|
||||
if started_datetime:
|
||||
return started_datetime
|
||||
return None
|
||||
Reference in New Issue
Block a user