Add source files: models and parser

This commit is contained in:
2026-02-02 13:56:57 +00:00
parent 9e7fd72429
commit 283e4d0c08

203
src/gdiffer/parser.py Normal file
View File

@@ -0,0 +1,203 @@
"""Diff parser for unified git diff format."""
import re
from typing import Optional
from gdiffer.models import DiffFile, DiffHunk
class DiffParser:
"""Parser for unified diff format (as produced by git diff)."""
HUNK_PATTERN = re.compile(r'^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@')
def __init__(self):
self.files: list[DiffFile] = []
self.errors: list[str] = []
def parse(self, diff_content: str) -> list[DiffFile]:
self.files = []
self.errors = []
if not diff_content.strip():
return self.files
lines = diff_content.splitlines()
self._parse_lines(lines)
return self.files
def _parse_lines(self, lines: list[str]) -> None:
i = 0
n = len(lines)
while i < n:
line = lines[i].rstrip('\n')
if line.startswith('diff --git'):
file_obj = self._parse_file(lines, i)
if file_obj:
self.files.append(file_obj)
i += 1
continue
i += 1
def _parse_file(self, lines: list[str], start: int) -> Optional[DiffFile]:
if start >= len(lines):
return None
first_line = lines[start]
if not first_line.startswith('diff --git'):
return None
parts = first_line.split(' ', 3)
if len(parts) < 4:
return None
old_path = parts[2][2:] if len(parts) > 2 else ''
new_path = parts[3][2:] if len(parts) > 3 else old_path
if old_path.startswith('a/'):
old_path = old_path[2:]
if new_path.startswith('b/'):
new_path = new_path[2:]
file_obj = DiffFile(old_path=old_path, new_path=new_path)
i = start + 1
n = len(lines)
while i < n:
line = lines[i].rstrip('\n')
if line.startswith('new file mode '):
file_obj.new_file_mode = line.split()[-1]
file_obj.change_type = "add"
i += 1
continue
if line.startswith('deleted file mode '):
file_obj.deleted_file_mode = line.split()[-1]
file_obj.change_type = "delete"
i += 1
continue
if line.startswith('similarity index '):
file_obj.similarity_index = line.split()[-1].rstrip('%')
i += 1
continue
if line.startswith('rename from '):
file_obj.rename_from = line[12:]
i += 1
continue
if line.startswith('rename to '):
file_obj.rename_to = line[10:]
file_obj.change_type = "rename"
i += 1
continue
if line.startswith('---'):
i += 1
continue
if line.startswith('+++'):
i += 1
continue
if line.startswith('@@'):
hunk, consumed = self._parse_hunk(lines, i)
if hunk:
file_obj.hunks.append(hunk)
i += consumed
continue
if line.startswith('diff --git'):
break
i += 1
return file_obj
def _parse_hunk(self, lines: list[str], start: int) -> tuple[Optional[DiffHunk], int]:
if start >= len(lines):
return None, 0
line = lines[start].rstrip('\n')
match = self.HUNK_PATTERN.match(line)
if not match:
return None, 0
old_start = int(match.group(1))
old_lines = int(match.group(2)) if match.group(2) else 1
new_start = int(match.group(3))
new_lines = int(match.group(4)) if match.group(4) else 1
hunk = DiffHunk(
old_start=old_start,
old_lines=old_lines,
new_start=new_start,
new_lines=new_lines
)
i = start + 1
n = len(lines)
old_lines_collected = 0
new_lines_collected = 0
old_content = []
new_content = []
while i < n:
line = lines[i].rstrip('\n')
if line.startswith('@@'):
break
if line.startswith('diff --git'):
break
if line.startswith('---'):
break
if line.startswith('+++'):
break
if old_lines_collected >= old_lines and new_lines_collected >= new_lines:
break
if line.startswith('+') and not line.startswith('+++'):
new_content.append(line)
new_lines_collected += 1
elif line.startswith('-') and not line.startswith('---'):
old_content.append(line)
old_lines_collected += 1
elif line.startswith(' ') or line == '':
old_content.append(line)
new_content.append(line)
old_lines_collected += 1
new_lines_collected += 1
else:
break
i += 1
hunk.old_lines_content = old_content
hunk.new_lines_content = new_content
return hunk, i - start
def parse_diff(diff_content: str) -> list[DiffFile]:
"""Parse diff content and return list of DiffFile objects."""
parser = DiffParser()
return parser.parse(diff_content)
def parse_diff_from_file(filepath: str) -> list[DiffFile]:
"""Read a diff file and parse its contents."""
with open(filepath, 'r') as f:
content = f.read()
return parse_diff(content)