From 283e4d0c0876cdf5e73174903eeacecea95eb930 Mon Sep 17 00:00:00 2001 From: 7000pctAUTO Date: Mon, 2 Feb 2026 13:56:57 +0000 Subject: [PATCH] Add source files: models and parser --- src/gdiffer/parser.py | 203 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 src/gdiffer/parser.py diff --git a/src/gdiffer/parser.py b/src/gdiffer/parser.py new file mode 100644 index 0000000..39071ab --- /dev/null +++ b/src/gdiffer/parser.py @@ -0,0 +1,203 @@ +"""Diff parser for unified git diff format.""" + +import re +from typing import Optional + +from gdiffer.models import DiffFile, DiffHunk + + +class DiffParser: + """Parser for unified diff format (as produced by git diff).""" + + HUNK_PATTERN = re.compile(r'^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@') + + def __init__(self): + self.files: list[DiffFile] = [] + self.errors: list[str] = [] + + def parse(self, diff_content: str) -> list[DiffFile]: + self.files = [] + self.errors = [] + + if not diff_content.strip(): + return self.files + + lines = diff_content.splitlines() + self._parse_lines(lines) + return self.files + + def _parse_lines(self, lines: list[str]) -> None: + i = 0 + n = len(lines) + + while i < n: + line = lines[i].rstrip('\n') + + if line.startswith('diff --git'): + file_obj = self._parse_file(lines, i) + if file_obj: + self.files.append(file_obj) + i += 1 + continue + + i += 1 + + def _parse_file(self, lines: list[str], start: int) -> Optional[DiffFile]: + if start >= len(lines): + return None + + first_line = lines[start] + if not first_line.startswith('diff --git'): + return None + + parts = first_line.split(' ', 3) + if len(parts) < 4: + return None + + old_path = parts[2][2:] if len(parts) > 2 else '' + new_path = parts[3][2:] if len(parts) > 3 else old_path + + if old_path.startswith('a/'): + old_path = old_path[2:] + if new_path.startswith('b/'): + new_path = new_path[2:] + + file_obj = DiffFile(old_path=old_path, new_path=new_path) + + i = start + 1 + n = len(lines) + + while i < n: + line = lines[i].rstrip('\n') + + if line.startswith('new file mode '): + file_obj.new_file_mode = line.split()[-1] + file_obj.change_type = "add" + i += 1 + continue + + if line.startswith('deleted file mode '): + file_obj.deleted_file_mode = line.split()[-1] + file_obj.change_type = "delete" + i += 1 + continue + + if line.startswith('similarity index '): + file_obj.similarity_index = line.split()[-1].rstrip('%') + i += 1 + continue + + if line.startswith('rename from '): + file_obj.rename_from = line[12:] + i += 1 + continue + + if line.startswith('rename to '): + file_obj.rename_to = line[10:] + file_obj.change_type = "rename" + i += 1 + continue + + if line.startswith('---'): + i += 1 + continue + + if line.startswith('+++'): + i += 1 + continue + + if line.startswith('@@'): + hunk, consumed = self._parse_hunk(lines, i) + if hunk: + file_obj.hunks.append(hunk) + i += consumed + continue + + if line.startswith('diff --git'): + break + + i += 1 + + return file_obj + + def _parse_hunk(self, lines: list[str], start: int) -> tuple[Optional[DiffHunk], int]: + if start >= len(lines): + return None, 0 + + line = lines[start].rstrip('\n') + match = self.HUNK_PATTERN.match(line) + + if not match: + return None, 0 + + old_start = int(match.group(1)) + old_lines = int(match.group(2)) if match.group(2) else 1 + new_start = int(match.group(3)) + new_lines = int(match.group(4)) if match.group(4) else 1 + + hunk = DiffHunk( + old_start=old_start, + old_lines=old_lines, + new_start=new_start, + new_lines=new_lines + ) + + i = start + 1 + n = len(lines) + old_lines_collected = 0 + new_lines_collected = 0 + + old_content = [] + new_content = [] + + while i < n: + line = lines[i].rstrip('\n') + + if line.startswith('@@'): + break + + if line.startswith('diff --git'): + break + + if line.startswith('---'): + break + + if line.startswith('+++'): + break + + if old_lines_collected >= old_lines and new_lines_collected >= new_lines: + break + + if line.startswith('+') and not line.startswith('+++'): + new_content.append(line) + new_lines_collected += 1 + elif line.startswith('-') and not line.startswith('---'): + old_content.append(line) + old_lines_collected += 1 + elif line.startswith(' ') or line == '': + old_content.append(line) + new_content.append(line) + old_lines_collected += 1 + new_lines_collected += 1 + else: + break + + i += 1 + + hunk.old_lines_content = old_content + hunk.new_lines_content = new_content + + return hunk, i - start + + +def parse_diff(diff_content: str) -> list[DiffFile]: + """Parse diff content and return list of DiffFile objects.""" + parser = DiffParser() + return parser.parse(diff_content) + + +def parse_diff_from_file(filepath: str) -> list[DiffFile]: + """Read a diff file and parse its contents.""" + with open(filepath, 'r') as f: + content = f.read() + return parse_diff(content)