ladybird/Userland/Libraries/LibDiff/Generator.cpp
Shannon Booth f690807c5a LibDiff: Change underlying representation of Hunk to allow context
The existing hunk data structure does not contain any way to easily
store information about context surrounding the additions and removals
in a hunk. While this does work fine for normal diffs (where there is
never any surrounding context) this data structure is quite limiting for
other use cases.

Without support for surrounding context it is not possible to:
 * Add support for unified or context format to the diff utility to
   output surrounding context.
 * Be able to implement a patch utility that uses the surrounding
   context to reliably locate where to apply a patch when a hunk range
   does not apply perfectly.

This patch changes Diff::Hunk such that its data structure more closely
resembles a unified diff. Each line in a hunk is now either a change,
removal, addition or context.

Allowing hunks to have context inside of them exposes that HackStudio
heavily relies on there being no context in the hunks that it uses for
its' git gutter implementation. The fix here is simple - ask git to
produce us a diff that has no context in it!
2023-07-02 11:18:11 -06:00

137 lines
4.3 KiB
C++

/*
* Copyright (c) 2021, Mustafa Quraish <mustafa@serenityos.org>
* Copyright (c) 2023, Shannon Booth <shannon.ml.booth@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include "Generator.h"
namespace Diff {
ErrorOr<Vector<Hunk>> from_text(StringView old_text, StringView new_text)
{
auto old_lines = old_text.lines();
auto new_lines = new_text.lines();
/**
* This is a simple implementation of the Longest Common Subsequence algorithm (over
* the lines of the text as opposed to the characters). A Dynamic programming approach
* is used here.
*/
enum class Direction {
Down, // Added a new line
Right, // Removed a line
Diagonal, // Line remained the same
};
// A single cell in the DP-matrix. Cell (i, j) represents the longest common
// sub-sequence of lines between old_lines[0 : i] and new_lines[0 : j].
struct Cell {
size_t length;
Direction direction;
};
auto dp_matrix = Vector<Cell>();
TRY(dp_matrix.try_resize((old_lines.size() + 1) * (new_lines.size() + 1)));
auto dp = [&dp_matrix, width = old_lines.size() + 1](size_t i, size_t j) -> Cell& {
return dp_matrix[i + width * j];
};
// Initialize the first row and column
for (size_t i = 0; i <= old_lines.size(); ++i)
dp(i, new_lines.size()) = { 0, Direction::Right };
for (size_t j = 0; j <= new_lines.size(); ++j)
dp(old_lines.size(), 0) = { 0, Direction::Down };
// Fill in the rest of the DP table
for (int i = old_lines.size() - 1; i >= 0; --i) {
for (int j = new_lines.size() - 1; j >= 0; --j) {
if (old_lines[i] == new_lines[j]) {
dp(i, j) = { dp(i + 1, j + 1).length + 1, Direction::Diagonal };
} else {
auto down = dp(i, j + 1).length;
auto right = dp(i + 1, j).length;
if (down > right)
dp(i, j) = { down, Direction::Down };
else
dp(i, j) = { right, Direction::Right };
}
}
}
Vector<Hunk> hunks;
Hunk cur_hunk;
bool in_hunk = false;
auto update_hunk = [&](size_t i, size_t j, Direction direction) -> ErrorOr<void> {
if (!in_hunk) {
HunkLocation location;
location.old_range.start_line = i;
location.new_range.start_line = j;
in_hunk = true;
cur_hunk = { location, {} };
}
if (direction == Direction::Down) {
TRY(cur_hunk.lines.try_append(Line { Line::Operation::Addition, TRY(String::from_utf8(new_lines[j])) }));
cur_hunk.location.new_range.number_of_lines++;
} else if (direction == Direction::Right) {
TRY(cur_hunk.lines.try_append(Line { Line::Operation::Removal, TRY(String::from_utf8(old_lines[i])) }));
cur_hunk.location.old_range.number_of_lines++;
}
return {};
};
auto flush_hunk = [&]() -> ErrorOr<void> {
if (in_hunk) {
// A file with no content has a zero indexed start line.
if (cur_hunk.location.new_range.start_line != 0 || cur_hunk.location.new_range.number_of_lines != 0)
cur_hunk.location.new_range.start_line++;
if (cur_hunk.location.old_range.start_line != 0 || cur_hunk.location.old_range.number_of_lines != 0)
cur_hunk.location.old_range.start_line++;
TRY(hunks.try_append(cur_hunk));
in_hunk = false;
}
return {};
};
size_t i = 0;
size_t j = 0;
while (i < old_lines.size() && j < new_lines.size()) {
auto& cell = dp(i, j);
if (cell.direction == Direction::Down) {
TRY(update_hunk(i, j, cell.direction));
++j;
} else if (cell.direction == Direction::Right) {
TRY(update_hunk(i, j, cell.direction));
++i;
} else {
++i;
++j;
TRY(flush_hunk());
}
}
while (i < old_lines.size()) {
TRY(update_hunk(i, new_lines.is_empty() ? 0 : new_lines.size() - 1, Direction::Right)); // Remove a line
++i;
}
while (j < new_lines.size()) {
TRY(update_hunk(old_lines.is_empty() ? 0 : old_lines.size() - 1, j, Direction::Down)); // Add a line
++j;
}
TRY(flush_hunk());
return hunks;
}
}