mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-04-25 14:05:15 +00:00
LibRegex: Make codegen+optimisation for alternatives much faster
Just a little thinking outside the box, and we can now parse and optimise a million copies of "a|" chained together in just a second :^)
This commit is contained in:
parent
4be7239626
commit
97a333608e
Notes:
sideshowbarker
2024-07-17 18:29:29 +09:00
Author: https://github.com/alimpfard Commit: https://github.com/SerenityOS/serenity/commit/97a333608e Pull-request: https://github.com/SerenityOS/serenity/pull/12653 Issue: https://github.com/SerenityOS/serenity/issues/12373 Issue: https://github.com/SerenityOS/serenity/issues/12615 Reviewed-by: https://github.com/trflynn89
5 changed files with 150 additions and 65 deletions
|
@ -498,7 +498,7 @@ TEST_CASE(posix_extended_nested_capture_group)
|
||||||
EXPECT_EQ(result.capture_group_matches[0][2].view, "llo"sv);
|
EXPECT_EQ(result.capture_group_matches[0][2].view, "llo"sv);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto parse_test_case_long_disjunction_chain = String::repeated("a|"sv, 10000);
|
auto parse_test_case_long_disjunction_chain = String::repeated("a|"sv, 100000);
|
||||||
|
|
||||||
TEST_CASE(ECMA262_parse)
|
TEST_CASE(ECMA262_parse)
|
||||||
{
|
{
|
||||||
|
|
|
@ -14,6 +14,7 @@ namespace regex {
|
||||||
class Optimizer {
|
class Optimizer {
|
||||||
public:
|
public:
|
||||||
static void append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right);
|
static void append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right);
|
||||||
|
static void append_alternation(ByteCode& target, Span<ByteCode> alternatives);
|
||||||
static void append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs);
|
static void append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -35,9 +35,12 @@ public:
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void print_bytecode(Regex<T> const& regex) const
|
void print_bytecode(Regex<T> const& regex) const
|
||||||
{
|
{
|
||||||
MatchState state;
|
print_bytecode(regex.parser_result.bytecode);
|
||||||
auto& bytecode = regex.parser_result.bytecode;
|
}
|
||||||
|
|
||||||
|
void print_bytecode(ByteCode const& bytecode) const
|
||||||
|
{
|
||||||
|
MatchState state;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
auto& opcode = bytecode.get_opcode(state);
|
auto& opcode = bytecode.get_opcode(state);
|
||||||
print_opcode("PrintBytecode", opcode, state);
|
print_opcode("PrintBytecode", opcode, state);
|
||||||
|
|
|
@ -9,6 +9,10 @@
|
||||||
#include <AK/Stack.h>
|
#include <AK/Stack.h>
|
||||||
#include <LibRegex/Regex.h>
|
#include <LibRegex/Regex.h>
|
||||||
#include <LibRegex/RegexBytecodeStreamOptimizer.h>
|
#include <LibRegex/RegexBytecodeStreamOptimizer.h>
|
||||||
|
#if REGEX_DEBUG
|
||||||
|
# include <AK/ScopeGuard.h>
|
||||||
|
# include <AK/ScopeLogger.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace regex {
|
namespace regex {
|
||||||
|
|
||||||
|
@ -444,78 +448,166 @@ void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&
|
||||||
|
|
||||||
void Optimizer::append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right)
|
void Optimizer::append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right)
|
||||||
{
|
{
|
||||||
auto left_is_empty = left.is_empty();
|
Array<ByteCode, 2> alternatives;
|
||||||
auto right_is_empty = right.is_empty();
|
alternatives[0] = move(left);
|
||||||
if (left_is_empty || right_is_empty) {
|
alternatives[1] = move(right);
|
||||||
if (left_is_empty && right_is_empty)
|
|
||||||
return;
|
|
||||||
|
|
||||||
// ForkJump left (+ 2 + right.size())
|
append_alternation(target, alternatives);
|
||||||
// (right)
|
}
|
||||||
// Jump end (+ left.size())
|
|
||||||
// (left)
|
void Optimizer::append_alternation(ByteCode& target, Span<ByteCode> alternatives)
|
||||||
// LABEL end
|
{
|
||||||
target.append(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
|
if (alternatives.size() == 0)
|
||||||
target.append(2 + right.size());
|
|
||||||
target.extend(move(right));
|
|
||||||
target.append(static_cast<ByteCodeValueType>(OpCodeId::Jump));
|
|
||||||
target.append(left.size());
|
|
||||||
target.extend(move(left));
|
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
if (alternatives.size() == 1)
|
||||||
|
return target.extend(move(alternatives[0]));
|
||||||
|
|
||||||
|
if (all_of(alternatives, [](auto& x) { return x.is_empty(); }))
|
||||||
|
return;
|
||||||
|
|
||||||
|
for (auto& entry : alternatives)
|
||||||
|
entry.flatten();
|
||||||
|
|
||||||
|
#if REGEX_DEBUG
|
||||||
|
ScopeLogger<true> log;
|
||||||
|
warnln("Alternations:");
|
||||||
|
RegexDebug dbg;
|
||||||
|
for (auto& entry : alternatives) {
|
||||||
|
warnln("----------");
|
||||||
|
dbg.print_bytecode(entry);
|
||||||
}
|
}
|
||||||
|
ScopeGuard print_at_end {
|
||||||
|
[&] {
|
||||||
|
warnln("======================");
|
||||||
|
RegexDebug dbg;
|
||||||
|
dbg.print_bytecode(target);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
left.flatten();
|
Vector<Vector<Detail::Block>> basic_blocks;
|
||||||
right.flatten();
|
basic_blocks.ensure_capacity(alternatives.size());
|
||||||
|
|
||||||
auto left_blocks = Regex<PosixBasicParser>::split_basic_blocks(left);
|
for (auto& entry : alternatives)
|
||||||
auto right_blocks = Regex<PosixBasicParser>::split_basic_blocks(right);
|
basic_blocks.append(Regex<PosixBasicParser>::split_basic_blocks(entry));
|
||||||
|
|
||||||
size_t left_skip = 0;
|
size_t left_skip = 0;
|
||||||
|
size_t shared_block_count = basic_blocks.first().size();
|
||||||
|
for (auto& entry : basic_blocks)
|
||||||
|
shared_block_count = min(shared_block_count, entry.size());
|
||||||
|
|
||||||
MatchState state;
|
MatchState state;
|
||||||
for (size_t block_index = 0; block_index < left_blocks.size() && block_index < right_blocks.size(); block_index++) {
|
for (size_t block_index = 0; block_index < shared_block_count; block_index++) {
|
||||||
auto& left_block = left_blocks[block_index];
|
auto& left_block = basic_blocks.first()[block_index];
|
||||||
auto& right_block = right_blocks[block_index];
|
auto left_end = block_index + 1 == basic_blocks.first().size() ? left_block.end : basic_blocks.first()[block_index + 1].start;
|
||||||
auto left_end = block_index + 1 == left_blocks.size() ? left_block.end : left_blocks[block_index + 1].start;
|
auto can_continue = true;
|
||||||
auto right_end = block_index + 1 == right_blocks.size() ? right_block.end : right_blocks[block_index + 1].start;
|
for (size_t i = 1; i < alternatives.size(); ++i) {
|
||||||
|
auto& right_blocks = basic_blocks[i];
|
||||||
|
auto& right_block = right_blocks[block_index];
|
||||||
|
auto right_end = block_index + 1 == right_blocks.size() ? right_block.end : right_blocks[block_index + 1].start;
|
||||||
|
|
||||||
if (left_end - left_block.start != right_end - right_block.start)
|
if (left_end - left_block.start != right_end - right_block.start) {
|
||||||
|
can_continue = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (alternatives[0].spans().slice(left_block.start, left_end - left_block.start) != alternatives[i].spans().slice(right_block.start, right_end - right_block.start)) {
|
||||||
|
can_continue = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!can_continue)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
if (left.spans().slice(left_block.start, left_end - left_block.start) != right.spans().slice(right_block.start, right_end - right_block.start))
|
size_t i = 0;
|
||||||
break;
|
for (auto& entry : alternatives) {
|
||||||
|
auto& blocks = basic_blocks[i];
|
||||||
state.instruction_position = 0;
|
auto& block = blocks[block_index];
|
||||||
while (state.instruction_position < left_end) {
|
auto end = block_index + 1 == blocks.size() ? block.end : blocks[block_index + 1].start;
|
||||||
auto& opcode = left.get_opcode(state);
|
state.instruction_position = block.start;
|
||||||
left_skip = state.instruction_position;
|
size_t skip = 0;
|
||||||
state.instruction_position += opcode.size();
|
while (state.instruction_position < end) {
|
||||||
|
auto& opcode = entry.get_opcode(state);
|
||||||
|
state.instruction_position += opcode.size();
|
||||||
|
skip = state.instruction_position;
|
||||||
|
}
|
||||||
|
left_skip = min(skip, left_skip);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
dbgln_if(REGEX_DEBUG, "Skipping {}/{} bytecode entries from {}/{}", left_skip, 0, left.size(), right.size());
|
dbgln_if(REGEX_DEBUG, "Skipping {}/{} bytecode entries from {}", left_skip, 0, alternatives[0].size());
|
||||||
|
|
||||||
if (left_skip > 0) {
|
if (left_skip > 0) {
|
||||||
target.extend(left.release_slice(left_blocks.first().start, left_skip));
|
target.extend(alternatives[0].release_slice(basic_blocks.first().first().start, left_skip));
|
||||||
right = right.release_slice(left_skip);
|
auto first = true;
|
||||||
|
for (auto& entry : alternatives) {
|
||||||
|
if (first) {
|
||||||
|
first = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
entry = entry.release_slice(left_skip);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto left_size = left.size();
|
if (all_of(alternatives, [](auto& entry) { return entry.is_empty(); }))
|
||||||
|
return;
|
||||||
|
|
||||||
target.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
|
size_t patch_start = target.size();
|
||||||
target.empend(right.size() + (left_size > 0 ? 2 : 0)); // Jump to the _ALT label
|
for (size_t i = 1; i < alternatives.size(); ++i) {
|
||||||
|
target.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
|
||||||
|
target.empend(0u); // To be filled later.
|
||||||
|
}
|
||||||
|
|
||||||
target.extend(move(right));
|
size_t size_to_jump = 0;
|
||||||
|
bool seen_one_empty = false;
|
||||||
|
for (size_t i = alternatives.size(); i > 0; --i) {
|
||||||
|
auto& entry = alternatives[i - 1];
|
||||||
|
if (entry.is_empty()) {
|
||||||
|
if (seen_one_empty)
|
||||||
|
continue;
|
||||||
|
seen_one_empty = true;
|
||||||
|
}
|
||||||
|
|
||||||
if (left_size != 0) {
|
auto is_first = i == 1;
|
||||||
|
auto instruction_size = entry.size() + (is_first ? 0 : 2); // Jump; -> +2
|
||||||
|
size_to_jump += instruction_size;
|
||||||
|
|
||||||
|
if (!is_first)
|
||||||
|
target[patch_start + (i - 2) * 2 + 1] = size_to_jump + (alternatives.size() - i) * 2;
|
||||||
|
|
||||||
|
dbgln_if(REGEX_DEBUG, "{} size = {}, cum={}", i - 1, instruction_size, size_to_jump);
|
||||||
|
}
|
||||||
|
|
||||||
|
seen_one_empty = false;
|
||||||
|
for (size_t i = alternatives.size(); i > 0; --i) {
|
||||||
|
auto& chunk = alternatives[i - 1];
|
||||||
|
if (chunk.is_empty()) {
|
||||||
|
if (seen_one_empty)
|
||||||
|
continue;
|
||||||
|
seen_one_empty = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
ByteCode* previous_chunk = nullptr;
|
||||||
|
size_t j = i - 1;
|
||||||
|
auto seen_one_empty_before = chunk.is_empty();
|
||||||
|
while (j >= 1) {
|
||||||
|
--j;
|
||||||
|
auto& candidate_chunk = alternatives[j];
|
||||||
|
if (candidate_chunk.is_empty()) {
|
||||||
|
if (seen_one_empty_before)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
previous_chunk = &candidate_chunk;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_to_jump -= chunk.size() + (previous_chunk ? 2 : 0);
|
||||||
|
|
||||||
|
target.extend(move(chunk));
|
||||||
target.empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
|
target.empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
|
||||||
target.empend(left.size()); // Jump to the _END label
|
target.empend(size_to_jump); // Jump to the _END label
|
||||||
}
|
}
|
||||||
|
|
||||||
// LABEL _ALT = bytecode.size() + 2
|
|
||||||
|
|
||||||
target.extend(move(left));
|
|
||||||
|
|
||||||
// LABEL _END = alterantive_bytecode.size
|
|
||||||
}
|
}
|
||||||
|
|
||||||
enum class LookupTableInsertionOutcome {
|
enum class LookupTableInsertionOutcome {
|
||||||
|
|
|
@ -958,7 +958,7 @@ bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_mini
|
||||||
{
|
{
|
||||||
size_t total_match_length_minimum = NumericLimits<size_t>::max();
|
size_t total_match_length_minimum = NumericLimits<size_t>::max();
|
||||||
Vector<ByteCode> alternatives;
|
Vector<ByteCode> alternatives;
|
||||||
do {
|
while (true) {
|
||||||
ByteCode alternative_stack;
|
ByteCode alternative_stack;
|
||||||
size_t alternative_minimum_length = 0;
|
size_t alternative_minimum_length = 0;
|
||||||
auto alt_ok = parse_alternative(alternative_stack, alternative_minimum_length, unicode, named);
|
auto alt_ok = parse_alternative(alternative_stack, alternative_minimum_length, unicode, named);
|
||||||
|
@ -971,20 +971,9 @@ bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_mini
|
||||||
if (!match(TokenType::Pipe))
|
if (!match(TokenType::Pipe))
|
||||||
break;
|
break;
|
||||||
consume();
|
consume();
|
||||||
} while (true);
|
|
||||||
|
|
||||||
Optional<ByteCode> alternative_stack {};
|
|
||||||
for (auto& alternative : alternatives) {
|
|
||||||
if (alternative_stack.has_value()) {
|
|
||||||
ByteCode target_stack;
|
|
||||||
target_stack.insert_bytecode_alternation(alternative_stack.release_value(), move(alternative));
|
|
||||||
alternative_stack = move(target_stack);
|
|
||||||
} else {
|
|
||||||
alternative_stack = move(alternative);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
stack.extend(alternative_stack.release_value());
|
Optimizer::append_alternation(stack, alternatives.span());
|
||||||
match_length_minimum = total_match_length_minimum;
|
match_length_minimum = total_match_length_minimum;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue