diff --git a/Libraries/LibWeb/HTML/Parser/Entities.cpp b/Libraries/LibWeb/HTML/Parser/Entities.cpp
index f41536bfce7..f80c4cc260b 100644
--- a/Libraries/LibWeb/HTML/Parser/Entities.cpp
+++ b/Libraries/LibWeb/HTML/Parser/Entities.cpp
@@ -4,26 +4,96 @@
* SPDX-License-Identifier: BSD-2-Clause
*/
-#include
+#include
+#include
+#include
#include
#include
namespace Web::HTML {
+static u8 ascii_alphabetic_to_index(u8 c)
+{
+ ASSERT(AK::is_ascii_alpha(c));
+ return c <= 'Z' ? (c - 'A') : (c - 'a' + 26);
+}
+
bool NamedCharacterReferenceMatcher::try_consume_ascii_char(u8 c)
{
- auto child_index = named_character_reference_child_index(m_node_index);
- auto maybe_updated_index = named_character_reference_find_sibling_and_update_unique_index(child_index, c, m_pending_unique_index);
- if (!maybe_updated_index.has_value())
- return false;
- m_overconsumed_code_points++;
- m_node_index = maybe_updated_index.value();
- if (currently_matches()) {
- m_last_matched_unique_index = m_pending_unique_index;
- m_ends_with_semicolon = c == ';';
- m_overconsumed_code_points = 0;
+ switch (m_search_state_tag) {
+ case NamedCharacterReferenceMatcher::SearchStateTag::Init: {
+ if (!AK::is_ascii_alpha(c))
+ return false;
+ auto index = ascii_alphabetic_to_index(c);
+ m_search_state_tag = NamedCharacterReferenceMatcher::SearchStateTag::FirstToSecondLayer;
+ m_search_state = { .first_to_second_layer = g_named_character_reference_first_to_second_layer[index] };
+ m_pending_unique_index = g_named_character_reference_first_layer[index].number;
+ m_overconsumed_code_points++;
+ return true;
+ }
+ case NamedCharacterReferenceMatcher::SearchStateTag::FirstToSecondLayer: {
+ if (!AK::is_ascii_alpha(c))
+ return false;
+ auto bit_index = ascii_alphabetic_to_index(c);
+ if (((1ull << bit_index) & m_search_state.first_to_second_layer.mask) == 0)
+ return false;
+
+ // Get the second layer node by re-using the first_to_second_layer.mask.
+ // For example, if the first character is 'n' and the second character is 'o':
+ //
+ // This is the first_to_second_layer.mask when the first character is 'n':
+ // 0001111110110110111111111100001000100000100001000000
+ // └ bit_index of 'o'
+ //
+ // Create a mask where all of the less significant bits than the
+ // bit index of the current character ('o') are set:
+ // 0000000000001111111111111111111111111111111111111111
+ // └ bit_index of 'o'
+ //
+ // Bitwise AND this new mask with the first_to_second_layer.mask
+ // to get only the set bits less significant than the bit index of the
+ // current character:
+ // 0000000000000110111111111100001000100000100001000000
+ //
+ // Take the popcount of this to get the index of the node within the
+ // second layer. In this case, there are 16 bits set, so the index
+ // of 'o' in the second layer is first_to_second_layer.second_layer_offset + 16.
+ u64 mask = (1ull << bit_index) - 1;
+ u8 char_index = AK::popcount(m_search_state.first_to_second_layer.mask & mask);
+ auto const& node = g_named_character_reference_second_layer[m_search_state.first_to_second_layer.second_layer_offset + char_index];
+
+ m_pending_unique_index += node.number;
+ m_overconsumed_code_points++;
+ if (node.end_of_word) {
+ m_pending_unique_index++;
+ m_last_matched_unique_index = m_pending_unique_index;
+ m_ends_with_semicolon = c == ';';
+ m_overconsumed_code_points = 0;
+ }
+ m_search_state_tag = NamedCharacterReferenceMatcher::SearchStateTag::DafsaChildren;
+ m_search_state = { .dafsa_children = { &g_named_character_reference_nodes[node.child_index], node.children_len } };
+ return true;
+ }
+ case NamedCharacterReferenceMatcher::SearchStateTag::DafsaChildren: {
+ for (auto const& node : m_search_state.dafsa_children) {
+ if (node.character == c) {
+ m_pending_unique_index += node.number;
+ m_overconsumed_code_points++;
+ if (node.end_of_word) {
+ m_pending_unique_index++;
+ m_last_matched_unique_index = m_pending_unique_index;
+ m_ends_with_semicolon = c == ';';
+ m_overconsumed_code_points = 0;
+ }
+ m_search_state = { .dafsa_children = { &g_named_character_reference_nodes[node.child_index], node.children_len } };
+ return true;
+ }
+ }
+ return false;
+ }
+ default:
+ VERIFY_NOT_REACHED();
}
- return true;
}
}
diff --git a/Libraries/LibWeb/HTML/Parser/Entities.h b/Libraries/LibWeb/HTML/Parser/Entities.h
index 565e5ebe4fc..9aabb44af92 100644
--- a/Libraries/LibWeb/HTML/Parser/Entities.h
+++ b/Libraries/LibWeb/HTML/Parser/Entities.h
@@ -7,7 +7,7 @@
#pragma once
#include
-#include
+#include
#include
namespace Web::HTML {
@@ -31,9 +31,6 @@ public:
// Otherwise, the `node_index` is unchanged and the function returns false.
bool try_consume_ascii_char(u8 c);
- // Returns true if the current `node_index` is marked as the end of a word
- bool currently_matches() const { return named_character_reference_is_end_of_word(m_node_index); }
-
// Returns the code points associated with the last match, if any.
Optional code_points() const { return named_character_reference_codepoints_from_unique_index(m_last_matched_unique_index); }
@@ -42,7 +39,18 @@ public:
u8 overconsumed_code_points() const { return m_overconsumed_code_points; }
private:
- u16 m_node_index { 0 };
+ enum class SearchStateTag : u8 {
+ Init,
+ FirstToSecondLayer,
+ DafsaChildren,
+ };
+ union SearchState {
+ NamedCharacterReferenceFirstToSecondLayerLink first_to_second_layer;
+ ReadonlySpan dafsa_children;
+ };
+
+ SearchStateTag m_search_state_tag { SearchStateTag::Init };
+ SearchState m_search_state { { 0, 0 } };
u16 m_last_matched_unique_index { 0 };
u16 m_pending_unique_index { 0 };
u8 m_overconsumed_code_points { 0 };
diff --git a/Meta/Lagom/Tools/CodeGenerators/LibWeb/GenerateNamedCharacterReferences.cpp b/Meta/Lagom/Tools/CodeGenerators/LibWeb/GenerateNamedCharacterReferences.cpp
index b9fa96dc704..db45f2a45fd 100644
--- a/Meta/Lagom/Tools/CodeGenerators/LibWeb/GenerateNamedCharacterReferences.cpp
+++ b/Meta/Lagom/Tools/CodeGenerators/LibWeb/GenerateNamedCharacterReferences.cpp
@@ -6,12 +6,74 @@
#include "GeneratorUtil.h"
#include
+#include
#include
#include
#include
#include
#include
+// The goal is to encode the necessary data compactly while still allowing for fast matching of
+// named character references, and taking full advantage of the note in the spec[1] that:
+//
+// > This list [of named character references] is static and will not be expanded or changed in the future.
+//
+// An overview of the approach taken (see [2] for more background/context):
+//
+// First, a deterministic acyclic finite state automaton (DAFSA) [3] is constructed from the set of
+// named character references. The nodes in the DAFSA are populated with a "number" field that
+// represents the count of all possible valid words from that node. This "number" field allows for
+// minimal perfect hashing, where each word in the set corresponds to a unique index. The unique
+// index of a word in the set is calculated during traversal/search of the DAFSA:
+// - For any non-matching node that is iterated when searching a list of children, add their number
+// to the unique index
+// - For nodes that match the current character, if the node is a valid end-of-word, add 1 to the
+// unique index
+// Note that "searching a list of children" is assumed to use a linear scan, so, for example, if
+// a list of children contained 'a', 'b', 'c', and 'd' (in that order), and the character 'c' was
+// being searched for, then the "number" of both 'a' and 'b' would get added to the unique index,
+// and then 1 would be added after matching 'c' (this minimal perfect hashing strategy comes from [4]).
+//
+// Something worth noting is that a DAFSA can be used with the set of named character references
+// (with minimal perfect hashing) while keeping the nodes of the DAFSA <= 32-bits. This is a property
+// that really matters, since any increase over 32-bits would immediately double the size of the data
+// due to padding bits when storing the nodes in a contiguous array.
+//
+// There are also a few modifications made to the DAFSA to increase performance:
+// - The 'first layer' of nodes is extracted out and replaced with a lookup table. This turns
+// the search for the first character from O(n) to O(1), and doesn't increase the data size because
+// all first characters in the set of named character references have the values 'a'-'z'/'A'-'Z',
+// so a lookup array of exactly 52 elements can be used. The lookup table stores the cumulative
+// "number" fields that would be calculated by a linear scan that matches a given node, thus allowing
+// the unique index to be built-up as normal with a O(1) search instead of a linear scan.
+// - The 'second layer' of nodes is also extracted out and searches of the second layer are done
+// using a bit field of 52 bits (the set bits of the bit field depend on the first character's value),
+// where each set bit corresponds to one of 'a'-'z'/'A'-'Z' (similar to the first layer, the second
+// layer can only contain ASCII alphabetic characters). The bit field is then re-used (along with
+// an offset) to get the index into the array of second layer nodes. This technique ultimately
+// allows for storing the minimum number of nodes in the second layer, and therefore only increasing the
+// size of the data by the size of the 'first to second layer link' info which is 52 * 8 = 416 bytes.
+// - After the second layer, the rest of the data is stored using a mostly-normal DAFSA, but there
+// are still a few differences:
+// - The "number" field is cumulative, in the same way that the first/second layer store a
+// cumulative "number" field. This cuts down slightly on the amount of work done during
+// the search of a list of children, and we can get away with it because the cumulative
+// "number" fields of the remaining nodes in the DAFSA (after the first and second layer
+// nodes were extracted out) happens to require few enough bits that we can store the
+// cumulative version while staying under our 32-bit budget.
+// - Instead of storing a 'last sibling' flag to denote the end of a list of children, the
+// length of each node's list of children is stored. Again, this is mostly done just because
+// there are enough bits available to do so while keeping the DAFSA node within 32 bits.
+// - Note: Together, these modifications open up the possibility of using a binary search instead
+// of a linear search over the children, but due to the consistently small lengths of the lists
+// of children in the remaining DAFSA, a linear search actually seems to be the better option.
+//
+// [1]: https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
+// [2]: https://www.ryanliptak.com/blog/better-named-character-reference-tokenization/
+// [3]: https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
+// [4]: Applications of finite automata representing large vocabularies (Cláudio L. Lucchesi,
+// Tomasz Kowaltowski, 1993) https://doi.org/10.1002/spe.4380230103
+
ErrorOr generate_header_file(Core::File& file);
ErrorOr generate_implementation_file(JsonObject& named_character_reference_data, Core::File& file);
@@ -81,7 +143,8 @@ ErrorOr generate_header_file(Core::File& file)
namespace Web::HTML {
-enum class NamedCharacterReferenceSecondCodepoint {
+// Uses u32 to match the `first` field of NamedCharacterReferenceCodepoints for bit-field packing purposes.
+enum class NamedCharacterReferenceSecondCodepoint : u32 {
None,
CombiningLongSolidusOverlay, // U+0338
CombiningLongVerticalLineOverlay, // U+20D2
@@ -121,19 +184,73 @@ inline Optional named_character_reference_second_codepoint_value(NamedChara
// Note: The first codepoint could fit in 17 bits, and the second could fit in 4 (if unsigned).
// However, to get any benefit from minimizing the struct size, it would need to be accompanied by
-// bit-packing the g_named_character_reference_codepoints_lookup array, and then either
-// using 5 bits for the second field (since enum bitfields are signed), or using a 4-bit wide
-// unsigned integer type.
+// bit-packing the g_named_character_reference_codepoints_lookup array.
struct NamedCharacterReferenceCodepoints {
u32 first : 24; // Largest value is U+1D56B
NamedCharacterReferenceSecondCodepoint second : 8;
};
static_assert(sizeof(NamedCharacterReferenceCodepoints) == 4);
-u16 named_character_reference_child_index(u16 node_index);
-bool named_character_reference_is_end_of_word(u16 node_index);
+struct NamedCharacterReferenceFirstLayerNode {
+ // Really only needs 12 bits.
+ u16 number;
+};
+static_assert(sizeof(NamedCharacterReferenceFirstLayerNode) == 2);
+
+struct NamedCharacterReferenceFirstToSecondLayerLink {
+ u64 mask : 52;
+ u64 second_layer_offset : 12;
+};
+static_assert(sizeof(NamedCharacterReferenceFirstToSecondLayerLink) == 8);
+
+// Note: It is possible to fit this information within 24 bits, which could then allow for tightly
+// bit-packing the second layer array. This would reduce the size of the array by 630 bytes.
+struct NamedCharacterReferenceSecondLayerNode {
+ // Could be 10 bits
+ u16 child_index;
+ u8 number;
+ // Could be 4 bits
+ u8 children_len : 7;
+ bool end_of_word : 1;
+};
+static_assert(sizeof(NamedCharacterReferenceSecondLayerNode) == 4);
+
+struct NamedCharacterReferenceNode {
+ // The actual alphabet of characters used in the list of named character references only
+ // includes 61 unique characters ('1'...'8', ';', 'a'...'z', 'A'...'Z').
+ u8 character;
+ // Typically, nodes are numbered with "an integer which gives the number of words that
+ // would be accepted by the automaton starting from that state." This numbering
+ // allows calculating "a one-to-one correspondence between the integers 1 to L
+ // (L is the number of words accepted by the automaton) and the words themselves."
+ //
+ // This allows us to have a minimal perfect hashing scheme such that it's possible to store
+ // and lookup the codepoint transformations of each named character reference using a separate
+ // array.
+ //
+ // This uses that idea, but instead of storing a per-node number that gets built up while
+ // searching a list of children, the cumulative number that would result from adding together
+ // the numbers of all the previous sibling nodes is stored instead. This cuts down on a bit
+ // of work done while searching while keeping the minimal perfect hashing strategy intact.
+ //
+ // Empirically, the largest number in our DAFSA is 51, so all number values could fit in a u6.
+ u8 number : 7;
+ bool end_of_word : 1;
+ // Index of the first child of this node.
+ // There are 3190 nodes in our DAFSA after the first and second layers were extracted out, so
+ // all indexes can fit in a u12 (there would be 3872 nodes with the first/second layers
+ // included, so still a u12).
+ u16 child_index : 12;
+ u16 children_len : 4;
+};
+static_assert(sizeof(NamedCharacterReferenceNode) == 4);
+
+extern NamedCharacterReferenceNode g_named_character_reference_nodes[];
+extern NamedCharacterReferenceFirstLayerNode g_named_character_reference_first_layer[];
+extern NamedCharacterReferenceFirstToSecondLayerLink g_named_character_reference_first_to_second_layer[];
+extern NamedCharacterReferenceSecondLayerNode g_named_character_reference_second_layer[];
+
Optional named_character_reference_codepoints_from_unique_index(u16 unique_index);
-Optional named_character_reference_find_sibling_and_update_unique_index(u16 first_child_index, u8 character, u16& unique_index);
} // namespace Web::HTML
@@ -143,6 +260,12 @@ Optional named_character_reference_find_sibling_and_update_unique_index(u16
return {};
}
+static u8 ascii_alphabetic_to_index(u8 c)
+{
+ ASSERT(AK::is_ascii_alpha(c));
+ return c <= 'Z' ? (c - 'A') : (c - 'a' + 26);
+}
+
class Node final : public RefCounted {
private:
struct NonnullRefPtrNodeTraits {
@@ -196,6 +319,17 @@ public:
return num;
}
+ u64 get_ascii_alphabetic_bit_mask()
+ {
+ u64 mask = 0;
+ for (int i = 0; i < 128; i++) {
+ if (m_children[i] == nullptr)
+ continue;
+ mask |= ((u64)1) << ascii_alphabetic_to_index(i);
+ }
+ return mask;
+ }
+
Array, 128>& children() { return m_children; }
void set_as_terminal() { m_is_terminal = true; }
@@ -323,16 +457,21 @@ private:
StringView m_previous_word = { m_previous_word_buf, 0 };
};
-static u16 write_children(NonnullRefPtr node, SourceGenerator& generator, Vector>& queue, HashMap& child_indexes, u16 first_available_index)
+struct NodeData {
+ u8 character;
+ u8 number;
+ bool end_of_word;
+ u16 child_index;
+ u8 children_len;
+};
+
+static u16 queue_children(NonnullRefPtr const& node, Vector>& queue, HashMap& child_indexes, u16 first_available_index)
{
auto current_available_index = first_available_index;
- auto num_children = node->num_direct_children();
- u16 child_i = 0;
for (u8 c = 0; c < 128; c++) {
if (node->children().at(c) == nullptr)
continue;
- auto child = node->children().at(c).release_nonnull();
- auto is_last_child = child_i == num_children - 1;
+ auto child = NonnullRefPtr(*node->children().at(c));
if (!child_indexes.contains(child.ptr())) {
auto child_num_children = child->num_direct_children();
@@ -342,22 +481,57 @@ static u16 write_children(NonnullRefPtr node, SourceGenerator& generator,
}
queue.append(child);
}
-
- auto member_generator = generator.fork();
- member_generator.set("char", StringView(&c, 1));
- member_generator.set("number", String::number(child->number()));
- member_generator.set("end_of_word", MUST(String::formatted("{}", child->is_terminal())));
- member_generator.set("end_of_list", MUST(String::formatted("{}", is_last_child)));
- auto child_index = child_indexes.get(child).value_or(0);
- member_generator.set("child_index", String::number(child_index));
- member_generator.append(R"~~~( { '@char@', @number@, @end_of_word@, @end_of_list@, @child_index@ },
-)~~~");
-
- child_i++;
}
return current_available_index;
}
+static u16 write_children_data(NonnullRefPtr const& node, Vector& node_data, Vector>& queue, HashMap& child_indexes, u16 first_available_index)
+{
+ auto current_available_index = first_available_index;
+ u8 unique_index_tally = 0;
+ for (u8 c = 0; c < 128; c++) {
+ if (node->children().at(c) == nullptr)
+ continue;
+ auto child = NonnullRefPtr(*node->children().at(c));
+ auto child_num_children = child->num_direct_children();
+
+ if (!child_indexes.contains(child.ptr())) {
+ if (child_num_children > 0) {
+ child_indexes.set(child, current_available_index);
+ current_available_index += child_num_children;
+ }
+ queue.append(child);
+ }
+
+ node_data.append({ c, unique_index_tally, child->is_terminal(), child_indexes.get(child).value_or(0), child_num_children });
+
+ unique_index_tally += child->number();
+ }
+ return current_available_index;
+}
+
+// Does not include the root node
+static void write_node_data(DafsaBuilder& dafsa_builder, Vector& node_data, HashMap& child_indexes)
+{
+ Vector> queue;
+
+ u16 first_available_index = 1;
+ first_available_index = queue_children(dafsa_builder.root(), queue, child_indexes, first_available_index);
+
+ child_indexes.clear_with_capacity();
+ first_available_index = 1;
+ auto second_layer_length = queue.size();
+ for (size_t i = 0; i < second_layer_length; i++) {
+ auto node = queue.take_first();
+ first_available_index = queue_children(node, queue, child_indexes, first_available_index);
+ }
+
+ while (queue.size() > 0) {
+ auto node = queue.take_first();
+ first_available_index = write_children_data(node, node_data, queue, child_indexes, first_available_index);
+ }
+}
+
ErrorOr generate_implementation_file(JsonObject& named_character_reference_data, Core::File& file)
{
StringBuilder builder;
@@ -412,68 +586,103 @@ static NamedCharacterReferenceCodepoints g_named_character_reference_codepoints_
)~~~");
}
+ Vector node_data;
+ HashMap child_indexes;
+ write_node_data(dafsa_builder, node_data, child_indexes);
+
generator.append(R"~~~(};
-struct __attribute__((packed)) DafsaNode {
- // The actual alphabet of characters used in the list of named character references only
- // includes 61 unique characters ('1'...'8', ';', 'a'...'z', 'A'...'Z'), but we have
- // bits to spare and encoding this as a `u8` allows us to avoid the need for converting
- // between an `enum(u6)` containing only the alphabet and the actual `u8` character value.
- u8 character;
- // Nodes are numbered with "an integer which gives the number of words that
- // would be accepted by the automaton starting from that state." This numbering
- // allows calculating "a one-to-one correspondence between the integers 1 to L
- // (L is the number of words accepted by the automaton) and the words themselves."
- //
- // Essentially, this allows us to have a minimal perfect hashing scheme such that
- // it's possible to store & lookup the codepoint transformations of each named character
- // reference using a separate array.
- //
- // Empirically, the largest number in our DAFSA is 168, so all number values fit in a u8.
- u8 number;
- // If true, this node is the end of a valid named character reference.
- // Note: This does not necessarily mean that this node does not have child nodes.
- bool end_of_word : 1;
- // If true, this node is the end of a sibling list.
- // If false, then (index + 1) will contain the next sibling.
- bool end_of_list : 1;
- // Index of the first child of this node.
- // There are 3872 nodes in our DAFSA, so all indexes could fit in a u12.
- u16 child_index : 14;
-};
-#if !defined(AK_OS_WINDOWS)
- static_assert(sizeof(DafsaNode) == 4);
-#else
- static_assert(sizeof(DafsaNode) == 5);
-#endif
-
-static DafsaNode g_named_character_reference_dafsa[] = {
- { 0, 0, false, true, 1 },
+NamedCharacterReferenceNode g_named_character_reference_nodes[] = {
+ { 0, 0, false, 0, 0 },
)~~~");
- Vector> queue;
- HashMap child_indexes;
-
- u16 first_available_index = dafsa_builder.root()->num_direct_children() + 1;
-
- NonnullRefPtr node = dafsa_builder.root();
- while (true) {
- first_available_index = write_children(node, generator, queue, child_indexes, first_available_index);
-
- if (queue.size() == 0)
- break;
- node = queue.take_first();
+ for (auto data : node_data) {
+ auto member_generator = generator.fork();
+ member_generator.set("char", StringView(&data.character, 1));
+ member_generator.set("number", String::number(data.number));
+ member_generator.set("end_of_word", MUST(String::formatted("{}", data.end_of_word)));
+ member_generator.set("child_index", String::number(data.child_index));
+ member_generator.set("children_len", String::number(data.children_len));
+ member_generator.append(R"~~~( { '@char@', @number@, @end_of_word@, @child_index@, @children_len@ },
+)~~~");
}
generator.append(R"~~~(};
-u16 named_character_reference_child_index(u16 node_index) {
- return g_named_character_reference_dafsa[node_index].child_index;
-}
+NamedCharacterReferenceFirstLayerNode g_named_character_reference_first_layer[] = {
+)~~~");
-bool named_character_reference_is_end_of_word(u16 node_index) {
- return g_named_character_reference_dafsa[node_index].end_of_word;
-}
+ auto num_children = dafsa_builder.root()->num_direct_children();
+ VERIFY(num_children == 52); // A-Z, a-z exactly
+ u16 unique_index_tally = 0;
+ for (u8 c = 0; c < 128; c++) {
+ if (dafsa_builder.root()->children().at(c) == nullptr)
+ continue;
+ VERIFY(AK::is_ascii_alpha(c));
+ auto child = dafsa_builder.root()->children().at(c);
+
+ auto member_generator = generator.fork();
+ member_generator.set("number", String::number(unique_index_tally));
+ member_generator.append(R"~~~( { @number@ },
+)~~~");
+
+ unique_index_tally += child->number();
+ }
+
+ generator.append(R"~~~(};
+
+NamedCharacterReferenceFirstToSecondLayerLink g_named_character_reference_first_to_second_layer[] = {
+)~~~");
+
+ u16 second_layer_offset = 0;
+ for (u8 c = 0; c < 128; c++) {
+ if (dafsa_builder.root()->children().at(c) == nullptr)
+ continue;
+ VERIFY(AK::is_ascii_alpha(c));
+ auto child = dafsa_builder.root()->children().at(c);
+ auto bit_mask = child->get_ascii_alphabetic_bit_mask();
+
+ auto member_generator = generator.fork();
+ member_generator.set("bit_mask", String::number(bit_mask));
+ member_generator.set("second_layer_offset", String::number(second_layer_offset));
+ member_generator.append(R"~~~( { @bit_mask@ull, @second_layer_offset@ },
+)~~~");
+
+ second_layer_offset += child->num_direct_children();
+ }
+
+ generator.append(R"~~~(};
+
+NamedCharacterReferenceSecondLayerNode g_named_character_reference_second_layer[] = {
+)~~~");
+
+ for (u8 c = 0; c < 128; c++) {
+ if (dafsa_builder.root()->children().at(c) == nullptr)
+ continue;
+ VERIFY(AK::is_ascii_alpha(c));
+ auto first_layer_node = dafsa_builder.root()->children().at(c);
+
+ u8 unique_index_tally = 0;
+ for (u8 child_c = 0; child_c < 128; child_c++) {
+ if (first_layer_node->children().at(child_c) == nullptr)
+ continue;
+ VERIFY(AK::is_ascii_alpha(child_c));
+ auto second_layer_node = first_layer_node->children().at(child_c);
+ auto child_num_children = second_layer_node->num_direct_children();
+ auto child_index = child_indexes.get(second_layer_node).value_or(0);
+
+ auto member_generator = generator.fork();
+ member_generator.set("child_index", String::number(child_index));
+ member_generator.set("number", String::number(unique_index_tally));
+ member_generator.set("children_len", String::number(child_num_children));
+ member_generator.set("end_of_word", MUST(String::formatted("{}", second_layer_node->is_terminal())));
+ member_generator.append(R"~~~( { @child_index@, @number@, @children_len@, @end_of_word@ },
+)~~~");
+ unique_index_tally += second_layer_node->number();
+ }
+ }
+
+ generator.append(R"~~~(};
// Note: The unique index is 1-based.
Optional named_character_reference_codepoints_from_unique_index(u16 unique_index) {
@@ -481,25 +690,6 @@ Optional named_character_reference_codepoints
return g_named_character_reference_codepoints_lookup[unique_index - 1];
}
-// Search `first_child_index` and siblings of `first_child_index` for a node with the value `character`.
-// If found, returns the index of the node within the `dafsa` array. Otherwise, returns `null`.
-// Updates `unique_index` as the array is traversed
-Optional named_character_reference_find_sibling_and_update_unique_index(u16 first_child_index, u8 character, u16& unique_index) {
- auto index = first_child_index;
- while (true) {
- if (g_named_character_reference_dafsa[index].character < character) {
- unique_index += g_named_character_reference_dafsa[index].number;
- }
- if (g_named_character_reference_dafsa[index].character == character) {
- if (g_named_character_reference_dafsa[index].end_of_word) unique_index++;
- return index;
- }
- if (g_named_character_reference_dafsa[index].end_of_list) return {};
- index += 1;
- }
- VERIFY_NOT_REACHED();
-}
-
} // namespace Web::HTML
)~~~");