From 89c5f25016e3486ae9104c1ecbda9e1341573377 Mon Sep 17 00:00:00 2001 From: Sam Atkins Date: Fri, 26 Jul 2024 15:11:57 +0100 Subject: [PATCH] LibWeb/CSS: Remove tiny-oom propagation from CSS Tokenizer --- .../Libraries/LibWeb/CSS/Parser/Parser.cpp | 2 +- .../Libraries/LibWeb/CSS/Parser/Tokenizer.cpp | 130 +++++++++--------- .../Libraries/LibWeb/CSS/Parser/Tokenizer.h | 18 +-- .../SyntaxHighlighter/SyntaxHighlighter.cpp | 2 +- 4 files changed, 76 insertions(+), 76 deletions(-) diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Parser.cpp b/Userland/Libraries/LibWeb/CSS/Parser/Parser.cpp index b8eea907469..7bc74d3d8f3 100644 --- a/Userland/Libraries/LibWeb/CSS/Parser/Parser.cpp +++ b/Userland/Libraries/LibWeb/CSS/Parser/Parser.cpp @@ -95,7 +95,7 @@ namespace Web::CSS::Parser { ErrorOr Parser::create(ParsingContext const& context, StringView input, StringView encoding) { - auto tokens = TRY(Tokenizer::tokenize(input, encoding)); + auto tokens = Tokenizer::tokenize(input, encoding); return Parser { context, move(tokens) }; } diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp index 1c692fbc332..f1951279bfb 100644 --- a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp +++ b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp @@ -195,14 +195,14 @@ static inline bool is_E(u32 code_point) return code_point == 0x45; } -ErrorOr> Tokenizer::tokenize(StringView input, StringView encoding) +Vector Tokenizer::tokenize(StringView input, StringView encoding) { // https://www.w3.org/TR/css-syntax-3/#css-filter-code-points - auto filter_code_points = [](StringView input, auto encoding) -> ErrorOr { + auto filter_code_points = [](StringView input, auto encoding) -> String { auto decoder = TextCodec::decoder_for(encoding); VERIFY(decoder.has_value()); - auto decoded_input = TRY(decoder->to_utf8(input)); + auto decoded_input = MUST(decoder->to_utf8(input)); // OPTIMIZATION: If the input doesn't contain any CR or FF, we can skip the filtering bool const contains_cr_or_ff = [&] { @@ -227,25 +227,25 @@ ErrorOr> Tokenizer::tokenize(StringView input, StringView encoding // in input by a single U+000A LINE FEED (LF) code point. if (code_point == '\r') { if (last_was_carriage_return) { - TRY(builder.try_append('\n')); + builder.append('\n'); } else { last_was_carriage_return = true; } } else { if (last_was_carriage_return) - TRY(builder.try_append('\n')); + builder.append('\n'); if (code_point == '\n') { if (!last_was_carriage_return) - TRY(builder.try_append('\n')); + builder.append('\n'); } else if (code_point == '\f') { - TRY(builder.try_append('\n')); + builder.append('\n'); // Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT CHARACTER (�). } else if (code_point == 0x00 || (code_point >= 0xD800 && code_point <= 0xDFFF)) { - TRY(builder.try_append_code_point(REPLACEMENT_CHARACTER)); + builder.append_code_point(REPLACEMENT_CHARACTER); } else { - TRY(builder.try_append_code_point(code_point)); + builder.append_code_point(code_point); } last_was_carriage_return = false; @@ -254,7 +254,7 @@ ErrorOr> Tokenizer::tokenize(StringView input, StringView encoding return builder.to_string_without_validation(); }; - Tokenizer tokenizer { TRY(filter_code_points(input, encoding)) }; + Tokenizer tokenizer { filter_code_points(input, encoding) }; return tokenizer.tokenize(); } @@ -265,15 +265,15 @@ Tokenizer::Tokenizer(String decoded_input) { } -ErrorOr> Tokenizer::tokenize() +Vector Tokenizer::tokenize() { Vector tokens; for (;;) { auto token_start = m_position; - auto token = TRY(consume_a_token()); + auto token = consume_a_token(); token.m_start_position = token_start; token.m_end_position = m_position; - TRY(tokens.try_append(token)); + tokens.append(token); if (token.is(Token::Type::EndOfFile)) { return tokens; @@ -441,14 +441,14 @@ u32 Tokenizer::consume_escaped_code_point() } // https://www.w3.org/TR/css-syntax-3/#consume-ident-like-token -ErrorOr Tokenizer::consume_an_ident_like_token() +Token Tokenizer::consume_an_ident_like_token() { // This section describes how to consume an ident-like token from a stream of code points. // It returns an , , , or . // Consume an ident sequence, and let string be the result. auto start_byte_offset = current_byte_offset(); - auto string = TRY(consume_an_ident_sequence()); + auto string = consume_an_ident_sequence(); // If string’s value is an ASCII case-insensitive match for "url", and the next input code // point is U+0028 LEFT PARENTHESIS ((), consume it. @@ -470,7 +470,7 @@ ErrorOr Tokenizer::consume_an_ident_like_token() // with its value set to string and return it. auto next_two = peek_twin(); if (is_quotation_mark(next_two.first) || is_apostrophe(next_two.first) || (is_whitespace(next_two.first) && (is_quotation_mark(next_two.second) || is_apostrophe(next_two.second)))) { - return create_value_token(Token::Type::Function, move(string), TRY(input_since(start_byte_offset))); + return create_value_token(Token::Type::Function, move(string), input_since(start_byte_offset)); } // Otherwise, consume a url token, and return it. @@ -482,11 +482,11 @@ ErrorOr Tokenizer::consume_an_ident_like_token() (void)next_code_point(); // Create a with its value set to string and return it. - return create_value_token(Token::Type::Function, move(string), TRY(input_since(start_byte_offset))); + return create_value_token(Token::Type::Function, move(string), input_since(start_byte_offset)); } // Otherwise, create an with its value set to string and return it. - return create_value_token(Token::Type::Ident, move(string), TRY(input_since(start_byte_offset))); + return create_value_token(Token::Type::Ident, move(string), input_since(start_byte_offset)); } // https://www.w3.org/TR/css-syntax-3/#consume-number @@ -595,7 +595,7 @@ double Tokenizer::convert_a_string_to_a_number(StringView string) } // https://www.w3.org/TR/css-syntax-3/#consume-name -ErrorOr Tokenizer::consume_an_ident_sequence() +FlyString Tokenizer::consume_an_ident_sequence() { // This section describes how to consume an ident sequence from a stream of code points. // It returns a string containing the largest name that can be formed from adjacent @@ -619,14 +619,14 @@ ErrorOr Tokenizer::consume_an_ident_sequence() // name code point if (is_ident_code_point(input)) { // Append the code point to result. - TRY(result.try_append_code_point(input)); + result.append_code_point(input); continue; } // the stream starts with a valid escape if (is_valid_escape_sequence(start_of_input_stream_twin())) { // Consume an escaped code point. Append the returned code point to result. - TRY(result.try_append_code_point(consume_escaped_code_point())); + result.append_code_point(consume_escaped_code_point()); continue; } @@ -640,7 +640,7 @@ ErrorOr Tokenizer::consume_an_ident_sequence() } // https://www.w3.org/TR/css-syntax-3/#consume-url-token -ErrorOr Tokenizer::consume_a_url_token() +Token Tokenizer::consume_a_url_token() { // This section describes how to consume a url token from a stream of code points. // It returns either a or a . @@ -659,9 +659,9 @@ ErrorOr Tokenizer::consume_a_url_token() // 2. Consume as much whitespace as possible. consume_as_much_whitespace_as_possible(); - auto make_token = [&]() -> ErrorOr { - token.m_value = TRY(FlyString::from_utf8(builder.string_view())); - token.m_representation = TRY(input_since(start_byte_offset)); + auto make_token = [&]() -> Token { + token.m_value = builder.to_fly_string_without_validation(); + token.m_representation = input_since(start_byte_offset); return token; }; @@ -705,7 +705,7 @@ ErrorOr Tokenizer::consume_a_url_token() // otherwise, consume the remnants of a bad url, create a , and return it. consume_the_remnants_of_a_bad_url(); auto bad_url_token = create_new_token(Token::Type::BadUrl); - bad_url_token.m_representation = TRY(input_since(start_byte_offset)); + bad_url_token.m_representation = input_since(start_byte_offset); return bad_url_token; } @@ -718,7 +718,7 @@ ErrorOr Tokenizer::consume_a_url_token() log_parse_error(); consume_the_remnants_of_a_bad_url(); auto bad_url_token = create_new_token(Token::Type::BadUrl); - bad_url_token.m_representation = TRY(input_since(start_byte_offset)); + bad_url_token.m_representation = input_since(start_byte_offset); return bad_url_token; } @@ -735,7 +735,7 @@ ErrorOr Tokenizer::consume_a_url_token() // Consume the remnants of a bad url, create a , and return it. consume_the_remnants_of_a_bad_url(); auto bad_url_token = create_new_token(Token::Type::BadUrl); - bad_url_token.m_representation = TRY(input_since(start_byte_offset)); + bad_url_token.m_representation = input_since(start_byte_offset); return bad_url_token; } } @@ -792,7 +792,7 @@ void Tokenizer::reconsume_current_input_code_point() } // https://www.w3.org/TR/css-syntax-3/#consume-numeric-token -ErrorOr Tokenizer::consume_a_numeric_token() +Token Tokenizer::consume_a_numeric_token() { // This section describes how to consume a numeric token from a stream of code points. // It returns either a , , or . @@ -810,13 +810,13 @@ ErrorOr Tokenizer::consume_a_numeric_token() token.m_number_value = number; // 2. Consume an ident sequence. Set the ’s unit to the returned value. - auto unit = TRY(consume_an_ident_sequence()); + auto unit = consume_an_ident_sequence(); VERIFY(!unit.is_empty()); // NOTE: We intentionally store this in the `value`, to save space. token.m_value = move(unit); // 3. Return the . - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } @@ -827,14 +827,14 @@ ErrorOr Tokenizer::consume_a_numeric_token() // Create a with the same value as number, and return it. auto token = create_new_token(Token::Type::Percentage); token.m_number_value = number; - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } // Otherwise, create a with the same value and type flag as number, and return it. auto token = create_new_token(Token::Type::Number); token.m_number_value = number; - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } @@ -949,7 +949,7 @@ bool Tokenizer::would_start_an_ident_sequence(U32Triplet values) } // https://www.w3.org/TR/css-syntax-3/#consume-string-token -ErrorOr Tokenizer::consume_string_token(u32 ending_code_point) +Token Tokenizer::consume_string_token(u32 ending_code_point) { // This section describes how to consume a string token from a stream of code points. // It returns either a or . @@ -963,9 +963,9 @@ ErrorOr Tokenizer::consume_string_token(u32 ending_code_point) auto token = create_new_token(Token::Type::String); StringBuilder builder; - auto make_token = [&]() -> ErrorOr { - token.m_value = TRY(FlyString::from_utf8(builder.string_view())); - token.m_representation = TRY(input_since(start_byte_offset)); + auto make_token = [&]() -> Token { + token.m_value = builder.to_fly_string_without_validation(); + token.m_representation = input_since(start_byte_offset); return token; }; @@ -990,7 +990,7 @@ ErrorOr Tokenizer::consume_string_token(u32 ending_code_point) // , and return it. reconsume_current_input_code_point(); auto bad_string_token = create_new_token(Token::Type::BadString); - bad_string_token.m_representation = TRY(input_since(start_byte_offset)); + bad_string_token.m_representation = input_since(start_byte_offset); return bad_string_token; } @@ -1059,7 +1059,7 @@ start: } // https://www.w3.org/TR/css-syntax-3/#consume-token -ErrorOr Tokenizer::consume_a_token() +Token Tokenizer::consume_a_token() { // This section describes how to consume a token from a stream of code points. // It will return a single token of any type. @@ -1077,7 +1077,7 @@ ErrorOr Tokenizer::consume_a_token() // Consume as much whitespace as possible. Return a . consume_as_much_whitespace_as_possible(); auto token = create_new_token(Token::Type::Whitespace); - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } @@ -1107,16 +1107,16 @@ ErrorOr Tokenizer::consume_a_token() token.m_hash_type = Token::HashType::Id; // 3. Consume an ident sequence, and set the ’s value to the returned string. - auto name = TRY(consume_an_ident_sequence()); + auto name = consume_an_ident_sequence(); token.m_value = move(name); // 4. Return the . - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } // Otherwise, return a with its value set to the current input code point. - return create_value_token(Token::Type::Delim, input, TRY(input_since(start_byte_offset))); + return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset)); } // U+0027 APOSTROPHE (') @@ -1131,7 +1131,7 @@ ErrorOr Tokenizer::consume_a_token() dbgln_if(CSS_TOKENIZER_DEBUG, "is left paren"); // Return a <(-token>. Token token = create_new_token(Token::Type::OpenParen); - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } @@ -1140,7 +1140,7 @@ ErrorOr Tokenizer::consume_a_token() dbgln_if(CSS_TOKENIZER_DEBUG, "is right paren"); // Return a <)-token>. Token token = create_new_token(Token::Type::CloseParen); - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } @@ -1155,7 +1155,7 @@ ErrorOr Tokenizer::consume_a_token() } // Otherwise, return a with its value set to the current input code point. - return create_value_token(Token::Type::Delim, input, TRY(input_since(start_byte_offset))); + return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset)); } // U+002C COMMA (,) @@ -1163,7 +1163,7 @@ ErrorOr Tokenizer::consume_a_token() dbgln_if(CSS_TOKENIZER_DEBUG, "is comma"); // Return a . Token token = create_new_token(Token::Type::Comma); - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } @@ -1185,7 +1185,7 @@ ErrorOr Tokenizer::consume_a_token() (void)next_code_point(); Token token = create_new_token(Token::Type::CDC); - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } @@ -1197,7 +1197,7 @@ ErrorOr Tokenizer::consume_a_token() } // Otherwise, return a with its value set to the current input code point. - return create_value_token(Token::Type::Delim, input, TRY(input_since(start_byte_offset))); + return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset)); } // U+002E FULL STOP (.) @@ -1211,7 +1211,7 @@ ErrorOr Tokenizer::consume_a_token() } // Otherwise, return a with its value set to the current input code point. - return create_value_token(Token::Type::Delim, input, TRY(input_since(start_byte_offset))); + return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset)); } // U+003A COLON (:) @@ -1219,7 +1219,7 @@ ErrorOr Tokenizer::consume_a_token() dbgln_if(CSS_TOKENIZER_DEBUG, "is colon"); // Return a . Token token = create_new_token(Token::Type::Colon); - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } @@ -1228,7 +1228,7 @@ ErrorOr Tokenizer::consume_a_token() dbgln_if(CSS_TOKENIZER_DEBUG, "is semicolon"); // Return a . Token token = create_new_token(Token::Type::Semicolon); - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } @@ -1244,12 +1244,12 @@ ErrorOr Tokenizer::consume_a_token() (void)next_code_point(); Token token = create_new_token(Token::Type::CDO); - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } // Otherwise, return a with its value set to the current input code point. - return create_value_token(Token::Type::Delim, input, TRY(input_since(start_byte_offset))); + return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset)); } // U+0040 COMMERCIAL AT (@) @@ -1258,12 +1258,12 @@ ErrorOr Tokenizer::consume_a_token() // If the next 3 input code points would start an ident sequence, consume an ident sequence, create // an with its value set to the returned value, and return it. if (would_start_an_ident_sequence(peek_triplet())) { - auto name = TRY(consume_an_ident_sequence()); - return create_value_token(Token::Type::AtKeyword, move(name), TRY(input_since(start_byte_offset))); + auto name = consume_an_ident_sequence(); + return create_value_token(Token::Type::AtKeyword, move(name), input_since(start_byte_offset)); } // Otherwise, return a with its value set to the current input code point. - return create_value_token(Token::Type::Delim, input, TRY(input_since(start_byte_offset))); + return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset)); } // U+005B LEFT SQUARE BRACKET ([) @@ -1271,7 +1271,7 @@ ErrorOr Tokenizer::consume_a_token() dbgln_if(CSS_TOKENIZER_DEBUG, "is open square"); // Return a <[-token>. Token token = create_new_token(Token::Type::OpenSquare); - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } @@ -1288,7 +1288,7 @@ ErrorOr Tokenizer::consume_a_token() // Otherwise, this is a parse error. Return a with its value set to the // current input code point. log_parse_error(); - return create_value_token(Token::Type::Delim, input, TRY(input_since(start_byte_offset))); + return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset)); } // U+005D RIGHT SQUARE BRACKET (]) @@ -1296,7 +1296,7 @@ ErrorOr Tokenizer::consume_a_token() dbgln_if(CSS_TOKENIZER_DEBUG, "is closed square"); // Return a <]-token>. Token token = create_new_token(Token::Type::CloseSquare); - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } @@ -1305,7 +1305,7 @@ ErrorOr Tokenizer::consume_a_token() dbgln_if(CSS_TOKENIZER_DEBUG, "is open curly"); // Return a <{-token>. Token token = create_new_token(Token::Type::OpenCurly); - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } @@ -1314,7 +1314,7 @@ ErrorOr Tokenizer::consume_a_token() dbgln_if(CSS_TOKENIZER_DEBUG, "is closed curly"); // Return a <}-token>. Token token = create_new_token(Token::Type::CloseCurly); - token.m_representation = TRY(input_since(start_byte_offset)); + token.m_representation = input_since(start_byte_offset); return token; } @@ -1343,7 +1343,7 @@ ErrorOr Tokenizer::consume_a_token() // anything else dbgln_if(CSS_TOKENIZER_DEBUG, "is delimiter"); // Return a with its value set to the current input code point. - return create_value_token(Token::Type::Delim, input, TRY(input_since(start_byte_offset))); + return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset)); } size_t Tokenizer::current_byte_offset() const @@ -1351,9 +1351,9 @@ size_t Tokenizer::current_byte_offset() const return m_utf8_iterator.ptr() - m_utf8_view.bytes(); } -ErrorOr Tokenizer::input_since(size_t offset) const +String Tokenizer::input_since(size_t offset) const { - return m_decoded_input.substring_from_byte_offset_with_shared_superstring(offset, current_byte_offset() - offset); + return MUST(m_decoded_input.substring_from_byte_offset_with_shared_superstring(offset, current_byte_offset() - offset)); } } diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h index 353f92adc54..a5e0d306bf0 100644 --- a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h +++ b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h @@ -59,17 +59,17 @@ public: class Tokenizer { public: - static ErrorOr> tokenize(StringView input, StringView encoding); + static Vector tokenize(StringView input, StringView encoding); [[nodiscard]] static Token create_eof_token(); private: explicit Tokenizer(String decoded_input); - [[nodiscard]] ErrorOr> tokenize(); + [[nodiscard]] Vector tokenize(); size_t current_byte_offset() const; - ErrorOr input_since(size_t offset) const; + String input_since(size_t offset) const; [[nodiscard]] u32 next_code_point(); [[nodiscard]] u32 peek_code_point(size_t offset = 0) const; @@ -82,15 +82,15 @@ private: [[nodiscard]] static Token create_new_token(Token::Type); [[nodiscard]] static Token create_value_token(Token::Type, FlyString&& value, String&& representation); [[nodiscard]] static Token create_value_token(Token::Type, u32 value, String&& representation); - [[nodiscard]] ErrorOr consume_a_token(); - [[nodiscard]] ErrorOr consume_string_token(u32 ending_code_point); - [[nodiscard]] ErrorOr consume_a_numeric_token(); - [[nodiscard]] ErrorOr consume_an_ident_like_token(); + [[nodiscard]] Token consume_a_token(); + [[nodiscard]] Token consume_string_token(u32 ending_code_point); + [[nodiscard]] Token consume_a_numeric_token(); + [[nodiscard]] Token consume_an_ident_like_token(); [[nodiscard]] Number consume_a_number(); [[nodiscard]] double convert_a_string_to_a_number(StringView); - [[nodiscard]] ErrorOr consume_an_ident_sequence(); + [[nodiscard]] FlyString consume_an_ident_sequence(); [[nodiscard]] u32 consume_escaped_code_point(); - [[nodiscard]] ErrorOr consume_a_url_token(); + [[nodiscard]] Token consume_a_url_token(); void consume_the_remnants_of_a_bad_url(); void consume_comments(); void consume_as_much_whitespace_as_possible(); diff --git a/Userland/Libraries/LibWeb/CSS/SyntaxHighlighter/SyntaxHighlighter.cpp b/Userland/Libraries/LibWeb/CSS/SyntaxHighlighter/SyntaxHighlighter.cpp index cb993b3a201..203adc3805c 100644 --- a/Userland/Libraries/LibWeb/CSS/SyntaxHighlighter/SyntaxHighlighter.cpp +++ b/Userland/Libraries/LibWeb/CSS/SyntaxHighlighter/SyntaxHighlighter.cpp @@ -45,7 +45,7 @@ void SyntaxHighlighter::rehighlight(Palette const& palette) false); }; - auto tokens = CSS::Parser::Tokenizer::tokenize(text, "utf-8"sv).release_value_but_fixme_should_propagate_errors(); + auto tokens = CSS::Parser::Tokenizer::tokenize(text, "utf-8"sv); for (auto const& token : tokens) { if (token.is(Parser::Token::Type::EndOfFile)) break;