From bd1009f3c19f19c8fe4d43253028320ccb4014f5 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 4 Dec 2024 16:41:02 -0500 Subject: [PATCH] LibJS: Extract some JS lexer helpers to free functions We will need these in an upcoming proposal. --- Libraries/LibJS/Lexer.cpp | 47 ++++++++++++++++++++++++++++++++++----- Libraries/LibJS/Lexer.h | 4 ++++ 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/Libraries/LibJS/Lexer.cpp b/Libraries/LibJS/Lexer.cpp index c5fb732857c..58717cee259 100644 --- a/Libraries/LibJS/Lexer.cpp +++ b/Libraries/LibJS/Lexer.cpp @@ -481,13 +481,13 @@ bool Lexer::is_eof() const ALWAYS_INLINE bool Lexer::is_line_terminator() const { + // OPTIMIZATION: Fast-path for ASCII characters. if (m_current_char == '\n' || m_current_char == '\r') return true; if (!is_unicode_character()) return false; - auto code_point = current_code_point(); - return code_point == LINE_SEPARATOR || code_point == PARAGRAPH_SEPARATOR; + return JS::is_line_terminator(current_code_point()); } ALWAYS_INLINE bool Lexer::is_unicode_character() const @@ -511,14 +511,13 @@ ALWAYS_INLINE u32 Lexer::current_code_point() const bool Lexer::is_whitespace() const { + // OPTIMIZATION: Fast-path for ASCII characters. if (is_ascii_space(m_current_char)) return true; if (!is_unicode_character()) return false; - auto code_point = current_code_point(); - if (code_point == NO_BREAK_SPACE || code_point == ZERO_WIDTH_NO_BREAK_SPACE) - return true; - return Unicode::code_point_has_space_separator_general_category(code_point); + + return JS::is_whitespace(current_code_point()); } // UnicodeEscapeSequence :: https://tc39.es/ecma262/#prod-UnicodeEscapeSequence @@ -1059,4 +1058,40 @@ TokenType Lexer::consume_regex_literal() return TokenType::UnterminatedRegexLiteral; } +// https://tc39.es/ecma262/#prod-SyntaxCharacter +bool is_syntax_character(u32 code_point) +{ + // SyntaxCharacter :: one of + // ^ $ \ . * + ? ( ) [ ] { } | + static constexpr Utf8View syntax_characters { "^$\\.*+?()[]{}|"sv }; + return syntax_characters.contains(code_point); +} + +// https://tc39.es/ecma262/#prod-WhiteSpace +bool is_whitespace(u32 code_point) +{ + // WhiteSpace :: + // + // + // + // + // + if (is_ascii_space(code_point)) + return true; + if (code_point == NO_BREAK_SPACE || code_point == ZERO_WIDTH_NO_BREAK_SPACE) + return true; + return Unicode::code_point_has_space_separator_general_category(code_point); +} + +// https://tc39.es/ecma262/#prod-LineTerminator +bool is_line_terminator(u32 code_point) +{ + // LineTerminator :: + // + // + // + // + return code_point == '\n' || code_point == '\r' || code_point == LINE_SEPARATOR || code_point == PARAGRAPH_SEPARATOR; +} + } diff --git a/Libraries/LibJS/Lexer.h b/Libraries/LibJS/Lexer.h index b36c81f9733..7f0df05c693 100644 --- a/Libraries/LibJS/Lexer.h +++ b/Libraries/LibJS/Lexer.h @@ -91,4 +91,8 @@ private: RefPtr m_parsed_identifiers; }; +bool is_syntax_character(u32 code_point); +bool is_whitespace(u32 code_point); +bool is_line_terminator(u32 code_point); + }