LibJS: Port the JS lexer and parser to UTF-16

This ports the lexer to UTF-16 and deals with the immediate fallout up to the AST. The AST will be dealt with in upcoming commits. The lexer will still accept UTF-8 strings as input, and will transcode them to UTF-16 for lexing. This doesn't actually incur a new allocation, as we were already converting the input StringView to a ByteString for each lexer. One immediate logical benefit here is that we do not need to know off- hand how many UTF-8 bytes some special code points occupy. They all happen to be a single UTF-16 code unit. So instead of advancing the lexer by 3 positions in some cases, we can just always advance by 1.
Author: https://github.com/trflynn89 Commit: 00182a2405 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5762
2025-10-16 04:59:23 +00:00 · 2025-08-06 07:18:45 -04:00 · 2025-08-06 07:18:45 -04:00 · 00182a2405 · 2025-08-13 13:57:27 +00:00
commit 00182a2405
parent eb74781a2d
14 changed files with 467 additions and 474 deletions
--- a/Libraries/LibJS/SourceCode.h
+++ b/Libraries/LibJS/SourceCode.h
@ -7,6 +7,7 @@
 #pragma once

 #include <AK/String.h>
+#include <AK/Utf16String.h>
 #include <AK/Vector.h>
 #include <LibJS/Export.h>
 #include <LibJS/Forward.h>
@ -16,18 +17,18 @@ namespace JS {

 class JS_API SourceCode : public RefCounted<SourceCode> {
 public:
-    static NonnullRefPtr<SourceCode const> create(String filename, String code);
+    static NonnullRefPtr<SourceCode const> create(String filename, Utf16String code);

-    String const& filename() const;
-    String const& code() const;
+    String const& filename() const { return m_filename; }
+    Utf16String const& code() const { return m_code; }

    SourceRange range_from_offsets(u32 start_offset, u32 end_offset) const;

 private:
-    SourceCode(String filename, String code);
+    SourceCode(String filename, Utf16String code);

    String m_filename;
-    String m_code;
+    Utf16String m_code;

    // For fast mapping of offsets to line/column numbers, we build a list of
    // starting points (with byte offsets into the source string) and which