LibJS: Port the JS lexer and parser to UTF-16

This ports the lexer to UTF-16 and deals with the immediate fallout up
to the AST. The AST will be dealt with in upcoming commits.

The lexer will still accept UTF-8 strings as input, and will transcode
them to UTF-16 for lexing. This doesn't actually incur a new allocation,
as we were already converting the input StringView to a ByteString for
each lexer.

One immediate logical benefit here is that we do not need to know off-
hand how many UTF-8 bytes some special code points occupy. They all
happen to be a single UTF-16 code unit. So instead of advancing the
lexer by 3 positions in some cases, we can just always advance by 1.
This commit is contained in:
Timothy Flynn 2025-08-06 07:18:45 -04:00 committed by Tim Flynn
commit 00182a2405
Notes: github-actions[bot] 2025-08-13 13:57:27 +00:00
14 changed files with 467 additions and 474 deletions

View file

@ -199,8 +199,10 @@ static ErrorOr<bool> parse_and_run(JS::Realm& realm, StringView source, StringVi
if (!s_as_module) {
auto script_or_error = JS::Script::parse(source, realm, source_name);
if (script_or_error.is_error()) {
auto utf16_source = Utf16String::from_utf8(source);
auto error = script_or_error.error()[0];
auto hint = error.source_location_hint(source);
auto hint = error.source_location_hint(utf16_source);
if (!hint.is_empty())
outln("{}", hint);
@ -213,8 +215,10 @@ static ErrorOr<bool> parse_and_run(JS::Realm& realm, StringView source, StringVi
} else {
auto module_or_error = JS::SourceTextModule::parse(source, realm, source_name);
if (module_or_error.is_error()) {
auto utf16_source = Utf16String::from_utf8(source);
auto error = module_or_error.error()[0];
auto hint = error.source_location_hint(source);
auto hint = error.source_location_hint(utf16_source);
if (!hint.is_empty())
outln("{}", hint);
@ -619,7 +623,7 @@ static ErrorOr<int> run_repl(bool gc_on_every_allocation, bool syntax_highlight)
JS::Lexer lexer(line);
bool indenters_starting_line = true;
for (JS::Token token = lexer.next(); token.type() != JS::TokenType::Eof; token = lexer.next()) {
auto length = Utf8View { token.value() }.length();
auto length = token.value().length_in_code_units();
auto start = token.offset();
auto end = start + length;
if (indenters_starting_line) {
@ -680,8 +684,8 @@ static ErrorOr<int> run_repl(bool gc_on_every_allocation, bool syntax_highlight)
CompleteProperty,
} mode { Initial };
FlyString variable_name;
FlyString property_name;
Utf16FlyString variable_name;
Utf16FlyString property_name;
// we're only going to complete either
// - <N>
@ -727,11 +731,11 @@ static ErrorOr<int> run_repl(bool gc_on_every_allocation, bool syntax_highlight)
}
}
bool last_token_has_trivia = js_token.trivia().length() > 0;
bool last_token_has_trivia = !js_token.trivia().is_empty();
if (mode == CompleteNullProperty) {
mode = CompleteProperty;
property_name = ""_fly_string;
property_name = Utf16FlyString {};
last_token_has_trivia = false; // <name> <dot> [tab] is sensible to complete.
}
@ -740,17 +744,18 @@ static ErrorOr<int> run_repl(bool gc_on_every_allocation, bool syntax_highlight)
Vector<Line::CompletionSuggestion> results;
Function<void(JS::Shape const&, StringView)> list_all_properties = [&results, &list_all_properties](JS::Shape const& shape, auto property_pattern) {
Function<void(JS::Shape const&, Utf16FlyString const&)> list_all_properties = [&results, &list_all_properties](JS::Shape const& shape, Utf16FlyString const& property_pattern) {
for (auto const& descriptor : shape.property_table()) {
if (!descriptor.key.is_string())
continue;
auto key = descriptor.key.as_string().to_utf16_string().to_utf8_but_should_be_ported_to_utf16();
if (key.bytes_as_string_view().starts_with(property_pattern)) {
Line::CompletionSuggestion completion { key, Line::CompletionSuggestion::ForSearch };
auto key = descriptor.key.as_string().to_utf16_string();
if (key.starts_with(property_pattern.view())) {
Line::CompletionSuggestion completion { key.to_utf8_but_should_be_ported_to_utf16(), Line::CompletionSuggestion::ForSearch };
if (!results.contains_slow(completion)) { // hide duplicates
results.append(key.to_byte_string());
results.last().invariant_offset = property_pattern.length();
results.last().invariant_offset = property_pattern.length_in_code_units();
}
}
}
@ -759,11 +764,9 @@ static ErrorOr<int> run_repl(bool gc_on_every_allocation, bool syntax_highlight)
}
};
auto variable_name_utf16 = Utf16FlyString::from_utf8(variable_name);
switch (mode) {
case CompleteProperty: {
auto reference_or_error = g_vm->resolve_binding(variable_name_utf16, &global_environment);
auto reference_or_error = g_vm->resolve_binding(variable_name, &global_environment);
if (reference_or_error.is_error())
return {};
auto value_or_error = reference_or_error.value().get_value(*g_vm);
@ -785,9 +788,9 @@ static ErrorOr<int> run_repl(bool gc_on_every_allocation, bool syntax_highlight)
list_all_properties(variable.shape(), variable_name);
for (auto const& name : global_environment.declarative_record().bindings()) {
if (name.view().starts_with(variable_name_utf16.view())) {
results.empend(name.view().to_utf8_but_should_be_ported_to_utf16().to_byte_string());
results.last().invariant_offset = variable_name.bytes().size();
if (name.view().starts_with(variable_name.view())) {
results.empend(MUST(name.view().to_byte_string()));
results.last().invariant_offset = variable_name.length_in_code_units();
}
}