ladybird/Libraries/LibJS/SourceCode.h
Timothy Flynn 00182a2405 LibJS: Port the JS lexer and parser to UTF-16
This ports the lexer to UTF-16 and deals with the immediate fallout up
to the AST. The AST will be dealt with in upcoming commits.

The lexer will still accept UTF-8 strings as input, and will transcode
them to UTF-16 for lexing. This doesn't actually incur a new allocation,
as we were already converting the input StringView to a ByteString for
each lexer.

One immediate logical benefit here is that we do not need to know off-
hand how many UTF-8 bytes some special code points occupy. They all
happen to be a single UTF-16 code unit. So instead of advancing the
lexer by 3 positions in some cases, we can just always advance by 1.
2025-08-13 09:56:13 -04:00

40 lines
1 KiB
C++

/*
* Copyright (c) 2022-2023, Andreas Kling <andreas@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/String.h>
#include <AK/Utf16String.h>
#include <AK/Vector.h>
#include <LibJS/Export.h>
#include <LibJS/Forward.h>
#include <LibJS/Position.h>
namespace JS {
class JS_API SourceCode : public RefCounted<SourceCode> {
public:
static NonnullRefPtr<SourceCode const> create(String filename, Utf16String code);
String const& filename() const { return m_filename; }
Utf16String const& code() const { return m_code; }
SourceRange range_from_offsets(u32 start_offset, u32 end_offset) const;
private:
SourceCode(String filename, Utf16String code);
String m_filename;
Utf16String m_code;
// For fast mapping of offsets to line/column numbers, we build a list of
// starting points (with byte offsets into the source string) and which
// line:column they map to. This can then be binary-searched.
void fill_position_cache() const;
Vector<Position> mutable m_cached_positions;
};
}