mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-10-16 04:59:23 +00:00
This ports the lexer to UTF-16 and deals with the immediate fallout up to the AST. The AST will be dealt with in upcoming commits. The lexer will still accept UTF-8 strings as input, and will transcode them to UTF-16 for lexing. This doesn't actually incur a new allocation, as we were already converting the input StringView to a ByteString for each lexer. One immediate logical benefit here is that we do not need to know off- hand how many UTF-8 bytes some special code points occupy. They all happen to be a single UTF-16 code unit. So instead of advancing the lexer by 3 positions in some cases, we can just always advance by 1.
40 lines
1 KiB
C++
40 lines
1 KiB
C++
/*
|
|
* Copyright (c) 2022-2023, Andreas Kling <andreas@ladybird.org>
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <AK/String.h>
|
|
#include <AK/Utf16String.h>
|
|
#include <AK/Vector.h>
|
|
#include <LibJS/Export.h>
|
|
#include <LibJS/Forward.h>
|
|
#include <LibJS/Position.h>
|
|
|
|
namespace JS {
|
|
|
|
class JS_API SourceCode : public RefCounted<SourceCode> {
|
|
public:
|
|
static NonnullRefPtr<SourceCode const> create(String filename, Utf16String code);
|
|
|
|
String const& filename() const { return m_filename; }
|
|
Utf16String const& code() const { return m_code; }
|
|
|
|
SourceRange range_from_offsets(u32 start_offset, u32 end_offset) const;
|
|
|
|
private:
|
|
SourceCode(String filename, Utf16String code);
|
|
|
|
String m_filename;
|
|
Utf16String m_code;
|
|
|
|
// For fast mapping of offsets to line/column numbers, we build a list of
|
|
// starting points (with byte offsets into the source string) and which
|
|
// line:column they map to. This can then be binary-searched.
|
|
void fill_position_cache() const;
|
|
Vector<Position> mutable m_cached_positions;
|
|
};
|
|
|
|
}
|