/* * Copyright (c) 2020, Benoit Lormeau * * SPDX-License-Identifier: BSD-2-Clause */ #pragma once #include #include #include #include #include #include #include #include namespace AK { constexpr auto is_any_of(StringView values) { return [values](auto c) { return values.contains(c); }; } constexpr auto is_not_any_of(StringView values) { return [values](auto c) { return !values.contains(c); }; } constexpr auto is_path_separator = is_any_of("/\\"sv); constexpr auto is_quote = is_any_of("'\""sv); enum class UnicodeEscapeError { MalformedUnicodeEscape, UnicodeEscapeOverflow, }; namespace Detail { template class GenericLexer { static_assert(IsOneOf); public: using ViewType = Detail::Conditional, StringView, Utf16View>; constexpr explicit GenericLexer(ViewType input) : m_input(input) { } constexpr size_t tell() const { return m_index; } constexpr size_t tell_remaining() const { return input_length() - m_index; } constexpr ViewType remaining() const { return m_input.substring_view(m_index); } constexpr ViewType input() const { return m_input; } constexpr bool is_eof() const { return m_index >= input_length(); } constexpr CharType peek(size_t offset = 0) const { return (m_index + offset < input_length()) ? code_unit_at(m_index + offset) : '\0'; } constexpr Optional peek_string(size_t length, size_t offset = 0) const { if (m_index + offset + length > input_length()) return {}; return m_input.substring_view(m_index + offset, length); } constexpr bool next_is(CharType expected) const { return peek() == expected; } constexpr bool next_is(char expected) const requires(IsSame) { return peek() == expected; } constexpr bool next_is(ViewType expected) const { size_t length = 0; if constexpr (IsSame) length = expected.length_in_code_units(); else length = expected.length(); return peek_string(length) == expected; } constexpr bool next_is(StringView expected) const requires(IsSame) { return peek_string(expected.length()) == expected; } constexpr void retreat() { VERIFY(m_index > 0); --m_index; } constexpr void retreat(size_t count) { VERIFY(m_index >= count); m_index -= count; } constexpr CharType consume() { VERIFY(!is_eof()); return code_unit_at(m_index++); } constexpr bool consume_specific(CharType next) { if (!next_is(next)) return false; ignore(); return true; } constexpr bool consume_specific(char next) requires(IsSame) { return consume_specific(static_cast(next)); } constexpr bool consume_specific(ViewType next) { if (!next_is(next)) return false; if constexpr (IsSame) ignore(next.length_in_code_units()); else ignore(next.length()); return true; } constexpr bool consume_specific(StringView next) requires(IsSame) { if (!next_is(next)) return false; ignore(next.length()); return true; } constexpr CharType consume_escaped_character(CharType escape_char = '\\', StringView escape_map = "n\nr\rt\tb\bf\f"sv) { if (!consume_specific(escape_char)) return consume(); auto c = consume(); for (size_t i = 0; i < escape_map.length(); i += 2) { if (c == escape_map[i]) return escape_map[i + 1]; } return c; } // Consume a number of characters constexpr ViewType consume(size_t count) { auto start = m_index; auto length = min(count, input_length() - m_index); m_index += length; return m_input.substring_view(start, length); } // Consume the rest of the input constexpr ViewType consume_all() { auto rest = m_input.substring_view(m_index, input_length() - m_index); m_index = input_length(); return rest; } // Consume until a new line is found constexpr ViewType consume_line() { auto start = m_index; while (!is_eof() && peek() != '\r' && peek() != '\n') m_index++; auto length = m_index - start; consume_specific('\r'); consume_specific('\n'); return m_input.substring_view(start, length); } // Consume and return characters until `stop` is peeked constexpr ViewType consume_until(CharType stop) { auto start = m_index; while (!is_eof() && peek() != stop) m_index++; auto length = m_index - start; return m_input.substring_view(start, length); } constexpr ViewType consume_until(char stop) requires(IsSame) { return consume_until(static_cast(stop)); } // Consume and return characters until the string `stop` is found constexpr ViewType consume_until(ViewType stop) { auto start = m_index; while (!is_eof() && !next_is(stop)) m_index++; auto length = m_index - start; return m_input.substring_view(start, length); } // Consume a string surrounded by single or double quotes. The returned ViewType does not include the quotes. An // escape character can be provided to capture the enclosing quotes. Please note that the escape character will // still be in the resulting ViewType. constexpr ViewType consume_quoted_string(CharType escape_char = 0) { if (!next_is(is_quote)) return {}; auto quote_char = consume(); auto start = m_index; while (!is_eof()) { if (next_is(escape_char)) m_index++; else if (next_is(quote_char)) break; m_index++; } auto length = m_index - start; if (peek() != quote_char) { // Restore the index in case the string is unterminated m_index = start - 1; return {}; } // Ignore closing quote ignore(); return m_input.substring_view(start, length); } template ErrorOr consume_decimal_integer() { using UnsignedT = MakeUnsigned; ArmedScopeGuard rollback { [&, rollback_position = m_index]() { m_index = rollback_position; } }; bool has_minus_sign = false; if (next_is('+') || next_is('-')) if (consume() == '-') has_minus_sign = true; auto number_view = consume_while(is_ascii_digit); if (number_view.is_empty()) return Error::from_errno(EINVAL); auto maybe_number = number_view.template to_number(TrimWhitespace::No); if (!maybe_number.has_value()) return Error::from_errno(ERANGE); auto number = maybe_number.value(); if (!has_minus_sign) { if (NumericLimits::max() < number) // This is only possible in a signed case. return Error::from_errno(ERANGE); rollback.disarm(); return number; } if constexpr (IsUnsigned) { if (number != 0) return Error::from_errno(ERANGE); rollback.disarm(); return 0; } else { static constexpr UnsignedT max_value = static_cast(NumericLimits::max()) + 1; if (number > max_value) return Error::from_errno(ERANGE); rollback.disarm(); return -number; } } Result consume_escaped_code_point(bool combine_surrogate_pairs = true) { if (!consume_specific("\\u"sv)) return UnicodeEscapeError::MalformedUnicodeEscape; if (next_is('{')) return decode_code_point(); return decode_single_or_paired_surrogate(combine_surrogate_pairs); } constexpr void ignore(size_t count = 1) { count = min(count, input_length() - m_index); m_index += count; } constexpr void ignore_until(CharType stop) { while (!is_eof() && peek() != stop) ++m_index; } constexpr void ignore_until(char stop) requires(IsSame) { return ignore_until(static_cast(stop)); } // Conditions are used to match arbitrary characters. You can use lambdas, ctype functions, or is_any_of() and its // derivatives (see below). // // A few examples: // - `if (lexer.next_is(isdigit))` // - `auto name = lexer.consume_while([](char c) { return isalnum(c) || c == '_'; });` // - `lexer.ignore_until(is_any_of("<^>"));` // Test the next character against a Condition template constexpr bool next_is(TPredicate pred) const { return pred(peek()); } // Consume and return characters while `pred` returns true template constexpr ViewType consume_while(TPredicate pred) { auto start = m_index; while (!is_eof() && pred(peek())) ++m_index; auto length = m_index - start; return m_input.substring_view(start, length); } // Consume and return characters until `pred` return true template constexpr ViewType consume_until(TPredicate pred) { auto start = m_index; while (!is_eof() && !pred(peek())) ++m_index; auto length = m_index - start; return m_input.substring_view(start, length); } template constexpr bool consume_specific_with_predicate(TPredicate pred) { if (is_eof() || !pred(peek())) return false; ignore(); return true; } // Ignore characters while `pred` returns true template constexpr void ignore_while(TPredicate pred) { while (!is_eof() && pred(peek())) ++m_index; } // Ignore characters until `pred` returns true template constexpr void ignore_until(TPredicate pred) { while (!is_eof() && !pred(peek())) ++m_index; } protected: Result decode_code_point() { bool starts_with_open_bracket = consume_specific('{'); VERIFY(starts_with_open_bracket); u32 code_point = 0; while (true) { if (!next_is(is_ascii_hex_digit)) return UnicodeEscapeError::MalformedUnicodeEscape; auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume()); if (new_code_point < code_point) return UnicodeEscapeError::UnicodeEscapeOverflow; code_point = new_code_point; if (consume_specific('}')) break; } if (is_unicode(code_point)) return code_point; return UnicodeEscapeError::UnicodeEscapeOverflow; } Result decode_single_or_paired_surrogate(bool combine_surrogate_pairs = true) { constexpr size_t surrogate_length = 4; auto decode_one_surrogate = [&]() -> Optional { u16 surrogate = 0; for (size_t i = 0; i < surrogate_length; ++i) { if (!next_is(is_ascii_hex_digit)) return {}; surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume()); } return surrogate; }; auto high_surrogate = decode_one_surrogate(); if (!high_surrogate.has_value()) return UnicodeEscapeError::MalformedUnicodeEscape; if (!UnicodeUtils::is_utf16_high_surrogate(*high_surrogate)) return *high_surrogate; if (!combine_surrogate_pairs || !consume_specific("\\u"sv)) return *high_surrogate; auto low_surrogate = decode_one_surrogate(); if (!low_surrogate.has_value()) return UnicodeEscapeError::MalformedUnicodeEscape; if (UnicodeUtils::is_utf16_low_surrogate(*low_surrogate)) return UnicodeUtils::decode_utf16_surrogate_pair(*high_surrogate, *low_surrogate); retreat(6); return *high_surrogate; } constexpr size_t input_length() const { if constexpr (IsSame) return m_input.length_in_code_units(); else return m_input.length(); } constexpr CharType code_unit_at(size_t index) const { if constexpr (IsSame) return m_input.code_unit_at(index); else return m_input[index]; } ViewType m_input; size_t m_index { 0 }; }; } class LineTrackingLexer : public GenericLexer { public: struct Position { size_t offset { 0 }; size_t line { 0 }; size_t column { 0 }; }; LineTrackingLexer(StringView input, Position start_position) : GenericLexer(input) , m_first_line_start_position(start_position) , m_line_start_positions(make>()) { m_line_start_positions->insert(0, 0); auto first_newline = input.find('\n').map([](auto x) { return x + 1; }).value_or(input.length()); m_line_start_positions->insert(first_newline, 1); m_largest_known_line_start_position = first_newline; } LineTrackingLexer(StringView input) : LineTrackingLexer(input, { 0, 1, 1 }) { } Position position_for(size_t) const; Position current_position() const { return position_for(m_index); } protected: Position m_first_line_start_position; mutable NonnullOwnPtr> m_line_start_positions; // offset -> line index mutable size_t m_largest_known_line_start_position { 0 }; }; } #if USING_AK_GLOBALLY using AK::GenericLexer; using AK::is_any_of; using AK::is_path_separator; using AK::is_quote; using AK::LineTrackingLexer; #endif