JSSpecCompiler: Add functions for splitting node contents into tokens

This commit is contained in:
Dan Klishch 2023-08-17 22:29:05 -04:00 committed by Andrew Kaster
parent 8342361481
commit 9f29e04897
Notes: sideshowbarker 2024-07-17 00:57:24 +09:00
6 changed files with 378 additions and 0 deletions

View file

@ -0,0 +1,157 @@
/*
* Copyright (c) 2023, Dan Klishch <danilklishch@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/GenericLexer.h>
#include <AK/NonnullOwnPtr.h>
#include "Parser/Lexer.h"
#include "Parser/XMLUtils.h"
namespace JSSpecCompiler {
namespace {
Optional<Token> consume_number(GenericLexer& lexer, XML::Node const* node)
{
u64 start = lexer.tell();
if (lexer.next_is('-'))
lexer.consume(1);
if (!lexer.next_is(is_ascii_digit)) {
lexer.retreat(lexer.tell() - start);
return {};
}
lexer.consume_while(is_ascii_digit);
if (lexer.next_is('.')) {
lexer.consume(1);
if (lexer.consume_while(is_ascii_digit).length() == 0)
lexer.retreat(1);
}
auto length = lexer.tell() - start;
lexer.retreat(length);
return { Token { TokenType::Number, lexer.consume(length), node } };
}
bool can_end_word_token(char c)
{
return is_ascii_space(c) || ".,"sv.contains(c);
}
}
ParseErrorOr<void> tokenize_string(XML::Node const* node, StringView view, Vector<Token>& tokens)
{
#define CONSUME_IF_NEXT(view, type) \
if (lexer.next_is(view##sv)) { \
size_t length = __builtin_strlen(view); \
tokens.append({ TokenType::type, lexer.consume(length), node }); \
continue; \
}
GenericLexer lexer(view);
while (!lexer.is_eof()) {
lexer.ignore_while(is_ascii_space);
if (auto result = consume_number(lexer, node); result.has_value()) {
tokens.append(result.release_value());
continue;
}
CONSUME_IF_NEXT("(", ParenOpen);
CONSUME_IF_NEXT(")", ParenClose);
CONSUME_IF_NEXT("{", BraceOpen);
CONSUME_IF_NEXT("}", BraceClose);
CONSUME_IF_NEXT(",", Comma);
CONSUME_IF_NEXT(". ", Dot);
CONSUME_IF_NEXT(".\n", Dot);
CONSUME_IF_NEXT(":", Colon);
CONSUME_IF_NEXT(".", MemberAccess);
CONSUME_IF_NEXT("<", Less);
CONSUME_IF_NEXT(">", Greater);
CONSUME_IF_NEXT("is not equal to", NotEquals);
CONSUME_IF_NEXT("", NotEquals);
CONSUME_IF_NEXT("is equal to", Equals);
CONSUME_IF_NEXT("=", Equals);
CONSUME_IF_NEXT("+", Plus);
CONSUME_IF_NEXT("-", AmbiguousMinus);
CONSUME_IF_NEXT("×", Multiplication);
CONSUME_IF_NEXT("/", Division);
CONSUME_IF_NEXT("!", ExclamationMark);
CONSUME_IF_NEXT("is", Is);
StringView word = lexer.consume_until(can_end_word_token);
if (word.length())
tokens.append({ TokenType::Word, word, node });
}
return {};
#undef CONSUME_IF_NEXT
}
ParseErrorOr<TokenizeTreeResult> tokenize_tree(XML::Node const* node, bool allow_substeps)
{
TokenizeTreeResult result;
auto& tokens = result.tokens;
for (auto const& child : node->as_element().children) {
TRY(child->content.visit(
[&](XML::Node::Element const& element) -> ParseErrorOr<void> {
if (result.substeps != nullptr)
return ParseError::create("Substeps list must be the last non-empty child"sv, child);
if (element.name == tag_var) {
tokens.append({ TokenType::Identifier, TRY(get_text_contents(child)), child });
return {};
}
if (element.name == tag_span) {
auto element_class = TRY(get_attribute_by_name(child, attribute_class));
if (element_class != class_secnum)
return ParseError::create(String::formatted("Expected 'secnum' as a class name of <span>, but found '{}'", element_class), child);
tokens.append({ TokenType::SectionNumber, TRY(get_text_contents(child)), child });
return {};
}
if (element.name == tag_emu_val) {
auto contents = TRY(get_text_contents(child));
if (contents.length() >= 2 && contents.starts_with('"') && contents.ends_with('"'))
tokens.append({ TokenType::String, contents.substring_view(1, contents.length() - 2), child });
else if (contents == "undefined")
tokens.append({ TokenType::Undefined, contents, child });
else
tokens.append({ TokenType::Identifier, contents, child });
return {};
}
if (element.name == tag_emu_xref) {
auto contents = TRY(get_text_contents(TRY(get_only_child(child, "a"sv))));
tokens.append({ TokenType::Identifier, contents, child });
return {};
}
if (element.name == tag_ol) {
if (!allow_substeps)
return ParseError::create("Found nested list but substeps are not allowed"sv, child);
result.substeps = child;
return {};
}
return ParseError::create(String::formatted("Unexpected child element with tag {}", element.name), child);
},
[&](XML::Node::Text const& text) -> ParseErrorOr<void> {
auto view = text.builder.string_view();
if (result.substeps && !contains_empty_text(child))
return ParseError::create("Substeps list must be the last non-empty child"sv, child);
return tokenize_string(child, view, tokens);
},
move(ignore_comments)));
}
return result;
}
}

View file

@ -0,0 +1,40 @@
/*
* Copyright (c) 2023, Dan Klishch <danilklishch@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include "Parser/ParseError.h"
#include "Parser/Token.h"
namespace JSSpecCompiler {
inline constexpr StringView tag_emu_alg = "emu-alg"sv;
inline constexpr StringView tag_emu_clause = "emu-clause"sv;
inline constexpr StringView tag_emu_val = "emu-val"sv;
inline constexpr StringView tag_emu_xref = "emu-xref"sv;
inline constexpr StringView tag_h1 = "h1"sv;
inline constexpr StringView tag_li = "li"sv;
inline constexpr StringView tag_ol = "ol"sv;
inline constexpr StringView tag_p = "p"sv;
inline constexpr StringView tag_span = "span"sv;
inline constexpr StringView tag_var = "var"sv;
inline constexpr StringView attribute_aoid = "aoid"sv;
inline constexpr StringView attribute_class = "class"sv;
inline constexpr StringView attribute_id = "id"sv;
inline constexpr StringView class_secnum = "secnum"sv;
ParseErrorOr<void> tokenize_string(XML::Node const* node, StringView view, Vector<Token>& tokens);
struct TokenizeTreeResult {
Vector<Token> tokens;
XML::Node const* substeps = nullptr;
};
ParseErrorOr<TokenizeTreeResult> tokenize_tree(XML::Node const* node, bool allow_substeps = false);
}

View file

@ -0,0 +1,51 @@
/*
* Copyright (c) 2023, Dan Klishch <danilklishch@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include "Parser/ParseError.h"
namespace JSSpecCompiler {
NonnullRefPtr<ParseError> ParseError::create(String message, XML::Node const* node)
{
return make_ref_counted<ParseError>(move(message), node);
}
NonnullRefPtr<ParseError> ParseError::create(StringView message, XML::Node const* node)
{
return create(MUST(String::from_utf8(message)), node);
}
// FIXME: Remove once String::formatted becomes infallible.
NonnullRefPtr<ParseError> ParseError::create(ErrorOr<String> message, XML::Node const* node)
{
return create(MUST(message), node);
}
String ParseError::to_string() const
{
StringBuilder builder;
builder.appendff("error: {}\n", m_message);
XML::Node const* current = m_node;
while (current != nullptr) {
builder.appendff(" at {}:{} ", current->offset.line + 1, current->offset.column + 1);
if (current->is_element()) {
builder.append("<"sv);
builder.append(current->as_element().name);
for (auto [key, value] : current->as_element().attributes)
builder.appendff(" {}=\"{}\"", key, value);
builder.append(">\n"sv);
} else if (current->is_text()) {
builder.appendff("text \"{}\"\n", current->as_text().builder.string_view().trim_whitespace());
} else {
builder.appendff("comment");
}
current = current->parent;
}
return MUST(builder.to_string());
}
}

View file

@ -0,0 +1,37 @@
/*
* Copyright (c) 2023, Dan Klishch <danilklishch@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/String.h>
#include <LibXML/DOM/Node.h>
namespace JSSpecCompiler {
class ParseError : public RefCounted<ParseError> {
public:
ParseError(String&& message, XML::Node const* node)
: m_message(move(message))
, m_node(node)
{
}
static NonnullRefPtr<ParseError> create(String message, XML::Node const* node);
static NonnullRefPtr<ParseError> create(StringView message, XML::Node const* node);
static NonnullRefPtr<ParseError> create(ErrorOr<String> message, XML::Node const* node);
String to_string() const;
private:
String m_message;
XML::Node const* m_node;
// TODO: Support chained parse errors
};
template<typename T>
using ParseErrorOr = ErrorOr<T, NonnullRefPtr<ParseError>>;
}

View file

@ -0,0 +1,64 @@
/*
* Copyright (c) 2023, Dan Klishch <danilklishch@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/NonnullOwnPtr.h>
#include <LibXML/DOM/Node.h>
#include "Parser/XMLUtils.h"
namespace JSSpecCompiler {
bool contains_empty_text(XML::Node const* node)
{
return node->as_text().builder.string_view().trim_whitespace().is_empty();
}
ParseErrorOr<StringView> get_attribute_by_name(XML::Node const* node, StringView attribute_name)
{
auto const& attribute = node->as_element().attributes.get(attribute_name);
if (!attribute.has_value())
return ParseError::create(String::formatted("Attribute {} is not present", attribute_name), node);
return attribute.value();
}
ParseErrorOr<StringView> get_text_contents(XML::Node const* node)
{
auto const& children = node->as_element().children;
if (children.size() != 1 || !children[0]->is_text())
return ParseError::create("Expected single text node in a child list of the node"sv, node);
return children[0]->as_text().builder.string_view();
}
ParseErrorOr<XML::Node const*> get_only_child(XML::Node const* element, StringView tag_name)
{
XML::Node const* result = nullptr;
for (auto const& child : element->as_element().children) {
TRY(child->content.visit(
[&](XML::Node::Element const& element) -> ParseErrorOr<void> {
if (element.name != tag_name)
return ParseError::create(String::formatted("Expected child with the tag name {} but found {}", tag_name, element.name), child);
if (result != nullptr)
return ParseError::create("Element must have only one child"sv, child);
result = child;
return {};
},
[&](XML::Node::Text const&) -> ParseErrorOr<void> {
if (!contains_empty_text(child))
return ParseError::create("Element should not have non-empty child text nodes"sv, element);
return {};
},
move(ignore_comments)));
}
if (result == nullptr)
return ParseError::create(String::formatted("Element must have only one child"), element);
return result;
}
}

View file

@ -0,0 +1,29 @@
/*
* Copyright (c) 2023, Dan Klishch <danilklishch@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <LibXML/Forward.h>
#include "Parser/ParseError.h"
namespace JSSpecCompiler {
struct IgnoreComments {
ParseErrorOr<void> operator()(XML::Node::Comment const&) { return {}; }
};
inline constexpr IgnoreComments ignore_comments {};
bool contains_empty_text(XML::Node const* node);
ParseErrorOr<StringView> get_attribute_by_name(XML::Node const* node, StringView attribute_name);
ParseErrorOr<StringView> get_text_contents(XML::Node const* node);
ParseErrorOr<XML::Node const*> get_only_child(XML::Node const* element, StringView tag_name);
}