Everywhere: Replace ctype.h to avoid narrowing conversions

This replaces ctype.h with CharacterType.h everywhere I could find issues with narrowing conversions. While using it will probably make sense almost everywhere in the future, the most critical places should have been addressed.
Author: https://github.com/MaxWipfli Commit: bc8d16ad28 Pull-request: https://github.com/SerenityOS/serenity/pull/7684 Reviewed-by: https://github.com/awesomekling Reviewed-by: https://github.com/bgianfo ✅
2025-07-02 23:21:56 +00:00 · 2021-06-01 21:18:08 +02:00 · 2021-06-01 21:18:08 +02:00 · bc8d16ad28 · 2024-07-18 16:57:29 +09:00
commit bc8d16ad28
parent 1c9d87c455
16 changed files with 153 additions and 266 deletions
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
@ -4,13 +4,13 @@
 * SPDX-License-Identifier: BSD-2-Clause
 */

+#include <AK/CharacterTypes.h>
 #include <AK/Debug.h>
 #include <AK/SourceLocation.h>
 #include <LibTextCodec/Decoder.h>
 #include <LibWeb/HTML/Parser/Entities.h>
 #include <LibWeb/HTML/Parser/HTMLToken.h>
 #include <LibWeb/HTML/Parser/HTMLTokenizer.h>
-#include <ctype.h>
 #include <string.h>

 namespace Web::HTML {
@ -93,25 +93,25 @@ namespace Web::HTML {
    if (!current_input_character.has_value())

 #define ON_ASCII_ALPHA \
-    if (current_input_character.has_value() && isalpha(current_input_character.value()))
+    if (current_input_character.has_value() && is_ascii_alpha(current_input_character.value()))

 #define ON_ASCII_ALPHANUMERIC \
-    if (current_input_character.has_value() && isalnum(current_input_character.value()))
+    if (current_input_character.has_value() && is_ascii_alphanumeric(current_input_character.value()))

 #define ON_ASCII_UPPER_ALPHA \
-    if (current_input_character.has_value() && current_input_character.value() >= 'A' && current_input_character.value() <= 'Z')
+    if (current_input_character.has_value() && is_ascii_upper_alpha(current_input_character.value()))

 #define ON_ASCII_LOWER_ALPHA \
-    if (current_input_character.has_value() && current_input_character.value() >= 'a' && current_input_character.value() <= 'z')
+    if (current_input_character.has_value() && is_ascii_lower_alpha(current_input_character.value()))

 #define ON_ASCII_DIGIT \
-    if (current_input_character.has_value() && isdigit(current_input_character.value()))
+    if (current_input_character.has_value() && is_ascii_digit(current_input_character.value()))

 #define ON_ASCII_HEX_DIGIT \
-    if (current_input_character.has_value() && isxdigit(current_input_character.value()))
+    if (current_input_character.has_value() && is_ascii_hex_digit(current_input_character.value()))

 #define ON_WHITESPACE \
-    if (current_input_character.has_value() && strchr("\t\n\f ", current_input_character.value()))
+    if (current_input_character.has_value() && is_ascii(current_input_character.value()) && "\t\n\f "sv.contains(current_input_character.value()))

 #define ANYTHING_ELSE if (1)

@ -172,26 +172,6 @@ static inline void log_parse_error(const SourceLocation& location = SourceLocati
    dbgln_if(TOKENIZER_TRACE_DEBUG, "Parse error (tokenization) {}", location);
 }

-static inline bool is_surrogate(u32 code_point)
-{
-    return (code_point & 0xfffff800) == 0xd800;
-}
-
-static inline bool is_noncharacter(u32 code_point)
-{
-    return code_point >= 0xfdd0 && (code_point <= 0xfdef || (code_point & 0xfffe) == 0xfffe) && code_point <= 0x10ffff;
-}
-
-static inline bool is_c0_control(u32 code_point)
-{
-    return code_point <= 0x1f;
-}
-
-static inline bool is_control(u32 code_point)
-{
-    return is_c0_control(code_point) || (code_point >= 0x7f && code_point <= 0x9f);
-}
-
 Optional<u32> HTMLTokenizer::next_code_point()
 {
    if (m_utf8_iterator == m_utf8_view.end())
@ -322,7 +302,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+                    m_current_token.m_tag.tag_name.append(to_ascii_lowercase(current_input_character.value()));
                    m_current_token.m_end_position = nth_last_position(0);
                    continue;
                }
@ -458,7 +438,7 @@ _StartOfFunction:
                ON_ASCII_UPPER_ALPHA
                {
                    create_new_token(HTMLToken::Type::DOCTYPE);
-                    m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
+                    m_current_token.m_doctype.name.append(to_ascii_lowercase(current_input_character.value()));
                    m_current_token.m_doctype.missing_name = false;
                    SWITCH_TO(DOCTYPEName);
                }
@ -507,7 +487,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
+                    m_current_token.m_doctype.name.append(to_ascii_lowercase(current_input_character.value()));
                    continue;
                }
                ON(0)
@ -550,10 +530,10 @@ _StartOfFunction:
                }
                ANYTHING_ELSE
                {
-                    if (toupper(current_input_character.value()) == 'P' && consume_next_if_match("UBLIC", CaseSensitivity::CaseInsensitive)) {
+                    if (to_ascii_uppercase(current_input_character.value()) == 'P' && consume_next_if_match("UBLIC", CaseSensitivity::CaseInsensitive)) {
                        SWITCH_TO(AfterDOCTYPEPublicKeyword);
                    }
-                    if (toupper(current_input_character.value()) == 'S' && consume_next_if_match("YSTEM", CaseSensitivity::CaseInsensitive)) {
+                    if (to_ascii_uppercase(current_input_character.value()) == 'S' && consume_next_if_match("YSTEM", CaseSensitivity::CaseInsensitive)) {
                        SWITCH_TO(AfterDOCTYPESystemKeyword);
                    }
                    log_parse_error();
@ -1068,7 +1048,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_current_token.m_tag.attributes.last().local_name_builder.append_code_point(tolower(current_input_character.value()));
+                    m_current_token.m_tag.attributes.last().local_name_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
                    continue;
                }
                ON(0)
@ -1558,7 +1538,7 @@ _StartOfFunction:

                    if (consumed_as_part_of_an_attribute() && !match.value().entity.ends_with(';')) {
                        auto next_code_point = peek_code_point(0);
-                        if (next_code_point.has_value() && (next_code_point.value() == '=' || isalnum(next_code_point.value()))) {
+                        if (next_code_point.has_value() && (next_code_point.value() == '=' || is_ascii_alphanumeric(next_code_point.value()))) {
                            FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
                            SWITCH_TO_RETURN_STATE;
                        }
@ -1720,14 +1700,14 @@ _StartOfFunction:
                    log_parse_error();
                    m_character_reference_code = 0xFFFD;
                }
-                if (is_surrogate(m_character_reference_code)) {
+                if (is_unicode_surrogate(m_character_reference_code)) {
                    log_parse_error();
                    m_character_reference_code = 0xFFFD;
                }
-                if (is_noncharacter(m_character_reference_code)) {
+                if (is_unicode_noncharacter(m_character_reference_code)) {
                    log_parse_error();
                }
-                if (m_character_reference_code == 0xd || (is_control(m_character_reference_code) && !isspace(m_character_reference_code))) {
+                if (m_character_reference_code == 0xd || (is_unicode_control(m_character_reference_code) && !is_ascii_space(m_character_reference_code))) {
                    log_parse_error();
                    constexpr struct {
                        u32 number;
@ -1870,7 +1850,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+                    m_current_token.m_tag.tag_name.append(to_ascii_lowercase(current_input_character.value()));
                    m_temporary_buffer.append(current_input_character.value());
                    continue;
                }
@ -1980,7 +1960,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+                    m_current_token.m_tag.tag_name.append(to_ascii_lowercase(current_input_character.value()));
                    m_temporary_buffer.append(current_input_character.value());
                    continue;
                }
@ -2193,7 +2173,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+                    m_current_token.m_tag.tag_name.append(to_ascii_lowercase(current_input_character.value()));
                    m_temporary_buffer.append(current_input_character.value());
                    continue;
                }
@ -2247,7 +2227,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_temporary_buffer.append(tolower(current_input_character.value()));
+                    m_temporary_buffer.append(to_ascii_lowercase(current_input_character.value()));
                    EMIT_CURRENT_CHARACTER;
                }
                ON_ASCII_LOWER_ALPHA
@ -2393,7 +2373,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_temporary_buffer.append(tolower(current_input_character.value()));
+                    m_temporary_buffer.append(to_ascii_lowercase(current_input_character.value()));
                    EMIT_CURRENT_CHARACTER;
                }
                ON_ASCII_LOWER_ALPHA
@ -2512,7 +2492,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+                    m_current_token.m_tag.tag_name.append(to_ascii_lowercase(current_input_character.value()));
                    m_temporary_buffer.append(current_input_character.value());
                    continue;
                }
@ -2598,7 +2578,7 @@ bool HTMLTokenizer::consume_next_if_match(const StringView& string, CaseSensitiv
        // FIXME: This should be more Unicode-aware.
        if (case_sensitivity == CaseSensitivity::CaseInsensitive) {
            if (code_point.value() < 0x80) {
-                if (tolower(code_point.value()) != tolower(string[i]))
+                if (to_ascii_lowercase(code_point.value()) != to_ascii_lowercase(string[i]))
                    return false;
                continue;
            }