mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-04-22 04:25:13 +00:00
AK: Invalidate UTF-8 encoded code points larger than U+10ffff
On oss-fuzz, the LibJS REPL is provided a file encoded with Windows-1252 with the following contents: /ô¡°½/ The REPL assumes the input file is UTF-8. So in Windows-1252, the above is represented as [0x2f 0xf4 0xa1 0xb0 0xbd 0x2f]. The inner 4 bytes are actually a valid UTF-8 encoding if we only look at the most significant bits to parse leading/continuation bytes. However, it decodes to the code point U+121c3d, which is not a valid code point. This commit adds additional validation to ensure the decoded code point itself is also valid.
This commit is contained in:
parent
119873b822
commit
9e5abec6f1
Notes:
sideshowbarker
2024-07-17 14:25:51 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/9e5abec6f1 Pull-request: https://github.com/SerenityOS/serenity/pull/13468 Reviewed-by: https://github.com/davidot Reviewed-by: https://github.com/linusg
2 changed files with 20 additions and 3 deletions
|
@ -6,6 +6,7 @@
|
|||
*/
|
||||
|
||||
#include <AK/Assertions.h>
|
||||
#include <AK/CharacterTypes.h>
|
||||
#include <AK/Format.h>
|
||||
#include <AK/Utf8View.h>
|
||||
|
||||
|
@ -100,9 +101,9 @@ bool Utf8View::validate(size_t& valid_bytes) const
|
|||
{
|
||||
valid_bytes = 0;
|
||||
for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) {
|
||||
size_t code_point_length_in_bytes;
|
||||
u32 value;
|
||||
bool first_byte_makes_sense = decode_first_byte(*ptr, code_point_length_in_bytes, value);
|
||||
size_t code_point_length_in_bytes = 0;
|
||||
u32 code_point = 0;
|
||||
bool first_byte_makes_sense = decode_first_byte(*ptr, code_point_length_in_bytes, code_point);
|
||||
if (!first_byte_makes_sense)
|
||||
return false;
|
||||
|
||||
|
@ -112,8 +113,14 @@ bool Utf8View::validate(size_t& valid_bytes) const
|
|||
return false;
|
||||
if (*ptr >> 6 != 2)
|
||||
return false;
|
||||
|
||||
code_point <<= 6;
|
||||
code_point |= *ptr & 63;
|
||||
}
|
||||
|
||||
if (!is_unicode(code_point))
|
||||
return false;
|
||||
|
||||
valid_bytes += code_point_length_in_bytes;
|
||||
}
|
||||
|
||||
|
|
|
@ -70,6 +70,16 @@ TEST_CASE(validate_invalid_ut8)
|
|||
Utf8View utf8_4 { StringView { invalid_utf8_4 } };
|
||||
EXPECT(!utf8_4.validate(valid_bytes));
|
||||
EXPECT(valid_bytes == 0);
|
||||
|
||||
char invalid_utf8_5[] = { (char)0xf4, (char)0x8f, (char)0xbf, (char)0xc0, 0 }; // U+110000
|
||||
Utf8View utf8_5 { StringView { invalid_utf8_5 } };
|
||||
EXPECT(!utf8_5.validate(valid_bytes));
|
||||
EXPECT(valid_bytes == 0);
|
||||
|
||||
char invalid_utf8_6[] = { (char)0xf4, (char)0xa1, (char)0xb0, (char)0xbd, 0 }; // U+121c3d
|
||||
Utf8View utf8_6 { StringView { invalid_utf8_6 } };
|
||||
EXPECT(!utf8_6.validate(valid_bytes));
|
||||
EXPECT(valid_bytes == 0);
|
||||
}
|
||||
|
||||
TEST_CASE(iterate_utf8)
|
||||
|
|
Loading…
Add table
Reference in a new issue