diff --git a/Libraries/LibJS/Lexer.cpp b/Libraries/LibJS/Lexer.cpp index d1dc500d6b1..681b86747ba 100644 --- a/Libraries/LibJS/Lexer.cpp +++ b/Libraries/LibJS/Lexer.cpp @@ -192,9 +192,9 @@ static constexpr TokenType parse_three_char_token(Utf16View const& view) } } -static consteval Array make_single_char_tokens_array() +static consteval AK::Array make_single_char_tokens_array() { - Array array; + AK::Array array; array.fill(TokenType::Invalid); array['&'] = TokenType::Ampersand; array['*'] = TokenType::Asterisk; @@ -225,33 +225,9 @@ static consteval Array make_single_char_tokens_array() static constexpr auto s_single_char_tokens = make_single_char_tokens_array(); -static Utf16String create_utf16_string_from_possibly_invalid_utf8_string(StringView source) -{ - Utf8View utf8_source { source }; - if (utf8_source.validate()) [[likely]] - return Utf16String::from_utf8_without_validation(source); - - StringBuilder builder(StringBuilder::Mode::UTF16); - - for (auto code_point : utf8_source) { - builder.append_code_point(code_point); - if (code_point == AK::UnicodeUtils::REPLACEMENT_CODE_POINT) - break; - } - - return builder.to_utf16_string(); -} - -Lexer::Lexer(StringView source, StringView filename, size_t line_number, size_t line_column) - : Lexer(create_utf16_string_from_possibly_invalid_utf8_string(source), filename, line_number, line_column) -{ - // FIXME: Remove this API once all callers are ported to UTF-16. -} - -Lexer::Lexer(Utf16String source, StringView filename, size_t line_number, size_t line_column) +Lexer::Lexer(NonnullRefPtr source, size_t line_number, size_t line_column) : m_source(move(source)) , m_current_token(TokenType::Eof, {}, {}, {}, 0, 0, 0) - , m_filename(String::from_utf8(filename).release_value_but_fixme_should_propagate_errors()) , m_line_number(line_number) , m_line_column(line_column) { @@ -304,16 +280,16 @@ Lexer::Lexer(Utf16String source, StringView filename, size_t line_number, size_t void Lexer::consume() { auto did_reach_eof = [this] { - if (m_position < m_source.length_in_code_units()) + if (m_position < m_source->code().length_in_code_units()) return false; m_eof = true; m_current_code_unit = '\0'; - m_position = m_source.length_in_code_units() + 1; + m_position = m_source->code().length_in_code_units() + 1; m_line_column++; return true; }; - if (m_position > m_source.length_in_code_units()) + if (m_position > m_source->code().length_in_code_units()) return; if (did_reach_eof()) @@ -339,7 +315,7 @@ void Lexer::consume() // and column - don't do it again. From https://tc39.es/ecma262/#sec-line-terminators: // The sequence is commonly used as a line terminator. // It should be considered a single SourceCharacter for the purpose of reporting line numbers. - auto second_char_of_crlf = m_position > 1 && m_source.code_unit_at(m_position - 2) == '\r' && m_current_code_unit == '\n'; + auto second_char_of_crlf = m_position > 1 && m_source->code().code_unit_at(m_position - 2) == '\r' && m_current_code_unit == '\n'; if (!second_char_of_crlf) { m_line_number++; @@ -349,8 +325,8 @@ void Lexer::consume() dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again."); } } else { - if (AK::UnicodeUtils::is_utf16_high_surrogate(m_current_code_unit) && m_position < m_source.length_in_code_units()) { - if (AK::UnicodeUtils::is_utf16_low_surrogate(m_source.code_unit_at(m_position))) { + if (AK::UnicodeUtils::is_utf16_high_surrogate(m_current_code_unit) && m_position < m_source->code().length_in_code_units()) { + if (AK::UnicodeUtils::is_utf16_low_surrogate(m_source->code().code_unit_at(m_position))) { ++m_position; if (did_reach_eof()) @@ -361,7 +337,7 @@ void Lexer::consume() ++m_line_column; } - m_current_code_unit = m_source.code_unit_at(m_position++); + m_current_code_unit = m_source->code().code_unit_at(m_position++); } bool Lexer::consume_decimal_number() @@ -436,40 +412,40 @@ bool Lexer::consume_binary_number() template bool Lexer::match_numeric_literal_separator_followed_by(Callback callback) const { - if (m_position >= m_source.length_in_code_units()) + if (m_position >= m_source->code().length_in_code_units()) return false; return m_current_code_unit == '_' - && callback(m_source.code_unit_at(m_position)); + && callback(m_source->code().code_unit_at(m_position)); } bool Lexer::match(char16_t a, char16_t b) const { - if (m_position >= m_source.length_in_code_units()) + if (m_position >= m_source->code().length_in_code_units()) return false; return m_current_code_unit == a - && m_source.code_unit_at(m_position) == b; + && m_source->code().code_unit_at(m_position) == b; } bool Lexer::match(char16_t a, char16_t b, char16_t c) const { - if (m_position + 1 >= m_source.length_in_code_units()) + if (m_position + 1 >= m_source->code().length_in_code_units()) return false; return m_current_code_unit == a - && m_source.code_unit_at(m_position) == b - && m_source.code_unit_at(m_position + 1) == c; + && m_source->code().code_unit_at(m_position) == b + && m_source->code().code_unit_at(m_position + 1) == c; } bool Lexer::match(char16_t a, char16_t b, char16_t c, char16_t d) const { - if (m_position + 2 >= m_source.length_in_code_units()) + if (m_position + 2 >= m_source->code().length_in_code_units()) return false; return m_current_code_unit == a - && m_source.code_unit_at(m_position) == b - && m_source.code_unit_at(m_position + 1) == c - && m_source.code_unit_at(m_position + 2) == d; + && m_source->code().code_unit_at(m_position) == b + && m_source->code().code_unit_at(m_position + 1) == c + && m_source->code().code_unit_at(m_position + 2) == d; } bool Lexer::is_eof() const @@ -493,7 +469,7 @@ ALWAYS_INLINE u32 Lexer::current_code_point() const if (m_position == 0) return AK::UnicodeUtils::REPLACEMENT_CODE_POINT; - auto substring = m_source.substring_view(m_position - 1); + auto substring = m_source->code().substring_view(m_position - 1); if (substring.is_empty()) return AK::UnicodeUtils::REPLACEMENT_CODE_POINT; @@ -615,7 +591,7 @@ bool Lexer::is_block_comment_end() const bool Lexer::is_numeric_literal_start() const { - return is_ascii_digit(m_current_code_unit) || (m_current_code_unit == '.' && m_position < m_source.length_in_code_units() && is_ascii_digit(m_source.code_unit_at(m_position))); + return is_ascii_digit(m_current_code_unit) || (m_current_code_unit == '.' && m_position < m_source->code().length_in_code_units() && is_ascii_digit(m_source->code().code_unit_at(m_position))); } bool Lexer::slash_means_division() const @@ -861,7 +837,7 @@ Token const& Lexer::next() while (m_current_code_unit != stop_char && m_current_code_unit != '\r' && m_current_code_unit != '\n' && !is_eof()) { if (m_current_code_unit == '\\') { consume(); - if (m_current_code_unit == '\r' && m_position < m_source.length_in_code_units() && m_source.code_unit_at(m_position) == '\n') { + if (m_current_code_unit == '\r' && m_position < m_source->code().length_in_code_units() && m_source->code().code_unit_at(m_position) == '\n') { consume(); } } @@ -896,8 +872,8 @@ Token const& Lexer::next() consume(); } - if (!found_token && m_position + 1 < m_source.length_in_code_units()) { - auto three_chars_view = m_source.substring_view(m_position - 1, 3); + if (!found_token && m_position + 1 < m_source->code().length_in_code_units()) { + auto three_chars_view = m_source->code().substring_view(m_position - 1, 3); if (auto type = parse_three_char_token(three_chars_view); type != TokenType::Invalid) { found_token = true; token_type = type; @@ -907,11 +883,11 @@ Token const& Lexer::next() } } - if (!found_token && m_position < m_source.length_in_code_units()) { - auto two_chars_view = m_source.substring_view(m_position - 1, 2); + if (!found_token && m_position < m_source->code().length_in_code_units()) { + auto two_chars_view = m_source->code().substring_view(m_position - 1, 2); if (auto type = parse_two_char_token(two_chars_view); type != TokenType::Invalid) { // OptionalChainingPunctuator :: ?. [lookahead ∉ DecimalDigit] - if (!(type == TokenType::QuestionMarkPeriod && m_position + 1 < m_source.length_in_code_units() && is_ascii_digit(m_source.code_unit_at(m_position + 1)))) { + if (!(type == TokenType::QuestionMarkPeriod && m_position + 1 < m_source->code().length_in_code_units() && is_ascii_digit(m_source->code().code_unit_at(m_position + 1)))) { found_token = true; token_type = type; consume(); @@ -945,8 +921,8 @@ Token const& Lexer::next() m_current_token = Token( token_type, token_message, - m_source.substring_view(trivia_start - 1, value_start - trivia_start), - m_source.substring_view(value_start - 1, m_position - value_start), + m_source->code().substring_view(trivia_start - 1, value_start - trivia_start), + m_source->code().substring_view(value_start - 1, m_position - value_start), value_start_line_number, value_start_column_number, value_start - 1); @@ -976,7 +952,7 @@ Token const& Lexer::force_slash_as_regex() size_t value_start = m_position - 1; if (has_equals) { - VERIFY(m_source.code_unit_at(value_start - 1) == '='); + VERIFY(m_source->code().code_unit_at(value_start - 1) == '='); --value_start; --m_position; m_current_code_unit = '='; @@ -988,7 +964,7 @@ Token const& Lexer::force_slash_as_regex() token_type, Token::Message::None, m_current_token.trivia(), - m_source.substring_view(value_start - 1, m_position - value_start), + m_source->code().substring_view(value_start - 1, m_position - value_start), m_current_token.line_number(), m_current_token.line_column(), value_start - 1); diff --git a/Libraries/LibJS/Lexer.h b/Libraries/LibJS/Lexer.h index 4973733d155..fd5487ddb72 100644 --- a/Libraries/LibJS/Lexer.h +++ b/Libraries/LibJS/Lexer.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2020, Stephan Unverwerth + * Copyright (c) 2020-2025, Andreas Kling * * SPDX-License-Identifier: BSD-2-Clause */ @@ -7,17 +8,16 @@ #pragma once #include -#include #include #include +#include #include namespace JS { class JS_API Lexer { public: - explicit Lexer(StringView source, StringView filename = "(unknown)"sv, size_t line_number = 1, size_t line_column = 0); - explicit Lexer(Utf16String source, StringView filename = "(unknown)"sv, size_t line_number = 1, size_t line_column = 0); + explicit Lexer(NonnullRefPtr, size_t line_number = 1, size_t line_column = 0); // These both advance the lexer and return a reference to the current token. Token const& next(); @@ -25,8 +25,9 @@ public: [[nodiscard]] Token const& current_token() const { return m_current_token; } - Utf16String const& source() const { return m_source; } - String const& filename() const { return m_filename; } + SourceCode const& source_code() const { return m_source; } + Utf16String const& source() const { return m_source->code(); } + String const& filename() const { return m_source->filename(); } void disallow_html_comments() { m_allow_html_comments = false; } @@ -59,7 +60,7 @@ private: TokenType consume_regex_literal(); - Utf16String m_source; + NonnullRefPtr m_source; size_t m_position { 0 }; Token m_current_token; char16_t m_current_code_unit { 0 }; @@ -67,7 +68,6 @@ private: bool m_regex_is_in_character_class { false }; bool m_allow_html_comments { true }; - String m_filename; size_t m_line_number { 1 }; size_t m_line_column { 0 }; diff --git a/Libraries/LibJS/Parser.cpp b/Libraries/LibJS/Parser.cpp index f6dfa21263c..6c577de1b16 100644 --- a/Libraries/LibJS/Parser.cpp +++ b/Libraries/LibJS/Parser.cpp @@ -688,7 +688,7 @@ Parser::ParserState::ParserState(Lexer l, Program::Type program_type) } Parser::Parser(Lexer lexer, Program::Type program_type, Optional initial_state_for_eval) - : m_source_code(SourceCode::create(lexer.filename(), lexer.source())) + : m_source_code(lexer.source_code()) , m_state(move(lexer), program_type) , m_program_type(program_type) { @@ -2596,7 +2596,7 @@ RefPtr Parser::synthesize_binding_pattern(Expression const auto source_end_offset = expression.source_range().end.offset; auto source = m_state.lexer.source().substring_view(source_start_offset, source_end_offset - source_start_offset); - Lexer lexer { Utf16String::from_utf16(source), m_state.lexer.filename(), expression.source_range().start.line, expression.source_range().start.column }; + Lexer lexer(SourceCode::create(m_state.lexer.filename(), Utf16String::from_utf16(source)), expression.source_range().start.line, expression.source_range().start.column); Parser parser { lexer }; parser.m_state.current_scope_pusher = m_state.current_scope_pusher; @@ -5233,7 +5233,7 @@ Parser Parser::parse_function_body_from_string(ByteString const& body_string, u1 { RefPtr function_body; - auto body_parser = Parser { Lexer { body_string } }; + auto body_parser = Parser(Lexer(SourceCode::create({}, Utf16String::from_utf8(body_string)))); { // Set up some parser state to accept things like return await, and yield in the plain function body. body_parser.m_state.in_function_context = true; diff --git a/Libraries/LibJS/Runtime/AbstractOperations.cpp b/Libraries/LibJS/Runtime/AbstractOperations.cpp index 2d81305ca1e..df4c451955a 100644 --- a/Libraries/LibJS/Runtime/AbstractOperations.cpp +++ b/Libraries/LibJS/Runtime/AbstractOperations.cpp @@ -623,7 +623,7 @@ ThrowCompletionOr perform_eval(VM& vm, Value x, CallerMode strict_caller, .in_class_field_initializer = in_class_field_initializer, }; - Parser parser { Lexer { code_string->utf8_string_view() }, Program::Type::Script, move(initial_state) }; + Parser parser(Lexer(SourceCode::create({}, code_string->utf16_string())), Program::Type::Script, move(initial_state)); auto program = parser.parse_program(strict_caller == CallerMode::Strict); // b. If script is a List of errors, throw a SyntaxError exception. diff --git a/Libraries/LibJS/Runtime/FunctionConstructor.cpp b/Libraries/LibJS/Runtime/FunctionConstructor.cpp index cc15696cb47..cb2ee58d361 100644 --- a/Libraries/LibJS/Runtime/FunctionConstructor.cpp +++ b/Libraries/LibJS/Runtime/FunctionConstructor.cpp @@ -156,7 +156,7 @@ ThrowCompletionOr> FunctionConstructor::create // 17. Let parameters be ParseText(P, parameterSym). i32 function_length = 0; - auto parameters_parser = Parser { Lexer { parameters_string } }; + auto parameters_parser = Parser(Lexer(SourceCode::create({}, Utf16String::from_utf8(parameters_string)))); auto parameters = parameters_parser.parse_formal_parameters(function_length, parse_options); // 18. If parameters is a List of errors, throw a SyntaxError exception. @@ -179,7 +179,7 @@ ThrowCompletionOr> FunctionConstructor::create // 22. NOTE: If this step is reached, sourceText must have the syntax of exprSym (although the reverse implication does not hold). The purpose of the next two steps is to enforce any Early Error rules which apply to exprSym directly. // 23. Let expr be ParseText(sourceText, exprSym). - auto source_parser = Parser { Lexer { source_text } }; + auto source_parser = Parser(Lexer(SourceCode::create({}, Utf16String::from_utf8(source_text)))); // This doesn't need any parse_options, it determines those & the function type based on the tokens that were found. auto expr = source_parser.parse_function_node(); diff --git a/Libraries/LibJS/Runtime/ShadowRealm.cpp b/Libraries/LibJS/Runtime/ShadowRealm.cpp index f7ce36bed24..f016236d4b1 100644 --- a/Libraries/LibJS/Runtime/ShadowRealm.cpp +++ b/Libraries/LibJS/Runtime/ShadowRealm.cpp @@ -123,7 +123,7 @@ ThrowCompletionOr perform_shadow_realm_eval(VM& vm, Value source, Realm& // 2. Perform the following substeps in an implementation-defined order, possibly interleaving parsing and error detection: // a. Let script be ParseText(StringToCodePoints(sourceText), Script). - auto parser = Parser(Lexer(source_text->utf8_string_view()), Program::Type::Script, Parser::EvalInitialState {}); + auto parser = Parser(Lexer(SourceCode::create({}, source_text->utf16_string())), Program::Type::Script, Parser::EvalInitialState {}); auto program = parser.parse_program(); // b. If script is a List of errors, throw a SyntaxError exception. diff --git a/Libraries/LibJS/Script.cpp b/Libraries/LibJS/Script.cpp index 99a80d43528..858f4b36c03 100644 --- a/Libraries/LibJS/Script.cpp +++ b/Libraries/LibJS/Script.cpp @@ -18,7 +18,7 @@ GC_DEFINE_ALLOCATOR(Script); Result, Vector> Script::parse(StringView source_text, Realm& realm, StringView filename, HostDefined* host_defined, size_t line_number_offset) { // 1. Let script be ParseText(sourceText, Script). - auto parser = Parser(Lexer(source_text, filename, line_number_offset)); + auto parser = Parser(Lexer(SourceCode::create(String::from_utf8(filename).release_value_but_fixme_should_propagate_errors(), Utf16String::from_utf8(source_text)), line_number_offset)); auto script = parser.parse_program(); // 2. If script is a List of errors, return body. diff --git a/Libraries/LibJS/SourceTextModule.cpp b/Libraries/LibJS/SourceTextModule.cpp index e0452b0ed48..ae8a76810e0 100644 --- a/Libraries/LibJS/SourceTextModule.cpp +++ b/Libraries/LibJS/SourceTextModule.cpp @@ -132,7 +132,7 @@ void SourceTextModule::visit_edges(Cell::Visitor& visitor) Result, Vector> SourceTextModule::parse(StringView source_text, Realm& realm, StringView filename, Script::HostDefined* host_defined) { // 1. Let body be ParseText(sourceText, Module). - auto parser = Parser(Lexer(source_text, filename), Program::Type::Module); + auto parser = Parser(Lexer(SourceCode::create(String::from_utf8(filename).release_value_but_fixme_should_propagate_errors(), Utf16String::from_utf8(source_text))), Program::Type::Module); auto body = parser.parse_program(); // 2. If body is a List of errors, return body. diff --git a/Libraries/LibJS/SyntaxHighlighter.cpp b/Libraries/LibJS/SyntaxHighlighter.cpp index 60ccb9872c1..24c496a68d8 100644 --- a/Libraries/LibJS/SyntaxHighlighter.cpp +++ b/Libraries/LibJS/SyntaxHighlighter.cpp @@ -52,7 +52,7 @@ void SyntaxHighlighter::rehighlight(Palette const& palette) { auto text = m_client->get_text(); - Lexer lexer(text); + Lexer lexer(SourceCode::create({}, Utf16String::from_utf8(text))); Vector spans; Vector folding_regions; diff --git a/Libraries/LibWeb/DOM/EventTarget.cpp b/Libraries/LibWeb/DOM/EventTarget.cpp index b951a180508..036f2f6dd06 100644 --- a/Libraries/LibWeb/DOM/EventTarget.cpp +++ b/Libraries/LibWeb/DOM/EventTarget.cpp @@ -444,7 +444,7 @@ WebIDL::CallbackType* EventTarget::get_current_value_of_event_handler(FlyString auto source_text = builder.to_byte_string(); - auto parser = JS::Parser(JS::Lexer(source_text)); + auto parser = JS::Parser(JS::Lexer(JS::SourceCode::create({}, Utf16String::from_utf8(source_text)))); // FIXME: This should only be parsing the `body` instead of `source_text` and therefore use `JS::FunctionBody` instead of `JS::FunctionExpression`. // However, JS::ECMAScriptFunctionObject::create wants parameters and length and JS::FunctionBody does not inherit JS::FunctionNode. diff --git a/Libraries/LibWeb/WebDriver/ExecuteScript.cpp b/Libraries/LibWeb/WebDriver/ExecuteScript.cpp index edac5a08b40..5a90e5d4b9f 100644 --- a/Libraries/LibWeb/WebDriver/ExecuteScript.cpp +++ b/Libraries/LibWeb/WebDriver/ExecuteScript.cpp @@ -55,7 +55,8 @@ static JS::ThrowCompletionOr execute_a_function_body(HTML::BrowsingCo }})~~~", body); - auto parser = JS::Parser { JS::Lexer { source_text } }; + auto parser = JS::Parser(JS::Lexer(JS::SourceCode::create({}, Utf16String::from_utf8(source_text)))); + ; auto function_expression = parser.parse_function_node(); // 4. If body is not parsable as a FunctionBody or if parsing detects an early error, return Completion { [[Type]]: normal, [[Value]]: null, [[Target]]: empty }. diff --git a/Tests/LibJS/CMakeLists.txt b/Tests/LibJS/CMakeLists.txt index ed9cfe322b1..f8c04081800 100644 --- a/Tests/LibJS/CMakeLists.txt +++ b/Tests/LibJS/CMakeLists.txt @@ -1,4 +1,3 @@ -ladybird_test(test-invalid-unicode-js.cpp LibJS LIBS LibJS LibUnicode) ladybird_test(test-value-js.cpp LibJS LIBS LibJS LibUnicode) ladybird_testjs_test(test-js.cpp test-js LIBS LibGC) diff --git a/Tests/LibJS/test-invalid-unicode-js.cpp b/Tests/LibJS/test-invalid-unicode-js.cpp deleted file mode 100644 index ac66f98dc18..00000000000 --- a/Tests/LibJS/test-invalid-unicode-js.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) 2021, David Tuin - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include -#include - -static bool produces_eof_tokens(JS::Lexer& lexer) -{ - for (auto i = 0; i < 10; i++) { - auto eof_token = lexer.next(); - if (eof_token.type() != JS::TokenType::Eof) - return false; - } - return true; -} - -static bool triggers_immediate_unicode_fault(StringView code) -{ - auto lexer = JS::Lexer(code); - auto first_token = lexer.next(); - - if (first_token.type() != JS::TokenType::Invalid) - return false; - - return produces_eof_tokens(lexer); -} -// In the not leading character it must start with 0b10xxxxxx -// Thus all these options are invalid: -// \x0y = 0000 y (or \x1y, \x2y and \x3y) -// \x4y = 0100 y (or \x5y, \x6y and \x7y) -// \xCy = 1100 y (or \xDy, \xEy and \xFy) -// And the only valid option is: -// \x8y = 1000 y (or \x9y, \xAy - -TEST_CASE(no_input_only_gives_eof) -{ - auto code = ""sv; - auto lexer = JS::Lexer(code); - EXPECT(produces_eof_tokens(lexer)); -} - -TEST_CASE(invalid_start_code_point) -{ - EXPECT(triggers_immediate_unicode_fault("\x80"sv)); - EXPECT(triggers_immediate_unicode_fault("\x90"sv)); - EXPECT(triggers_immediate_unicode_fault("\xA0"sv)); - EXPECT(triggers_immediate_unicode_fault("\xB0"sv)); - EXPECT(triggers_immediate_unicode_fault("\xF8"sv)); - EXPECT(triggers_immediate_unicode_fault("\xFF"sv)); -} - -TEST_CASE(code_points_of_length_2) -{ - // Initial 110xxxxx -> \xCy or \xDy - EXPECT(triggers_immediate_unicode_fault("\xC5"sv)); - EXPECT(triggers_immediate_unicode_fault("\xC5\x02"sv)); - EXPECT(triggers_immediate_unicode_fault("\xC5\x52"sv)); - EXPECT(triggers_immediate_unicode_fault("\xC5\xD2"sv)); - - EXPECT(triggers_immediate_unicode_fault("\xD5"sv)); - EXPECT(triggers_immediate_unicode_fault("\xD5\x23"sv)); - EXPECT(triggers_immediate_unicode_fault("\xD5\x74"sv)); - EXPECT(triggers_immediate_unicode_fault("\xD5\xF5"sv)); -} - -TEST_CASE(code_points_of_length_3) -{ - // Initial 1110xxxx -> \xEy - EXPECT(triggers_immediate_unicode_fault("\xE5"sv)); - EXPECT(triggers_immediate_unicode_fault("\xE5\x02"sv)); - EXPECT(triggers_immediate_unicode_fault("\xE5\x52"sv)); - EXPECT(triggers_immediate_unicode_fault("\xE5\xD2"sv)); - - EXPECT(triggers_immediate_unicode_fault("\xEA\x80"sv)); - EXPECT(triggers_immediate_unicode_fault("\xEA\x81\x07"sv)); - EXPECT(triggers_immediate_unicode_fault("\xEA\x82\x57"sv)); - EXPECT(triggers_immediate_unicode_fault("\xEA\x83\xD7"sv)); -} - -TEST_CASE(code_points_of_length_4) -{ - // Initial 11110xxx -> \xF{0..7} - EXPECT(triggers_immediate_unicode_fault("\xF0"sv)); - EXPECT(triggers_immediate_unicode_fault("\xF1\x02"sv)); - EXPECT(triggers_immediate_unicode_fault("\xF2\x52"sv)); - EXPECT(triggers_immediate_unicode_fault("\xF3\xD2"sv)); - - EXPECT(triggers_immediate_unicode_fault("\xF4\x80"sv)); - EXPECT(triggers_immediate_unicode_fault("\xF5\x81\x07"sv)); - EXPECT(triggers_immediate_unicode_fault("\xF6\x82\x57"sv)); - EXPECT(triggers_immediate_unicode_fault("\xF7\x83\xD7"sv)); - - EXPECT(triggers_immediate_unicode_fault("\xF4\x80\x80"sv)); - EXPECT(triggers_immediate_unicode_fault("\xF5\x91\x80\x07"sv)); - EXPECT(triggers_immediate_unicode_fault("\xF6\xA2\x80\x57"sv)); - EXPECT(triggers_immediate_unicode_fault("\xF7\xB3\x80\xD7"sv)); -} - -TEST_CASE(gives_valid_part_until_fault) -{ - auto code = "abc\xF5\x81\x80\x07; abc\xF5\x81\x80\x07 += 4"sv; - JS::Lexer lexer(code); - auto first_token = lexer.next(); - EXPECT_EQ(first_token.type(), JS::TokenType::Identifier); - EXPECT_EQ(first_token.value(), "abc"sv); - auto second_token = lexer.next(); - EXPECT_EQ(second_token.type(), JS::TokenType::Invalid); - EXPECT(produces_eof_tokens(lexer)); -} - -TEST_CASE(gives_fully_parsed_tokens_even_if_invalid_unicode_follows) -{ - auto code = "let \xE5\xD2"sv; - JS::Lexer lexer(code); - auto first_token = lexer.next(); - EXPECT_EQ(first_token.type(), JS::TokenType::Let); - auto second_token = lexer.next(); - EXPECT_EQ(second_token.type(), JS::TokenType::Invalid); - EXPECT(produces_eof_tokens(lexer)); -} - -TEST_CASE(invalid_unicode_and_valid_code) -{ - EXPECT(triggers_immediate_unicode_fault("\xEA\xFDthrow 1;"sv)); -} - -TEST_CASE(long_invalid_unicode_and_valid_code) -{ - EXPECT(triggers_immediate_unicode_fault("\xF7throw 1;"sv)); -} - -TEST_CASE(invalid_unicode_after_valid_code_and_before_eof) -{ - auto code = "let \xEA\xFD;"sv; - auto lexer = JS::Lexer(code); - auto let_token = lexer.next(); - EXPECT_EQ(let_token.type(), JS::TokenType::Let); - auto invalid_token = lexer.next(); - EXPECT_EQ(invalid_token.type(), JS::TokenType::Invalid); - EXPECT(produces_eof_tokens(lexer)); -} diff --git a/Tests/LibJS/test-js.cpp b/Tests/LibJS/test-js.cpp index b8015b022ba..020103194a8 100644 --- a/Tests/LibJS/test-js.cpp +++ b/Tests/LibJS/test-js.cpp @@ -19,8 +19,8 @@ TESTJS_PROGRAM_FLAG(test262_parser_tests, "Run test262 parser tests", "test262-p TESTJS_GLOBAL_FUNCTION(can_parse_source, canParseSource) { - auto source = TRY(vm.argument(0).to_string(vm)); - auto parser = JS::Parser(JS::Lexer(source)); + auto source = TRY(vm.argument(0).to_utf16_string(vm)); + auto parser = JS::Parser(JS::Lexer(JS::SourceCode::create({}, source))); (void)parser.parse_program(); return JS::Value(!parser.has_errors()); } diff --git a/Utilities/js.cpp b/Utilities/js.cpp index 69606f1199a..49bbae29879 100644 --- a/Utilities/js.cpp +++ b/Utilities/js.cpp @@ -513,7 +513,7 @@ static ErrorOr read_next_piece() piece.append(line); piece.append('\n'); - auto lexer = JS::Lexer(line); + auto lexer = JS::Lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line))); enum { NotInLabelOrObjectKey, @@ -622,7 +622,7 @@ static ErrorOr run_repl(bool gc_on_every_allocation, bool syntax_highlight) size_t open_indents = s_repl_line_level; auto line = editor.line(); - JS::Lexer lexer(line); + JS::Lexer lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line))); bool indenters_starting_line = true; for (JS::Token token = lexer.next(); token.type() != JS::TokenType::Eof; token = lexer.next()) { auto length = token.value().length_in_code_units(); @@ -678,7 +678,7 @@ static ErrorOr run_repl(bool gc_on_every_allocation, bool syntax_highlight) auto complete = [&realm, &global_environment](Line::Editor const& editor) -> Vector { auto line = editor.line(editor.cursor()); - JS::Lexer lexer { line }; + JS::Lexer lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line))); enum { Initial, CompleteVariable, diff --git a/Utilities/test262-runner.cpp b/Utilities/test262-runner.cpp index 3df3c194a50..f9ffe7f4260 100644 --- a/Utilities/test262-runner.cpp +++ b/Utilities/test262-runner.cpp @@ -194,7 +194,7 @@ static ErrorOr run_test(StringView source, StringView filepath, // We can also skip if we know the test is supposed to fail during parse // time. Unfortunately the phases of modules are not as clear and thus we // only do this for scripts. See also the comment at the end of verify_test. - auto parser = JS::Parser(JS::Lexer(source, filepath), metadata.program_type); + auto parser = JS::Parser(JS::Lexer(JS::SourceCode::create(String::from_utf8(filepath).release_value_but_fixme_should_propagate_errors(), Utf16String::from_utf8(source))), metadata.program_type); auto program_or_error = parser.parse_program(); if (parser.has_errors()) { return TestError {