LibJS: Have JS::Lexer take a JS::SourceCode as input

This moves the responsibility of setting up a SourceCode object to the users of JS::Lexer. This means Lexer and Parser are free to use string views into the SourceCode internally while working. It also means Lexer no longer has to think about anything other than UTF-16 (or ASCII) inputs. So the unit test for parsing various invalid UTF-8 sequences is deleted here.
Author: https://github.com/awesomekling Commit: https://github.com/LadybirdBrowser/ladybird/commit/0dacc94edd6 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/6764
2025-12-05 01:10:24 +00:00 · 2025-11-08 21:21:52 +01:00 · 2025-11-09 11:15:22 +00:00
parent 9ca25e55d7
commit 0dacc94edd
16 changed files with 59 additions and 227 deletions
--- a/Libraries/LibJS/Lexer.cpp
+++ b/Libraries/LibJS/Lexer.cpp
@@ -192,9 +192,9 @@ static constexpr TokenType parse_three_char_token(Utf16View const& view)
    }
 }

-static consteval Array<TokenType, 256> make_single_char_tokens_array()
+static consteval AK::Array<TokenType, 256> make_single_char_tokens_array()
 {
-    Array<TokenType, 256> array;
+    AK::Array<TokenType, 256> array;
    array.fill(TokenType::Invalid);
    array['&'] = TokenType::Ampersand;
    array['*'] = TokenType::Asterisk;
@@ -225,33 +225,9 @@ static consteval Array<TokenType, 256> make_single_char_tokens_array()

 static constexpr auto s_single_char_tokens = make_single_char_tokens_array();

-static Utf16String create_utf16_string_from_possibly_invalid_utf8_string(StringView source)
-{
-    Utf8View utf8_source { source };
-    if (utf8_source.validate()) [[likely]]
-        return Utf16String::from_utf8_without_validation(source);
-
-    StringBuilder builder(StringBuilder::Mode::UTF16);
-
-    for (auto code_point : utf8_source) {
-        builder.append_code_point(code_point);
-        if (code_point == AK::UnicodeUtils::REPLACEMENT_CODE_POINT)
-            break;
-    }
-
-    return builder.to_utf16_string();
-}
-
-Lexer::Lexer(StringView source, StringView filename, size_t line_number, size_t line_column)
-    : Lexer(create_utf16_string_from_possibly_invalid_utf8_string(source), filename, line_number, line_column)
-{
-    // FIXME: Remove this API once all callers are ported to UTF-16.
-}
-
-Lexer::Lexer(Utf16String source, StringView filename, size_t line_number, size_t line_column)
+Lexer::Lexer(NonnullRefPtr<SourceCode const> source, size_t line_number, size_t line_column)
    : m_source(move(source))
    , m_current_token(TokenType::Eof, {}, {}, {}, 0, 0, 0)
-    , m_filename(String::from_utf8(filename).release_value_but_fixme_should_propagate_errors())
    , m_line_number(line_number)
    , m_line_column(line_column)
 {
@@ -304,16 +280,16 @@ Lexer::Lexer(Utf16String source, StringView filename, size_t line_number, size_t
 void Lexer::consume()
 {
    auto did_reach_eof = [this] {
-        if (m_position < m_source.length_in_code_units())
+        if (m_position < m_source->code().length_in_code_units())
            return false;
        m_eof = true;
        m_current_code_unit = '\0';
-        m_position = m_source.length_in_code_units() + 1;
+        m_position = m_source->code().length_in_code_units() + 1;
        m_line_column++;
        return true;
    };

-    if (m_position > m_source.length_in_code_units())
+    if (m_position > m_source->code().length_in_code_units())
        return;

    if (did_reach_eof())
@@ -339,7 +315,7 @@ void Lexer::consume()
        // and column - don't do it again. From https://tc39.es/ecma262/#sec-line-terminators:
        //   The sequence <CR><LF> is commonly used as a line terminator.
        //   It should be considered a single SourceCharacter for the purpose of reporting line numbers.
-        auto second_char_of_crlf = m_position > 1 && m_source.code_unit_at(m_position - 2) == '\r' && m_current_code_unit == '\n';
+        auto second_char_of_crlf = m_position > 1 && m_source->code().code_unit_at(m_position - 2) == '\r' && m_current_code_unit == '\n';

        if (!second_char_of_crlf) {
            m_line_number++;
@@ -349,8 +325,8 @@ void Lexer::consume()
            dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again.");
        }
    } else {
-        if (AK::UnicodeUtils::is_utf16_high_surrogate(m_current_code_unit) && m_position < m_source.length_in_code_units()) {
-            if (AK::UnicodeUtils::is_utf16_low_surrogate(m_source.code_unit_at(m_position))) {
+        if (AK::UnicodeUtils::is_utf16_high_surrogate(m_current_code_unit) && m_position < m_source->code().length_in_code_units()) {
+            if (AK::UnicodeUtils::is_utf16_low_surrogate(m_source->code().code_unit_at(m_position))) {
                ++m_position;

                if (did_reach_eof())
@@ -361,7 +337,7 @@ void Lexer::consume()
        ++m_line_column;
    }

-    m_current_code_unit = m_source.code_unit_at(m_position++);
+    m_current_code_unit = m_source->code().code_unit_at(m_position++);
 }

 bool Lexer::consume_decimal_number()
@@ -436,40 +412,40 @@ bool Lexer::consume_binary_number()
 template<typename Callback>
 bool Lexer::match_numeric_literal_separator_followed_by(Callback callback) const
 {
-    if (m_position >= m_source.length_in_code_units())
+    if (m_position >= m_source->code().length_in_code_units())
        return false;
    return m_current_code_unit == '_'
-        && callback(m_source.code_unit_at(m_position));
+        && callback(m_source->code().code_unit_at(m_position));
 }

 bool Lexer::match(char16_t a, char16_t b) const
 {
-    if (m_position >= m_source.length_in_code_units())
+    if (m_position >= m_source->code().length_in_code_units())
        return false;

    return m_current_code_unit == a
-        && m_source.code_unit_at(m_position) == b;
+        && m_source->code().code_unit_at(m_position) == b;
 }

 bool Lexer::match(char16_t a, char16_t b, char16_t c) const
 {
-    if (m_position + 1 >= m_source.length_in_code_units())
+    if (m_position + 1 >= m_source->code().length_in_code_units())
        return false;

    return m_current_code_unit == a
-        && m_source.code_unit_at(m_position) == b
-        && m_source.code_unit_at(m_position + 1) == c;
+        && m_source->code().code_unit_at(m_position) == b
+        && m_source->code().code_unit_at(m_position + 1) == c;
 }

 bool Lexer::match(char16_t a, char16_t b, char16_t c, char16_t d) const
 {
-    if (m_position + 2 >= m_source.length_in_code_units())
+    if (m_position + 2 >= m_source->code().length_in_code_units())
        return false;

    return m_current_code_unit == a
-        && m_source.code_unit_at(m_position) == b
-        && m_source.code_unit_at(m_position + 1) == c
-        && m_source.code_unit_at(m_position + 2) == d;
+        && m_source->code().code_unit_at(m_position) == b
+        && m_source->code().code_unit_at(m_position + 1) == c
+        && m_source->code().code_unit_at(m_position + 2) == d;
 }

 bool Lexer::is_eof() const
@@ -493,7 +469,7 @@ ALWAYS_INLINE u32 Lexer::current_code_point() const
    if (m_position == 0)
        return AK::UnicodeUtils::REPLACEMENT_CODE_POINT;

-    auto substring = m_source.substring_view(m_position - 1);
+    auto substring = m_source->code().substring_view(m_position - 1);
    if (substring.is_empty())
        return AK::UnicodeUtils::REPLACEMENT_CODE_POINT;

@@ -615,7 +591,7 @@ bool Lexer::is_block_comment_end() const

 bool Lexer::is_numeric_literal_start() const
 {
-    return is_ascii_digit(m_current_code_unit) || (m_current_code_unit == '.' && m_position < m_source.length_in_code_units() && is_ascii_digit(m_source.code_unit_at(m_position)));
+    return is_ascii_digit(m_current_code_unit) || (m_current_code_unit == '.' && m_position < m_source->code().length_in_code_units() && is_ascii_digit(m_source->code().code_unit_at(m_position)));
 }

 bool Lexer::slash_means_division() const
@@ -861,7 +837,7 @@ Token const& Lexer::next()
        while (m_current_code_unit != stop_char && m_current_code_unit != '\r' && m_current_code_unit != '\n' && !is_eof()) {
            if (m_current_code_unit == '\\') {
                consume();
-                if (m_current_code_unit == '\r' && m_position < m_source.length_in_code_units() && m_source.code_unit_at(m_position) == '\n') {
+                if (m_current_code_unit == '\r' && m_position < m_source->code().length_in_code_units() && m_source->code().code_unit_at(m_position) == '\n') {
                    consume();
                }
            }
@@ -896,8 +872,8 @@ Token const& Lexer::next()
            consume();
        }

-        if (!found_token && m_position + 1 < m_source.length_in_code_units()) {
-            auto three_chars_view = m_source.substring_view(m_position - 1, 3);
+        if (!found_token && m_position + 1 < m_source->code().length_in_code_units()) {
+            auto three_chars_view = m_source->code().substring_view(m_position - 1, 3);
            if (auto type = parse_three_char_token(three_chars_view); type != TokenType::Invalid) {
                found_token = true;
                token_type = type;
@@ -907,11 +883,11 @@ Token const& Lexer::next()
            }
        }

-        if (!found_token && m_position < m_source.length_in_code_units()) {
-            auto two_chars_view = m_source.substring_view(m_position - 1, 2);
+        if (!found_token && m_position < m_source->code().length_in_code_units()) {
+            auto two_chars_view = m_source->code().substring_view(m_position - 1, 2);
            if (auto type = parse_two_char_token(two_chars_view); type != TokenType::Invalid) {
                // OptionalChainingPunctuator :: ?. [lookahead ∉ DecimalDigit]
-                if (!(type == TokenType::QuestionMarkPeriod && m_position + 1 < m_source.length_in_code_units() && is_ascii_digit(m_source.code_unit_at(m_position + 1)))) {
+                if (!(type == TokenType::QuestionMarkPeriod && m_position + 1 < m_source->code().length_in_code_units() && is_ascii_digit(m_source->code().code_unit_at(m_position + 1)))) {
                    found_token = true;
                    token_type = type;
                    consume();
@@ -945,8 +921,8 @@ Token const& Lexer::next()
    m_current_token = Token(
        token_type,
        token_message,
-        m_source.substring_view(trivia_start - 1, value_start - trivia_start),
-        m_source.substring_view(value_start - 1, m_position - value_start),
+        m_source->code().substring_view(trivia_start - 1, value_start - trivia_start),
+        m_source->code().substring_view(value_start - 1, m_position - value_start),
        value_start_line_number,
        value_start_column_number,
        value_start - 1);
@@ -976,7 +952,7 @@ Token const& Lexer::force_slash_as_regex()
    size_t value_start = m_position - 1;

    if (has_equals) {
-        VERIFY(m_source.code_unit_at(value_start - 1) == '=');
+        VERIFY(m_source->code().code_unit_at(value_start - 1) == '=');
        --value_start;
        --m_position;
        m_current_code_unit = '=';
@@ -988,7 +964,7 @@ Token const& Lexer::force_slash_as_regex()
        token_type,
        Token::Message::None,
        m_current_token.trivia(),
-        m_source.substring_view(value_start - 1, m_position - value_start),
+        m_source->code().substring_view(value_start - 1, m_position - value_start),
        m_current_token.line_number(),
        m_current_token.line_column(),
        value_start - 1);
--- a/Libraries/LibJS/Lexer.h
+++ b/Libraries/LibJS/Lexer.h
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@serenityos.org>
+ * Copyright (c) 2020-2025, Andreas Kling <andreas@ladybird.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */
@@ -7,17 +8,16 @@
 #pragma once

 #include <AK/HashMap.h>
-#include <AK/StringView.h>
 #include <AK/Utf16String.h>
 #include <LibJS/Export.h>
+#include <LibJS/SourceCode.h>
 #include <LibJS/Token.h>

 namespace JS {

 class JS_API Lexer {
 public:
-    explicit Lexer(StringView source, StringView filename = "(unknown)"sv, size_t line_number = 1, size_t line_column = 0);
-    explicit Lexer(Utf16String source, StringView filename = "(unknown)"sv, size_t line_number = 1, size_t line_column = 0);
+    explicit Lexer(NonnullRefPtr<SourceCode const>, size_t line_number = 1, size_t line_column = 0);

    // These both advance the lexer and return a reference to the current token.
    Token const& next();
@@ -25,8 +25,9 @@ public:

    [[nodiscard]] Token const& current_token() const { return m_current_token; }

-    Utf16String const& source() const { return m_source; }
-    String const& filename() const { return m_filename; }
+    SourceCode const& source_code() const { return m_source; }
+    Utf16String const& source() const { return m_source->code(); }
+    String const& filename() const { return m_source->filename(); }

    void disallow_html_comments() { m_allow_html_comments = false; }

@@ -59,7 +60,7 @@ private:

    TokenType consume_regex_literal();

-    Utf16String m_source;
+    NonnullRefPtr<SourceCode const> m_source;
    size_t m_position { 0 };
    Token m_current_token;
    char16_t m_current_code_unit { 0 };
@@ -67,7 +68,6 @@ private:
    bool m_regex_is_in_character_class { false };
    bool m_allow_html_comments { true };

-    String m_filename;
    size_t m_line_number { 1 };
    size_t m_line_column { 0 };

--- a/Libraries/LibJS/Parser.cpp
+++ b/Libraries/LibJS/Parser.cpp
@@ -688,7 +688,7 @@ Parser::ParserState::ParserState(Lexer l, Program::Type program_type)
 }

 Parser::Parser(Lexer lexer, Program::Type program_type, Optional<EvalInitialState> initial_state_for_eval)
-    : m_source_code(SourceCode::create(lexer.filename(), lexer.source()))
+    : m_source_code(lexer.source_code())
    , m_state(move(lexer), program_type)
    , m_program_type(program_type)
 {
@@ -2596,7 +2596,7 @@ RefPtr<BindingPattern const> Parser::synthesize_binding_pattern(Expression const
    auto source_end_offset = expression.source_range().end.offset;
    auto source = m_state.lexer.source().substring_view(source_start_offset, source_end_offset - source_start_offset);

-    Lexer lexer { Utf16String::from_utf16(source), m_state.lexer.filename(), expression.source_range().start.line, expression.source_range().start.column };
+    Lexer lexer(SourceCode::create(m_state.lexer.filename(), Utf16String::from_utf16(source)), expression.source_range().start.line, expression.source_range().start.column);
    Parser parser { lexer };

    parser.m_state.current_scope_pusher = m_state.current_scope_pusher;
@@ -5233,7 +5233,7 @@ Parser Parser::parse_function_body_from_string(ByteString const& body_string, u1
 {
    RefPtr<FunctionBody const> function_body;

-    auto body_parser = Parser { Lexer { body_string } };
+    auto body_parser = Parser(Lexer(SourceCode::create({}, Utf16String::from_utf8(body_string))));
    {
        // Set up some parser state to accept things like return await, and yield in the plain function body.
        body_parser.m_state.in_function_context = true;
--- a/Libraries/LibJS/Runtime/AbstractOperations.cpp
+++ b/Libraries/LibJS/Runtime/AbstractOperations.cpp
@@ -623,7 +623,7 @@ ThrowCompletionOr<Value> perform_eval(VM& vm, Value x, CallerMode strict_caller,
        .in_class_field_initializer = in_class_field_initializer,
    };

-    Parser parser { Lexer { code_string->utf8_string_view() }, Program::Type::Script, move(initial_state) };
+    Parser parser(Lexer(SourceCode::create({}, code_string->utf16_string())), Program::Type::Script, move(initial_state));
    auto program = parser.parse_program(strict_caller == CallerMode::Strict);

    //     b. If script is a List of errors, throw a SyntaxError exception.
--- a/Libraries/LibJS/Runtime/FunctionConstructor.cpp
+++ b/Libraries/LibJS/Runtime/FunctionConstructor.cpp
@@ -156,7 +156,7 @@ ThrowCompletionOr<GC::Ref<ECMAScriptFunctionObject>> FunctionConstructor::create

    // 17. Let parameters be ParseText(P, parameterSym).
    i32 function_length = 0;
-    auto parameters_parser = Parser { Lexer { parameters_string } };
+    auto parameters_parser = Parser(Lexer(SourceCode::create({}, Utf16String::from_utf8(parameters_string))));
    auto parameters = parameters_parser.parse_formal_parameters(function_length, parse_options);

    // 18. If parameters is a List of errors, throw a SyntaxError exception.
@@ -179,7 +179,7 @@ ThrowCompletionOr<GC::Ref<ECMAScriptFunctionObject>> FunctionConstructor::create
    // 22. NOTE: If this step is reached, sourceText must have the syntax of exprSym (although the reverse implication does not hold). The purpose of the next two steps is to enforce any Early Error rules which apply to exprSym directly.

    // 23. Let expr be ParseText(sourceText, exprSym).
-    auto source_parser = Parser { Lexer { source_text } };
+    auto source_parser = Parser(Lexer(SourceCode::create({}, Utf16String::from_utf8(source_text))));
    // This doesn't need any parse_options, it determines those & the function type based on the tokens that were found.
    auto expr = source_parser.parse_function_node<FunctionExpression>();

--- a/Libraries/LibJS/Runtime/ShadowRealm.cpp
+++ b/Libraries/LibJS/Runtime/ShadowRealm.cpp
@@ -123,7 +123,7 @@ ThrowCompletionOr<Value> perform_shadow_realm_eval(VM& vm, Value source, Realm&
    // 2. Perform the following substeps in an implementation-defined order, possibly interleaving parsing and error detection:

    // a. Let script be ParseText(StringToCodePoints(sourceText), Script).
-    auto parser = Parser(Lexer(source_text->utf8_string_view()), Program::Type::Script, Parser::EvalInitialState {});
+    auto parser = Parser(Lexer(SourceCode::create({}, source_text->utf16_string())), Program::Type::Script, Parser::EvalInitialState {});
    auto program = parser.parse_program();

    // b. If script is a List of errors, throw a SyntaxError exception.
--- a/Libraries/LibJS/Script.cpp
+++ b/Libraries/LibJS/Script.cpp
@@ -18,7 +18,7 @@ GC_DEFINE_ALLOCATOR(Script);
 Result<GC::Ref<Script>, Vector<ParserError>> Script::parse(StringView source_text, Realm& realm, StringView filename, HostDefined* host_defined, size_t line_number_offset)
 {
    // 1. Let script be ParseText(sourceText, Script).
-    auto parser = Parser(Lexer(source_text, filename, line_number_offset));
+    auto parser = Parser(Lexer(SourceCode::create(String::from_utf8(filename).release_value_but_fixme_should_propagate_errors(), Utf16String::from_utf8(source_text)), line_number_offset));
    auto script = parser.parse_program();

    // 2. If script is a List of errors, return body.
--- a/Libraries/LibJS/SourceTextModule.cpp
+++ b/Libraries/LibJS/SourceTextModule.cpp
@@ -132,7 +132,7 @@ void SourceTextModule::visit_edges(Cell::Visitor& visitor)
 Result<GC::Ref<SourceTextModule>, Vector<ParserError>> SourceTextModule::parse(StringView source_text, Realm& realm, StringView filename, Script::HostDefined* host_defined)
 {
    // 1. Let body be ParseText(sourceText, Module).
-    auto parser = Parser(Lexer(source_text, filename), Program::Type::Module);
+    auto parser = Parser(Lexer(SourceCode::create(String::from_utf8(filename).release_value_but_fixme_should_propagate_errors(), Utf16String::from_utf8(source_text))), Program::Type::Module);
    auto body = parser.parse_program();

    // 2. If body is a List of errors, return body.
--- a/Libraries/LibJS/SyntaxHighlighter.cpp
+++ b/Libraries/LibJS/SyntaxHighlighter.cpp
@@ -52,7 +52,7 @@ void SyntaxHighlighter::rehighlight(Palette const& palette)
 {
    auto text = m_client->get_text();

-    Lexer lexer(text);
+    Lexer lexer(SourceCode::create({}, Utf16String::from_utf8(text)));

    Vector<Syntax::TextDocumentSpan> spans;
    Vector<Syntax::TextDocumentFoldingRegion> folding_regions;
--- a/Libraries/LibWeb/DOM/EventTarget.cpp
+++ b/Libraries/LibWeb/DOM/EventTarget.cpp
@@ -444,7 +444,7 @@ WebIDL::CallbackType* EventTarget::get_current_value_of_event_handler(FlyString

        auto source_text = builder.to_byte_string();

-        auto parser = JS::Parser(JS::Lexer(source_text));
+        auto parser = JS::Parser(JS::Lexer(JS::SourceCode::create({}, Utf16String::from_utf8(source_text))));

        // FIXME: This should only be parsing the `body` instead of `source_text` and therefore use `JS::FunctionBody` instead of `JS::FunctionExpression`.
        //        However, JS::ECMAScriptFunctionObject::create wants parameters and length and JS::FunctionBody does not inherit JS::FunctionNode.
--- a/Libraries/LibWeb/WebDriver/ExecuteScript.cpp
+++ b/Libraries/LibWeb/WebDriver/ExecuteScript.cpp
@@ -55,7 +55,8 @@ static JS::ThrowCompletionOr<JS::Value> execute_a_function_body(HTML::BrowsingCo
        }})~~~",
        body);

-    auto parser = JS::Parser { JS::Lexer { source_text } };
+    auto parser = JS::Parser(JS::Lexer(JS::SourceCode::create({}, Utf16String::from_utf8(source_text))));
+    ;
    auto function_expression = parser.parse_function_node<JS::FunctionExpression>();

    // 4. If body is not parsable as a FunctionBody or if parsing detects an early error, return Completion { [[Type]]: normal, [[Value]]: null, [[Target]]: empty }.
--- a/Tests/LibJS/CMakeLists.txt
+++ b/Tests/LibJS/CMakeLists.txt
@@ -1,4 +1,3 @@
-ladybird_test(test-invalid-unicode-js.cpp LibJS LIBS LibJS LibUnicode)
 ladybird_test(test-value-js.cpp LibJS LIBS LibJS LibUnicode)

 ladybird_testjs_test(test-js.cpp test-js LIBS LibGC)
--- a/Tests/LibJS/test-invalid-unicode-js.cpp
+++ b/Tests/LibJS/test-invalid-unicode-js.cpp
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2021, David Tuin <davidot@serenityos.org>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
-
-#include <LibJS/Lexer.h>
-#include <LibTest/TestCase.h>
-
-static bool produces_eof_tokens(JS::Lexer& lexer)
-{
-    for (auto i = 0; i < 10; i++) {
-        auto eof_token = lexer.next();
-        if (eof_token.type() != JS::TokenType::Eof)
-            return false;
-    }
-    return true;
-}
-
-static bool triggers_immediate_unicode_fault(StringView code)
-{
-    auto lexer = JS::Lexer(code);
-    auto first_token = lexer.next();
-
-    if (first_token.type() != JS::TokenType::Invalid)
-        return false;
-
-    return produces_eof_tokens(lexer);
-}
-// In the not leading character it must start with 0b10xxxxxx
-// Thus all these options are invalid:
-// \x0y = 0000 y (or \x1y, \x2y and \x3y)
-// \x4y = 0100 y (or \x5y, \x6y and \x7y)
-// \xCy = 1100 y (or \xDy, \xEy and \xFy)
-// And the only valid option is:
-// \x8y = 1000 y (or \x9y, \xAy
-
-TEST_CASE(no_input_only_gives_eof)
-{
-    auto code = ""sv;
-    auto lexer = JS::Lexer(code);
-    EXPECT(produces_eof_tokens(lexer));
-}
-
-TEST_CASE(invalid_start_code_point)
-{
-    EXPECT(triggers_immediate_unicode_fault("\x80"sv));
-    EXPECT(triggers_immediate_unicode_fault("\x90"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xA0"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xB0"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xF8"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xFF"sv));
-}
-
-TEST_CASE(code_points_of_length_2)
-{
-    // Initial 110xxxxx -> \xCy or \xDy
-    EXPECT(triggers_immediate_unicode_fault("\xC5"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xC5\x02"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xC5\x52"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xC5\xD2"sv));
-
-    EXPECT(triggers_immediate_unicode_fault("\xD5"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xD5\x23"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xD5\x74"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xD5\xF5"sv));
-}
-
-TEST_CASE(code_points_of_length_3)
-{
-    // Initial 1110xxxx -> \xEy
-    EXPECT(triggers_immediate_unicode_fault("\xE5"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xE5\x02"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xE5\x52"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xE5\xD2"sv));
-
-    EXPECT(triggers_immediate_unicode_fault("\xEA\x80"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xEA\x81\x07"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xEA\x82\x57"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xEA\x83\xD7"sv));
-}
-
-TEST_CASE(code_points_of_length_4)
-{
-    // Initial 11110xxx -> \xF{0..7}
-    EXPECT(triggers_immediate_unicode_fault("\xF0"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xF1\x02"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xF2\x52"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xF3\xD2"sv));
-
-    EXPECT(triggers_immediate_unicode_fault("\xF4\x80"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xF5\x81\x07"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xF6\x82\x57"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xF7\x83\xD7"sv));
-
-    EXPECT(triggers_immediate_unicode_fault("\xF4\x80\x80"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xF5\x91\x80\x07"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xF6\xA2\x80\x57"sv));
-    EXPECT(triggers_immediate_unicode_fault("\xF7\xB3\x80\xD7"sv));
-}
-
-TEST_CASE(gives_valid_part_until_fault)
-{
-    auto code = "abc\xF5\x81\x80\x07; abc\xF5\x81\x80\x07 += 4"sv;
-    JS::Lexer lexer(code);
-    auto first_token = lexer.next();
-    EXPECT_EQ(first_token.type(), JS::TokenType::Identifier);
-    EXPECT_EQ(first_token.value(), "abc"sv);
-    auto second_token = lexer.next();
-    EXPECT_EQ(second_token.type(), JS::TokenType::Invalid);
-    EXPECT(produces_eof_tokens(lexer));
-}
-
-TEST_CASE(gives_fully_parsed_tokens_even_if_invalid_unicode_follows)
-{
-    auto code = "let \xE5\xD2"sv;
-    JS::Lexer lexer(code);
-    auto first_token = lexer.next();
-    EXPECT_EQ(first_token.type(), JS::TokenType::Let);
-    auto second_token = lexer.next();
-    EXPECT_EQ(second_token.type(), JS::TokenType::Invalid);
-    EXPECT(produces_eof_tokens(lexer));
-}
-
-TEST_CASE(invalid_unicode_and_valid_code)
-{
-    EXPECT(triggers_immediate_unicode_fault("\xEA\xFDthrow 1;"sv));
-}
-
-TEST_CASE(long_invalid_unicode_and_valid_code)
-{
-    EXPECT(triggers_immediate_unicode_fault("\xF7throw 1;"sv));
-}
-
-TEST_CASE(invalid_unicode_after_valid_code_and_before_eof)
-{
-    auto code = "let \xEA\xFD;"sv;
-    auto lexer = JS::Lexer(code);
-    auto let_token = lexer.next();
-    EXPECT_EQ(let_token.type(), JS::TokenType::Let);
-    auto invalid_token = lexer.next();
-    EXPECT_EQ(invalid_token.type(), JS::TokenType::Invalid);
-    EXPECT(produces_eof_tokens(lexer));
-}
--- a/Tests/LibJS/test-js.cpp
+++ b/Tests/LibJS/test-js.cpp
@@ -19,8 +19,8 @@ TESTJS_PROGRAM_FLAG(test262_parser_tests, "Run test262 parser tests", "test262-p

 TESTJS_GLOBAL_FUNCTION(can_parse_source, canParseSource)
 {
-    auto source = TRY(vm.argument(0).to_string(vm));
-    auto parser = JS::Parser(JS::Lexer(source));
+    auto source = TRY(vm.argument(0).to_utf16_string(vm));
+    auto parser = JS::Parser(JS::Lexer(JS::SourceCode::create({}, source)));
    (void)parser.parse_program();
    return JS::Value(!parser.has_errors());
 }
--- a/Utilities/js.cpp
+++ b/Utilities/js.cpp
@@ -513,7 +513,7 @@ static ErrorOr<String> read_next_piece()

        piece.append(line);
        piece.append('\n');
-        auto lexer = JS::Lexer(line);
+        auto lexer = JS::Lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line)));

        enum {
            NotInLabelOrObjectKey,
@@ -622,7 +622,7 @@ static ErrorOr<int> run_repl(bool gc_on_every_allocation, bool syntax_highlight)
        size_t open_indents = s_repl_line_level;

        auto line = editor.line();
-        JS::Lexer lexer(line);
+        JS::Lexer lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line)));
        bool indenters_starting_line = true;
        for (JS::Token token = lexer.next(); token.type() != JS::TokenType::Eof; token = lexer.next()) {
            auto length = token.value().length_in_code_units();
@@ -678,7 +678,7 @@ static ErrorOr<int> run_repl(bool gc_on_every_allocation, bool syntax_highlight)
    auto complete = [&realm, &global_environment](Line::Editor const& editor) -> Vector<Line::CompletionSuggestion> {
        auto line = editor.line(editor.cursor());

-        JS::Lexer lexer { line };
+        JS::Lexer lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line)));
        enum {
            Initial,
            CompleteVariable,
--- a/Utilities/test262-runner.cpp
+++ b/Utilities/test262-runner.cpp
@@ -194,7 +194,7 @@ static ErrorOr<void, TestError> run_test(StringView source, StringView filepath,
        // We can also skip if we know the test is supposed to fail during parse
        // time. Unfortunately the phases of modules are not as clear and thus we
        // only do this for scripts. See also the comment at the end of verify_test.
-        auto parser = JS::Parser(JS::Lexer(source, filepath), metadata.program_type);
+        auto parser = JS::Parser(JS::Lexer(JS::SourceCode::create(String::from_utf8(filepath).release_value_but_fixme_should_propagate_errors(), Utf16String::from_utf8(source))), metadata.program_type);
        auto program_or_error = parser.parse_program();
        if (parser.has_errors()) {
            return TestError {