LibJS: Have JS::Lexer take a JS::SourceCode as input

This moves the responsibility of setting up a SourceCode object to the
users of JS::Lexer.

This means Lexer and Parser are free to use string views into the
SourceCode internally while working.

It also means Lexer no longer has to think about anything other than
UTF-16 (or ASCII) inputs. So the unit test for parsing various invalid
UTF-8 sequences is deleted here.
This commit is contained in:
Andreas Kling
2025-11-08 21:21:52 +01:00
committed by Andreas Kling
parent 9ca25e55d7
commit 0dacc94edd
Notes: github-actions[bot] 2025-11-09 11:15:22 +00:00
16 changed files with 59 additions and 227 deletions

View File

@@ -192,9 +192,9 @@ static constexpr TokenType parse_three_char_token(Utf16View const& view)
} }
} }
static consteval Array<TokenType, 256> make_single_char_tokens_array() static consteval AK::Array<TokenType, 256> make_single_char_tokens_array()
{ {
Array<TokenType, 256> array; AK::Array<TokenType, 256> array;
array.fill(TokenType::Invalid); array.fill(TokenType::Invalid);
array['&'] = TokenType::Ampersand; array['&'] = TokenType::Ampersand;
array['*'] = TokenType::Asterisk; array['*'] = TokenType::Asterisk;
@@ -225,33 +225,9 @@ static consteval Array<TokenType, 256> make_single_char_tokens_array()
static constexpr auto s_single_char_tokens = make_single_char_tokens_array(); static constexpr auto s_single_char_tokens = make_single_char_tokens_array();
static Utf16String create_utf16_string_from_possibly_invalid_utf8_string(StringView source) Lexer::Lexer(NonnullRefPtr<SourceCode const> source, size_t line_number, size_t line_column)
{
Utf8View utf8_source { source };
if (utf8_source.validate()) [[likely]]
return Utf16String::from_utf8_without_validation(source);
StringBuilder builder(StringBuilder::Mode::UTF16);
for (auto code_point : utf8_source) {
builder.append_code_point(code_point);
if (code_point == AK::UnicodeUtils::REPLACEMENT_CODE_POINT)
break;
}
return builder.to_utf16_string();
}
Lexer::Lexer(StringView source, StringView filename, size_t line_number, size_t line_column)
: Lexer(create_utf16_string_from_possibly_invalid_utf8_string(source), filename, line_number, line_column)
{
// FIXME: Remove this API once all callers are ported to UTF-16.
}
Lexer::Lexer(Utf16String source, StringView filename, size_t line_number, size_t line_column)
: m_source(move(source)) : m_source(move(source))
, m_current_token(TokenType::Eof, {}, {}, {}, 0, 0, 0) , m_current_token(TokenType::Eof, {}, {}, {}, 0, 0, 0)
, m_filename(String::from_utf8(filename).release_value_but_fixme_should_propagate_errors())
, m_line_number(line_number) , m_line_number(line_number)
, m_line_column(line_column) , m_line_column(line_column)
{ {
@@ -304,16 +280,16 @@ Lexer::Lexer(Utf16String source, StringView filename, size_t line_number, size_t
void Lexer::consume() void Lexer::consume()
{ {
auto did_reach_eof = [this] { auto did_reach_eof = [this] {
if (m_position < m_source.length_in_code_units()) if (m_position < m_source->code().length_in_code_units())
return false; return false;
m_eof = true; m_eof = true;
m_current_code_unit = '\0'; m_current_code_unit = '\0';
m_position = m_source.length_in_code_units() + 1; m_position = m_source->code().length_in_code_units() + 1;
m_line_column++; m_line_column++;
return true; return true;
}; };
if (m_position > m_source.length_in_code_units()) if (m_position > m_source->code().length_in_code_units())
return; return;
if (did_reach_eof()) if (did_reach_eof())
@@ -339,7 +315,7 @@ void Lexer::consume()
// and column - don't do it again. From https://tc39.es/ecma262/#sec-line-terminators: // and column - don't do it again. From https://tc39.es/ecma262/#sec-line-terminators:
// The sequence <CR><LF> is commonly used as a line terminator. // The sequence <CR><LF> is commonly used as a line terminator.
// It should be considered a single SourceCharacter for the purpose of reporting line numbers. // It should be considered a single SourceCharacter for the purpose of reporting line numbers.
auto second_char_of_crlf = m_position > 1 && m_source.code_unit_at(m_position - 2) == '\r' && m_current_code_unit == '\n'; auto second_char_of_crlf = m_position > 1 && m_source->code().code_unit_at(m_position - 2) == '\r' && m_current_code_unit == '\n';
if (!second_char_of_crlf) { if (!second_char_of_crlf) {
m_line_number++; m_line_number++;
@@ -349,8 +325,8 @@ void Lexer::consume()
dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again."); dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again.");
} }
} else { } else {
if (AK::UnicodeUtils::is_utf16_high_surrogate(m_current_code_unit) && m_position < m_source.length_in_code_units()) { if (AK::UnicodeUtils::is_utf16_high_surrogate(m_current_code_unit) && m_position < m_source->code().length_in_code_units()) {
if (AK::UnicodeUtils::is_utf16_low_surrogate(m_source.code_unit_at(m_position))) { if (AK::UnicodeUtils::is_utf16_low_surrogate(m_source->code().code_unit_at(m_position))) {
++m_position; ++m_position;
if (did_reach_eof()) if (did_reach_eof())
@@ -361,7 +337,7 @@ void Lexer::consume()
++m_line_column; ++m_line_column;
} }
m_current_code_unit = m_source.code_unit_at(m_position++); m_current_code_unit = m_source->code().code_unit_at(m_position++);
} }
bool Lexer::consume_decimal_number() bool Lexer::consume_decimal_number()
@@ -436,40 +412,40 @@ bool Lexer::consume_binary_number()
template<typename Callback> template<typename Callback>
bool Lexer::match_numeric_literal_separator_followed_by(Callback callback) const bool Lexer::match_numeric_literal_separator_followed_by(Callback callback) const
{ {
if (m_position >= m_source.length_in_code_units()) if (m_position >= m_source->code().length_in_code_units())
return false; return false;
return m_current_code_unit == '_' return m_current_code_unit == '_'
&& callback(m_source.code_unit_at(m_position)); && callback(m_source->code().code_unit_at(m_position));
} }
bool Lexer::match(char16_t a, char16_t b) const bool Lexer::match(char16_t a, char16_t b) const
{ {
if (m_position >= m_source.length_in_code_units()) if (m_position >= m_source->code().length_in_code_units())
return false; return false;
return m_current_code_unit == a return m_current_code_unit == a
&& m_source.code_unit_at(m_position) == b; && m_source->code().code_unit_at(m_position) == b;
} }
bool Lexer::match(char16_t a, char16_t b, char16_t c) const bool Lexer::match(char16_t a, char16_t b, char16_t c) const
{ {
if (m_position + 1 >= m_source.length_in_code_units()) if (m_position + 1 >= m_source->code().length_in_code_units())
return false; return false;
return m_current_code_unit == a return m_current_code_unit == a
&& m_source.code_unit_at(m_position) == b && m_source->code().code_unit_at(m_position) == b
&& m_source.code_unit_at(m_position + 1) == c; && m_source->code().code_unit_at(m_position + 1) == c;
} }
bool Lexer::match(char16_t a, char16_t b, char16_t c, char16_t d) const bool Lexer::match(char16_t a, char16_t b, char16_t c, char16_t d) const
{ {
if (m_position + 2 >= m_source.length_in_code_units()) if (m_position + 2 >= m_source->code().length_in_code_units())
return false; return false;
return m_current_code_unit == a return m_current_code_unit == a
&& m_source.code_unit_at(m_position) == b && m_source->code().code_unit_at(m_position) == b
&& m_source.code_unit_at(m_position + 1) == c && m_source->code().code_unit_at(m_position + 1) == c
&& m_source.code_unit_at(m_position + 2) == d; && m_source->code().code_unit_at(m_position + 2) == d;
} }
bool Lexer::is_eof() const bool Lexer::is_eof() const
@@ -493,7 +469,7 @@ ALWAYS_INLINE u32 Lexer::current_code_point() const
if (m_position == 0) if (m_position == 0)
return AK::UnicodeUtils::REPLACEMENT_CODE_POINT; return AK::UnicodeUtils::REPLACEMENT_CODE_POINT;
auto substring = m_source.substring_view(m_position - 1); auto substring = m_source->code().substring_view(m_position - 1);
if (substring.is_empty()) if (substring.is_empty())
return AK::UnicodeUtils::REPLACEMENT_CODE_POINT; return AK::UnicodeUtils::REPLACEMENT_CODE_POINT;
@@ -615,7 +591,7 @@ bool Lexer::is_block_comment_end() const
bool Lexer::is_numeric_literal_start() const bool Lexer::is_numeric_literal_start() const
{ {
return is_ascii_digit(m_current_code_unit) || (m_current_code_unit == '.' && m_position < m_source.length_in_code_units() && is_ascii_digit(m_source.code_unit_at(m_position))); return is_ascii_digit(m_current_code_unit) || (m_current_code_unit == '.' && m_position < m_source->code().length_in_code_units() && is_ascii_digit(m_source->code().code_unit_at(m_position)));
} }
bool Lexer::slash_means_division() const bool Lexer::slash_means_division() const
@@ -861,7 +837,7 @@ Token const& Lexer::next()
while (m_current_code_unit != stop_char && m_current_code_unit != '\r' && m_current_code_unit != '\n' && !is_eof()) { while (m_current_code_unit != stop_char && m_current_code_unit != '\r' && m_current_code_unit != '\n' && !is_eof()) {
if (m_current_code_unit == '\\') { if (m_current_code_unit == '\\') {
consume(); consume();
if (m_current_code_unit == '\r' && m_position < m_source.length_in_code_units() && m_source.code_unit_at(m_position) == '\n') { if (m_current_code_unit == '\r' && m_position < m_source->code().length_in_code_units() && m_source->code().code_unit_at(m_position) == '\n') {
consume(); consume();
} }
} }
@@ -896,8 +872,8 @@ Token const& Lexer::next()
consume(); consume();
} }
if (!found_token && m_position + 1 < m_source.length_in_code_units()) { if (!found_token && m_position + 1 < m_source->code().length_in_code_units()) {
auto three_chars_view = m_source.substring_view(m_position - 1, 3); auto three_chars_view = m_source->code().substring_view(m_position - 1, 3);
if (auto type = parse_three_char_token(three_chars_view); type != TokenType::Invalid) { if (auto type = parse_three_char_token(three_chars_view); type != TokenType::Invalid) {
found_token = true; found_token = true;
token_type = type; token_type = type;
@@ -907,11 +883,11 @@ Token const& Lexer::next()
} }
} }
if (!found_token && m_position < m_source.length_in_code_units()) { if (!found_token && m_position < m_source->code().length_in_code_units()) {
auto two_chars_view = m_source.substring_view(m_position - 1, 2); auto two_chars_view = m_source->code().substring_view(m_position - 1, 2);
if (auto type = parse_two_char_token(two_chars_view); type != TokenType::Invalid) { if (auto type = parse_two_char_token(two_chars_view); type != TokenType::Invalid) {
// OptionalChainingPunctuator :: ?. [lookahead ∉ DecimalDigit] // OptionalChainingPunctuator :: ?. [lookahead ∉ DecimalDigit]
if (!(type == TokenType::QuestionMarkPeriod && m_position + 1 < m_source.length_in_code_units() && is_ascii_digit(m_source.code_unit_at(m_position + 1)))) { if (!(type == TokenType::QuestionMarkPeriod && m_position + 1 < m_source->code().length_in_code_units() && is_ascii_digit(m_source->code().code_unit_at(m_position + 1)))) {
found_token = true; found_token = true;
token_type = type; token_type = type;
consume(); consume();
@@ -945,8 +921,8 @@ Token const& Lexer::next()
m_current_token = Token( m_current_token = Token(
token_type, token_type,
token_message, token_message,
m_source.substring_view(trivia_start - 1, value_start - trivia_start), m_source->code().substring_view(trivia_start - 1, value_start - trivia_start),
m_source.substring_view(value_start - 1, m_position - value_start), m_source->code().substring_view(value_start - 1, m_position - value_start),
value_start_line_number, value_start_line_number,
value_start_column_number, value_start_column_number,
value_start - 1); value_start - 1);
@@ -976,7 +952,7 @@ Token const& Lexer::force_slash_as_regex()
size_t value_start = m_position - 1; size_t value_start = m_position - 1;
if (has_equals) { if (has_equals) {
VERIFY(m_source.code_unit_at(value_start - 1) == '='); VERIFY(m_source->code().code_unit_at(value_start - 1) == '=');
--value_start; --value_start;
--m_position; --m_position;
m_current_code_unit = '='; m_current_code_unit = '=';
@@ -988,7 +964,7 @@ Token const& Lexer::force_slash_as_regex()
token_type, token_type,
Token::Message::None, Token::Message::None,
m_current_token.trivia(), m_current_token.trivia(),
m_source.substring_view(value_start - 1, m_position - value_start), m_source->code().substring_view(value_start - 1, m_position - value_start),
m_current_token.line_number(), m_current_token.line_number(),
m_current_token.line_column(), m_current_token.line_column(),
value_start - 1); value_start - 1);

View File

@@ -1,5 +1,6 @@
/* /*
* Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@serenityos.org> * Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@serenityos.org>
* Copyright (c) 2020-2025, Andreas Kling <andreas@ladybird.org>
* *
* SPDX-License-Identifier: BSD-2-Clause * SPDX-License-Identifier: BSD-2-Clause
*/ */
@@ -7,17 +8,16 @@
#pragma once #pragma once
#include <AK/HashMap.h> #include <AK/HashMap.h>
#include <AK/StringView.h>
#include <AK/Utf16String.h> #include <AK/Utf16String.h>
#include <LibJS/Export.h> #include <LibJS/Export.h>
#include <LibJS/SourceCode.h>
#include <LibJS/Token.h> #include <LibJS/Token.h>
namespace JS { namespace JS {
class JS_API Lexer { class JS_API Lexer {
public: public:
explicit Lexer(StringView source, StringView filename = "(unknown)"sv, size_t line_number = 1, size_t line_column = 0); explicit Lexer(NonnullRefPtr<SourceCode const>, size_t line_number = 1, size_t line_column = 0);
explicit Lexer(Utf16String source, StringView filename = "(unknown)"sv, size_t line_number = 1, size_t line_column = 0);
// These both advance the lexer and return a reference to the current token. // These both advance the lexer and return a reference to the current token.
Token const& next(); Token const& next();
@@ -25,8 +25,9 @@ public:
[[nodiscard]] Token const& current_token() const { return m_current_token; } [[nodiscard]] Token const& current_token() const { return m_current_token; }
Utf16String const& source() const { return m_source; } SourceCode const& source_code() const { return m_source; }
String const& filename() const { return m_filename; } Utf16String const& source() const { return m_source->code(); }
String const& filename() const { return m_source->filename(); }
void disallow_html_comments() { m_allow_html_comments = false; } void disallow_html_comments() { m_allow_html_comments = false; }
@@ -59,7 +60,7 @@ private:
TokenType consume_regex_literal(); TokenType consume_regex_literal();
Utf16String m_source; NonnullRefPtr<SourceCode const> m_source;
size_t m_position { 0 }; size_t m_position { 0 };
Token m_current_token; Token m_current_token;
char16_t m_current_code_unit { 0 }; char16_t m_current_code_unit { 0 };
@@ -67,7 +68,6 @@ private:
bool m_regex_is_in_character_class { false }; bool m_regex_is_in_character_class { false };
bool m_allow_html_comments { true }; bool m_allow_html_comments { true };
String m_filename;
size_t m_line_number { 1 }; size_t m_line_number { 1 };
size_t m_line_column { 0 }; size_t m_line_column { 0 };

View File

@@ -688,7 +688,7 @@ Parser::ParserState::ParserState(Lexer l, Program::Type program_type)
} }
Parser::Parser(Lexer lexer, Program::Type program_type, Optional<EvalInitialState> initial_state_for_eval) Parser::Parser(Lexer lexer, Program::Type program_type, Optional<EvalInitialState> initial_state_for_eval)
: m_source_code(SourceCode::create(lexer.filename(), lexer.source())) : m_source_code(lexer.source_code())
, m_state(move(lexer), program_type) , m_state(move(lexer), program_type)
, m_program_type(program_type) , m_program_type(program_type)
{ {
@@ -2596,7 +2596,7 @@ RefPtr<BindingPattern const> Parser::synthesize_binding_pattern(Expression const
auto source_end_offset = expression.source_range().end.offset; auto source_end_offset = expression.source_range().end.offset;
auto source = m_state.lexer.source().substring_view(source_start_offset, source_end_offset - source_start_offset); auto source = m_state.lexer.source().substring_view(source_start_offset, source_end_offset - source_start_offset);
Lexer lexer { Utf16String::from_utf16(source), m_state.lexer.filename(), expression.source_range().start.line, expression.source_range().start.column }; Lexer lexer(SourceCode::create(m_state.lexer.filename(), Utf16String::from_utf16(source)), expression.source_range().start.line, expression.source_range().start.column);
Parser parser { lexer }; Parser parser { lexer };
parser.m_state.current_scope_pusher = m_state.current_scope_pusher; parser.m_state.current_scope_pusher = m_state.current_scope_pusher;
@@ -5233,7 +5233,7 @@ Parser Parser::parse_function_body_from_string(ByteString const& body_string, u1
{ {
RefPtr<FunctionBody const> function_body; RefPtr<FunctionBody const> function_body;
auto body_parser = Parser { Lexer { body_string } }; auto body_parser = Parser(Lexer(SourceCode::create({}, Utf16String::from_utf8(body_string))));
{ {
// Set up some parser state to accept things like return await, and yield in the plain function body. // Set up some parser state to accept things like return await, and yield in the plain function body.
body_parser.m_state.in_function_context = true; body_parser.m_state.in_function_context = true;

View File

@@ -623,7 +623,7 @@ ThrowCompletionOr<Value> perform_eval(VM& vm, Value x, CallerMode strict_caller,
.in_class_field_initializer = in_class_field_initializer, .in_class_field_initializer = in_class_field_initializer,
}; };
Parser parser { Lexer { code_string->utf8_string_view() }, Program::Type::Script, move(initial_state) }; Parser parser(Lexer(SourceCode::create({}, code_string->utf16_string())), Program::Type::Script, move(initial_state));
auto program = parser.parse_program(strict_caller == CallerMode::Strict); auto program = parser.parse_program(strict_caller == CallerMode::Strict);
// b. If script is a List of errors, throw a SyntaxError exception. // b. If script is a List of errors, throw a SyntaxError exception.

View File

@@ -156,7 +156,7 @@ ThrowCompletionOr<GC::Ref<ECMAScriptFunctionObject>> FunctionConstructor::create
// 17. Let parameters be ParseText(P, parameterSym). // 17. Let parameters be ParseText(P, parameterSym).
i32 function_length = 0; i32 function_length = 0;
auto parameters_parser = Parser { Lexer { parameters_string } }; auto parameters_parser = Parser(Lexer(SourceCode::create({}, Utf16String::from_utf8(parameters_string))));
auto parameters = parameters_parser.parse_formal_parameters(function_length, parse_options); auto parameters = parameters_parser.parse_formal_parameters(function_length, parse_options);
// 18. If parameters is a List of errors, throw a SyntaxError exception. // 18. If parameters is a List of errors, throw a SyntaxError exception.
@@ -179,7 +179,7 @@ ThrowCompletionOr<GC::Ref<ECMAScriptFunctionObject>> FunctionConstructor::create
// 22. NOTE: If this step is reached, sourceText must have the syntax of exprSym (although the reverse implication does not hold). The purpose of the next two steps is to enforce any Early Error rules which apply to exprSym directly. // 22. NOTE: If this step is reached, sourceText must have the syntax of exprSym (although the reverse implication does not hold). The purpose of the next two steps is to enforce any Early Error rules which apply to exprSym directly.
// 23. Let expr be ParseText(sourceText, exprSym). // 23. Let expr be ParseText(sourceText, exprSym).
auto source_parser = Parser { Lexer { source_text } }; auto source_parser = Parser(Lexer(SourceCode::create({}, Utf16String::from_utf8(source_text))));
// This doesn't need any parse_options, it determines those & the function type based on the tokens that were found. // This doesn't need any parse_options, it determines those & the function type based on the tokens that were found.
auto expr = source_parser.parse_function_node<FunctionExpression>(); auto expr = source_parser.parse_function_node<FunctionExpression>();

View File

@@ -123,7 +123,7 @@ ThrowCompletionOr<Value> perform_shadow_realm_eval(VM& vm, Value source, Realm&
// 2. Perform the following substeps in an implementation-defined order, possibly interleaving parsing and error detection: // 2. Perform the following substeps in an implementation-defined order, possibly interleaving parsing and error detection:
// a. Let script be ParseText(StringToCodePoints(sourceText), Script). // a. Let script be ParseText(StringToCodePoints(sourceText), Script).
auto parser = Parser(Lexer(source_text->utf8_string_view()), Program::Type::Script, Parser::EvalInitialState {}); auto parser = Parser(Lexer(SourceCode::create({}, source_text->utf16_string())), Program::Type::Script, Parser::EvalInitialState {});
auto program = parser.parse_program(); auto program = parser.parse_program();
// b. If script is a List of errors, throw a SyntaxError exception. // b. If script is a List of errors, throw a SyntaxError exception.

View File

@@ -18,7 +18,7 @@ GC_DEFINE_ALLOCATOR(Script);
Result<GC::Ref<Script>, Vector<ParserError>> Script::parse(StringView source_text, Realm& realm, StringView filename, HostDefined* host_defined, size_t line_number_offset) Result<GC::Ref<Script>, Vector<ParserError>> Script::parse(StringView source_text, Realm& realm, StringView filename, HostDefined* host_defined, size_t line_number_offset)
{ {
// 1. Let script be ParseText(sourceText, Script). // 1. Let script be ParseText(sourceText, Script).
auto parser = Parser(Lexer(source_text, filename, line_number_offset)); auto parser = Parser(Lexer(SourceCode::create(String::from_utf8(filename).release_value_but_fixme_should_propagate_errors(), Utf16String::from_utf8(source_text)), line_number_offset));
auto script = parser.parse_program(); auto script = parser.parse_program();
// 2. If script is a List of errors, return body. // 2. If script is a List of errors, return body.

View File

@@ -132,7 +132,7 @@ void SourceTextModule::visit_edges(Cell::Visitor& visitor)
Result<GC::Ref<SourceTextModule>, Vector<ParserError>> SourceTextModule::parse(StringView source_text, Realm& realm, StringView filename, Script::HostDefined* host_defined) Result<GC::Ref<SourceTextModule>, Vector<ParserError>> SourceTextModule::parse(StringView source_text, Realm& realm, StringView filename, Script::HostDefined* host_defined)
{ {
// 1. Let body be ParseText(sourceText, Module). // 1. Let body be ParseText(sourceText, Module).
auto parser = Parser(Lexer(source_text, filename), Program::Type::Module); auto parser = Parser(Lexer(SourceCode::create(String::from_utf8(filename).release_value_but_fixme_should_propagate_errors(), Utf16String::from_utf8(source_text))), Program::Type::Module);
auto body = parser.parse_program(); auto body = parser.parse_program();
// 2. If body is a List of errors, return body. // 2. If body is a List of errors, return body.

View File

@@ -52,7 +52,7 @@ void SyntaxHighlighter::rehighlight(Palette const& palette)
{ {
auto text = m_client->get_text(); auto text = m_client->get_text();
Lexer lexer(text); Lexer lexer(SourceCode::create({}, Utf16String::from_utf8(text)));
Vector<Syntax::TextDocumentSpan> spans; Vector<Syntax::TextDocumentSpan> spans;
Vector<Syntax::TextDocumentFoldingRegion> folding_regions; Vector<Syntax::TextDocumentFoldingRegion> folding_regions;

View File

@@ -444,7 +444,7 @@ WebIDL::CallbackType* EventTarget::get_current_value_of_event_handler(FlyString
auto source_text = builder.to_byte_string(); auto source_text = builder.to_byte_string();
auto parser = JS::Parser(JS::Lexer(source_text)); auto parser = JS::Parser(JS::Lexer(JS::SourceCode::create({}, Utf16String::from_utf8(source_text))));
// FIXME: This should only be parsing the `body` instead of `source_text` and therefore use `JS::FunctionBody` instead of `JS::FunctionExpression`. // FIXME: This should only be parsing the `body` instead of `source_text` and therefore use `JS::FunctionBody` instead of `JS::FunctionExpression`.
// However, JS::ECMAScriptFunctionObject::create wants parameters and length and JS::FunctionBody does not inherit JS::FunctionNode. // However, JS::ECMAScriptFunctionObject::create wants parameters and length and JS::FunctionBody does not inherit JS::FunctionNode.

View File

@@ -55,7 +55,8 @@ static JS::ThrowCompletionOr<JS::Value> execute_a_function_body(HTML::BrowsingCo
}})~~~", }})~~~",
body); body);
auto parser = JS::Parser { JS::Lexer { source_text } }; auto parser = JS::Parser(JS::Lexer(JS::SourceCode::create({}, Utf16String::from_utf8(source_text))));
;
auto function_expression = parser.parse_function_node<JS::FunctionExpression>(); auto function_expression = parser.parse_function_node<JS::FunctionExpression>();
// 4. If body is not parsable as a FunctionBody or if parsing detects an early error, return Completion { [[Type]]: normal, [[Value]]: null, [[Target]]: empty }. // 4. If body is not parsable as a FunctionBody or if parsing detects an early error, return Completion { [[Type]]: normal, [[Value]]: null, [[Target]]: empty }.

View File

@@ -1,4 +1,3 @@
ladybird_test(test-invalid-unicode-js.cpp LibJS LIBS LibJS LibUnicode)
ladybird_test(test-value-js.cpp LibJS LIBS LibJS LibUnicode) ladybird_test(test-value-js.cpp LibJS LIBS LibJS LibUnicode)
ladybird_testjs_test(test-js.cpp test-js LIBS LibGC) ladybird_testjs_test(test-js.cpp test-js LIBS LibGC)

View File

@@ -1,144 +0,0 @@
/*
* Copyright (c) 2021, David Tuin <davidot@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibJS/Lexer.h>
#include <LibTest/TestCase.h>
static bool produces_eof_tokens(JS::Lexer& lexer)
{
for (auto i = 0; i < 10; i++) {
auto eof_token = lexer.next();
if (eof_token.type() != JS::TokenType::Eof)
return false;
}
return true;
}
static bool triggers_immediate_unicode_fault(StringView code)
{
auto lexer = JS::Lexer(code);
auto first_token = lexer.next();
if (first_token.type() != JS::TokenType::Invalid)
return false;
return produces_eof_tokens(lexer);
}
// In the not leading character it must start with 0b10xxxxxx
// Thus all these options are invalid:
// \x0y = 0000 y (or \x1y, \x2y and \x3y)
// \x4y = 0100 y (or \x5y, \x6y and \x7y)
// \xCy = 1100 y (or \xDy, \xEy and \xFy)
// And the only valid option is:
// \x8y = 1000 y (or \x9y, \xAy
TEST_CASE(no_input_only_gives_eof)
{
auto code = ""sv;
auto lexer = JS::Lexer(code);
EXPECT(produces_eof_tokens(lexer));
}
TEST_CASE(invalid_start_code_point)
{
EXPECT(triggers_immediate_unicode_fault("\x80"sv));
EXPECT(triggers_immediate_unicode_fault("\x90"sv));
EXPECT(triggers_immediate_unicode_fault("\xA0"sv));
EXPECT(triggers_immediate_unicode_fault("\xB0"sv));
EXPECT(triggers_immediate_unicode_fault("\xF8"sv));
EXPECT(triggers_immediate_unicode_fault("\xFF"sv));
}
TEST_CASE(code_points_of_length_2)
{
// Initial 110xxxxx -> \xCy or \xDy
EXPECT(triggers_immediate_unicode_fault("\xC5"sv));
EXPECT(triggers_immediate_unicode_fault("\xC5\x02"sv));
EXPECT(triggers_immediate_unicode_fault("\xC5\x52"sv));
EXPECT(triggers_immediate_unicode_fault("\xC5\xD2"sv));
EXPECT(triggers_immediate_unicode_fault("\xD5"sv));
EXPECT(triggers_immediate_unicode_fault("\xD5\x23"sv));
EXPECT(triggers_immediate_unicode_fault("\xD5\x74"sv));
EXPECT(triggers_immediate_unicode_fault("\xD5\xF5"sv));
}
TEST_CASE(code_points_of_length_3)
{
// Initial 1110xxxx -> \xEy
EXPECT(triggers_immediate_unicode_fault("\xE5"sv));
EXPECT(triggers_immediate_unicode_fault("\xE5\x02"sv));
EXPECT(triggers_immediate_unicode_fault("\xE5\x52"sv));
EXPECT(triggers_immediate_unicode_fault("\xE5\xD2"sv));
EXPECT(triggers_immediate_unicode_fault("\xEA\x80"sv));
EXPECT(triggers_immediate_unicode_fault("\xEA\x81\x07"sv));
EXPECT(triggers_immediate_unicode_fault("\xEA\x82\x57"sv));
EXPECT(triggers_immediate_unicode_fault("\xEA\x83\xD7"sv));
}
TEST_CASE(code_points_of_length_4)
{
// Initial 11110xxx -> \xF{0..7}
EXPECT(triggers_immediate_unicode_fault("\xF0"sv));
EXPECT(triggers_immediate_unicode_fault("\xF1\x02"sv));
EXPECT(triggers_immediate_unicode_fault("\xF2\x52"sv));
EXPECT(triggers_immediate_unicode_fault("\xF3\xD2"sv));
EXPECT(triggers_immediate_unicode_fault("\xF4\x80"sv));
EXPECT(triggers_immediate_unicode_fault("\xF5\x81\x07"sv));
EXPECT(triggers_immediate_unicode_fault("\xF6\x82\x57"sv));
EXPECT(triggers_immediate_unicode_fault("\xF7\x83\xD7"sv));
EXPECT(triggers_immediate_unicode_fault("\xF4\x80\x80"sv));
EXPECT(triggers_immediate_unicode_fault("\xF5\x91\x80\x07"sv));
EXPECT(triggers_immediate_unicode_fault("\xF6\xA2\x80\x57"sv));
EXPECT(triggers_immediate_unicode_fault("\xF7\xB3\x80\xD7"sv));
}
TEST_CASE(gives_valid_part_until_fault)
{
auto code = "abc\xF5\x81\x80\x07; abc\xF5\x81\x80\x07 += 4"sv;
JS::Lexer lexer(code);
auto first_token = lexer.next();
EXPECT_EQ(first_token.type(), JS::TokenType::Identifier);
EXPECT_EQ(first_token.value(), "abc"sv);
auto second_token = lexer.next();
EXPECT_EQ(second_token.type(), JS::TokenType::Invalid);
EXPECT(produces_eof_tokens(lexer));
}
TEST_CASE(gives_fully_parsed_tokens_even_if_invalid_unicode_follows)
{
auto code = "let \xE5\xD2"sv;
JS::Lexer lexer(code);
auto first_token = lexer.next();
EXPECT_EQ(first_token.type(), JS::TokenType::Let);
auto second_token = lexer.next();
EXPECT_EQ(second_token.type(), JS::TokenType::Invalid);
EXPECT(produces_eof_tokens(lexer));
}
TEST_CASE(invalid_unicode_and_valid_code)
{
EXPECT(triggers_immediate_unicode_fault("\xEA\xFDthrow 1;"sv));
}
TEST_CASE(long_invalid_unicode_and_valid_code)
{
EXPECT(triggers_immediate_unicode_fault("\xF7throw 1;"sv));
}
TEST_CASE(invalid_unicode_after_valid_code_and_before_eof)
{
auto code = "let \xEA\xFD;"sv;
auto lexer = JS::Lexer(code);
auto let_token = lexer.next();
EXPECT_EQ(let_token.type(), JS::TokenType::Let);
auto invalid_token = lexer.next();
EXPECT_EQ(invalid_token.type(), JS::TokenType::Invalid);
EXPECT(produces_eof_tokens(lexer));
}

View File

@@ -19,8 +19,8 @@ TESTJS_PROGRAM_FLAG(test262_parser_tests, "Run test262 parser tests", "test262-p
TESTJS_GLOBAL_FUNCTION(can_parse_source, canParseSource) TESTJS_GLOBAL_FUNCTION(can_parse_source, canParseSource)
{ {
auto source = TRY(vm.argument(0).to_string(vm)); auto source = TRY(vm.argument(0).to_utf16_string(vm));
auto parser = JS::Parser(JS::Lexer(source)); auto parser = JS::Parser(JS::Lexer(JS::SourceCode::create({}, source)));
(void)parser.parse_program(); (void)parser.parse_program();
return JS::Value(!parser.has_errors()); return JS::Value(!parser.has_errors());
} }

View File

@@ -513,7 +513,7 @@ static ErrorOr<String> read_next_piece()
piece.append(line); piece.append(line);
piece.append('\n'); piece.append('\n');
auto lexer = JS::Lexer(line); auto lexer = JS::Lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line)));
enum { enum {
NotInLabelOrObjectKey, NotInLabelOrObjectKey,
@@ -622,7 +622,7 @@ static ErrorOr<int> run_repl(bool gc_on_every_allocation, bool syntax_highlight)
size_t open_indents = s_repl_line_level; size_t open_indents = s_repl_line_level;
auto line = editor.line(); auto line = editor.line();
JS::Lexer lexer(line); JS::Lexer lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line)));
bool indenters_starting_line = true; bool indenters_starting_line = true;
for (JS::Token token = lexer.next(); token.type() != JS::TokenType::Eof; token = lexer.next()) { for (JS::Token token = lexer.next(); token.type() != JS::TokenType::Eof; token = lexer.next()) {
auto length = token.value().length_in_code_units(); auto length = token.value().length_in_code_units();
@@ -678,7 +678,7 @@ static ErrorOr<int> run_repl(bool gc_on_every_allocation, bool syntax_highlight)
auto complete = [&realm, &global_environment](Line::Editor const& editor) -> Vector<Line::CompletionSuggestion> { auto complete = [&realm, &global_environment](Line::Editor const& editor) -> Vector<Line::CompletionSuggestion> {
auto line = editor.line(editor.cursor()); auto line = editor.line(editor.cursor());
JS::Lexer lexer { line }; JS::Lexer lexer(JS::SourceCode::create({}, Utf16String::from_utf8(line)));
enum { enum {
Initial, Initial,
CompleteVariable, CompleteVariable,

View File

@@ -194,7 +194,7 @@ static ErrorOr<void, TestError> run_test(StringView source, StringView filepath,
// We can also skip if we know the test is supposed to fail during parse // We can also skip if we know the test is supposed to fail during parse
// time. Unfortunately the phases of modules are not as clear and thus we // time. Unfortunately the phases of modules are not as clear and thus we
// only do this for scripts. See also the comment at the end of verify_test. // only do this for scripts. See also the comment at the end of verify_test.
auto parser = JS::Parser(JS::Lexer(source, filepath), metadata.program_type); auto parser = JS::Parser(JS::Lexer(JS::SourceCode::create(String::from_utf8(filepath).release_value_but_fixme_should_propagate_errors(), Utf16String::from_utf8(source))), metadata.program_type);
auto program_or_error = parser.parse_program(); auto program_or_error = parser.parse_program();
if (parser.has_errors()) { if (parser.has_errors()) {
return TestError { return TestError {