AK: Templatize GenericLexer for UTF-16 strings

We now define GenericLexer as a template to allow using it with UTF-16 strings. To keep existing users happy, the template is defined in the Detail namespace. Then AK::GenericLexer is an alias for a char-based view, and AK::Utf16GenericLexer is an alias for a char16-based view.
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/99d7e08dff0 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5762
2025-12-05 01:10:24 +00:00 · 2025-08-05 15:33:31 -04:00 · 2025-08-13 13:57:56 +00:00
parent 28d9d3a2c7
commit 99d7e08dff
5 changed files with 620 additions and 419 deletions
--- a/AK/Forward.h
+++ b/AK/Forward.h
@@ -18,6 +18,9 @@ namespace Detail {
 template<size_t inline_capacity>
 class ByteBuffer;

+template<typename CharType>
+class GenericLexer;
+
 class StringData;
 class Utf16StringData;

@@ -36,7 +39,6 @@ class CountingStream;
 class Duration;
 class Error;
 class FlyString;
-class GenericLexer;
 class IPv4Address;
 class IPv6Address;
 class JsonArray;
@@ -63,6 +65,9 @@ class Utf8View;

 using ByteBuffer = Detail::ByteBuffer<32>;

+using GenericLexer = Detail::GenericLexer<char>;
+using Utf16GenericLexer = Detail::GenericLexer<char16_t>;
+
 template<typename T>
 class Span;

@@ -207,6 +212,7 @@ using AK::TrailingCodePointTransformation;
 using AK::Traits;
 using AK::UnixDateTime;
 using AK::Utf16FlyString;
+using AK::Utf16GenericLexer;
 using AK::Utf16String;
 using AK::Utf16View;
 using AK::Utf32CodePointIterator;
--- a/AK/GenericLexer.cpp
+++ b/AK/GenericLexer.cpp
@@ -4,151 +4,10 @@
 * SPDX-License-Identifier: BSD-2-Clause
 */

-#include <AK/Assertions.h>
-#include <AK/ByteString.h>
-#include <AK/CharacterTypes.h>
 #include <AK/GenericLexer.h>
-#include <AK/ScopeGuard.h>
-#include <AK/StringBuilder.h>
-#include <AK/UnicodeUtils.h>

 namespace AK {

-// Consume a number of characters
-StringView GenericLexer::consume(size_t count)
-{
-    size_t start = m_index;
-    size_t length = min(count, m_input.length() - m_index);
-    m_index += length;
-
-    return m_input.substring_view(start, length);
-}
-
-// Consume the rest of the input
-StringView GenericLexer::consume_all()
-{
-    auto rest = m_input.substring_view(m_index, m_input.length() - m_index);
-    m_index = m_input.length();
-    return rest;
-}
-
-// Consume until a new line is found
-StringView GenericLexer::consume_line()
-{
-    size_t start = m_index;
-    while (!is_eof() && peek() != '\r' && peek() != '\n')
-        m_index++;
-    size_t length = m_index - start;
-
-    consume_specific('\r');
-    consume_specific('\n');
-
-    return m_input.substring_view(start, length);
-}
-
-// Consume and return characters until `stop` is peek'd
-StringView GenericLexer::consume_until(char stop)
-{
-    size_t start = m_index;
-    while (!is_eof() && peek() != stop)
-        m_index++;
-    size_t length = m_index - start;
-
-    return m_input.substring_view(start, length);
-}
-
-// Consume and return characters until the string `stop` is found
-StringView GenericLexer::consume_until(StringView stop)
-{
-    size_t start = m_index;
-    while (!is_eof() && !next_is(stop))
-        m_index++;
-    size_t length = m_index - start;
-
-    return m_input.substring_view(start, length);
-}
-
-/*
- * Consume a string surrounded by single or double quotes. The returned
- * StringView does not include the quotes. An escape character can be provided
- * to capture the enclosing quotes. Please note that the escape character will
- * still be in the resulting StringView
- */
-StringView GenericLexer::consume_quoted_string(char escape_char)
-{
-    if (!next_is(is_quote))
-        return {};
-
-    char quote_char = consume();
-    size_t start = m_index;
-    while (!is_eof()) {
-        if (next_is(escape_char))
-            m_index++;
-        else if (next_is(quote_char))
-            break;
-        m_index++;
-    }
-    size_t length = m_index - start;
-
-    if (peek() != quote_char) {
-        // Restore the index in case the string is unterminated
-        m_index = start - 1;
-        return {};
-    }
-
-    // Ignore closing quote
-    ignore();
-
-    return m_input.substring_view(start, length);
-}
-
-template<Integral T>
-ErrorOr<T> GenericLexer::consume_decimal_integer()
-{
-    using UnsignedT = MakeUnsigned<T>;
-
-    ArmedScopeGuard rollback { [&, rollback_position = m_index] {
-        m_index = rollback_position;
-    } };
-
-    bool has_minus_sign = false;
-
-    if (next_is('+') || next_is('-'))
-        if (consume() == '-')
-            has_minus_sign = true;
-
-    StringView number_view = consume_while(is_ascii_digit);
-    if (number_view.is_empty())
-        return Error::from_errno(EINVAL);
-
-    auto maybe_number = number_view.to_number<UnsignedT>(TrimWhitespace::No);
-    if (!maybe_number.has_value())
-        return Error::from_errno(ERANGE);
-    auto number = maybe_number.value();
-
-    if (!has_minus_sign) {
-        if (NumericLimits<T>::max() < number) // This is only possible in a signed case.
-            return Error::from_errno(ERANGE);
-
-        rollback.disarm();
-        return number;
-    } else {
-        if constexpr (IsUnsigned<T>) {
-            if (number == 0) {
-                rollback.disarm();
-                return 0;
-            }
-            return Error::from_errno(ERANGE);
-        } else {
-            static constexpr UnsignedT max_value = static_cast<UnsignedT>(NumericLimits<T>::max()) + 1;
-            if (number > max_value)
-                return Error::from_errno(ERANGE);
-            rollback.disarm();
-            return -number;
-        }
-    }
-}
-
 LineTrackingLexer::Position LineTrackingLexer::position_for(size_t index) const
 {
    // Sad case: we have no idea where the nearest newline is, so we have to
@@ -157,8 +16,8 @@ LineTrackingLexer::Position LineTrackingLexer::position_for(size_t index) const
        auto next_newline = m_input.find('\n', m_largest_known_line_start_position);
        if (!next_newline.has_value()) {
            // No more newlines, add the end of the input as a line start to avoid searching again.
-            m_line_start_positions->insert(m_input.length(), m_line_start_positions->size());
-            m_largest_known_line_start_position = m_input.length();
+            m_line_start_positions->insert(input_length(), m_line_start_positions->size());
+            m_largest_known_line_start_position = input_length();
            break;
        }
        m_line_start_positions->insert(next_newline.value() + 1, m_line_start_positions->size());
@@ -179,83 +38,4 @@ LineTrackingLexer::Position LineTrackingLexer::position_for(size_t index) const
    return { index, line, column };
 }

-template ErrorOr<u8> GenericLexer::consume_decimal_integer<u8>();
-template ErrorOr<i8> GenericLexer::consume_decimal_integer<i8>();
-template ErrorOr<u16> GenericLexer::consume_decimal_integer<u16>();
-template ErrorOr<i16> GenericLexer::consume_decimal_integer<i16>();
-template ErrorOr<u32> GenericLexer::consume_decimal_integer<u32>();
-template ErrorOr<i32> GenericLexer::consume_decimal_integer<i32>();
-template ErrorOr<u64> GenericLexer::consume_decimal_integer<u64>();
-template ErrorOr<i64> GenericLexer::consume_decimal_integer<i64>();
-
-auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
-{
-    if (!consume_specific("\\u"sv))
-        return UnicodeEscapeError::MalformedUnicodeEscape;
-
-    if (next_is('{'))
-        return decode_code_point();
-    return decode_single_or_paired_surrogate(combine_surrogate_pairs);
-}
-
-auto GenericLexer::decode_code_point() -> Result<u32, UnicodeEscapeError>
-{
-    bool starts_with_open_bracket = consume_specific('{');
-    VERIFY(starts_with_open_bracket);
-
-    u32 code_point = 0;
-
-    while (true) {
-        if (!next_is(is_ascii_hex_digit))
-            return UnicodeEscapeError::MalformedUnicodeEscape;
-
-        auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume());
-        if (new_code_point < code_point)
-            return UnicodeEscapeError::UnicodeEscapeOverflow;
-
-        code_point = new_code_point;
-        if (consume_specific('}'))
-            break;
-    }
-
-    if (is_unicode(code_point))
-        return code_point;
-    return UnicodeEscapeError::UnicodeEscapeOverflow;
-}
-
-auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
-{
-    constexpr size_t surrogate_length = 4;
-
-    auto decode_one_surrogate = [&]() -> Optional<u16> {
-        u16 surrogate = 0;
-
-        for (size_t i = 0; i < surrogate_length; ++i) {
-            if (!next_is(is_ascii_hex_digit))
-                return {};
-
-            surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume());
-        }
-
-        return surrogate;
-    };
-
-    auto high_surrogate = decode_one_surrogate();
-    if (!high_surrogate.has_value())
-        return UnicodeEscapeError::MalformedUnicodeEscape;
-    if (!UnicodeUtils::is_utf16_high_surrogate(*high_surrogate))
-        return *high_surrogate;
-    if (!combine_surrogate_pairs || !consume_specific("\\u"sv))
-        return *high_surrogate;
-
-    auto low_surrogate = decode_one_surrogate();
-    if (!low_surrogate.has_value())
-        return UnicodeEscapeError::MalformedUnicodeEscape;
-    if (UnicodeUtils::is_utf16_low_surrogate(*low_surrogate))
-        return UnicodeUtils::decode_utf16_surrogate_pair(*high_surrogate, *low_surrogate);
-
-    retreat(6);
-    return *high_surrogate;
-}
-
 }
--- a/AK/GenericLexer.h
+++ b/AK/GenericLexer.h
@@ -6,51 +6,96 @@

 #pragma once

+#include <AK/Assertions.h>
+#include <AK/Forward.h>
 #include <AK/NonnullOwnPtr.h>
 #include <AK/RedBlackTree.h>
 #include <AK/Result.h>
+#include <AK/ScopeGuard.h>
 #include <AK/StringView.h>
+#include <AK/Utf16View.h>

 namespace AK {

+constexpr auto is_any_of(StringView values)
+{
+    return [values](auto c) { return values.contains(c); };
+}
+
+constexpr auto is_not_any_of(StringView values)
+{
+    return [values](auto c) { return !values.contains(c); };
+}
+
+constexpr auto is_path_separator = is_any_of("/\\"sv);
+constexpr auto is_quote = is_any_of("'\""sv);
+
+enum class UnicodeEscapeError {
+    MalformedUnicodeEscape,
+    UnicodeEscapeOverflow,
+};
+
+namespace Detail {
+
+template<typename CharType>
 class GenericLexer {
+    static_assert(IsOneOf<CharType, char, char16_t>);
+
 public:
-    constexpr explicit GenericLexer(StringView input)
+    using ViewType = Detail::Conditional<IsSame<CharType, char>, StringView, Utf16View>;
+
+    constexpr explicit GenericLexer(ViewType input)
        : m_input(input)
    {
    }

    constexpr size_t tell() const { return m_index; }
-    constexpr size_t tell_remaining() const { return m_input.length() - m_index; }
+    constexpr size_t tell_remaining() const { return input_length() - m_index; }

-    StringView remaining() const { return m_input.substring_view(m_index); }
-    StringView input() const { return m_input; }
+    constexpr ViewType remaining() const { return m_input.substring_view(m_index); }
+    constexpr ViewType input() const { return m_input; }

-    constexpr bool is_eof() const { return m_index >= m_input.length(); }
+    constexpr bool is_eof() const { return m_index >= input_length(); }

-    constexpr char peek(size_t offset = 0) const
+    constexpr CharType peek(size_t offset = 0) const
    {
-        return (m_index + offset < m_input.length()) ? m_input[m_index + offset] : '\0';
+        return (m_index + offset < input_length()) ? code_unit_at(m_index + offset) : '\0';
    }

-    Optional<StringView> peek_string(size_t length, size_t offset = 0) const
+    constexpr Optional<ViewType> peek_string(size_t length, size_t offset = 0) const
    {
-        if (m_index + offset + length > m_input.length())
+        if (m_index + offset + length > input_length())
            return {};
        return m_input.substring_view(m_index + offset, length);
    }

-    constexpr bool next_is(char expected) const
+    constexpr bool next_is(CharType expected) const
    {
        return peek() == expected;
    }

-    constexpr bool next_is(StringView expected) const
+    constexpr bool next_is(char expected) const
+    requires(IsSame<CharType, char16_t>)
    {
-        for (size_t i = 0; i < expected.length(); ++i)
-            if (peek(i) != expected[i])
-                return false;
-        return true;
+        return peek() == expected;
+    }
+
+    constexpr bool next_is(ViewType expected) const
+    {
+        size_t length = 0;
+
+        if constexpr (IsSame<CharType, char16_t>)
+            length = expected.length_in_code_units();
+        else
+            length = expected.length();
+
+        return peek_string(length) == expected;
+    }
+
+    constexpr bool next_is(StringView expected) const
+    requires(IsSame<CharType, char16_t>)
+    {
+        return peek_string(expected.length()) == expected;
    }

    constexpr void retreat()
@@ -65,13 +110,42 @@ public:
        m_index -= count;
    }

-    constexpr char consume()
+    constexpr CharType consume()
    {
        VERIFY(!is_eof());
-        return m_input[m_index++];
+        return code_unit_at(m_index++);
+    }
+
+    constexpr bool consume_specific(CharType next)
+    {
+        if (!next_is(next))
+            return false;
+
+        ignore();
+        return true;
+    }
+
+    constexpr bool consume_specific(char next)
+    requires(IsSame<CharType, char16_t>)
+    {
+        return consume_specific(static_cast<char16_t>(next));
+    }
+
+    constexpr bool consume_specific(ViewType next)
+    {
+        if (!next_is(next))
+            return false;
+
+        if constexpr (IsSame<CharType, char16_t>)
+            ignore(next.length_in_code_units());
+        else
+            ignore(next.length());
+
+        return true;
    }

    constexpr bool consume_specific(StringView next)
+    requires(IsSame<CharType, char16_t>)
    {
        if (!next_is(next))
            return false;
@@ -80,16 +154,7 @@ public:
        return true;
    }

-    constexpr bool consume_specific(char next)
-    {
-        if (!next_is(next))
-            return false;
-
-        ignore(sizeof(next));
-        return true;
-    }
-
-    constexpr char consume_escaped_character(char escape_char = '\\', StringView escape_map = "n\nr\rt\tb\bf\f"sv)
+    constexpr CharType consume_escaped_character(CharType escape_char = '\\', StringView escape_map = "n\nr\rt\tb\bf\f"sv)
    {
        if (!consume_specific(escape_char))
            return consume();
@@ -104,44 +169,180 @@ public:
        return c;
    }

-    StringView consume(size_t count);
-    StringView consume_all();
-    StringView consume_line();
-    StringView consume_until(char);
-    StringView consume_until(StringView);
-    StringView consume_quoted_string(char escape_char = 0);
-
-    template<Integral T>
-    ErrorOr<T> consume_decimal_integer();
-
-    enum class UnicodeEscapeError {
-        MalformedUnicodeEscape,
-        UnicodeEscapeOverflow,
-    };
-
-    Result<u32, UnicodeEscapeError> consume_escaped_code_point(bool combine_surrogate_pairs = true);
-
-    constexpr void ignore(size_t count = 1)
+    // Consume a number of characters
+    constexpr ViewType consume(size_t count)
    {
-        count = min(count, m_input.length() - m_index);
-        m_index += count;
+        auto start = m_index;
+        auto length = min(count, input_length() - m_index);
+        m_index += length;
+
+        return m_input.substring_view(start, length);
    }

-    constexpr void ignore_until(char stop)
+    // Consume the rest of the input
+    constexpr ViewType consume_all()
    {
-        while (!is_eof() && peek() != stop) {
-            ++m_index;
+        auto rest = m_input.substring_view(m_index, input_length() - m_index);
+        m_index = input_length();
+        return rest;
+    }
+
+    // Consume until a new line is found
+    constexpr ViewType consume_line()
+    {
+        auto start = m_index;
+        while (!is_eof() && peek() != '\r' && peek() != '\n')
+            m_index++;
+
+        auto length = m_index - start;
+        consume_specific('\r');
+        consume_specific('\n');
+
+        return m_input.substring_view(start, length);
+    }
+
+    // Consume and return characters until `stop` is peeked
+    constexpr ViewType consume_until(CharType stop)
+    {
+        auto start = m_index;
+        while (!is_eof() && peek() != stop)
+            m_index++;
+
+        auto length = m_index - start;
+        return m_input.substring_view(start, length);
+    }
+
+    constexpr ViewType consume_until(char stop)
+    requires(IsSame<CharType, char16_t>)
+    {
+        return consume_until(static_cast<char16_t>(stop));
+    }
+
+    // Consume and return characters until the string `stop` is found
+    constexpr ViewType consume_until(ViewType stop)
+    {
+        auto start = m_index;
+        while (!is_eof() && !next_is(stop))
+            m_index++;
+
+        auto length = m_index - start;
+        return m_input.substring_view(start, length);
+    }
+
+    // Consume a string surrounded by single or double quotes. The returned ViewType does not include the quotes. An
+    // escape character can be provided to capture the enclosing quotes. Please note that the escape character will
+    // still be in the resulting ViewType.
+    constexpr ViewType consume_quoted_string(CharType escape_char = 0)
+    {
+        if (!next_is(is_quote))
+            return {};
+
+        auto quote_char = consume();
+        auto start = m_index;
+        while (!is_eof()) {
+            if (next_is(escape_char))
+                m_index++;
+            else if (next_is(quote_char))
+                break;
+            m_index++;
+        }
+        auto length = m_index - start;
+
+        if (peek() != quote_char) {
+            // Restore the index in case the string is unterminated
+            m_index = start - 1;
+            return {};
+        }
+
+        // Ignore closing quote
+        ignore();
+
+        return m_input.substring_view(start, length);
+    }
+
+    template<Integral T>
+    ErrorOr<T> consume_decimal_integer()
+    {
+        using UnsignedT = MakeUnsigned<T>;
+
+        ArmedScopeGuard rollback { [&, rollback_position = m_index]() {
+            m_index = rollback_position;
+        } };
+
+        bool has_minus_sign = false;
+
+        if (next_is('+') || next_is('-'))
+            if (consume() == '-')
+                has_minus_sign = true;
+
+        auto number_view = consume_while(is_ascii_digit);
+        if (number_view.is_empty())
+            return Error::from_errno(EINVAL);
+
+        auto maybe_number = number_view.template to_number<UnsignedT>(TrimWhitespace::No);
+        if (!maybe_number.has_value())
+            return Error::from_errno(ERANGE);
+        auto number = maybe_number.value();
+
+        if (!has_minus_sign) {
+            if (NumericLimits<T>::max() < number) // This is only possible in a signed case.
+                return Error::from_errno(ERANGE);
+
+            rollback.disarm();
+            return number;
+        }
+
+        if constexpr (IsUnsigned<T>) {
+            if (number != 0)
+                return Error::from_errno(ERANGE);
+
+            rollback.disarm();
+            return 0;
+        } else {
+            static constexpr UnsignedT max_value = static_cast<UnsignedT>(NumericLimits<T>::max()) + 1;
+            if (number > max_value)
+                return Error::from_errno(ERANGE);
+
+            rollback.disarm();
+            return -number;
        }
    }

-    /*
-     * Conditions are used to match arbitrary characters. You can use lambdas,
-     * ctype functions, or is_any_of() and its derivatives (see below).
-     * A few examples:
-     *   - `if (lexer.next_is(isdigit))`
-     *   - `auto name = lexer.consume_while([](char c) { return isalnum(c) || c == '_'; });`
-     *   - `lexer.ignore_until(is_any_of("<^>"));`
-     */
+    Result<u32, UnicodeEscapeError> consume_escaped_code_point(bool combine_surrogate_pairs = true)
+    {
+        if (!consume_specific("\\u"sv))
+            return UnicodeEscapeError::MalformedUnicodeEscape;
+
+        if (next_is('{'))
+            return decode_code_point();
+        return decode_single_or_paired_surrogate(combine_surrogate_pairs);
+    }
+
+    constexpr void ignore(size_t count = 1)
+    {
+        count = min(count, input_length() - m_index);
+        m_index += count;
+    }
+
+    constexpr void ignore_until(CharType stop)
+    {
+        while (!is_eof() && peek() != stop)
+            ++m_index;
+    }
+
+    constexpr void ignore_until(char stop)
+    requires(IsSame<CharType, char16_t>)
+    {
+        return ignore_until(static_cast<char16_t>(stop));
+    }
+
+    // Conditions are used to match arbitrary characters. You can use lambdas, ctype functions, or is_any_of() and its
+    // derivatives (see below).
+    //
+    // A few examples:
+    //   - `if (lexer.next_is(isdigit))`
+    //   - `auto name = lexer.consume_while([](char c) { return isalnum(c) || c == '_'; });`
+    //   - `lexer.ignore_until(is_any_of("<^>"));`

    // Test the next character against a Condition
    template<typename TPredicate>
@@ -152,25 +353,25 @@ public:

    // Consume and return characters while `pred` returns true
    template<typename TPredicate>
-    StringView consume_while(TPredicate pred)
+    constexpr ViewType consume_while(TPredicate pred)
    {
-        size_t start = m_index;
+        auto start = m_index;
        while (!is_eof() && pred(peek()))
            ++m_index;
-        size_t length = m_index - start;

+        auto length = m_index - start;
        return m_input.substring_view(start, length);
    }

    // Consume and return characters until `pred` return true
    template<typename TPredicate>
-    StringView consume_until(TPredicate pred)
+    constexpr ViewType consume_until(TPredicate pred)
    {
-        size_t start = m_index;
+        auto start = m_index;
        while (!is_eof() && !pred(peek()))
            ++m_index;
-        size_t length = m_index - start;

+        auto length = m_index - start;
        return m_input.substring_view(start, length);
    }

@@ -201,13 +402,88 @@ public:
    }

 protected:
-    Result<u32, UnicodeEscapeError> decode_code_point();
-    Result<u32, UnicodeEscapeError> decode_single_or_paired_surrogate(bool combine_surrogate_pairs = true);
+    Result<u32, UnicodeEscapeError> decode_code_point()
+    {
+        bool starts_with_open_bracket = consume_specific('{');
+        VERIFY(starts_with_open_bracket);

-    StringView m_input;
+        u32 code_point = 0;
+
+        while (true) {
+            if (!next_is(is_ascii_hex_digit))
+                return UnicodeEscapeError::MalformedUnicodeEscape;
+
+            auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume());
+            if (new_code_point < code_point)
+                return UnicodeEscapeError::UnicodeEscapeOverflow;
+
+            code_point = new_code_point;
+            if (consume_specific('}'))
+                break;
+        }
+
+        if (is_unicode(code_point))
+            return code_point;
+        return UnicodeEscapeError::UnicodeEscapeOverflow;
+    }
+
+    Result<u32, UnicodeEscapeError> decode_single_or_paired_surrogate(bool combine_surrogate_pairs = true)
+    {
+        constexpr size_t surrogate_length = 4;
+
+        auto decode_one_surrogate = [&]() -> Optional<u16> {
+            u16 surrogate = 0;
+
+            for (size_t i = 0; i < surrogate_length; ++i) {
+                if (!next_is(is_ascii_hex_digit))
+                    return {};
+
+                surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume());
+            }
+
+            return surrogate;
+        };
+
+        auto high_surrogate = decode_one_surrogate();
+        if (!high_surrogate.has_value())
+            return UnicodeEscapeError::MalformedUnicodeEscape;
+        if (!UnicodeUtils::is_utf16_high_surrogate(*high_surrogate))
+            return *high_surrogate;
+        if (!combine_surrogate_pairs || !consume_specific("\\u"sv))
+            return *high_surrogate;
+
+        auto low_surrogate = decode_one_surrogate();
+        if (!low_surrogate.has_value())
+            return UnicodeEscapeError::MalformedUnicodeEscape;
+        if (UnicodeUtils::is_utf16_low_surrogate(*low_surrogate))
+            return UnicodeUtils::decode_utf16_surrogate_pair(*high_surrogate, *low_surrogate);
+
+        retreat(6);
+        return *high_surrogate;
+    }
+
+    constexpr size_t input_length() const
+    {
+        if constexpr (IsSame<CharType, char16_t>)
+            return m_input.length_in_code_units();
+        else
+            return m_input.length();
+    }
+
+    constexpr CharType code_unit_at(size_t index) const
+    {
+        if constexpr (IsSame<CharType, char16_t>)
+            return m_input.code_unit_at(index);
+        else
+            return m_input[index];
+    }
+
+    ViewType m_input;
    size_t m_index { 0 };
 };

+}
+
 class LineTrackingLexer : public GenericLexer {
 public:
    struct Position {
@@ -241,19 +517,6 @@ protected:
    mutable size_t m_largest_known_line_start_position { 0 };
 };

-constexpr auto is_any_of(StringView values)
-{
-    return [values](auto c) { return values.contains(c); };
-}
-
-constexpr auto is_not_any_of(StringView values)
-{
-    return [values](auto c) { return !values.contains(c); };
-}
-
-constexpr auto is_path_separator = is_any_of("/\\"sv);
-constexpr auto is_quote = is_any_of("'\""sv);
-
 }

 #if USING_AK_GLOBALLY
--- a/Libraries/LibJS/Token.cpp
+++ b/Libraries/LibJS/Token.cpp
@@ -123,9 +123,9 @@ ByteString Token::string_value(StringValueStatus& status) const

            if (code_point_or_error.is_error()) {
                switch (code_point_or_error.error()) {
-                case GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape:
+                case AK::UnicodeEscapeError::MalformedUnicodeEscape:
                    return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
-                case GenericLexer::UnicodeEscapeError::UnicodeEscapeOverflow:
+                case AK::UnicodeEscapeError::UnicodeEscapeOverflow:
                    return encoding_failure(StringValueStatus::UnicodeEscapeOverflow);
                }
            }
--- a/Tests/AK/TestGenericLexer.cpp
+++ b/Tests/AK/TestGenericLexer.cpp
@@ -7,120 +7,214 @@
 #include <LibTest/TestCase.h>

 #include <AK/GenericLexer.h>
-#include <AK/StringView.h>

 TEST_CASE(should_constexpr_construct_from_empty_string_view)
 {
-    constexpr GenericLexer sut(StringView {});
-    static_assert(sut.is_eof());
+    {
+        constexpr GenericLexer sut(StringView {});
+        static_assert(sut.is_eof());
+    }
+    {
+        constexpr Utf16GenericLexer sut(Utf16View {});
+        static_assert(sut.is_eof());
+    }
 }

 TEST_CASE(should_construct_from_string_view)
 {
-    constexpr GenericLexer sut("abcdef"sv);
-    static_assert(!sut.is_eof());
+    {
+        constexpr GenericLexer sut("abcdef"sv);
+        static_assert(!sut.is_eof());
+    }
+    {
+        constexpr Utf16GenericLexer sut("abcdef"sv);
+        static_assert(!sut.is_eof());
+    }
 }

 TEST_CASE(should_constexpr_tell)
 {
-    constexpr GenericLexer sut("abcdef"sv);
-    static_assert(sut.tell() == 0);
+    {
+        constexpr GenericLexer sut("abcdef"sv);
+        static_assert(sut.tell() == 0);
+    }
+    {
+        constexpr Utf16GenericLexer sut("abcdef"sv);
+        static_assert(sut.tell() == 0);
+    }
 }

 TEST_CASE(should_constexpr_tell_remaining)
 {
-    constexpr GenericLexer sut("abcdef"sv);
-    static_assert(sut.tell_remaining() == 6);
+    {
+        constexpr GenericLexer sut("abcdef"sv);
+        static_assert(sut.tell_remaining() == 6);
+    }
+    {
+        constexpr Utf16GenericLexer sut("abcdef"sv);
+        static_assert(sut.tell_remaining() == 6);
+    }
 }

 TEST_CASE(should_constexpr_peek)
 {
-    constexpr GenericLexer sut("abcdef"sv);
-    static_assert(sut.peek() == 'a');
-    static_assert(sut.peek(2) == 'c');
-    static_assert(sut.peek(100) == '\0');
+    {
+        constexpr GenericLexer sut("abcdef"sv);
+        static_assert(sut.peek() == 'a');
+        static_assert(sut.peek(2) == 'c');
+        static_assert(sut.peek(100) == '\0');
+    }
+    {
+        constexpr Utf16GenericLexer sut("abcdef"sv);
+        static_assert(sut.peek() == 'a');
+        static_assert(sut.peek(2) == 'c');
+        static_assert(sut.peek(100) == '\0');
+    }
 }

 TEST_CASE(should_constexpr_next_is)
 {
-    constexpr GenericLexer sut("abcdef"sv);
-    static_assert(sut.next_is('a'));
-    static_assert(sut.next_is("abc"sv));
+    {
+        constexpr GenericLexer sut("abcdef"sv);
+        static_assert(sut.next_is('a'));
+        static_assert(sut.next_is("abc"sv));
+    }
+    {
+        constexpr Utf16GenericLexer sut("abcdef"sv);
+        static_assert(sut.next_is('a'));
+        static_assert(sut.next_is("abc"sv));
+    }
 }

 TEST_CASE(should_constexpr_retreat)
 {
-    constexpr auto sut = [] {
-        GenericLexer sut("abcdef"sv);
-        sut.consume();
-        sut.retreat();
-        return sut;
-    }();
-    static_assert(sut.peek() == 'a');
+    {
+        constexpr auto sut = [] {
+            GenericLexer sut("abcdef"sv);
+            sut.consume();
+            sut.retreat();
+            return sut;
+        }();
+        static_assert(sut.peek() == 'a');
+    }
+    {
+        constexpr auto sut = [] {
+            Utf16GenericLexer sut("abcdef"sv);
+            sut.consume();
+            sut.retreat();
+            return sut;
+        }();
+        static_assert(sut.peek() == 'a');
+    }
 }

 TEST_CASE(should_constexpr_consume_1)
 {
-    constexpr auto sut = [] {
-        GenericLexer sut("abcdef"sv);
-        sut.consume();
-        return sut;
-    }();
-    static_assert(sut.peek() == 'b');
+    {
+        constexpr auto sut = [] {
+            GenericLexer sut("abcdef"sv);
+            sut.consume();
+            return sut;
+        }();
+        static_assert(sut.peek() == 'b');
+    }
+    {
+        constexpr auto sut = [] {
+            Utf16GenericLexer sut("abcdef"sv);
+            sut.consume();
+            return sut;
+        }();
+        static_assert(sut.peek() == 'b');
+    }
 }

 TEST_CASE(should_constexpr_consume_specific_char)
 {
-    constexpr auto sut = [] {
-        GenericLexer sut("abcdef"sv);
-        sut.consume_specific('a');
-        return sut;
-    }();
-    static_assert(sut.peek() == 'b');
+    {
+        constexpr auto sut = [] {
+            GenericLexer sut("abcdef"sv);
+            sut.consume_specific('a');
+            return sut;
+        }();
+        static_assert(sut.peek() == 'b');
+    }
+    {
+        constexpr auto sut = [] {
+            Utf16GenericLexer sut("abcdef"sv);
+            sut.consume_specific('a');
+            return sut;
+        }();
+        static_assert(sut.peek() == 'b');
+    }
 }

 TEST_CASE(should_constexpr_consume_specific_string_view)
 {
-    constexpr auto sut = [] {
-        GenericLexer sut("abcdef"sv);
-        sut.consume_specific("ab"sv);
-        return sut;
-    }();
-    static_assert(sut.peek() == 'c');
-}
+    {
+        constexpr auto sut = [] {
+            GenericLexer sut("abcdef"sv);
+            VERIFY(sut.consume_specific("ab"sv));
+            return sut;
+        }();
+        static_assert(sut.peek() == 'c');
+    }
+    {
+        constexpr auto sut = [] {
+            Utf16GenericLexer sut("abcdef"sv);
+            VERIFY(sut.consume_specific("abcd"sv));
+            return sut;
+        }();

-TEST_CASE(should_constexpr_consume_specific_cstring)
-{
-    constexpr auto sut = [] {
-        GenericLexer sut("abcdef"sv);
-        sut.consume_specific("abcd"sv);
-        return sut;
-    }();
-    static_assert(sut.peek() == 'e');
+        static_assert(sut.peek() == 'e');
+    }
 }

 TEST_CASE(should_constexpr_consume_specific_with_predicate)
 {
-    constexpr auto sut = [] {
-        GenericLexer sut("h e l l o !"sv);
-        for (size_t i = 0; i < 100; ++i) {
-            sut.consume_specific_with_predicate([](auto c) {
-                return is_ascii_alpha(c) || is_ascii_space(c);
-            });
-        }
-        return sut;
-    }();
-    static_assert(sut.peek() == '!');
+    {
+        constexpr auto sut = [] {
+            GenericLexer sut("h e l l o !"sv);
+            for (size_t i = 0; i < 100; ++i) {
+                sut.consume_specific_with_predicate([](auto c) {
+                    return is_ascii_alpha(c) || is_ascii_space(c);
+                });
+            }
+            return sut;
+        }();
+        static_assert(sut.peek() == '!');
+    }
+    {
+        constexpr auto sut = [] {
+            Utf16GenericLexer sut("h e l l o !"sv);
+            for (size_t i = 0; i < 100; ++i) {
+                sut.consume_specific_with_predicate([](auto c) {
+                    return is_ascii_alpha(c) || is_ascii_space(c);
+                });
+            }
+            return sut;
+        }();
+        static_assert(sut.peek() == '!');
+    }
 }

 TEST_CASE(should_constexpr_ignore_until)
 {
-    constexpr auto sut = [] {
-        GenericLexer sut("abcdef"sv);
-        sut.ignore_until('d');
-        return sut;
-    }();
-    static_assert(sut.peek() == 'd');
+    {
+        constexpr auto sut = [] {
+            GenericLexer sut("abcdef"sv);
+            sut.ignore_until('d');
+            return sut;
+        }();
+        static_assert(sut.peek() == 'd');
+    }
+    {
+        constexpr auto sut = [] {
+            Utf16GenericLexer sut("abcdef"sv);
+            sut.ignore_until('d');
+            return sut;
+        }();
+        static_assert(sut.peek() == 'd');
+    }
 }

 TEST_CASE(should_constexpr_next_is_pred)
@@ -128,41 +222,68 @@ TEST_CASE(should_constexpr_next_is_pred)
    constexpr auto pred = [](auto c) {
        return c == 'a';
    };
-    constexpr GenericLexer sut("abcdef"sv);
-    static_assert(sut.next_is(pred));
+
+    {
+        constexpr GenericLexer sut("abcdef"sv);
+        static_assert(sut.next_is(pred));
+    }
+    {
+        constexpr Utf16GenericLexer sut("abcdef"sv);
+        static_assert(sut.next_is(pred));
+    }
 }

 TEST_CASE(should_constexpr_ignore_while_pred)
 {
-    constexpr auto sut = [] {
-        constexpr auto pred = [](auto c) {
-            return c == 'a';
-        };
+    constexpr auto pred = [](auto c) {
+        return c == 'a';
+    };

-        GenericLexer sut("abcdef"sv);
-        sut.ignore_while(pred);
-        return sut;
-    }();
-    static_assert(sut.peek() == 'b');
+    {
+        constexpr auto sut = [&] {
+            GenericLexer sut("abcdef"sv);
+            sut.ignore_while(pred);
+            return sut;
+        }();
+        static_assert(sut.peek() == 'b');
+    }
+    {
+        constexpr auto sut = [&] {
+            Utf16GenericLexer sut("abcdef"sv);
+            sut.ignore_while(pred);
+            return sut;
+        }();
+        static_assert(sut.peek() == 'b');
+    }
 }

 TEST_CASE(should_constexpr_ignore_until_pred)
 {
-    constexpr auto sut = [] {
-        constexpr auto pred = [](auto c) {
-            return c == 'c';
-        };
+    constexpr auto pred = [](auto c) {
+        return c == 'c';
+    };

-        GenericLexer sut("abcdef"sv);
-        sut.ignore_until(pred);
-        return sut;
-    }();
-    static_assert(sut.peek() == 'c');
+    {
+        constexpr auto sut = [&] {
+            GenericLexer sut("abcdef"sv);
+            sut.ignore_until(pred);
+            return sut;
+        }();
+        static_assert(sut.peek() == 'c');
+    }
+    {
+        constexpr auto sut = [&] {
+            Utf16GenericLexer sut("abcdef"sv);
+            sut.ignore_until(pred);
+            return sut;
+        }();
+        static_assert(sut.peek() == 'c');
+    }
 }

 TEST_CASE(consume_escaped_code_point)
 {
-    auto test = [](StringView test, Result<u32, GenericLexer::UnicodeEscapeError> expected, bool combine_surrogate_pairs = true) {
+    auto test = [](StringView test, Result<u32, AK::UnicodeEscapeError> expected, bool combine_surrogate_pairs = true) {
        GenericLexer lexer(test);

        auto actual = lexer.consume_escaped_code_point(combine_surrogate_pairs);
@@ -173,39 +294,56 @@ TEST_CASE(consume_escaped_code_point)
        else
            EXPECT_EQ(actual.value(), expected.value());
    };
+    auto test_utf16 = [](Utf16View test, Result<u32, AK::UnicodeEscapeError> expected, bool combine_surrogate_pairs = true) {
+        Utf16GenericLexer lexer(test);

-    test("\\u"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
-    test("\\u{"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
-    test("\\u{1"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
-    test("\\u{}"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
-    test("\\u{x}"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
+        auto actual = lexer.consume_escaped_code_point(combine_surrogate_pairs);
+        EXPECT_EQ(actual.is_error(), expected.is_error());

-    test("\\u{110000}"sv, GenericLexer::UnicodeEscapeError::UnicodeEscapeOverflow);
-    test("\\u{f00000000}"sv, GenericLexer::UnicodeEscapeError::UnicodeEscapeOverflow);
+        if (actual.is_error() && expected.is_error())
+            EXPECT_EQ(actual.error(), expected.error());
+        else
+            EXPECT_EQ(actual.value(), expected.value());
+    };

-    test("\\u{0}"sv, 0);
-    test("\\u{41}"sv, 0x41);
-    test("\\u{ffff}"sv, 0xffff);
-    test("\\u{10ffff}"sv, 0x10ffff);
+#define CHECK(input, expected, ...)                             \
+    do {                                                        \
+        test(input, expected __VA_OPT__(, ) __VA_ARGS__);       \
+        test_utf16(input, expected __VA_OPT__(, ) __VA_ARGS__); \
+    } while (false)

-    test("\\u1"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
-    test("\\u11"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
-    test("\\u111"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
-    test("\\u111x"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
-    test("\\ud800\\u"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
-    test("\\ud800\\u1"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
-    test("\\ud800\\u11"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
-    test("\\ud800\\u111"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
-    test("\\ud800\\u111x"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
+    CHECK("\\u"sv, AK::UnicodeEscapeError::MalformedUnicodeEscape);
+    CHECK("\\u{"sv, AK::UnicodeEscapeError::MalformedUnicodeEscape);
+    CHECK("\\u{1"sv, AK::UnicodeEscapeError::MalformedUnicodeEscape);
+    CHECK("\\u{}"sv, AK::UnicodeEscapeError::MalformedUnicodeEscape);
+    CHECK("\\u{x}"sv, AK::UnicodeEscapeError::MalformedUnicodeEscape);

-    test("\\u0000"sv, 0x0);
-    test("\\u0041"sv, 0x41);
-    test("\\uffff"sv, 0xffff);
+    CHECK("\\u{110000}"sv, AK::UnicodeEscapeError::UnicodeEscapeOverflow);
+    CHECK("\\u{f00000000}"sv, AK::UnicodeEscapeError::UnicodeEscapeOverflow);

-    test("\\ud83d"sv, 0xd83d);
-    test("\\ud83d\\u1111"sv, 0xd83d);
-    test("\\ud83d\\ude00"sv, 0x1f600);
-    test("\\ud83d\\ude00"sv, 0xd83d, false);
+    CHECK("\\u{0}"sv, 0);
+    CHECK("\\u{41}"sv, 0x41);
+    CHECK("\\u{ffff}"sv, 0xffff);
+    CHECK("\\u{10ffff}"sv, 0x10ffff);
+
+    CHECK("\\u1"sv, AK::UnicodeEscapeError::MalformedUnicodeEscape);
+    CHECK("\\u11"sv, AK::UnicodeEscapeError::MalformedUnicodeEscape);
+    CHECK("\\u111"sv, AK::UnicodeEscapeError::MalformedUnicodeEscape);
+    CHECK("\\u111x"sv, AK::UnicodeEscapeError::MalformedUnicodeEscape);
+    CHECK("\\ud800\\u"sv, AK::UnicodeEscapeError::MalformedUnicodeEscape);
+    CHECK("\\ud800\\u1"sv, AK::UnicodeEscapeError::MalformedUnicodeEscape);
+    CHECK("\\ud800\\u11"sv, AK::UnicodeEscapeError::MalformedUnicodeEscape);
+    CHECK("\\ud800\\u111"sv, AK::UnicodeEscapeError::MalformedUnicodeEscape);
+    CHECK("\\ud800\\u111x"sv, AK::UnicodeEscapeError::MalformedUnicodeEscape);
+
+    CHECK("\\u0000"sv, 0x0);
+    CHECK("\\u0041"sv, 0x41);
+    CHECK("\\uffff"sv, 0xffff);
+
+    CHECK("\\ud83d"sv, 0xd83d);
+    CHECK("\\ud83d\\u1111"sv, 0xd83d);
+    CHECK("\\ud83d\\ude00"sv, 0x1f600);
+    CHECK("\\ud83d\\ude00"sv, 0xd83d, false);
 }

 TEST_CASE(consume_decimal_integer_correctly_parses)
@@ -217,7 +355,14 @@ TEST_CASE(consume_decimal_integer_correctly_parses)
        VERIFY(!actual.is_error());                             \
        EXPECT_EQ(actual.value(), static_cast<type>(expected)); \
        EXPECT_EQ(lexer.tell(), test##sv.length());             \
+                                                                \
+        Utf16GenericLexer utf16_lexer(test##sv);                \
+        actual = utf16_lexer.consume_decimal_integer<type>();   \
+        VERIFY(!actual.is_error());                             \
+        EXPECT_EQ(actual.value(), static_cast<type>(expected)); \
+        EXPECT_EQ(utf16_lexer.tell(), test##sv.length());       \
    } while (false)
+
    CHECK_PARSES_INTEGER("0", 0, u8);
    CHECK_PARSES_INTEGER("-0", -0, u8);
    CHECK_PARSES_INTEGER("10", 10, u8);
@@ -270,7 +415,14 @@ TEST_CASE(consume_decimal_integer_fails_with_correct_error)
        VERIFY(actual.is_error() && actual.error().is_errno()); \
        EXPECT_EQ(actual.error().code(), err);                  \
        EXPECT_EQ(lexer.tell(), static_cast<size_t>(0));        \
+                                                                \
+        Utf16GenericLexer utf16_lexer(test##sv);                \
+        actual = utf16_lexer.consume_decimal_integer<type>();   \
+        VERIFY(actual.is_error() && actual.error().is_errno()); \
+        EXPECT_EQ(actual.error().code(), err);                  \
+        EXPECT_EQ(utf16_lexer.tell(), static_cast<size_t>(0));  \
    } while (false)
+
    CHECK_FAILS_WITH_ERROR("Well hello GenericLexer!", u64, EINVAL);
    CHECK_FAILS_WITH_ERROR("+", u64, EINVAL);
    CHECK_FAILS_WITH_ERROR("+WHF", u64, EINVAL);