LibRegex: Support matching unicode multi-character sequences

Author: https://github.com/aplefull Commit: https://github.com/LadybirdBrowser/ladybird/commit/a49c39de32f Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/6867 Reviewed-by: https://github.com/alimpfard ✅
2025-12-05 01:10:24 +00:00 · 2025-11-09 13:35:16 +01:00 · 2025-11-26 10:35:48 +00:00
parent 5b7c9af340
commit a49c39de32
7 changed files with 462 additions and 34 deletions
--- a/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js
+++ b/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js
@@ -95,7 +95,7 @@ test("Unicode non-ASCII matching", () => {
    }
 });

-// Test from https://github.com/tc39/test262/blob/main/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-property-of-strings-escape.js
+// https://github.com/tc39/test262/tree/main/test/built-ins/RegExp/unicodeSets/generated
 test("Unicode properties of strings", () => {
    const regexes = [
        /\p{Basic_Emoji}/v,
@@ -113,26 +113,119 @@ test("Unicode properties of strings", () => {
        }).not.toThrow();
    }

-    const matchStrings = ["0", "1", "2", "3", "4", "5", "8", "A", "B", "D", "E", "F", "a", "b", "c", "d", "e", "f"];
-
-    const nonMatchStrings = [
-        "6\uFE0F\u20E3",
-        "7\uFE0F\u20E3",
-        "9\uFE0F\u20E3",
-        "\u2603",
-        "\u{1D306}",
-        "\u{1F1E7}\u{1F1EA}",
-    ];
-
-    const re = /^[\p{ASCII_Hex_Digit}--\p{Emoji_Keycap_Sequence}]+$/v;
-
-    for (const str of matchStrings) {
-        expect(re.test(str)).toBeTrue();
+    function testExtendedCharacterClass({ regExp, matchStrings, nonMatchStrings }) {
+        matchStrings.forEach(str => expect(regExp.test(str)).toBeTrue());
+        nonMatchStrings.forEach(str => expect(regExp.test(str)).toBeFalse());
    }

-    for (const str of nonMatchStrings) {
-        expect(re.test(str)).toBeFalse();
-    }
+    testExtendedCharacterClass({
+        regExp: /^[\p{ASCII_Hex_Digit}--\p{Emoji_Keycap_Sequence}]+$/v,
+        matchStrings: ["0", "1", "2", "3", "4", "5", "8", "A", "B", "D", "E", "F", "a", "b", "c", "d", "e", "f"],
+        nonMatchStrings: [
+            "6\uFE0F\u20E3",
+            "7\uFE0F\u20E3",
+            "9\uFE0F\u20E3",
+            "\u2603",
+            "\u{1D306}",
+            "\u{1F1E7}\u{1F1EA}",
+        ],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[\d\p{Emoji_Keycap_Sequence}]+$/v,
+        matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3"],
+        nonMatchStrings: ["C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[[0-9]\p{Emoji_Keycap_Sequence}]+$/v,
+        matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3"],
+        nonMatchStrings: ["C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[_--[0-9]]+$/v,
+        matchStrings: ["_"],
+        nonMatchStrings: ["6\uFE0F\u20E3", "7", "9\uFE0F\u20E3", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[\p{ASCII_Hex_Digit}--[0-9]]+$/v,
+        matchStrings: ["a", "b"],
+        nonMatchStrings: ["0", "9", "9\uFE0F\u20E3", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[\p{ASCII_Hex_Digit}\p{Emoji_Keycap_Sequence}]+$/v,
+        matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "A", "B", "a", "b"],
+        nonMatchStrings: ["\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[_\p{Emoji_Keycap_Sequence}]+$/v,
+        matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3", "_"],
+        nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[\p{Emoji_Keycap_Sequence}--\d]+$/v,
+        matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
+        nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[\p{Emoji_Keycap_Sequence}--[0-9]]+$/v,
+        matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
+        nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[\p{Emoji_Keycap_Sequence}--\p{ASCII_Hex_Digit}]+$/v,
+        matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
+        nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[\p{Emoji_Keycap_Sequence}--_]+$/v,
+        matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
+        nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[\p{Emoji_Keycap_Sequence}&&\p{Emoji_Keycap_Sequence}]+$/v,
+        matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
+        nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[\p{Emoji_Keycap_Sequence}\d]+$/v,
+        matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3"],
+        nonMatchStrings: ["C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[\p{Emoji_Keycap_Sequence}[0-9]]+$/v,
+        matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3"],
+        nonMatchStrings: ["C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[\p{Emoji_Keycap_Sequence}\p{ASCII_Hex_Digit}]+$/v,
+        matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3", "A", "a"],
+        nonMatchStrings: ["\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[\p{Emoji_Keycap_Sequence}_]+$/v,
+        matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3", "_"],
+        nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
+
+    testExtendedCharacterClass({
+        regExp: /^[\p{Emoji_Keycap_Sequence}\p{Emoji_Keycap_Sequence}]+$/v,
+        matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
+        nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
+    });
 });

 test("Unicode matching with u and v flags", () => {
--- a/Libraries/LibRegex/RegexByteCode.cpp
+++ b/Libraries/LibRegex/RegexByteCode.cpp
@@ -175,6 +175,41 @@ StringTable::~StringTable()
        --s_next_string_table_serial; // We didn't use this serial, put it back.
 }

+static u32 s_next_string_set_table_serial { 0 };
+
+StringSetTable::StringSetTable()
+    : m_serial(s_next_string_set_table_serial++)
+{
+}
+
+StringSetTable::~StringSetTable()
+{
+    if (m_serial == s_next_string_set_table_serial - 1 && m_u8_tries.is_empty())
+        --s_next_string_set_table_serial;
+}
+
+StringSetTable::StringSetTable(StringSetTable const& other)
+    : m_serial(s_next_string_set_table_serial++)
+{
+    for (auto const& entry : other.m_u8_tries)
+        m_u8_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
+    for (auto const& entry : other.m_u16_tries)
+        m_u16_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
+}
+
+StringSetTable& StringSetTable::operator=(StringSetTable const& other)
+{
+    if (this != &other) {
+        m_u8_tries.clear();
+        m_u16_tries.clear();
+        for (auto const& entry : other.m_u8_tries)
+            m_u8_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
+        for (auto const& entry : other.m_u16_tries)
+            m_u16_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
+    }
+    return *this;
+}
+
 void ByteCode::ensure_opcodes_initialized()
 {
    if (s_opcodes_initialized)
@@ -450,8 +485,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
    struct DisjunctionState {
        bool active { false };
        bool is_conjunction { false };
+        bool is_subtraction { false };
        bool fail { false };
        bool inverse_matched { false };
+        size_t subtraction_operand_index { 0 };
        size_t initial_position;
        size_t initial_code_unit_position;
        Optional<size_t> last_accepted_position {};
@@ -471,19 +508,35 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M

    state.string_position_before_match = state.string_position;

+    bool has_string_set = false;
+    bool string_set_matched = false;
+    size_t best_match_position = state.string_position;
+    size_t best_match_position_in_code_units = state.string_position_in_code_units;
+
    size_t offset { state.instruction_position + 3 };
+    CharacterCompareType last_compare_type = CharacterCompareType::Undefined;
+
    for (size_t i = 0; i < argument_count; ++i) {
        if (state.string_position > string_position)
            break;

+        if (has_string_set) {
+            state.string_position = string_position;
+            state.string_position_in_code_units = current_disjunction_state().initial_code_unit_position;
+        }
+
+        auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
+
        if (reset_temp_inverse) {
            reset_temp_inverse = false;
-            temporary_inverse = false;
+            if (compare_type != CharacterCompareType::Property || last_compare_type != CharacterCompareType::StringSet) {
+                temporary_inverse = false;
+            }
        } else {
            reset_temp_inverse = true;
        }

-        auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
+        last_compare_type = compare_type;

        switch (compare_type) {
        case CharacterCompareType::Inverse:
@@ -710,6 +763,111 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
            compare_script_extension(input, state, script, current_inversion_state(), inverse_matched);
            break;
        }
+        case CharacterCompareType::StringSet: {
+            has_string_set = true;
+            auto string_set_index = m_bytecode->at(offset++);
+
+            bool matched = false;
+            size_t longest_match_length = 0;
+
+            auto find_longest_match = [&](auto const& view, auto const& trie) {
+                auto const* current = &trie;
+                size_t current_code_unit_offset = state.string_position_in_code_units;
+
+                while (true) {
+                    u32 value;
+
+                    if constexpr (IsSame<decltype(view), Utf16View const&>) {
+                        if (current_code_unit_offset >= view.length_in_code_units())
+                            break;
+                        value = view.code_unit_at(current_code_unit_offset);
+                    } else {
+                        if (current_code_unit_offset >= input.view.length_in_code_units())
+                            break;
+                        value = input.view.code_point_at(current_code_unit_offset);
+                    }
+
+                    if (input.regex_options & AllFlags::Insensitive) {
+                        bool found_child = false;
+                        for (auto const& [key, child] : current->children()) {
+                            if (to_ascii_lowercase(key) == to_ascii_lowercase(value)) {
+                                current = static_cast<StringSetTrie const*>(child.ptr());
+                                current_code_unit_offset++;
+                                found_child = true;
+                                break;
+                            }
+                        }
+                        if (!found_child)
+                            break;
+                    } else {
+                        auto it = current->children().find(value);
+                        if (it == current->children().end())
+                            break;
+
+                        current = static_cast<StringSetTrie const*>(it->value.ptr());
+                        current_code_unit_offset++;
+                    }
+
+                    auto is_terminal = current->has_metadata() && current->metadata_value();
+                    if (is_terminal) {
+                        size_t match_length_in_code_points;
+                        if constexpr (IsSame<decltype(view), Utf16View const&>) {
+                            size_t code_points = 0;
+                            for (size_t i = state.string_position_in_code_units; i < current_code_unit_offset;) {
+                                auto code_point = view.code_point_at(i);
+                                i += code_point >= 0x10000 ? 2 : 1;
+                                code_points++;
+                            }
+                            match_length_in_code_points = code_points;
+                        } else {
+                            size_t code_points = 0;
+                            for (size_t i = state.string_position_in_code_units; i < current_code_unit_offset;) {
+                                auto code_point = input.view.code_point_at(i);
+                                if (code_point <= 0x7F)
+                                    i += 1;
+                                else if (code_point <= 0x7FF)
+                                    i += 2;
+                                else if (code_point <= 0xFFFF)
+                                    i += 3;
+                                else
+                                    i += 4;
+                                code_points++;
+                            }
+                            match_length_in_code_points = code_points;
+                        }
+
+                        if (match_length_in_code_points > longest_match_length) {
+                            matched = true;
+                            longest_match_length = match_length_in_code_points;
+                        }
+                    }
+                }
+            };
+
+            if (input.view.u16_view().is_null()) {
+                auto const& trie = m_bytecode->string_set_table().get_u8_trie(string_set_index);
+                StringView view;
+                find_longest_match(view, trie);
+            } else {
+                auto const& view = input.view.u16_view();
+                auto const& trie = m_bytecode->string_set_table().get_u16_trie(string_set_index);
+                find_longest_match(view, trie);
+            }
+
+            if (matched) {
+                if (current_inversion_state()) {
+                    inverse_matched = true;
+                } else {
+                    state.string_position += longest_match_length;
+                    if (input.view.unicode()) {
+                        state.string_position_in_code_units = input.view.code_unit_offset_of(state.string_position);
+                    } else {
+                        state.string_position_in_code_units = state.string_position;
+                    }
+                }
+            }
+            break;
+        }
        case CharacterCompareType::And:
            disjunction_states.append({
                .active = true,
@@ -720,6 +878,17 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
                .initial_code_unit_position = state.string_position_in_code_units,
            });
            continue;
+        case CharacterCompareType::Subtract:
+            disjunction_states.append({
+                .active = true,
+                .is_conjunction = true,
+                .is_subtraction = true,
+                .fail = true,
+                .inverse_matched = false,
+                .initial_position = state.string_position,
+                .initial_code_unit_position = state.string_position_in_code_units,
+            });
+            continue;
        case CharacterCompareType::Or:
            disjunction_states.append({
                .active = true,
@@ -735,6 +904,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
            if (!disjunction_state.fail) {
                state.string_position = disjunction_state.last_accepted_position.value_or(disjunction_state.initial_position);
                state.string_position_in_code_units = disjunction_state.last_accepted_code_unit_position.value_or(disjunction_state.initial_code_unit_position);
+            } else if (has_string_set) {
+                string_set_matched = false;
+                best_match_position = disjunction_state.initial_position;
+                best_match_position_in_code_units = disjunction_state.initial_code_unit_position;
            }
            inverse_matched = disjunction_state.inverse_matched || disjunction_state.fail;
            break;
@@ -751,6 +924,12 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
            inverse_matched = true;
        }

+        if (has_string_set && state.string_position > best_match_position) {
+            best_match_position = state.string_position;
+            best_match_position_in_code_units = state.string_position_in_code_units;
+            string_set_matched = true;
+        }
+
        if (!has_single_argument && new_disjunction_state.active) {
            auto failed = (!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length();

@@ -760,10 +939,18 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
                new_disjunction_state.inverse_matched |= inverse_matched;
            }

-            if (new_disjunction_state.is_conjunction)
+            if (new_disjunction_state.is_subtraction) {
+                if (new_disjunction_state.subtraction_operand_index == 0) {
+                    new_disjunction_state.fail = failed && new_disjunction_state.fail;
+                } else if (!failed && (!has_string_set || state.string_position >= best_match_position)) {
+                    new_disjunction_state.fail = true;
+                }
+                new_disjunction_state.subtraction_operand_index++;
+            } else if (new_disjunction_state.is_conjunction) {
                new_disjunction_state.fail = failed && new_disjunction_state.fail;
-            else
+            } else {
                new_disjunction_state.fail = failed || new_disjunction_state.fail;
+            }

            state.string_position = new_disjunction_state.initial_position;
            state.string_position_in_code_units = new_disjunction_state.initial_code_unit_position;
@@ -773,11 +960,16 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M

    if (!has_single_argument) {
        auto& new_disjunction_state = current_disjunction_state();
-        if (new_disjunction_state.active) {
-            if (!new_disjunction_state.fail) {
-                state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position);
-                state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position);
-            }
+        if (new_disjunction_state.active && !new_disjunction_state.fail) {
+            state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position);
+            state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position);
+        }
+    }
+
+    if (has_string_set && string_set_matched) {
+        if (has_single_argument || best_match_position > string_position) {
+            state.string_position = best_match_position;
+            state.string_position_in_code_units = best_match_position_in_code_units;
        }
    }

--- a/Libraries/LibRegex/RegexByteCode.h
+++ b/Libraries/LibRegex/RegexByteCode.h
@@ -14,6 +14,7 @@
 #include <AK/Forward.h>
 #include <AK/HashMap.h>
 #include <AK/OwnPtr.h>
+#include <AK/Trie.h>
 #include <AK/TypeCasts.h>
 #include <AK/Types.h>
 #include <AK/Vector.h>
@@ -78,7 +79,9 @@ enum class OpCodeId : ByteCodeValueType {
    __ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable)          \
    __ENUMERATE_CHARACTER_COMPARE_TYPE(And)                  \
    __ENUMERATE_CHARACTER_COMPARE_TYPE(Or)                   \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(EndAndOr)
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(EndAndOr)             \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(Subtract)             \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(StringSet)

 enum class CharacterCompareType : ByteCodeValueType {
 #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
@@ -177,6 +180,62 @@ struct REGEX_API StringTable {
    HashMap<ByteCodeValueType, FlyString> m_inverse_table;
 };

+using StringSetTrie = Trie<u32, bool>;
+
+struct REGEX_API StringSetTable {
+    StringSetTable();
+    ~StringSetTable();
+    StringSetTable(StringSetTable const& other);
+    StringSetTable(StringSetTable&&) = default;
+    StringSetTable& operator=(StringSetTable const& other);
+    StringSetTable& operator=(StringSetTable&&) = default;
+
+    ByteCodeValueType set(Vector<String> const& strings)
+    {
+        u32 local_index = m_u8_tries.size();
+        ByteCodeValueType global_index = static_cast<ByteCodeValueType>(m_serial) << 32 | static_cast<ByteCodeValueType>(local_index);
+
+        StringSetTrie u8_trie { 0, false };
+        StringSetTrie u16_trie { 0, false };
+
+        for (auto const& str : strings) {
+            Vector<u32> code_points;
+            Utf8View utf8_view { str.bytes_as_string_view() };
+            for (auto code_point : utf8_view)
+                code_points.append(code_point);
+
+            (void)u8_trie.insert(code_points.begin(), code_points.end(), true, [](auto&, auto) { return false; });
+
+            auto utf16_string = Utf16String::from_utf32({ code_points.data(), code_points.size() });
+            Vector<u32> u16_code_units;
+            auto utf16_view = utf16_string.utf16_view();
+            for (size_t i = 0; i < utf16_view.length_in_code_units(); i++) {
+                auto code_unit = utf16_view.code_unit_at(i);
+                u16_code_units.append(code_unit);
+            }
+            (void)u16_trie.insert(u16_code_units.begin(), u16_code_units.end(), true, [](auto&, auto) { return false; });
+        }
+
+        m_u8_tries.set(global_index, move(u8_trie));
+        m_u16_tries.set(global_index, move(u16_trie));
+        return global_index;
+    }
+
+    StringSetTrie const& get_u8_trie(ByteCodeValueType index) const
+    {
+        return m_u8_tries.get(index).value();
+    }
+
+    StringSetTrie const& get_u16_trie(ByteCodeValueType index) const
+    {
+        return m_u16_tries.get(index).value();
+    }
+
+    u32 m_serial { 0 };
+    HashMap<ByteCodeValueType, StringSetTrie> m_u8_tries;
+    HashMap<ByteCodeValueType, StringSetTrie> m_u16_tries;
+};
+
 class REGEX_API ByteCode : public DisjointChunks<ByteCodeValueType> {
    using Base = DisjointChunks<ByteCodeValueType>;

@@ -262,6 +321,9 @@ public:
    FlyString get_string(size_t index) const { return m_string_table.get(index); }
    auto const& string_table() const { return m_string_table; }

+    auto const& string_set_table() const { return m_string_set_table; }
+    auto& string_set_table() { return m_string_set_table; }
+
    Optional<size_t> get_group_name_index(size_t group_index) const
    {
        return m_group_name_mappings.get(group_index);
@@ -286,6 +348,13 @@ public:
            }
            m_string_table.m_inverse_table.update(other.m_string_table.m_inverse_table);

+            for (auto const& entry : other.m_string_set_table.m_u8_tries) {
+                m_string_set_table.m_u8_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
+            }
+            for (auto const& entry : other.m_string_set_table.m_u16_tries) {
+                m_string_set_table.m_u16_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
+            }
+
            for (auto const& mapping : other.m_group_name_mappings) {
                m_group_name_mappings.set(mapping.key, mapping.value);
            }
@@ -631,6 +700,7 @@ private:
    static bool s_opcodes_initialized;
    static size_t s_next_checkpoint_serial_id;
    StringTable m_string_table;
+    StringSetTable m_string_set_table;
    HashMap<size_t, size_t> m_group_name_mappings;
 };

--- a/Libraries/LibRegex/RegexOptimizer.cpp
+++ b/Libraries/LibRegex/RegexOptimizer.cpp
@@ -113,6 +113,8 @@ static bool interpret_compares(Vector<CompareTypeAndValuePair> const& lhs, Stati
            // FIXME: We just need to look at the last character of this string, but we only have the first character here.
            //        Just bail out to avoid false positives.
            return false;
+        case CharacterCompareType::StringSet:
+            return false;
        case CharacterCompareType::CharClass:
            if (!current_lhs_inversion_state())
                lhs_char_classes.set(static_cast<CharClass>(pair.value));
@@ -167,6 +169,7 @@ static bool interpret_compares(Vector<CompareTypeAndValuePair> const& lhs, Stati
            // These are the default behaviour for [...], so we don't need to do anything (unless we add support for 'And' below).
            break;
        case CharacterCompareType::And:
+        case CharacterCompareType::Subtract:
            // FIXME: These are too difficult to handle, so bail out.
            return false;
        case CharacterCompareType::Undefined:
@@ -495,6 +498,8 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
            // FIXME: We just need to look at the last character of this string, but we only have the first character here.
            //        Just bail out to avoid false positives.
            return true;
+        case CharacterCompareType::StringSet:
+            return true;
        case CharacterCompareType::CharClass: {
            auto contains = char_class_contains(static_cast<CharClass>(pair.value));
            if (!in_or() && (current_lhs_inversion_state() ^ contains))
@@ -613,6 +618,7 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
            break;
        }
        case CharacterCompareType::And:
+        case CharacterCompareType::Subtract:
            // FIXME: These are too difficult to handle, so bail out.
            return true;
        case CharacterCompareType::Undefined:
@@ -1838,6 +1844,7 @@ static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCod
    case CharacterCompareType::EndAndOr:
        return LookupTableInsertionOutcome::FinishFlushOnInsertion;
    case CharacterCompareType::And:
+    case CharacterCompareType::Subtract:
        return LookupTableInsertionOutcome::FlushOnInsertion;
    case CharacterCompareType::Reference:
    case CharacterCompareType::NamedReference:
@@ -1845,6 +1852,7 @@ static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCod
    case CharacterCompareType::GeneralCategory:
    case CharacterCompareType::Script:
    case CharacterCompareType::ScriptExtension:
+    case CharacterCompareType::StringSet:
    case CharacterCompareType::Or:
        return LookupTableInsertionOutcome::CannotPlaceInTable;
    case CharacterCompareType::Undefined:
@@ -1870,6 +1878,7 @@ void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndVa
                && pair.type != CharacterCompareType::Inverse
                && pair.type != CharacterCompareType::And
                && pair.type != CharacterCompareType::Or
+                && pair.type != CharacterCompareType::Subtract
                && pair.type != CharacterCompareType::EndAndOr)
                arguments.append(pair.value);
            ++argument_count;
@@ -1983,6 +1992,7 @@ void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndVa
                    && value.type != CharacterCompareType::Inverse
                    && value.type != CharacterCompareType::And
                    && value.type != CharacterCompareType::Or
+                    && value.type != CharacterCompareType::Subtract
                    && value.type != CharacterCompareType::EndAndOr)
                    arguments.append(value.value);
                ++argument_count;
--- a/Libraries/LibRegex/RegexParser.cpp
+++ b/Libraries/LibRegex/RegexParser.cpp
@@ -1684,7 +1684,15 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
                compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
            property.visit(
                [&](Unicode::Property property) {
-                    compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
+                    if (Unicode::is_ecma262_string_property(property) && !negated) {
+                        auto strings = Unicode::get_property_strings(property);
+                        if (!strings.is_empty()) {
+                            auto string_set_index = m_parser_state.bytecode.string_set_table().set(move(strings));
+                            compares.empend(CompareTypeAndValuePair { CharacterCompareType::StringSet, string_set_index });
+                        }
+                    } else {
+                        compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
+                    }
                },
                [&](Unicode::GeneralCategory general_category) {
                    compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category.value() });
@@ -2165,6 +2173,11 @@ bool ECMA262Parser::parse_class_union(Vector<regex::CompareTypeAndValuePair>& co
        first = false;
    }

+    if (!first) {
+        compares.prepend({ CharacterCompareType::Or, 0 });
+        compares.append({ CharacterCompareType::EndAndOr, 0 });
+    }
+
    restore_position.disarm();
    return !has_error();
 }
@@ -2220,7 +2233,7 @@ bool ECMA262Parser::parse_class_subtraction(Vector<CompareTypeAndValuePair>& com
    if (!try_skip("--"sv))
        return false;

-    compares.append({ CharacterCompareType::And, 0 });
+    compares.append({ CharacterCompareType::Subtract, 0 });
    compares.extend(move(lhs));

    do {
@@ -2228,7 +2241,6 @@ bool ECMA262Parser::parse_class_subtraction(Vector<CompareTypeAndValuePair>& com
        if (!parse_class_set_operand(rhs))
            return false;

-        compares.append({ CharacterCompareType::TemporaryInverse, 0 });
        compares.extend(rhs);
    } while (!has_error() && try_skip("--"sv));

@@ -2376,7 +2388,15 @@ bool ECMA262Parser::parse_class_set_operand(Vector<regex::CompareTypeAndValuePai
            compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
        property.visit(
            [&](Unicode::Property property) {
-                compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
+                if (Unicode::is_ecma262_string_property(property) && !negated) {
+                    auto strings = Unicode::get_property_strings(property);
+                    if (!strings.is_empty()) {
+                        auto string_set_index = m_parser_state.bytecode.string_set_table().set(move(strings));
+                        compares.empend(CompareTypeAndValuePair { CharacterCompareType::StringSet, string_set_index });
+                    }
+                } else {
+                    compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
+                }
            },
            [&](Unicode::GeneralCategory general_category) {
                compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category.value() });
@@ -2477,8 +2497,15 @@ bool ECMA262Parser::parse_nested_class(Vector<regex::CompareTypeAndValuePair>& c
                                return;
                            }
                        }
+
+                        auto strings = Unicode::get_property_strings(property);
+                        if (!strings.is_empty()) {
+                            auto string_set_index = m_parser_state.bytecode.string_set_table().set(move(strings));
+                            compares.empend(CompareTypeAndValuePair { CharacterCompareType::StringSet, string_set_index });
+                        }
+                    } else {
+                        compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
                    }
-                    compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
                },
                [&](Unicode::GeneralCategory general_category) {
                    compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category.value() });
--- a/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Libraries/LibUnicode/CharacterTypes.cpp
@@ -12,7 +12,9 @@
 #include <LibUnicode/ICU.h>

 #include <unicode/uchar.h>
+#include <unicode/uniset.h>
 #include <unicode/uscript.h>
+#include <unicode/uset.h>

 namespace Unicode {

@@ -321,6 +323,39 @@ bool is_ecma262_string_property(Property property)
    }
 }

+Vector<String> get_property_strings(Property property)
+{
+    Vector<String> result;
+
+    if (!is_ecma262_string_property(property))
+        return result;
+
+    UErrorCode status = U_ZERO_ERROR;
+    auto const* icu_set = u_getBinaryPropertySet(static_cast<UProperty>(property.value()), &status);
+    if (!icu_success(status) || !icu_set)
+        return result;
+
+    auto const* unicode_set = icu::UnicodeSet::fromUSet(icu_set);
+    if (!unicode_set)
+        return result;
+
+    auto range_count = unicode_set->getRangeCount();
+    for (int32_t i = 0; i < range_count; ++i) {
+        auto start = unicode_set->getRangeStart(i);
+        auto end = unicode_set->getRangeEnd(i);
+
+        for (auto code_point = start; code_point <= end; ++code_point) {
+            result.append(String::from_code_point(code_point));
+        }
+    }
+
+    for (auto const& str : unicode_set->strings()) {
+        result.append(icu_string_to_string(str));
+    }
+
+    return result;
+}
+
 Optional<Script> script_from_string(StringView script)
 {
    static auto script_names = []() {
--- a/Libraries/LibUnicode/CharacterTypes.h
+++ b/Libraries/LibUnicode/CharacterTypes.h
@@ -40,6 +40,7 @@ bool code_point_has_white_space_property(u32 code_point);

 bool is_ecma262_property(Property);
 bool is_ecma262_string_property(Property);
+Vector<String> get_property_strings(Property);

 Optional<Script> script_from_string(StringView);
 bool code_point_has_script(u32 code_point, Script script);