LibRegex: Support matching unicode multi-character sequences

This commit is contained in:
aplefull
2025-11-09 13:35:16 +01:00
committed by Ali Mohammad Pur
parent 5b7c9af340
commit a49c39de32
Notes: github-actions[bot] 2025-11-26 10:35:48 +00:00
7 changed files with 462 additions and 34 deletions

View File

@@ -95,7 +95,7 @@ test("Unicode non-ASCII matching", () => {
}
});
// Test from https://github.com/tc39/test262/blob/main/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-property-of-strings-escape.js
// https://github.com/tc39/test262/tree/main/test/built-ins/RegExp/unicodeSets/generated
test("Unicode properties of strings", () => {
const regexes = [
/\p{Basic_Emoji}/v,
@@ -113,26 +113,119 @@ test("Unicode properties of strings", () => {
}).not.toThrow();
}
const matchStrings = ["0", "1", "2", "3", "4", "5", "8", "A", "B", "D", "E", "F", "a", "b", "c", "d", "e", "f"];
const nonMatchStrings = [
"6\uFE0F\u20E3",
"7\uFE0F\u20E3",
"9\uFE0F\u20E3",
"\u2603",
"\u{1D306}",
"\u{1F1E7}\u{1F1EA}",
];
const re = /^[\p{ASCII_Hex_Digit}--\p{Emoji_Keycap_Sequence}]+$/v;
for (const str of matchStrings) {
expect(re.test(str)).toBeTrue();
function testExtendedCharacterClass({ regExp, matchStrings, nonMatchStrings }) {
matchStrings.forEach(str => expect(regExp.test(str)).toBeTrue());
nonMatchStrings.forEach(str => expect(regExp.test(str)).toBeFalse());
}
for (const str of nonMatchStrings) {
expect(re.test(str)).toBeFalse();
}
testExtendedCharacterClass({
regExp: /^[\p{ASCII_Hex_Digit}--\p{Emoji_Keycap_Sequence}]+$/v,
matchStrings: ["0", "1", "2", "3", "4", "5", "8", "A", "B", "D", "E", "F", "a", "b", "c", "d", "e", "f"],
nonMatchStrings: [
"6\uFE0F\u20E3",
"7\uFE0F\u20E3",
"9\uFE0F\u20E3",
"\u2603",
"\u{1D306}",
"\u{1F1E7}\u{1F1EA}",
],
});
testExtendedCharacterClass({
regExp: /^[\d\p{Emoji_Keycap_Sequence}]+$/v,
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3"],
nonMatchStrings: ["C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[[0-9]\p{Emoji_Keycap_Sequence}]+$/v,
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3"],
nonMatchStrings: ["C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[_--[0-9]]+$/v,
matchStrings: ["_"],
nonMatchStrings: ["6\uFE0F\u20E3", "7", "9\uFE0F\u20E3", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[\p{ASCII_Hex_Digit}--[0-9]]+$/v,
matchStrings: ["a", "b"],
nonMatchStrings: ["0", "9", "9\uFE0F\u20E3", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[\p{ASCII_Hex_Digit}\p{Emoji_Keycap_Sequence}]+$/v,
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "A", "B", "a", "b"],
nonMatchStrings: ["\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[_\p{Emoji_Keycap_Sequence}]+$/v,
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3", "_"],
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[\p{Emoji_Keycap_Sequence}--\d]+$/v,
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[\p{Emoji_Keycap_Sequence}--[0-9]]+$/v,
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[\p{Emoji_Keycap_Sequence}--\p{ASCII_Hex_Digit}]+$/v,
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[\p{Emoji_Keycap_Sequence}--_]+$/v,
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[\p{Emoji_Keycap_Sequence}&&\p{Emoji_Keycap_Sequence}]+$/v,
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[\p{Emoji_Keycap_Sequence}\d]+$/v,
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3"],
nonMatchStrings: ["C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[\p{Emoji_Keycap_Sequence}[0-9]]+$/v,
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3"],
nonMatchStrings: ["C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[\p{Emoji_Keycap_Sequence}\p{ASCII_Hex_Digit}]+$/v,
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3", "A", "a"],
nonMatchStrings: ["\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[\p{Emoji_Keycap_Sequence}_]+$/v,
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3", "_"],
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
testExtendedCharacterClass({
regExp: /^[\p{Emoji_Keycap_Sequence}\p{Emoji_Keycap_Sequence}]+$/v,
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
});
});
test("Unicode matching with u and v flags", () => {

View File

@@ -175,6 +175,41 @@ StringTable::~StringTable()
--s_next_string_table_serial; // We didn't use this serial, put it back.
}
static u32 s_next_string_set_table_serial { 0 };
StringSetTable::StringSetTable()
: m_serial(s_next_string_set_table_serial++)
{
}
StringSetTable::~StringSetTable()
{
if (m_serial == s_next_string_set_table_serial - 1 && m_u8_tries.is_empty())
--s_next_string_set_table_serial;
}
StringSetTable::StringSetTable(StringSetTable const& other)
: m_serial(s_next_string_set_table_serial++)
{
for (auto const& entry : other.m_u8_tries)
m_u8_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
for (auto const& entry : other.m_u16_tries)
m_u16_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
}
StringSetTable& StringSetTable::operator=(StringSetTable const& other)
{
if (this != &other) {
m_u8_tries.clear();
m_u16_tries.clear();
for (auto const& entry : other.m_u8_tries)
m_u8_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
for (auto const& entry : other.m_u16_tries)
m_u16_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
}
return *this;
}
void ByteCode::ensure_opcodes_initialized()
{
if (s_opcodes_initialized)
@@ -450,8 +485,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
struct DisjunctionState {
bool active { false };
bool is_conjunction { false };
bool is_subtraction { false };
bool fail { false };
bool inverse_matched { false };
size_t subtraction_operand_index { 0 };
size_t initial_position;
size_t initial_code_unit_position;
Optional<size_t> last_accepted_position {};
@@ -471,19 +508,35 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
state.string_position_before_match = state.string_position;
bool has_string_set = false;
bool string_set_matched = false;
size_t best_match_position = state.string_position;
size_t best_match_position_in_code_units = state.string_position_in_code_units;
size_t offset { state.instruction_position + 3 };
CharacterCompareType last_compare_type = CharacterCompareType::Undefined;
for (size_t i = 0; i < argument_count; ++i) {
if (state.string_position > string_position)
break;
if (has_string_set) {
state.string_position = string_position;
state.string_position_in_code_units = current_disjunction_state().initial_code_unit_position;
}
auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
if (reset_temp_inverse) {
reset_temp_inverse = false;
temporary_inverse = false;
if (compare_type != CharacterCompareType::Property || last_compare_type != CharacterCompareType::StringSet) {
temporary_inverse = false;
}
} else {
reset_temp_inverse = true;
}
auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
last_compare_type = compare_type;
switch (compare_type) {
case CharacterCompareType::Inverse:
@@ -710,6 +763,111 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
compare_script_extension(input, state, script, current_inversion_state(), inverse_matched);
break;
}
case CharacterCompareType::StringSet: {
has_string_set = true;
auto string_set_index = m_bytecode->at(offset++);
bool matched = false;
size_t longest_match_length = 0;
auto find_longest_match = [&](auto const& view, auto const& trie) {
auto const* current = &trie;
size_t current_code_unit_offset = state.string_position_in_code_units;
while (true) {
u32 value;
if constexpr (IsSame<decltype(view), Utf16View const&>) {
if (current_code_unit_offset >= view.length_in_code_units())
break;
value = view.code_unit_at(current_code_unit_offset);
} else {
if (current_code_unit_offset >= input.view.length_in_code_units())
break;
value = input.view.code_point_at(current_code_unit_offset);
}
if (input.regex_options & AllFlags::Insensitive) {
bool found_child = false;
for (auto const& [key, child] : current->children()) {
if (to_ascii_lowercase(key) == to_ascii_lowercase(value)) {
current = static_cast<StringSetTrie const*>(child.ptr());
current_code_unit_offset++;
found_child = true;
break;
}
}
if (!found_child)
break;
} else {
auto it = current->children().find(value);
if (it == current->children().end())
break;
current = static_cast<StringSetTrie const*>(it->value.ptr());
current_code_unit_offset++;
}
auto is_terminal = current->has_metadata() && current->metadata_value();
if (is_terminal) {
size_t match_length_in_code_points;
if constexpr (IsSame<decltype(view), Utf16View const&>) {
size_t code_points = 0;
for (size_t i = state.string_position_in_code_units; i < current_code_unit_offset;) {
auto code_point = view.code_point_at(i);
i += code_point >= 0x10000 ? 2 : 1;
code_points++;
}
match_length_in_code_points = code_points;
} else {
size_t code_points = 0;
for (size_t i = state.string_position_in_code_units; i < current_code_unit_offset;) {
auto code_point = input.view.code_point_at(i);
if (code_point <= 0x7F)
i += 1;
else if (code_point <= 0x7FF)
i += 2;
else if (code_point <= 0xFFFF)
i += 3;
else
i += 4;
code_points++;
}
match_length_in_code_points = code_points;
}
if (match_length_in_code_points > longest_match_length) {
matched = true;
longest_match_length = match_length_in_code_points;
}
}
}
};
if (input.view.u16_view().is_null()) {
auto const& trie = m_bytecode->string_set_table().get_u8_trie(string_set_index);
StringView view;
find_longest_match(view, trie);
} else {
auto const& view = input.view.u16_view();
auto const& trie = m_bytecode->string_set_table().get_u16_trie(string_set_index);
find_longest_match(view, trie);
}
if (matched) {
if (current_inversion_state()) {
inverse_matched = true;
} else {
state.string_position += longest_match_length;
if (input.view.unicode()) {
state.string_position_in_code_units = input.view.code_unit_offset_of(state.string_position);
} else {
state.string_position_in_code_units = state.string_position;
}
}
}
break;
}
case CharacterCompareType::And:
disjunction_states.append({
.active = true,
@@ -720,6 +878,17 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
.initial_code_unit_position = state.string_position_in_code_units,
});
continue;
case CharacterCompareType::Subtract:
disjunction_states.append({
.active = true,
.is_conjunction = true,
.is_subtraction = true,
.fail = true,
.inverse_matched = false,
.initial_position = state.string_position,
.initial_code_unit_position = state.string_position_in_code_units,
});
continue;
case CharacterCompareType::Or:
disjunction_states.append({
.active = true,
@@ -735,6 +904,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
if (!disjunction_state.fail) {
state.string_position = disjunction_state.last_accepted_position.value_or(disjunction_state.initial_position);
state.string_position_in_code_units = disjunction_state.last_accepted_code_unit_position.value_or(disjunction_state.initial_code_unit_position);
} else if (has_string_set) {
string_set_matched = false;
best_match_position = disjunction_state.initial_position;
best_match_position_in_code_units = disjunction_state.initial_code_unit_position;
}
inverse_matched = disjunction_state.inverse_matched || disjunction_state.fail;
break;
@@ -751,6 +924,12 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
inverse_matched = true;
}
if (has_string_set && state.string_position > best_match_position) {
best_match_position = state.string_position;
best_match_position_in_code_units = state.string_position_in_code_units;
string_set_matched = true;
}
if (!has_single_argument && new_disjunction_state.active) {
auto failed = (!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length();
@@ -760,10 +939,18 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
new_disjunction_state.inverse_matched |= inverse_matched;
}
if (new_disjunction_state.is_conjunction)
if (new_disjunction_state.is_subtraction) {
if (new_disjunction_state.subtraction_operand_index == 0) {
new_disjunction_state.fail = failed && new_disjunction_state.fail;
} else if (!failed && (!has_string_set || state.string_position >= best_match_position)) {
new_disjunction_state.fail = true;
}
new_disjunction_state.subtraction_operand_index++;
} else if (new_disjunction_state.is_conjunction) {
new_disjunction_state.fail = failed && new_disjunction_state.fail;
else
} else {
new_disjunction_state.fail = failed || new_disjunction_state.fail;
}
state.string_position = new_disjunction_state.initial_position;
state.string_position_in_code_units = new_disjunction_state.initial_code_unit_position;
@@ -773,11 +960,16 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
if (!has_single_argument) {
auto& new_disjunction_state = current_disjunction_state();
if (new_disjunction_state.active) {
if (!new_disjunction_state.fail) {
state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position);
state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position);
}
if (new_disjunction_state.active && !new_disjunction_state.fail) {
state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position);
state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position);
}
}
if (has_string_set && string_set_matched) {
if (has_single_argument || best_match_position > string_position) {
state.string_position = best_match_position;
state.string_position_in_code_units = best_match_position_in_code_units;
}
}

View File

@@ -14,6 +14,7 @@
#include <AK/Forward.h>
#include <AK/HashMap.h>
#include <AK/OwnPtr.h>
#include <AK/Trie.h>
#include <AK/TypeCasts.h>
#include <AK/Types.h>
#include <AK/Vector.h>
@@ -78,7 +79,9 @@ enum class OpCodeId : ByteCodeValueType {
__ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(And) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Or) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(EndAndOr)
__ENUMERATE_CHARACTER_COMPARE_TYPE(EndAndOr) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Subtract) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(StringSet)
enum class CharacterCompareType : ByteCodeValueType {
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
@@ -177,6 +180,62 @@ struct REGEX_API StringTable {
HashMap<ByteCodeValueType, FlyString> m_inverse_table;
};
using StringSetTrie = Trie<u32, bool>;
struct REGEX_API StringSetTable {
StringSetTable();
~StringSetTable();
StringSetTable(StringSetTable const& other);
StringSetTable(StringSetTable&&) = default;
StringSetTable& operator=(StringSetTable const& other);
StringSetTable& operator=(StringSetTable&&) = default;
ByteCodeValueType set(Vector<String> const& strings)
{
u32 local_index = m_u8_tries.size();
ByteCodeValueType global_index = static_cast<ByteCodeValueType>(m_serial) << 32 | static_cast<ByteCodeValueType>(local_index);
StringSetTrie u8_trie { 0, false };
StringSetTrie u16_trie { 0, false };
for (auto const& str : strings) {
Vector<u32> code_points;
Utf8View utf8_view { str.bytes_as_string_view() };
for (auto code_point : utf8_view)
code_points.append(code_point);
(void)u8_trie.insert(code_points.begin(), code_points.end(), true, [](auto&, auto) { return false; });
auto utf16_string = Utf16String::from_utf32({ code_points.data(), code_points.size() });
Vector<u32> u16_code_units;
auto utf16_view = utf16_string.utf16_view();
for (size_t i = 0; i < utf16_view.length_in_code_units(); i++) {
auto code_unit = utf16_view.code_unit_at(i);
u16_code_units.append(code_unit);
}
(void)u16_trie.insert(u16_code_units.begin(), u16_code_units.end(), true, [](auto&, auto) { return false; });
}
m_u8_tries.set(global_index, move(u8_trie));
m_u16_tries.set(global_index, move(u16_trie));
return global_index;
}
StringSetTrie const& get_u8_trie(ByteCodeValueType index) const
{
return m_u8_tries.get(index).value();
}
StringSetTrie const& get_u16_trie(ByteCodeValueType index) const
{
return m_u16_tries.get(index).value();
}
u32 m_serial { 0 };
HashMap<ByteCodeValueType, StringSetTrie> m_u8_tries;
HashMap<ByteCodeValueType, StringSetTrie> m_u16_tries;
};
class REGEX_API ByteCode : public DisjointChunks<ByteCodeValueType> {
using Base = DisjointChunks<ByteCodeValueType>;
@@ -262,6 +321,9 @@ public:
FlyString get_string(size_t index) const { return m_string_table.get(index); }
auto const& string_table() const { return m_string_table; }
auto const& string_set_table() const { return m_string_set_table; }
auto& string_set_table() { return m_string_set_table; }
Optional<size_t> get_group_name_index(size_t group_index) const
{
return m_group_name_mappings.get(group_index);
@@ -286,6 +348,13 @@ public:
}
m_string_table.m_inverse_table.update(other.m_string_table.m_inverse_table);
for (auto const& entry : other.m_string_set_table.m_u8_tries) {
m_string_set_table.m_u8_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
}
for (auto const& entry : other.m_string_set_table.m_u16_tries) {
m_string_set_table.m_u16_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
}
for (auto const& mapping : other.m_group_name_mappings) {
m_group_name_mappings.set(mapping.key, mapping.value);
}
@@ -631,6 +700,7 @@ private:
static bool s_opcodes_initialized;
static size_t s_next_checkpoint_serial_id;
StringTable m_string_table;
StringSetTable m_string_set_table;
HashMap<size_t, size_t> m_group_name_mappings;
};

View File

@@ -113,6 +113,8 @@ static bool interpret_compares(Vector<CompareTypeAndValuePair> const& lhs, Stati
// FIXME: We just need to look at the last character of this string, but we only have the first character here.
// Just bail out to avoid false positives.
return false;
case CharacterCompareType::StringSet:
return false;
case CharacterCompareType::CharClass:
if (!current_lhs_inversion_state())
lhs_char_classes.set(static_cast<CharClass>(pair.value));
@@ -167,6 +169,7 @@ static bool interpret_compares(Vector<CompareTypeAndValuePair> const& lhs, Stati
// These are the default behaviour for [...], so we don't need to do anything (unless we add support for 'And' below).
break;
case CharacterCompareType::And:
case CharacterCompareType::Subtract:
// FIXME: These are too difficult to handle, so bail out.
return false;
case CharacterCompareType::Undefined:
@@ -495,6 +498,8 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
// FIXME: We just need to look at the last character of this string, but we only have the first character here.
// Just bail out to avoid false positives.
return true;
case CharacterCompareType::StringSet:
return true;
case CharacterCompareType::CharClass: {
auto contains = char_class_contains(static_cast<CharClass>(pair.value));
if (!in_or() && (current_lhs_inversion_state() ^ contains))
@@ -613,6 +618,7 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
break;
}
case CharacterCompareType::And:
case CharacterCompareType::Subtract:
// FIXME: These are too difficult to handle, so bail out.
return true;
case CharacterCompareType::Undefined:
@@ -1838,6 +1844,7 @@ static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCod
case CharacterCompareType::EndAndOr:
return LookupTableInsertionOutcome::FinishFlushOnInsertion;
case CharacterCompareType::And:
case CharacterCompareType::Subtract:
return LookupTableInsertionOutcome::FlushOnInsertion;
case CharacterCompareType::Reference:
case CharacterCompareType::NamedReference:
@@ -1845,6 +1852,7 @@ static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCod
case CharacterCompareType::GeneralCategory:
case CharacterCompareType::Script:
case CharacterCompareType::ScriptExtension:
case CharacterCompareType::StringSet:
case CharacterCompareType::Or:
return LookupTableInsertionOutcome::CannotPlaceInTable;
case CharacterCompareType::Undefined:
@@ -1870,6 +1878,7 @@ void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndVa
&& pair.type != CharacterCompareType::Inverse
&& pair.type != CharacterCompareType::And
&& pair.type != CharacterCompareType::Or
&& pair.type != CharacterCompareType::Subtract
&& pair.type != CharacterCompareType::EndAndOr)
arguments.append(pair.value);
++argument_count;
@@ -1983,6 +1992,7 @@ void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndVa
&& value.type != CharacterCompareType::Inverse
&& value.type != CharacterCompareType::And
&& value.type != CharacterCompareType::Or
&& value.type != CharacterCompareType::Subtract
&& value.type != CharacterCompareType::EndAndOr)
arguments.append(value.value);
++argument_count;

View File

@@ -1684,7 +1684,15 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
property.visit(
[&](Unicode::Property property) {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
if (Unicode::is_ecma262_string_property(property) && !negated) {
auto strings = Unicode::get_property_strings(property);
if (!strings.is_empty()) {
auto string_set_index = m_parser_state.bytecode.string_set_table().set(move(strings));
compares.empend(CompareTypeAndValuePair { CharacterCompareType::StringSet, string_set_index });
}
} else {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
}
},
[&](Unicode::GeneralCategory general_category) {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category.value() });
@@ -2165,6 +2173,11 @@ bool ECMA262Parser::parse_class_union(Vector<regex::CompareTypeAndValuePair>& co
first = false;
}
if (!first) {
compares.prepend({ CharacterCompareType::Or, 0 });
compares.append({ CharacterCompareType::EndAndOr, 0 });
}
restore_position.disarm();
return !has_error();
}
@@ -2220,7 +2233,7 @@ bool ECMA262Parser::parse_class_subtraction(Vector<CompareTypeAndValuePair>& com
if (!try_skip("--"sv))
return false;
compares.append({ CharacterCompareType::And, 0 });
compares.append({ CharacterCompareType::Subtract, 0 });
compares.extend(move(lhs));
do {
@@ -2228,7 +2241,6 @@ bool ECMA262Parser::parse_class_subtraction(Vector<CompareTypeAndValuePair>& com
if (!parse_class_set_operand(rhs))
return false;
compares.append({ CharacterCompareType::TemporaryInverse, 0 });
compares.extend(rhs);
} while (!has_error() && try_skip("--"sv));
@@ -2376,7 +2388,15 @@ bool ECMA262Parser::parse_class_set_operand(Vector<regex::CompareTypeAndValuePai
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
property.visit(
[&](Unicode::Property property) {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
if (Unicode::is_ecma262_string_property(property) && !negated) {
auto strings = Unicode::get_property_strings(property);
if (!strings.is_empty()) {
auto string_set_index = m_parser_state.bytecode.string_set_table().set(move(strings));
compares.empend(CompareTypeAndValuePair { CharacterCompareType::StringSet, string_set_index });
}
} else {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
}
},
[&](Unicode::GeneralCategory general_category) {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category.value() });
@@ -2477,8 +2497,15 @@ bool ECMA262Parser::parse_nested_class(Vector<regex::CompareTypeAndValuePair>& c
return;
}
}
auto strings = Unicode::get_property_strings(property);
if (!strings.is_empty()) {
auto string_set_index = m_parser_state.bytecode.string_set_table().set(move(strings));
compares.empend(CompareTypeAndValuePair { CharacterCompareType::StringSet, string_set_index });
}
} else {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
}
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
},
[&](Unicode::GeneralCategory general_category) {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category.value() });

View File

@@ -12,7 +12,9 @@
#include <LibUnicode/ICU.h>
#include <unicode/uchar.h>
#include <unicode/uniset.h>
#include <unicode/uscript.h>
#include <unicode/uset.h>
namespace Unicode {
@@ -321,6 +323,39 @@ bool is_ecma262_string_property(Property property)
}
}
Vector<String> get_property_strings(Property property)
{
Vector<String> result;
if (!is_ecma262_string_property(property))
return result;
UErrorCode status = U_ZERO_ERROR;
auto const* icu_set = u_getBinaryPropertySet(static_cast<UProperty>(property.value()), &status);
if (!icu_success(status) || !icu_set)
return result;
auto const* unicode_set = icu::UnicodeSet::fromUSet(icu_set);
if (!unicode_set)
return result;
auto range_count = unicode_set->getRangeCount();
for (int32_t i = 0; i < range_count; ++i) {
auto start = unicode_set->getRangeStart(i);
auto end = unicode_set->getRangeEnd(i);
for (auto code_point = start; code_point <= end; ++code_point) {
result.append(String::from_code_point(code_point));
}
}
for (auto const& str : unicode_set->strings()) {
result.append(icu_string_to_string(str));
}
return result;
}
Optional<Script> script_from_string(StringView script)
{
static auto script_names = []() {

View File

@@ -40,6 +40,7 @@ bool code_point_has_white_space_property(u32 code_point);
bool is_ecma262_property(Property);
bool is_ecma262_string_property(Property);
Vector<String> get_property_strings(Property);
Optional<Script> script_from_string(StringView);
bool code_point_has_script(u32 code_point, Script script);