mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-12-05 01:10:24 +00:00
LibRegex: Support matching unicode multi-character sequences
This commit is contained in:
committed by
Ali Mohammad Pur
parent
5b7c9af340
commit
a49c39de32
Notes:
github-actions[bot]
2025-11-26 10:35:48 +00:00
Author: https://github.com/aplefull Commit: https://github.com/LadybirdBrowser/ladybird/commit/a49c39de32f Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/6867 Reviewed-by: https://github.com/alimpfard ✅
@@ -95,7 +95,7 @@ test("Unicode non-ASCII matching", () => {
|
||||
}
|
||||
});
|
||||
|
||||
// Test from https://github.com/tc39/test262/blob/main/test/built-ins/RegExp/unicodeSets/generated/character-property-escape-difference-property-of-strings-escape.js
|
||||
// https://github.com/tc39/test262/tree/main/test/built-ins/RegExp/unicodeSets/generated
|
||||
test("Unicode properties of strings", () => {
|
||||
const regexes = [
|
||||
/\p{Basic_Emoji}/v,
|
||||
@@ -113,26 +113,119 @@ test("Unicode properties of strings", () => {
|
||||
}).not.toThrow();
|
||||
}
|
||||
|
||||
const matchStrings = ["0", "1", "2", "3", "4", "5", "8", "A", "B", "D", "E", "F", "a", "b", "c", "d", "e", "f"];
|
||||
|
||||
const nonMatchStrings = [
|
||||
"6\uFE0F\u20E3",
|
||||
"7\uFE0F\u20E3",
|
||||
"9\uFE0F\u20E3",
|
||||
"\u2603",
|
||||
"\u{1D306}",
|
||||
"\u{1F1E7}\u{1F1EA}",
|
||||
];
|
||||
|
||||
const re = /^[\p{ASCII_Hex_Digit}--\p{Emoji_Keycap_Sequence}]+$/v;
|
||||
|
||||
for (const str of matchStrings) {
|
||||
expect(re.test(str)).toBeTrue();
|
||||
function testExtendedCharacterClass({ regExp, matchStrings, nonMatchStrings }) {
|
||||
matchStrings.forEach(str => expect(regExp.test(str)).toBeTrue());
|
||||
nonMatchStrings.forEach(str => expect(regExp.test(str)).toBeFalse());
|
||||
}
|
||||
|
||||
for (const str of nonMatchStrings) {
|
||||
expect(re.test(str)).toBeFalse();
|
||||
}
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[\p{ASCII_Hex_Digit}--\p{Emoji_Keycap_Sequence}]+$/v,
|
||||
matchStrings: ["0", "1", "2", "3", "4", "5", "8", "A", "B", "D", "E", "F", "a", "b", "c", "d", "e", "f"],
|
||||
nonMatchStrings: [
|
||||
"6\uFE0F\u20E3",
|
||||
"7\uFE0F\u20E3",
|
||||
"9\uFE0F\u20E3",
|
||||
"\u2603",
|
||||
"\u{1D306}",
|
||||
"\u{1F1E7}\u{1F1EA}",
|
||||
],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[\d\p{Emoji_Keycap_Sequence}]+$/v,
|
||||
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3"],
|
||||
nonMatchStrings: ["C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[[0-9]\p{Emoji_Keycap_Sequence}]+$/v,
|
||||
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3"],
|
||||
nonMatchStrings: ["C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[_--[0-9]]+$/v,
|
||||
matchStrings: ["_"],
|
||||
nonMatchStrings: ["6\uFE0F\u20E3", "7", "9\uFE0F\u20E3", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[\p{ASCII_Hex_Digit}--[0-9]]+$/v,
|
||||
matchStrings: ["a", "b"],
|
||||
nonMatchStrings: ["0", "9", "9\uFE0F\u20E3", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[\p{ASCII_Hex_Digit}\p{Emoji_Keycap_Sequence}]+$/v,
|
||||
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "A", "B", "a", "b"],
|
||||
nonMatchStrings: ["\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[_\p{Emoji_Keycap_Sequence}]+$/v,
|
||||
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3", "_"],
|
||||
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[\p{Emoji_Keycap_Sequence}--\d]+$/v,
|
||||
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
|
||||
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[\p{Emoji_Keycap_Sequence}--[0-9]]+$/v,
|
||||
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
|
||||
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[\p{Emoji_Keycap_Sequence}--\p{ASCII_Hex_Digit}]+$/v,
|
||||
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
|
||||
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[\p{Emoji_Keycap_Sequence}--_]+$/v,
|
||||
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
|
||||
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[\p{Emoji_Keycap_Sequence}&&\p{Emoji_Keycap_Sequence}]+$/v,
|
||||
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
|
||||
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[\p{Emoji_Keycap_Sequence}\d]+$/v,
|
||||
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3"],
|
||||
nonMatchStrings: ["C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[\p{Emoji_Keycap_Sequence}[0-9]]+$/v,
|
||||
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3"],
|
||||
nonMatchStrings: ["C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[\p{Emoji_Keycap_Sequence}\p{ASCII_Hex_Digit}]+$/v,
|
||||
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0", "0\uFE0F\u20E3", "9", "9\uFE0F\u20E3", "A", "a"],
|
||||
nonMatchStrings: ["\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[\p{Emoji_Keycap_Sequence}_]+$/v,
|
||||
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3", "_"],
|
||||
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
|
||||
testExtendedCharacterClass({
|
||||
regExp: /^[\p{Emoji_Keycap_Sequence}\p{Emoji_Keycap_Sequence}]+$/v,
|
||||
matchStrings: ["#\uFE0F\u20E3", "*\uFE0F\u20E3", "0\uFE0F\u20E3"],
|
||||
nonMatchStrings: ["7", "C", "\u2603", "\u{1D306}", "\u{1F1E7}\u{1F1EA}"],
|
||||
});
|
||||
});
|
||||
|
||||
test("Unicode matching with u and v flags", () => {
|
||||
|
||||
@@ -175,6 +175,41 @@ StringTable::~StringTable()
|
||||
--s_next_string_table_serial; // We didn't use this serial, put it back.
|
||||
}
|
||||
|
||||
static u32 s_next_string_set_table_serial { 0 };
|
||||
|
||||
StringSetTable::StringSetTable()
|
||||
: m_serial(s_next_string_set_table_serial++)
|
||||
{
|
||||
}
|
||||
|
||||
StringSetTable::~StringSetTable()
|
||||
{
|
||||
if (m_serial == s_next_string_set_table_serial - 1 && m_u8_tries.is_empty())
|
||||
--s_next_string_set_table_serial;
|
||||
}
|
||||
|
||||
StringSetTable::StringSetTable(StringSetTable const& other)
|
||||
: m_serial(s_next_string_set_table_serial++)
|
||||
{
|
||||
for (auto const& entry : other.m_u8_tries)
|
||||
m_u8_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
|
||||
for (auto const& entry : other.m_u16_tries)
|
||||
m_u16_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
|
||||
}
|
||||
|
||||
StringSetTable& StringSetTable::operator=(StringSetTable const& other)
|
||||
{
|
||||
if (this != &other) {
|
||||
m_u8_tries.clear();
|
||||
m_u16_tries.clear();
|
||||
for (auto const& entry : other.m_u8_tries)
|
||||
m_u8_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
|
||||
for (auto const& entry : other.m_u16_tries)
|
||||
m_u16_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
void ByteCode::ensure_opcodes_initialized()
|
||||
{
|
||||
if (s_opcodes_initialized)
|
||||
@@ -450,8 +485,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
||||
struct DisjunctionState {
|
||||
bool active { false };
|
||||
bool is_conjunction { false };
|
||||
bool is_subtraction { false };
|
||||
bool fail { false };
|
||||
bool inverse_matched { false };
|
||||
size_t subtraction_operand_index { 0 };
|
||||
size_t initial_position;
|
||||
size_t initial_code_unit_position;
|
||||
Optional<size_t> last_accepted_position {};
|
||||
@@ -471,19 +508,35 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
||||
|
||||
state.string_position_before_match = state.string_position;
|
||||
|
||||
bool has_string_set = false;
|
||||
bool string_set_matched = false;
|
||||
size_t best_match_position = state.string_position;
|
||||
size_t best_match_position_in_code_units = state.string_position_in_code_units;
|
||||
|
||||
size_t offset { state.instruction_position + 3 };
|
||||
CharacterCompareType last_compare_type = CharacterCompareType::Undefined;
|
||||
|
||||
for (size_t i = 0; i < argument_count; ++i) {
|
||||
if (state.string_position > string_position)
|
||||
break;
|
||||
|
||||
if (has_string_set) {
|
||||
state.string_position = string_position;
|
||||
state.string_position_in_code_units = current_disjunction_state().initial_code_unit_position;
|
||||
}
|
||||
|
||||
auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
|
||||
|
||||
if (reset_temp_inverse) {
|
||||
reset_temp_inverse = false;
|
||||
temporary_inverse = false;
|
||||
if (compare_type != CharacterCompareType::Property || last_compare_type != CharacterCompareType::StringSet) {
|
||||
temporary_inverse = false;
|
||||
}
|
||||
} else {
|
||||
reset_temp_inverse = true;
|
||||
}
|
||||
|
||||
auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
|
||||
last_compare_type = compare_type;
|
||||
|
||||
switch (compare_type) {
|
||||
case CharacterCompareType::Inverse:
|
||||
@@ -710,6 +763,111 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
||||
compare_script_extension(input, state, script, current_inversion_state(), inverse_matched);
|
||||
break;
|
||||
}
|
||||
case CharacterCompareType::StringSet: {
|
||||
has_string_set = true;
|
||||
auto string_set_index = m_bytecode->at(offset++);
|
||||
|
||||
bool matched = false;
|
||||
size_t longest_match_length = 0;
|
||||
|
||||
auto find_longest_match = [&](auto const& view, auto const& trie) {
|
||||
auto const* current = ≜
|
||||
size_t current_code_unit_offset = state.string_position_in_code_units;
|
||||
|
||||
while (true) {
|
||||
u32 value;
|
||||
|
||||
if constexpr (IsSame<decltype(view), Utf16View const&>) {
|
||||
if (current_code_unit_offset >= view.length_in_code_units())
|
||||
break;
|
||||
value = view.code_unit_at(current_code_unit_offset);
|
||||
} else {
|
||||
if (current_code_unit_offset >= input.view.length_in_code_units())
|
||||
break;
|
||||
value = input.view.code_point_at(current_code_unit_offset);
|
||||
}
|
||||
|
||||
if (input.regex_options & AllFlags::Insensitive) {
|
||||
bool found_child = false;
|
||||
for (auto const& [key, child] : current->children()) {
|
||||
if (to_ascii_lowercase(key) == to_ascii_lowercase(value)) {
|
||||
current = static_cast<StringSetTrie const*>(child.ptr());
|
||||
current_code_unit_offset++;
|
||||
found_child = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found_child)
|
||||
break;
|
||||
} else {
|
||||
auto it = current->children().find(value);
|
||||
if (it == current->children().end())
|
||||
break;
|
||||
|
||||
current = static_cast<StringSetTrie const*>(it->value.ptr());
|
||||
current_code_unit_offset++;
|
||||
}
|
||||
|
||||
auto is_terminal = current->has_metadata() && current->metadata_value();
|
||||
if (is_terminal) {
|
||||
size_t match_length_in_code_points;
|
||||
if constexpr (IsSame<decltype(view), Utf16View const&>) {
|
||||
size_t code_points = 0;
|
||||
for (size_t i = state.string_position_in_code_units; i < current_code_unit_offset;) {
|
||||
auto code_point = view.code_point_at(i);
|
||||
i += code_point >= 0x10000 ? 2 : 1;
|
||||
code_points++;
|
||||
}
|
||||
match_length_in_code_points = code_points;
|
||||
} else {
|
||||
size_t code_points = 0;
|
||||
for (size_t i = state.string_position_in_code_units; i < current_code_unit_offset;) {
|
||||
auto code_point = input.view.code_point_at(i);
|
||||
if (code_point <= 0x7F)
|
||||
i += 1;
|
||||
else if (code_point <= 0x7FF)
|
||||
i += 2;
|
||||
else if (code_point <= 0xFFFF)
|
||||
i += 3;
|
||||
else
|
||||
i += 4;
|
||||
code_points++;
|
||||
}
|
||||
match_length_in_code_points = code_points;
|
||||
}
|
||||
|
||||
if (match_length_in_code_points > longest_match_length) {
|
||||
matched = true;
|
||||
longest_match_length = match_length_in_code_points;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if (input.view.u16_view().is_null()) {
|
||||
auto const& trie = m_bytecode->string_set_table().get_u8_trie(string_set_index);
|
||||
StringView view;
|
||||
find_longest_match(view, trie);
|
||||
} else {
|
||||
auto const& view = input.view.u16_view();
|
||||
auto const& trie = m_bytecode->string_set_table().get_u16_trie(string_set_index);
|
||||
find_longest_match(view, trie);
|
||||
}
|
||||
|
||||
if (matched) {
|
||||
if (current_inversion_state()) {
|
||||
inverse_matched = true;
|
||||
} else {
|
||||
state.string_position += longest_match_length;
|
||||
if (input.view.unicode()) {
|
||||
state.string_position_in_code_units = input.view.code_unit_offset_of(state.string_position);
|
||||
} else {
|
||||
state.string_position_in_code_units = state.string_position;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case CharacterCompareType::And:
|
||||
disjunction_states.append({
|
||||
.active = true,
|
||||
@@ -720,6 +878,17 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
||||
.initial_code_unit_position = state.string_position_in_code_units,
|
||||
});
|
||||
continue;
|
||||
case CharacterCompareType::Subtract:
|
||||
disjunction_states.append({
|
||||
.active = true,
|
||||
.is_conjunction = true,
|
||||
.is_subtraction = true,
|
||||
.fail = true,
|
||||
.inverse_matched = false,
|
||||
.initial_position = state.string_position,
|
||||
.initial_code_unit_position = state.string_position_in_code_units,
|
||||
});
|
||||
continue;
|
||||
case CharacterCompareType::Or:
|
||||
disjunction_states.append({
|
||||
.active = true,
|
||||
@@ -735,6 +904,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
||||
if (!disjunction_state.fail) {
|
||||
state.string_position = disjunction_state.last_accepted_position.value_or(disjunction_state.initial_position);
|
||||
state.string_position_in_code_units = disjunction_state.last_accepted_code_unit_position.value_or(disjunction_state.initial_code_unit_position);
|
||||
} else if (has_string_set) {
|
||||
string_set_matched = false;
|
||||
best_match_position = disjunction_state.initial_position;
|
||||
best_match_position_in_code_units = disjunction_state.initial_code_unit_position;
|
||||
}
|
||||
inverse_matched = disjunction_state.inverse_matched || disjunction_state.fail;
|
||||
break;
|
||||
@@ -751,6 +924,12 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
||||
inverse_matched = true;
|
||||
}
|
||||
|
||||
if (has_string_set && state.string_position > best_match_position) {
|
||||
best_match_position = state.string_position;
|
||||
best_match_position_in_code_units = state.string_position_in_code_units;
|
||||
string_set_matched = true;
|
||||
}
|
||||
|
||||
if (!has_single_argument && new_disjunction_state.active) {
|
||||
auto failed = (!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length();
|
||||
|
||||
@@ -760,10 +939,18 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
||||
new_disjunction_state.inverse_matched |= inverse_matched;
|
||||
}
|
||||
|
||||
if (new_disjunction_state.is_conjunction)
|
||||
if (new_disjunction_state.is_subtraction) {
|
||||
if (new_disjunction_state.subtraction_operand_index == 0) {
|
||||
new_disjunction_state.fail = failed && new_disjunction_state.fail;
|
||||
} else if (!failed && (!has_string_set || state.string_position >= best_match_position)) {
|
||||
new_disjunction_state.fail = true;
|
||||
}
|
||||
new_disjunction_state.subtraction_operand_index++;
|
||||
} else if (new_disjunction_state.is_conjunction) {
|
||||
new_disjunction_state.fail = failed && new_disjunction_state.fail;
|
||||
else
|
||||
} else {
|
||||
new_disjunction_state.fail = failed || new_disjunction_state.fail;
|
||||
}
|
||||
|
||||
state.string_position = new_disjunction_state.initial_position;
|
||||
state.string_position_in_code_units = new_disjunction_state.initial_code_unit_position;
|
||||
@@ -773,11 +960,16 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
||||
|
||||
if (!has_single_argument) {
|
||||
auto& new_disjunction_state = current_disjunction_state();
|
||||
if (new_disjunction_state.active) {
|
||||
if (!new_disjunction_state.fail) {
|
||||
state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position);
|
||||
state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position);
|
||||
}
|
||||
if (new_disjunction_state.active && !new_disjunction_state.fail) {
|
||||
state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position);
|
||||
state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position);
|
||||
}
|
||||
}
|
||||
|
||||
if (has_string_set && string_set_matched) {
|
||||
if (has_single_argument || best_match_position > string_position) {
|
||||
state.string_position = best_match_position;
|
||||
state.string_position_in_code_units = best_match_position_in_code_units;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/HashMap.h>
|
||||
#include <AK/OwnPtr.h>
|
||||
#include <AK/Trie.h>
|
||||
#include <AK/TypeCasts.h>
|
||||
#include <AK/Types.h>
|
||||
#include <AK/Vector.h>
|
||||
@@ -78,7 +79,9 @@ enum class OpCodeId : ByteCodeValueType {
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(And) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Or) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(EndAndOr)
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(EndAndOr) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Subtract) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(StringSet)
|
||||
|
||||
enum class CharacterCompareType : ByteCodeValueType {
|
||||
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
|
||||
@@ -177,6 +180,62 @@ struct REGEX_API StringTable {
|
||||
HashMap<ByteCodeValueType, FlyString> m_inverse_table;
|
||||
};
|
||||
|
||||
using StringSetTrie = Trie<u32, bool>;
|
||||
|
||||
struct REGEX_API StringSetTable {
|
||||
StringSetTable();
|
||||
~StringSetTable();
|
||||
StringSetTable(StringSetTable const& other);
|
||||
StringSetTable(StringSetTable&&) = default;
|
||||
StringSetTable& operator=(StringSetTable const& other);
|
||||
StringSetTable& operator=(StringSetTable&&) = default;
|
||||
|
||||
ByteCodeValueType set(Vector<String> const& strings)
|
||||
{
|
||||
u32 local_index = m_u8_tries.size();
|
||||
ByteCodeValueType global_index = static_cast<ByteCodeValueType>(m_serial) << 32 | static_cast<ByteCodeValueType>(local_index);
|
||||
|
||||
StringSetTrie u8_trie { 0, false };
|
||||
StringSetTrie u16_trie { 0, false };
|
||||
|
||||
for (auto const& str : strings) {
|
||||
Vector<u32> code_points;
|
||||
Utf8View utf8_view { str.bytes_as_string_view() };
|
||||
for (auto code_point : utf8_view)
|
||||
code_points.append(code_point);
|
||||
|
||||
(void)u8_trie.insert(code_points.begin(), code_points.end(), true, [](auto&, auto) { return false; });
|
||||
|
||||
auto utf16_string = Utf16String::from_utf32({ code_points.data(), code_points.size() });
|
||||
Vector<u32> u16_code_units;
|
||||
auto utf16_view = utf16_string.utf16_view();
|
||||
for (size_t i = 0; i < utf16_view.length_in_code_units(); i++) {
|
||||
auto code_unit = utf16_view.code_unit_at(i);
|
||||
u16_code_units.append(code_unit);
|
||||
}
|
||||
(void)u16_trie.insert(u16_code_units.begin(), u16_code_units.end(), true, [](auto&, auto) { return false; });
|
||||
}
|
||||
|
||||
m_u8_tries.set(global_index, move(u8_trie));
|
||||
m_u16_tries.set(global_index, move(u16_trie));
|
||||
return global_index;
|
||||
}
|
||||
|
||||
StringSetTrie const& get_u8_trie(ByteCodeValueType index) const
|
||||
{
|
||||
return m_u8_tries.get(index).value();
|
||||
}
|
||||
|
||||
StringSetTrie const& get_u16_trie(ByteCodeValueType index) const
|
||||
{
|
||||
return m_u16_tries.get(index).value();
|
||||
}
|
||||
|
||||
u32 m_serial { 0 };
|
||||
HashMap<ByteCodeValueType, StringSetTrie> m_u8_tries;
|
||||
HashMap<ByteCodeValueType, StringSetTrie> m_u16_tries;
|
||||
};
|
||||
|
||||
class REGEX_API ByteCode : public DisjointChunks<ByteCodeValueType> {
|
||||
using Base = DisjointChunks<ByteCodeValueType>;
|
||||
|
||||
@@ -262,6 +321,9 @@ public:
|
||||
FlyString get_string(size_t index) const { return m_string_table.get(index); }
|
||||
auto const& string_table() const { return m_string_table; }
|
||||
|
||||
auto const& string_set_table() const { return m_string_set_table; }
|
||||
auto& string_set_table() { return m_string_set_table; }
|
||||
|
||||
Optional<size_t> get_group_name_index(size_t group_index) const
|
||||
{
|
||||
return m_group_name_mappings.get(group_index);
|
||||
@@ -286,6 +348,13 @@ public:
|
||||
}
|
||||
m_string_table.m_inverse_table.update(other.m_string_table.m_inverse_table);
|
||||
|
||||
for (auto const& entry : other.m_string_set_table.m_u8_tries) {
|
||||
m_string_set_table.m_u8_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
|
||||
}
|
||||
for (auto const& entry : other.m_string_set_table.m_u16_tries) {
|
||||
m_string_set_table.m_u16_tries.set(entry.key, MUST(const_cast<StringSetTrie&>(entry.value).deep_copy()));
|
||||
}
|
||||
|
||||
for (auto const& mapping : other.m_group_name_mappings) {
|
||||
m_group_name_mappings.set(mapping.key, mapping.value);
|
||||
}
|
||||
@@ -631,6 +700,7 @@ private:
|
||||
static bool s_opcodes_initialized;
|
||||
static size_t s_next_checkpoint_serial_id;
|
||||
StringTable m_string_table;
|
||||
StringSetTable m_string_set_table;
|
||||
HashMap<size_t, size_t> m_group_name_mappings;
|
||||
};
|
||||
|
||||
|
||||
@@ -113,6 +113,8 @@ static bool interpret_compares(Vector<CompareTypeAndValuePair> const& lhs, Stati
|
||||
// FIXME: We just need to look at the last character of this string, but we only have the first character here.
|
||||
// Just bail out to avoid false positives.
|
||||
return false;
|
||||
case CharacterCompareType::StringSet:
|
||||
return false;
|
||||
case CharacterCompareType::CharClass:
|
||||
if (!current_lhs_inversion_state())
|
||||
lhs_char_classes.set(static_cast<CharClass>(pair.value));
|
||||
@@ -167,6 +169,7 @@ static bool interpret_compares(Vector<CompareTypeAndValuePair> const& lhs, Stati
|
||||
// These are the default behaviour for [...], so we don't need to do anything (unless we add support for 'And' below).
|
||||
break;
|
||||
case CharacterCompareType::And:
|
||||
case CharacterCompareType::Subtract:
|
||||
// FIXME: These are too difficult to handle, so bail out.
|
||||
return false;
|
||||
case CharacterCompareType::Undefined:
|
||||
@@ -495,6 +498,8 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
|
||||
// FIXME: We just need to look at the last character of this string, but we only have the first character here.
|
||||
// Just bail out to avoid false positives.
|
||||
return true;
|
||||
case CharacterCompareType::StringSet:
|
||||
return true;
|
||||
case CharacterCompareType::CharClass: {
|
||||
auto contains = char_class_contains(static_cast<CharClass>(pair.value));
|
||||
if (!in_or() && (current_lhs_inversion_state() ^ contains))
|
||||
@@ -613,6 +618,7 @@ static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<Compa
|
||||
break;
|
||||
}
|
||||
case CharacterCompareType::And:
|
||||
case CharacterCompareType::Subtract:
|
||||
// FIXME: These are too difficult to handle, so bail out.
|
||||
return true;
|
||||
case CharacterCompareType::Undefined:
|
||||
@@ -1838,6 +1844,7 @@ static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCod
|
||||
case CharacterCompareType::EndAndOr:
|
||||
return LookupTableInsertionOutcome::FinishFlushOnInsertion;
|
||||
case CharacterCompareType::And:
|
||||
case CharacterCompareType::Subtract:
|
||||
return LookupTableInsertionOutcome::FlushOnInsertion;
|
||||
case CharacterCompareType::Reference:
|
||||
case CharacterCompareType::NamedReference:
|
||||
@@ -1845,6 +1852,7 @@ static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCod
|
||||
case CharacterCompareType::GeneralCategory:
|
||||
case CharacterCompareType::Script:
|
||||
case CharacterCompareType::ScriptExtension:
|
||||
case CharacterCompareType::StringSet:
|
||||
case CharacterCompareType::Or:
|
||||
return LookupTableInsertionOutcome::CannotPlaceInTable;
|
||||
case CharacterCompareType::Undefined:
|
||||
@@ -1870,6 +1878,7 @@ void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndVa
|
||||
&& pair.type != CharacterCompareType::Inverse
|
||||
&& pair.type != CharacterCompareType::And
|
||||
&& pair.type != CharacterCompareType::Or
|
||||
&& pair.type != CharacterCompareType::Subtract
|
||||
&& pair.type != CharacterCompareType::EndAndOr)
|
||||
arguments.append(pair.value);
|
||||
++argument_count;
|
||||
@@ -1983,6 +1992,7 @@ void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndVa
|
||||
&& value.type != CharacterCompareType::Inverse
|
||||
&& value.type != CharacterCompareType::And
|
||||
&& value.type != CharacterCompareType::Or
|
||||
&& value.type != CharacterCompareType::Subtract
|
||||
&& value.type != CharacterCompareType::EndAndOr)
|
||||
arguments.append(value.value);
|
||||
++argument_count;
|
||||
|
||||
@@ -1684,7 +1684,15 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
|
||||
property.visit(
|
||||
[&](Unicode::Property property) {
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
|
||||
if (Unicode::is_ecma262_string_property(property) && !negated) {
|
||||
auto strings = Unicode::get_property_strings(property);
|
||||
if (!strings.is_empty()) {
|
||||
auto string_set_index = m_parser_state.bytecode.string_set_table().set(move(strings));
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::StringSet, string_set_index });
|
||||
}
|
||||
} else {
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
|
||||
}
|
||||
},
|
||||
[&](Unicode::GeneralCategory general_category) {
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category.value() });
|
||||
@@ -2165,6 +2173,11 @@ bool ECMA262Parser::parse_class_union(Vector<regex::CompareTypeAndValuePair>& co
|
||||
first = false;
|
||||
}
|
||||
|
||||
if (!first) {
|
||||
compares.prepend({ CharacterCompareType::Or, 0 });
|
||||
compares.append({ CharacterCompareType::EndAndOr, 0 });
|
||||
}
|
||||
|
||||
restore_position.disarm();
|
||||
return !has_error();
|
||||
}
|
||||
@@ -2220,7 +2233,7 @@ bool ECMA262Parser::parse_class_subtraction(Vector<CompareTypeAndValuePair>& com
|
||||
if (!try_skip("--"sv))
|
||||
return false;
|
||||
|
||||
compares.append({ CharacterCompareType::And, 0 });
|
||||
compares.append({ CharacterCompareType::Subtract, 0 });
|
||||
compares.extend(move(lhs));
|
||||
|
||||
do {
|
||||
@@ -2228,7 +2241,6 @@ bool ECMA262Parser::parse_class_subtraction(Vector<CompareTypeAndValuePair>& com
|
||||
if (!parse_class_set_operand(rhs))
|
||||
return false;
|
||||
|
||||
compares.append({ CharacterCompareType::TemporaryInverse, 0 });
|
||||
compares.extend(rhs);
|
||||
} while (!has_error() && try_skip("--"sv));
|
||||
|
||||
@@ -2376,7 +2388,15 @@ bool ECMA262Parser::parse_class_set_operand(Vector<regex::CompareTypeAndValuePai
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
|
||||
property.visit(
|
||||
[&](Unicode::Property property) {
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
|
||||
if (Unicode::is_ecma262_string_property(property) && !negated) {
|
||||
auto strings = Unicode::get_property_strings(property);
|
||||
if (!strings.is_empty()) {
|
||||
auto string_set_index = m_parser_state.bytecode.string_set_table().set(move(strings));
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::StringSet, string_set_index });
|
||||
}
|
||||
} else {
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
|
||||
}
|
||||
},
|
||||
[&](Unicode::GeneralCategory general_category) {
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category.value() });
|
||||
@@ -2477,8 +2497,15 @@ bool ECMA262Parser::parse_nested_class(Vector<regex::CompareTypeAndValuePair>& c
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
auto strings = Unicode::get_property_strings(property);
|
||||
if (!strings.is_empty()) {
|
||||
auto string_set_index = m_parser_state.bytecode.string_set_table().set(move(strings));
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::StringSet, string_set_index });
|
||||
}
|
||||
} else {
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
|
||||
}
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() });
|
||||
},
|
||||
[&](Unicode::GeneralCategory general_category) {
|
||||
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category.value() });
|
||||
|
||||
@@ -12,7 +12,9 @@
|
||||
#include <LibUnicode/ICU.h>
|
||||
|
||||
#include <unicode/uchar.h>
|
||||
#include <unicode/uniset.h>
|
||||
#include <unicode/uscript.h>
|
||||
#include <unicode/uset.h>
|
||||
|
||||
namespace Unicode {
|
||||
|
||||
@@ -321,6 +323,39 @@ bool is_ecma262_string_property(Property property)
|
||||
}
|
||||
}
|
||||
|
||||
Vector<String> get_property_strings(Property property)
|
||||
{
|
||||
Vector<String> result;
|
||||
|
||||
if (!is_ecma262_string_property(property))
|
||||
return result;
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
auto const* icu_set = u_getBinaryPropertySet(static_cast<UProperty>(property.value()), &status);
|
||||
if (!icu_success(status) || !icu_set)
|
||||
return result;
|
||||
|
||||
auto const* unicode_set = icu::UnicodeSet::fromUSet(icu_set);
|
||||
if (!unicode_set)
|
||||
return result;
|
||||
|
||||
auto range_count = unicode_set->getRangeCount();
|
||||
for (int32_t i = 0; i < range_count; ++i) {
|
||||
auto start = unicode_set->getRangeStart(i);
|
||||
auto end = unicode_set->getRangeEnd(i);
|
||||
|
||||
for (auto code_point = start; code_point <= end; ++code_point) {
|
||||
result.append(String::from_code_point(code_point));
|
||||
}
|
||||
}
|
||||
|
||||
for (auto const& str : unicode_set->strings()) {
|
||||
result.append(icu_string_to_string(str));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Optional<Script> script_from_string(StringView script)
|
||||
{
|
||||
static auto script_names = []() {
|
||||
|
||||
@@ -40,6 +40,7 @@ bool code_point_has_white_space_property(u32 code_point);
|
||||
|
||||
bool is_ecma262_property(Property);
|
||||
bool is_ecma262_string_property(Property);
|
||||
Vector<String> get_property_strings(Property);
|
||||
|
||||
Optional<Script> script_from_string(StringView);
|
||||
bool code_point_has_script(u32 code_point, Script script);
|
||||
|
||||
Reference in New Issue
Block a user