mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-12-05 01:10:24 +00:00
AK: Use simdutf for searching strings for a single code unit
In the following synthetic benchmark, the simdutf version is 4x faster:
BENCHMARK_CASE(find)
{
auto string = u"😀Foo😀Bar"sv;
for (size_t i = 0; i < 100'000'000; ++i)
(void)string.find_code_unit_offset('a');
}
This commit is contained in:
committed by
Andreas Kling
parent
bd7599ccfc
commit
f03c432b52
Notes:
github-actions[bot]
2025-08-11 16:12:30 +00:00
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/f03c432b526 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5818 Reviewed-by: https://github.com/gmta ✅
@@ -16,6 +16,8 @@
|
||||
#include <AK/Vector.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <simdutf.h>
|
||||
|
||||
namespace AK {
|
||||
|
||||
namespace StringUtils {
|
||||
@@ -211,15 +213,19 @@ StringView trim_whitespace(StringView str, TrimMode mode)
|
||||
return trim(str, " \n\t\v\f\r"sv, mode);
|
||||
}
|
||||
|
||||
Optional<size_t> find(StringView haystack, char needle, size_t start)
|
||||
Optional<size_t> find(StringView haystack, char needle, size_t start_offset)
|
||||
{
|
||||
if (start >= haystack.length())
|
||||
if (start_offset >= haystack.length())
|
||||
return {};
|
||||
for (size_t i = start; i < haystack.length(); ++i) {
|
||||
if (haystack[i] == needle)
|
||||
return i;
|
||||
}
|
||||
return {};
|
||||
|
||||
auto const* start = haystack.characters_without_null_termination() + start_offset;
|
||||
auto const* end = haystack.characters_without_null_termination() + haystack.length();
|
||||
|
||||
auto const* result = simdutf::find(start, end, needle);
|
||||
if (result == end)
|
||||
return {};
|
||||
|
||||
return result - start + start_offset;
|
||||
}
|
||||
|
||||
Optional<size_t> find(StringView haystack, StringView needle, size_t start)
|
||||
|
||||
@@ -254,6 +254,27 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
Optional<size_t> Utf16View::find_code_unit_offset(char16_t needle, size_t start_offset) const
|
||||
{
|
||||
if (start_offset >= length_in_code_units())
|
||||
return {};
|
||||
|
||||
if (has_ascii_storage()) {
|
||||
if (!AK::is_ascii(needle))
|
||||
return {};
|
||||
return StringUtils::find(bytes(), static_cast<char>(needle), start_offset);
|
||||
}
|
||||
|
||||
auto const* start = m_string.utf16 + start_offset;
|
||||
auto const* end = m_string.utf16 + length_in_code_units();
|
||||
|
||||
auto const* result = simdutf::find(start, end, needle);
|
||||
if (result == end)
|
||||
return {};
|
||||
|
||||
return result - start + start_offset;
|
||||
}
|
||||
|
||||
Vector<Utf16View> Utf16View::split_view(char16_t separator, SplitBehavior split_behavior) const
|
||||
{
|
||||
Utf16View seperator_view { &separator, 1 };
|
||||
|
||||
@@ -488,21 +488,7 @@ public:
|
||||
return trim(" \n\t\v\f\r"sv, mode);
|
||||
}
|
||||
|
||||
constexpr Optional<size_t> find_code_unit_offset(char16_t needle, size_t start_offset = 0) const
|
||||
{
|
||||
if (start_offset >= length_in_code_units())
|
||||
return {};
|
||||
|
||||
if (has_ascii_storage()) {
|
||||
if (!AK::is_ascii(needle))
|
||||
return {};
|
||||
|
||||
auto byte = static_cast<char>(needle);
|
||||
return AK::memmem_optional(m_string.ascii + start_offset, length_in_code_units() - start_offset, &byte, sizeof(byte));
|
||||
}
|
||||
|
||||
return AK::memmem_optional(m_string.utf16 + start_offset, (length_in_code_units() - start_offset) * sizeof(char16_t), &needle, sizeof(needle));
|
||||
}
|
||||
Optional<size_t> find_code_unit_offset(char16_t needle, size_t start_offset = 0) const;
|
||||
|
||||
constexpr Optional<size_t> find_code_unit_offset(Utf16View const& needle, size_t start_offset = 0) const
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user