AK: Use simdutf for searching strings for a single code unit

In the following synthetic benchmark, the simdutf version is 4x faster:

    BENCHMARK_CASE(find)
    {
        auto string = u"😀Foo😀Bar"sv;

        for (size_t i = 0; i < 100'000'000; ++i)
            (void)string.find_code_unit_offset('a');
    }
This commit is contained in:
Timothy Flynn
2025-08-11 07:57:58 -04:00
committed by Andreas Kling
parent bd7599ccfc
commit f03c432b52
Notes: github-actions[bot] 2025-08-11 16:12:30 +00:00
3 changed files with 35 additions and 22 deletions

View File

@@ -16,6 +16,8 @@
#include <AK/Vector.h>
#include <string.h>
#include <simdutf.h>
namespace AK {
namespace StringUtils {
@@ -211,15 +213,19 @@ StringView trim_whitespace(StringView str, TrimMode mode)
return trim(str, " \n\t\v\f\r"sv, mode);
}
Optional<size_t> find(StringView haystack, char needle, size_t start)
Optional<size_t> find(StringView haystack, char needle, size_t start_offset)
{
if (start >= haystack.length())
if (start_offset >= haystack.length())
return {};
for (size_t i = start; i < haystack.length(); ++i) {
if (haystack[i] == needle)
return i;
}
return {};
auto const* start = haystack.characters_without_null_termination() + start_offset;
auto const* end = haystack.characters_without_null_termination() + haystack.length();
auto const* result = simdutf::find(start, end, needle);
if (result == end)
return {};
return result - start + start_offset;
}
Optional<size_t> find(StringView haystack, StringView needle, size_t start)

View File

@@ -254,6 +254,27 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
VERIFY_NOT_REACHED();
}
Optional<size_t> Utf16View::find_code_unit_offset(char16_t needle, size_t start_offset) const
{
if (start_offset >= length_in_code_units())
return {};
if (has_ascii_storage()) {
if (!AK::is_ascii(needle))
return {};
return StringUtils::find(bytes(), static_cast<char>(needle), start_offset);
}
auto const* start = m_string.utf16 + start_offset;
auto const* end = m_string.utf16 + length_in_code_units();
auto const* result = simdutf::find(start, end, needle);
if (result == end)
return {};
return result - start + start_offset;
}
Vector<Utf16View> Utf16View::split_view(char16_t separator, SplitBehavior split_behavior) const
{
Utf16View seperator_view { &separator, 1 };

View File

@@ -488,21 +488,7 @@ public:
return trim(" \n\t\v\f\r"sv, mode);
}
constexpr Optional<size_t> find_code_unit_offset(char16_t needle, size_t start_offset = 0) const
{
if (start_offset >= length_in_code_units())
return {};
if (has_ascii_storage()) {
if (!AK::is_ascii(needle))
return {};
auto byte = static_cast<char>(needle);
return AK::memmem_optional(m_string.ascii + start_offset, length_in_code_units() - start_offset, &byte, sizeof(byte));
}
return AK::memmem_optional(m_string.utf16 + start_offset, (length_in_code_units() - start_offset) * sizeof(char16_t), &needle, sizeof(needle));
}
Optional<size_t> find_code_unit_offset(char16_t needle, size_t start_offset = 0) const;
constexpr Optional<size_t> find_code_unit_offset(Utf16View const& needle, size_t start_offset = 0) const
{