AK: Make hashing of UTF-16 strings cheaper

No need to iterate every byte of the string, we can iterate the code
units instead.

We must also actually record that we have cached the hash :^)
This commit is contained in:
Timothy Flynn
2025-08-06 12:50:38 -04:00
committed by Jelle Raaijmakers
parent 73154defa8
commit 274f8ee462
Notes: github-actions[bot] 2025-08-07 00:07:21 +00:00
5 changed files with 11 additions and 13 deletions

View File

@@ -219,7 +219,7 @@ inline u32 StringBase::hash() const
return string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size());
}
if (!m_impl.data)
return string_hash(nullptr, 0);
return string_hash<char>(nullptr, 0);
return data_without_union_member_assertion()->hash();
}

View File

@@ -6,6 +6,7 @@
#pragma once
#include <AK/Concepts.h>
#include <AK/Types.h>
namespace AK {
@@ -14,7 +15,8 @@ namespace AK {
// We can't use SipHash since that depends on runtime parameters,
// but some string hashes like IPC endpoint magic numbers need to be deterministic.
// Maybe use a SipHash with a statically-known key?
constexpr u32 string_hash(char const* characters, size_t length, u32 seed = 0)
template<OneOf<char, char16_t> T>
constexpr u32 string_hash(T const* characters, size_t length, u32 seed = 0)
{
u32 hash = seed;
for (size_t i = 0; i < length; ++i) {

View File

@@ -176,7 +176,7 @@ public:
if (auto const* data = data_without_union_member_assertion())
return data->hash();
return string_hash(nullptr, 0);
return string_hash<char16_t>(nullptr, 0);
}
[[nodiscard]] ALWAYS_INLINE bool is_empty() const { return length_in_code_units() == 0uz; }

View File

@@ -78,8 +78,11 @@ public:
ALWAYS_INLINE u32 hash() const
{
if (!m_has_hash)
m_hash = calculate_hash();
if (!m_has_hash) {
m_hash = utf16_view().hash();
m_has_hash = true;
}
return m_hash;
}
@@ -128,13 +131,6 @@ private:
[[nodiscard]] size_t calculate_code_point_length() const;
[[nodiscard]] ALWAYS_INLINE u32 calculate_hash() const
{
if (has_ascii_storage())
return ascii_view().hash();
return utf16_view().hash();
}
// We store whether this string has ASCII or UTF-16 storage by setting the most significant bit of m_length_in_code_units
// to 1 for UTF-16 storage. This shrinks the size of most UTF-16 string related classes, at the cost of not being
// allowed to create a string larger than 2**63 - 1.

View File

@@ -324,7 +324,7 @@ public:
return 0;
if (has_ascii_storage())
return string_hash(m_string.ascii, length_in_code_units());
return string_hash(reinterpret_cast<char const*>(m_string.utf16), length_in_code_units() * sizeof(char16_t));
return string_hash(m_string.utf16, length_in_code_units());
}
[[nodiscard]] constexpr bool is_null() const