mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-12-05 01:10:24 +00:00
AK: Make hashing of UTF-16 strings cheaper
No need to iterate every byte of the string, we can iterate the code units instead. We must also actually record that we have cached the hash :^)
This commit is contained in:
committed by
Jelle Raaijmakers
parent
73154defa8
commit
274f8ee462
Notes:
github-actions[bot]
2025-08-07 00:07:21 +00:00
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/274f8ee462f Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5746 Reviewed-by: https://github.com/gmta ✅
@@ -219,7 +219,7 @@ inline u32 StringBase::hash() const
|
||||
return string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size());
|
||||
}
|
||||
if (!m_impl.data)
|
||||
return string_hash(nullptr, 0);
|
||||
return string_hash<char>(nullptr, 0);
|
||||
return data_without_union_member_assertion()->hash();
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Concepts.h>
|
||||
#include <AK/Types.h>
|
||||
|
||||
namespace AK {
|
||||
@@ -14,7 +15,8 @@ namespace AK {
|
||||
// We can't use SipHash since that depends on runtime parameters,
|
||||
// but some string hashes like IPC endpoint magic numbers need to be deterministic.
|
||||
// Maybe use a SipHash with a statically-known key?
|
||||
constexpr u32 string_hash(char const* characters, size_t length, u32 seed = 0)
|
||||
template<OneOf<char, char16_t> T>
|
||||
constexpr u32 string_hash(T const* characters, size_t length, u32 seed = 0)
|
||||
{
|
||||
u32 hash = seed;
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
|
||||
@@ -176,7 +176,7 @@ public:
|
||||
|
||||
if (auto const* data = data_without_union_member_assertion())
|
||||
return data->hash();
|
||||
return string_hash(nullptr, 0);
|
||||
return string_hash<char16_t>(nullptr, 0);
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE bool is_empty() const { return length_in_code_units() == 0uz; }
|
||||
|
||||
@@ -78,8 +78,11 @@ public:
|
||||
|
||||
ALWAYS_INLINE u32 hash() const
|
||||
{
|
||||
if (!m_has_hash)
|
||||
m_hash = calculate_hash();
|
||||
if (!m_has_hash) {
|
||||
m_hash = utf16_view().hash();
|
||||
m_has_hash = true;
|
||||
}
|
||||
|
||||
return m_hash;
|
||||
}
|
||||
|
||||
@@ -128,13 +131,6 @@ private:
|
||||
|
||||
[[nodiscard]] size_t calculate_code_point_length() const;
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE u32 calculate_hash() const
|
||||
{
|
||||
if (has_ascii_storage())
|
||||
return ascii_view().hash();
|
||||
return utf16_view().hash();
|
||||
}
|
||||
|
||||
// We store whether this string has ASCII or UTF-16 storage by setting the most significant bit of m_length_in_code_units
|
||||
// to 1 for UTF-16 storage. This shrinks the size of most UTF-16 string related classes, at the cost of not being
|
||||
// allowed to create a string larger than 2**63 - 1.
|
||||
|
||||
@@ -324,7 +324,7 @@ public:
|
||||
return 0;
|
||||
if (has_ascii_storage())
|
||||
return string_hash(m_string.ascii, length_in_code_units());
|
||||
return string_hash(reinterpret_cast<char const*>(m_string.utf16), length_in_code_units() * sizeof(char16_t));
|
||||
return string_hash(m_string.utf16, length_in_code_units());
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr bool is_null() const
|
||||
|
||||
Reference in New Issue
Block a user