AK: Make hashing of UTF-16 strings cheaper

No need to iterate every byte of the string, we can iterate the code units instead. We must also actually record that we have cached the hash :^)
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/274f8ee462f Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5746 Reviewed-by: https://github.com/gmta ✅
2025-12-05 01:10:24 +00:00 · 2025-08-06 12:50:38 -04:00 · 2025-08-07 00:07:21 +00:00
parent 73154defa8
commit 274f8ee462
5 changed files with 11 additions and 13 deletions
--- a/AK/StringBase.h
+++ b/AK/StringBase.h
@@ -219,7 +219,7 @@ inline u32 StringBase::hash() const
        return string_hash(reinterpret_cast<char const*>(bytes.data()), bytes.size());
    }
    if (!m_impl.data)
-        return string_hash(nullptr, 0);
+        return string_hash<char>(nullptr, 0);
    return data_without_union_member_assertion()->hash();
 }

--- a/AK/StringHash.h
+++ b/AK/StringHash.h
@@ -6,6 +6,7 @@

 #pragma once

+#include <AK/Concepts.h>
 #include <AK/Types.h>

 namespace AK {
@@ -14,7 +15,8 @@ namespace AK {
 //        We can't use SipHash since that depends on runtime parameters,
 //        but some string hashes like IPC endpoint magic numbers need to be deterministic.
 //        Maybe use a SipHash with a statically-known key?
-constexpr u32 string_hash(char const* characters, size_t length, u32 seed = 0)
+template<OneOf<char, char16_t> T>
+constexpr u32 string_hash(T const* characters, size_t length, u32 seed = 0)
 {
    u32 hash = seed;
    for (size_t i = 0; i < length; ++i) {
--- a/AK/Utf16StringBase.h
+++ b/AK/Utf16StringBase.h
@@ -176,7 +176,7 @@ public:

        if (auto const* data = data_without_union_member_assertion())
            return data->hash();
-        return string_hash(nullptr, 0);
+        return string_hash<char16_t>(nullptr, 0);
    }

    [[nodiscard]] ALWAYS_INLINE bool is_empty() const { return length_in_code_units() == 0uz; }
--- a/AK/Utf16StringData.h
+++ b/AK/Utf16StringData.h
@@ -78,8 +78,11 @@ public:

    ALWAYS_INLINE u32 hash() const
    {
-        if (!m_has_hash)
-            m_hash = calculate_hash();
+        if (!m_has_hash) {
+            m_hash = utf16_view().hash();
+            m_has_hash = true;
+        }
+
        return m_hash;
    }

@@ -128,13 +131,6 @@ private:

    [[nodiscard]] size_t calculate_code_point_length() const;

-    [[nodiscard]] ALWAYS_INLINE u32 calculate_hash() const
-    {
-        if (has_ascii_storage())
-            return ascii_view().hash();
-        return utf16_view().hash();
-    }
-
    // We store whether this string has ASCII or UTF-16 storage by setting the most significant bit of m_length_in_code_units
    // to 1 for UTF-16 storage. This shrinks the size of most UTF-16 string related classes, at the cost of not being
    // allowed to create a string larger than 2**63 - 1.
--- a/AK/Utf16View.h
+++ b/AK/Utf16View.h
@@ -324,7 +324,7 @@ public:
            return 0;
        if (has_ascii_storage())
            return string_hash(m_string.ascii, length_in_code_units());
-        return string_hash(reinterpret_cast<char const*>(m_string.utf16), length_in_code_units() * sizeof(char16_t));
+        return string_hash(m_string.utf16, length_in_code_units());
    }

    [[nodiscard]] constexpr bool is_null() const