Fixe DeviceRTL TeamAllocator.

Make the Allocator use dynamic size for teams/workgroup/threadblock and warp/wavefront. Needed for proper AMD GPUs support.
2024-08-30 10:49:42 -07:00
parent 45da2df28b
commit c51af81745
1 changed files with 48 additions and 34 deletions
--- a/offload/DeviceRTL/src/TeamAllocator.cpp
+++ b/offload/DeviceRTL/src/TeamAllocator.cpp
@@ -18,6 +18,7 @@
 #include "Synchronization.h"
 #include "Types.h"
 #include "Utils.h"
+#include "Interface.h"

 using namespace ompx;

@@ -31,15 +32,17 @@ using namespace ompx;

 namespace {
 constexpr const size_t Alignment = 16;
-constexpr const size_t FirstThreadRatio = 40;
+constexpr const size_t FirstThreadTeamRatio = 40;
+constexpr const size_t FirstThreadWarpRatio = 40;
 constexpr const size_t SplitThreadhold = Alignment * 4;

 template <typename T> T abs(T V) { return V > 0 ? V : -V; }

-template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator;
+//template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator;
+template <uint32_t MAX_TEAM_SIZE> struct WarpAllocator;

 class WarpAllocatorEntry {
-  template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> friend struct WarpAllocator;
+  template <uint32_t MAX_TEAM_SIZE> friend struct WarpAllocator;

  /// If Size is less than 0, the entry is allocated (in use).
  int64_t Size = 0;
@@ -88,7 +91,8 @@ public:

 static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");

-template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
+//template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
+template <uint32_t MAX_TEAM_SIZE> struct WarpAllocator {
  void init() {
    if (mapping::isSPMDMode() &&
        (mapping::getThreadIdInBlock() || mapping::getBlockIdInKernel()))
@@ -96,25 +100,30 @@ template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {

    size_t HeapSize = __omp_rtl_device_memory_pool.Size;

-    FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
+    FirstThreadHeapSize = HeapSize * FirstThreadWarpRatio / 100;
    FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
    size_t OtherThreadHeapSize =
-        (HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
+        (HeapSize - FirstThreadHeapSize) / (mapping::getWarpSize() - 1);
    OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);

-    size_t TeamHeapSize = FirstThreadHeapSize / TEAM_SIZE;
+    size_t TeamHeapSize = FirstThreadHeapSize / mapping::getMaxTeamWarps();
    TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
    FirstTeamSize = TeamHeapSize;

+    printf("Team Size: %d, WarpSize: %d, ThreadinBlock: %d\n", mapping::getMaxTeamWarps(), mapping::getWarpSize(), mapping::getMaxTeamWarps() * mapping::getWarpSize());
+    printf("TeamAllocator Init: Total Team Memory Size (%ldMB), 1st Thread in Warp (%ldMB), Any thread in warp(%ldMB)\n",
+        HeapSize / (1024 * 1024), TeamHeapSize / (1024 * 1024), OtherThreadHeapSize / (1024 * 1024));
+
    char *LastLimit = reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr);
-    for (int I = 0; I < WARP_SIZE; ++I) {
-      for (int J = 0; J < TEAM_SIZE; ++J) {
-        Entries[I][J] = nullptr;
-        Limits[I][J] = LastLimit + TeamHeapSize * (J + 1);
+    for (int I = 0; I < mapping::getWarpSize(); ++I) {
+      for (int J = 0; J < mapping::getMaxTeamWarps(); ++J) {
+        Entries[I * mapping::getMaxTeamWarps() + J] = nullptr;
+        Limits[I * mapping::getMaxTeamWarps() + J] = LastLimit + TeamHeapSize * (J + 1);
      }
      LastLimit += I ? OtherThreadHeapSize : FirstThreadHeapSize;
-      Limits[I][TEAM_SIZE - 1] = LastLimit;
-      TeamHeapSize = OtherThreadHeapSize / TEAM_SIZE;
+      Limits[I * mapping::getMaxTeamWarps() + mapping::getMaxTeamWarps() - 1] =
+          LastLimit;
+      TeamHeapSize = OtherThreadHeapSize / mapping::getMaxTeamWarps();
      TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
    }
  }
@@ -131,13 +140,13 @@ template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {

    WarpAllocatorEntry *E = nullptr;
    {
-      mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
+      mutex::LockGuard LG(Locks[TIdInWarp * mapping::getMaxTeamWarps() + TeamSlot]);

-      auto *LastEntry = Entries[TIdInWarp][TeamSlot];
+      auto *LastEntry = Entries[TIdInWarp * mapping::getMaxTeamWarps() + TeamSlot];
      auto *NewWatermark = (LastEntry ? LastEntry->getEndPtr()
                                      : getBlockBegin(TIdInWarp, TeamSlot)) +
                           Size;
-      if (NewWatermark >= Limits[TIdInWarp][TeamSlot]) {
+      if (NewWatermark >= Limits[TIdInWarp * mapping::getMaxTeamWarps() + TeamSlot]) {
        E = findMemorySlow(Size, TIdInWarp, TeamSlot);
      } else {
        E = LastEntry ? LastEntry->getNext()
@@ -145,7 +154,7 @@ template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
                            getBlockBegin(TIdInWarp, TeamSlot));
        E->setSize(Size);
        E->setPrevSize(LastEntry);
-        Entries[TIdInWarp][TeamSlot] = E;
+        Entries[TIdInWarp * mapping::getMaxTeamWarps() + TeamSlot] = E;
      }

      if (!E)
@@ -163,14 +172,16 @@ template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
    auto TeamSlot = getTeamSlot();
    auto TIdInWarp = mapping::getThreadIdInWarp();

-    mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
+    mutex::LockGuard LG(Locks[TIdInWarp * mapping::getMaxTeamWarps() + TeamSlot]);
+    if (E->isUnused())
+      return;
    E->setUnused();
    // Is last entry?
-    if (E == Entries[TIdInWarp][TeamSlot]) {
+    if (E == Entries[TIdInWarp * mapping::getMaxTeamWarps() + TeamSlot]) {
      do {
        E = E->getPrev();
      } while (!E->isFirst() && !E->isUsed());
-      Entries[TIdInWarp][TeamSlot] = E;
+      Entries[TIdInWarp * mapping::getMaxTeamWarps() + TeamSlot] = E;
    }
  }

@@ -181,15 +192,15 @@ template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {

    auto TeamSlot = getTeamSlot();
    auto TIdInWarp = mapping::getThreadIdInWarp();
-    for (int I = TIdInWarp; I < TIdInWarp + WARP_SIZE; ++I) {
-      int TId = I % WARP_SIZE;
-      for (int J = TeamSlot; J < TeamSlot + TEAM_SIZE; ++J) {
-        int SId = J % TEAM_SIZE;
+    for (int I = TIdInWarp; I < TIdInWarp + mapping::getWarpSize(); ++I) {
+      int TId = I % mapping::getWarpSize();
+      for (int J = TeamSlot; J < TeamSlot + mapping::getMaxTeamWarps(); ++J) {
+        int SId = J % mapping::getMaxTeamWarps();
        if (P < getBlockBegin(TId, SId) || P >= getBlockEnd(TId, SId))
          continue;

-        mutex::LockGuard LG(Locks[I][SId]);
-        WarpAllocatorEntry *E = Entries[I][SId];
+        mutex::LockGuard LG(Locks[I * mapping::getMaxTeamWarps() + SId]);
+        WarpAllocatorEntry *E = Entries[I * mapping::getMaxTeamWarps() + SId];
        if (!E)
          return {};
        if (E->getEndPtr() <= P)
@@ -210,15 +221,16 @@ template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
  }

 private:
+
  char *getBlockBegin(int32_t TIdInWarp, int32_t TeamSlot) const {
    if (TeamSlot)
-      return Limits[TIdInWarp][TeamSlot - 1];
+      return Limits[TIdInWarp * mapping::getMaxTeamWarps() + TeamSlot - 1];
    if (TIdInWarp)
-      return Limits[TIdInWarp - 1][TEAM_SIZE - 1];
+      return Limits[(TIdInWarp - 1) * mapping::getMaxTeamWarps() + mapping::getMaxTeamWarps() - 1];
    return reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr);
  }
  char *getBlockEnd(int32_t TIdInWarp, int32_t TeamSlot) const {
-    return Limits[TIdInWarp][TeamSlot];
+    return Limits[TIdInWarp * mapping::getMaxTeamWarps() + TeamSlot];
  }

  size_t getBlockSize(int32_t TIdInWarp, int32_t TeamSlot) const {
@@ -226,7 +238,8 @@ private:
           getBlockBegin(TIdInWarp, TeamSlot);
  }

-  static int32_t getTeamSlot() { return mapping::getBlockIdInKernel() % TEAM_SIZE; }
+  int32_t getTeamSlot() { return mapping::getBlockIdInKernel() % 
+    mapping::getMaxTeamWarps(); }

  WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp,
                                     int32_t TeamSlot) {
@@ -255,14 +268,15 @@ private:
    return E;
  }

-  WarpAllocatorEntry *Entries[WARP_SIZE][TEAM_SIZE];
-  char *Limits[WARP_SIZE][TEAM_SIZE];
-  mutex::TicketLock Locks[WARP_SIZE][TEAM_SIZE];
+  WarpAllocatorEntry *Entries[MAX_TEAM_SIZE]; //[WARP_SIZE][TEAM_SIZE];
+  char *Limits[MAX_TEAM_SIZE]; //[WARP_SIZE][TEAM_SIZE];
+  mutex::TicketLock Locks[MAX_TEAM_SIZE]; //[WARP_SIZE][TEAM_SIZE];
  size_t FirstThreadHeapSize;
  size_t FirstTeamSize;
 };

-WarpAllocator<32, 16> Allocator;
+// Max team size (hread blocl size)
+WarpAllocator<1024> Allocator;

 } // namespace