Fixe GPU First RPCs to work when given a pointer

- Fixe GPUFirst Memory Allocator to work with new offload plugin.
- Fixe TeamAllocator to not Ignore first Allocation.
This commit is contained in:
Nicolas Marie
2024-06-18 08:45:35 -07:00
parent 3b1aae9380
commit ad1e11e0d9
6 changed files with 47 additions and 47 deletions

View File

@@ -37,4 +37,8 @@ void *realloc(void *ptr, size_t new_size) {
}
}
extern "C" {
void __kmpc_target_init_allocator() { return; }
}
#pragma omp end declare target

View File

@@ -18,11 +18,13 @@
using namespace ompx;
char *CONSTANT(omptarget_device_heap_buffer)
__attribute__((used, retain, weak, visibility("protected")));
size_t CONSTANT(omptarget_device_heap_size)
__attribute__((used, retain, weak, visibility("protected")));
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility(
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
__omp_rtl_device_memory_pool_tracker;
// TODO: implement Device Debug Allocation Tracker
namespace {
size_t HeapCurPos = 0;
@@ -250,8 +252,8 @@ void *malloc(size_t Size) {
{
mutex::LockGuard LG(HeapLock);
if (Size + HeapCurPos < omptarget_device_heap_size) {
void *R = omptarget_device_heap_buffer + HeapCurPos;
if (Size + HeapCurPos < __omp_rtl_device_memory_pool.Size) {
void *R = reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr) + HeapCurPos;
(void)atomic::add(&HeapCurPos, Size, atomic::acq_rel);
MD = AllocationMetadata::getFromAddr(R);
}

View File

@@ -21,11 +21,13 @@
using namespace ompx;
char *CONSTANT(omptarget_device_heap_buffer)
__attribute__((used, retain, weak, visibility("protected")));
size_t CONSTANT(omptarget_device_heap_size)
__attribute__((used, retain, weak, visibility("protected")));
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility(
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
__omp_rtl_device_memory_pool_tracker;
// TODO: implement Device Debug Allocation Tracker
namespace {
constexpr const size_t Alignment = 16;
@@ -92,7 +94,7 @@ template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
(mapping::getThreadIdInBlock() || mapping::getBlockIdInKernel()))
return;
size_t HeapSize = omptarget_device_heap_size;
size_t HeapSize = __omp_rtl_device_memory_pool.Size;
FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
@@ -104,7 +106,7 @@ template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
FirstTeamSize = TeamHeapSize;
char *LastLimit = omptarget_device_heap_buffer;
char *LastLimit = reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr);
for (int I = 0; I < WARP_SIZE; ++I) {
for (int J = 0; J < TEAM_SIZE; ++J) {
Entries[I][J] = nullptr;
@@ -173,8 +175,8 @@ template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
}
memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
if (!utils::isInRange(P, omptarget_device_heap_buffer,
omptarget_device_heap_size))
if (!utils::isInRange(P, reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr),
__omp_rtl_device_memory_pool.Size))
return {};
auto TeamSlot = getTeamSlot();
@@ -192,14 +194,16 @@ template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
return {};
if (E->getEndPtr() <= P)
return {};
do {
bool isFirst = false;
while (!isFirst) {
if (E->getUserPtr() <= P && P < E->getEndPtr()) {
if (!E->isUsed())
return {};
return {E->getUserPtr(), E->getUserSize()};
}
isFirst = E->isFirst();
E = E->getPrev();
} while (!E->isFirst());
}
}
}
return {};
@@ -211,7 +215,7 @@ private:
return Limits[TIdInWarp][TeamSlot - 1];
if (TIdInWarp)
return Limits[TIdInWarp - 1][TEAM_SIZE - 1];
return omptarget_device_heap_buffer;
return reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr);
}
char *getBlockEnd(int32_t TIdInWarp, int32_t TeamSlot) const {
return Limits[TIdInWarp][TeamSlot];

View File

@@ -21,11 +21,13 @@
using namespace ompx;
char *CONSTANT(omptarget_device_heap_buffer)
__attribute__((used, retain, weak, visibility("protected")));
size_t CONSTANT(omptarget_device_heap_size)
__attribute__((used, retain, weak, visibility("protected")));
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility(
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
__omp_rtl_device_memory_pool_tracker;
// TODO: implement Device Debug Allocation Tracker
namespace {
constexpr const size_t Alignment = 16;
@@ -92,7 +94,7 @@ template <int32_t WARP_SIZE> struct WarpAllocator {
(mapping::getThreadIdInBlock() || mapping::getBlockId()))
return;
size_t HeapSize = omptarget_device_heap_size;
size_t HeapSize = __omp_rtl_device_memory_pool.Size;
size_t FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
size_t OtherThreadHeapSize =
@@ -102,7 +104,7 @@ template <int32_t WARP_SIZE> struct WarpAllocator {
for (int I = 0; I < WARP_SIZE; ++I) {
Entries[I] = nullptr;
size_t PrivateOffset = OtherThreadHeapSize * I + FirstThreadHeapSize;
Limits[I] = omptarget_device_heap_buffer + PrivateOffset;
Limits[I] = reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr) + PrivateOffset;
}
}
@@ -160,8 +162,8 @@ template <int32_t WARP_SIZE> struct WarpAllocator {
}
memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
if (!utils::isInRange(P, omptarget_device_heap_buffer,
omptarget_device_heap_size))
if (!utils::isInRange(P, reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr),
__omp_rtl_device_memory_pool.Size))
return {};
auto TIdInWarp = mapping::getThreadIdInWarp();
@@ -190,7 +192,7 @@ template <int32_t WARP_SIZE> struct WarpAllocator {
private:
char *getBlockBegin(int32_t TIdInWarp) const {
return TIdInWarp ? Limits[TIdInWarp - 1] : omptarget_device_heap_buffer;
return TIdInWarp ? Limits[TIdInWarp - 1] : reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr);
}
char *getBlockEnd(int32_t TIdInWarp) const { return Limits[TIdInWarp]; }

View File

@@ -426,8 +426,8 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
uint64_t &ReqPtrArgOffset);
// Host RPC support
int64_t __kmpc_host_rpc_get_arg(void *Wrapper, int32_t ArgNo);
void __kmpc_host_rpc_set_ret_val(void *Wrapper, int64_t RetVal);
int64_t __kmpc_host_rpc_get_arg(void *descriptor, int32_t ArgNo);
void __kmpc_host_rpc_set_ret_val(void *descriptor, int64_t RetVal);
#ifdef __cplusplus
}

View File

@@ -514,31 +514,19 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
}
// Host RPC support functions.
EXTERN int64_t __kmpc_host_rpc_get_arg(void *Wrapper, int32_t ArgNum) {
auto *W = reinterpret_cast<hostrpc::DescriptorWrapper *>(Wrapper);
auto &SD = W->D;
EXTERN int64_t __kmpc_host_rpc_get_arg(void *descriptor, int32_t ArgNum) {
hostrpc::Descriptor &SD = *reinterpret_cast<hostrpc::Descriptor *>(descriptor);
assert(ArgNum < SD.NumArgs && "out-of-range argument");
int64_t ArgVal = SD.Args[ArgNum].Value;
void *ArgPtr = reinterpret_cast<void *>(ArgVal);
DP("[host-rpc] get argno=%d arg=%lx...\n", ArgNum, ArgVal);
if (W->StdIn && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
ArgPtr == W->StdIn)
return (int64_t)stdin;
if (W->StdOut && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
ArgPtr == W->StdOut)
return (int64_t)stdout;
if (W->StdErr && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
ArgPtr == W->StdErr)
return (int64_t)stderr;
return ArgVal;
}
EXTERN void __kmpc_host_rpc_set_ret_val(void *Wrapper, int64_t RetVal) {
auto &SD = reinterpret_cast<hostrpc::DescriptorWrapper *>(Wrapper)->D;
EXTERN void __kmpc_host_rpc_set_ret_val(void *descriptor, int64_t RetVal) {
hostrpc::Descriptor &SD = *reinterpret_cast<hostrpc::Descriptor *>(descriptor);
SD.ReturnValue = RetVal;
}