Compare commits

...

1 Commits

Author SHA1 Message Date
Shilei Tian
555fb762be [OpenMP] Introduce dynamic memory allocator for OpenMP target offloading 2022-07-27 23:27:24 -04:00
9 changed files with 203 additions and 30 deletions

View File

@@ -91,6 +91,7 @@ set(include_files
${include_directory}/Debug.h
${include_directory}/Interface.h
${include_directory}/Mapping.h
${include_directory}/Memory.h
${include_directory}/State.h
${include_directory}/Synchronization.h
${include_directory}/Types.h
@@ -102,6 +103,7 @@ set(src_files
${source_directory}/Debug.cpp
${source_directory}/Kernel.cpp
${source_directory}/Mapping.cpp
${source_directory}/Memory.cpp
${source_directory}/Misc.cpp
${source_directory}/Parallelism.cpp
${source_directory}/Reduction.cpp

View File

@@ -0,0 +1,23 @@
//===--- Memory.h - OpenMP device runtime memory allocator -------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//
//===----------------------------------------------------------------------===//
#ifndef OMPTARGET_MEMORY_H
#define OMPTARGET_MEMORY_H
#include "Types.h"
extern "C" {
__attribute__((leaf)) void *malloc(size_t Size);
__attribute__((leaf)) void free(void *Ptr);
}
#endif

View File

@@ -62,6 +62,9 @@ namespace atomic {
/// Atomically load \p Addr with \p Ordering semantics.
uint32_t load(uint32_t *Addr, int Ordering);
/// Atomically load \p Addr with \p Ordering semantics.
uint64_t load(uint64_t *Addr, int Ordering);
/// Atomically store \p V to \p Addr with \p Ordering semantics.
void store(uint32_t *Addr, uint32_t V, int Ordering);
@@ -76,6 +79,35 @@ uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);
} // namespace atomic
namespace mutex {
class TicketLock {
uint64_t NowServing = 0;
uint64_t NextTicket = 0;
public:
TicketLock() = default;
TicketLock(const TicketLock &) = delete;
TicketLock(TicketLock &&) = delete;
void lock();
void unlock();
};
template <typename T> class LockGaurd {
T &Lock;
public:
explicit LockGaurd(T &L) : Lock(L) { Lock.lock(); }
~LockGaurd() { Lock.unlock(); }
};
} // namespace mutex
} // namespace _OMP
#endif

View File

@@ -32,6 +32,7 @@ using int32_t = int;
using uint32_t = unsigned int;
using int64_t = long;
using uint64_t = unsigned long;
using size_t = decltype(sizeof(char));;
static_assert(sizeof(int8_t) == 1, "type size mismatch");
static_assert(sizeof(uint8_t) == 1, "type size mismatch");

View File

@@ -3,6 +3,7 @@ target_sources(omptarget.devicertl PRIVATE
Debug.cpp
Kernel.cpp
Mapping.cpp
Memory.cpp
Misc.cpp
Parallelism.cpp
Reduction.cpp

View File

@@ -0,0 +1,47 @@
//===------- Memory.cpp - OpenMP device runtime memory allocator -- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#pragma omp begin declare target device_type(nohost)
#include "Memory.h"
#include "Synchronization.h"
using namespace _OMP;
char *CONSTANT(omptarget_device_heap_buffer)
__attribute__((used, retain, weak, visibility("protected")));
size_t CONSTANT(omptarget_device_heap_size)
__attribute__((used, retain, weak, visibility("protected")));
namespace {
size_t HeapCurPos = 0;
mutex::TicketLock HeapLock;
}
extern "C" {
void *malloc(size_t Size) {
mutex::LockGaurd LG(HeapLock);
if (Size + HeapCurPos < omptarget_device_heap_size) {
void *R = omptarget_device_heap_buffer + HeapCurPos;
atomic::add(&HeapCurPos, Size, __ATOMIC_SEQ_CST);
return R;
}
return nullptr;
}
void free(void *) {}
}
#pragma omp end declare target

View File

@@ -12,6 +12,8 @@
#include "Configuration.h"
#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"
#include "Memory.h"
#include "Synchronization.h"
#include "Types.h"
@@ -34,36 +36,6 @@ extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
namespace {
/// Fallback implementations are missing to trigger a link time error.
/// Implementations for new devices, including the host, should go into a
/// dedicated begin/end declare variant.
///
///{
extern "C" {
__attribute__((leaf)) void *malloc(uint64_t Size);
__attribute__((leaf)) void free(void *Ptr);
}
///}
/// AMDGCN implementations of the shuffle sync idiom.
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
extern "C" {
void *malloc(uint64_t Size) {
// TODO: Use some preallocated space for dynamic malloc.
return nullptr;
}
void free(void *Ptr) {}
}
#pragma omp end declare variant
///}
/// A "smart" stack in shared memory.
///
/// The stack exposes a malloc/free interface but works like a stack internally.

View File

@@ -35,6 +35,10 @@ uint32_t atomicLoad(uint32_t *Address, int Ordering) {
return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST);
}
uint64_t atomicLoad(uint64_t *Address, int Ordering) {
return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST);
}
void atomicStore(uint32_t *Address, uint32_t Val, int Ordering) {
__atomic_store_n(Address, Val, Ordering);
}
@@ -320,6 +324,10 @@ uint32_t atomic::load(uint32_t *Addr, int Ordering) {
return impl::atomicLoad(Addr, Ordering);
}
uint64_t atomic::load(uint64_t *Addr, int Ordering) {
return impl::atomicLoad(Addr, Ordering);
}
void atomic::store(uint32_t *Addr, uint32_t V, int Ordering) {
impl::atomicStore(Addr, V, Ordering);
}
@@ -336,6 +344,17 @@ uint64_t atomic::add(uint64_t *Addr, uint64_t V, int Ordering) {
return impl::atomicAdd(Addr, V, Ordering);
}
void mutex::TicketLock::lock() {
uint64_t MyTicket = atomic::add(&NextTicket, 1U, __ATOMIC_SEQ_CST);
while (atomic::load(&NowServing, __ATOMIC_SEQ_CST) != MyTicket)
;
}
void mutex::TicketLock::unlock() {
atomic::add(&NowServing, 1U, __ATOMIC_SEQ_CST);
}
extern "C" {
void __kmpc_ordered(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); }

View File

@@ -968,6 +968,82 @@ public:
}
}
// Initialize heap buffer
{
const char *BufferVarName = "omptarget_device_heap_buffer";
const char *SizeVarName = "omptarget_device_heap_size";
CUdeviceptr BufferVarPtr;
CUdeviceptr SizeVarPtr;
size_t BufferVarSize;
size_t SizeVarSize;
Err = cuModuleGetGlobal(&BufferVarPtr, &BufferVarSize, Module,
BufferVarName);
if (Err == CUDA_SUCCESS) {
if (BufferVarSize != sizeof(uint64_t)) {
REPORT("Global global heap buffer pointer '%s' - size mismatch (%zu "
"!= %zu)\n",
BufferVarName, BufferVarSize, sizeof(uint64_t));
CUDA_ERR_STRING(Err);
return nullptr;
}
Err = cuModuleGetGlobal(&SizeVarPtr, &SizeVarSize, Module, SizeVarName);
if (Err == CUDA_SUCCESS) {
if (SizeVarSize != sizeof(uint64_t)) {
REPORT("Global global heap size variable '%s' - size mismatch (%zu "
"!= %zu)\n",
SizeVarName, SizeVarSize, sizeof(uint64_t));
CUDA_ERR_STRING(Err);
return nullptr;
}
CUdeviceptr BufferPtr;
size_t HeapSize = 1024U * 1024 * 1024 * 2;
Err = cuMemAlloc(&BufferPtr, HeapSize);
if (Err != CUDA_SUCCESS) {
REPORT("Error when allocating heap bufferm size = %zu\n", HeapSize);
CUDA_ERR_STRING(Err);
return nullptr;
}
Err = cuMemcpyHtoD(BufferVarPtr, &BufferPtr, BufferVarSize);
if (Err != CUDA_SUCCESS) {
REPORT("Error when copying data from host to device. Pointers: "
"host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
DPxPTR(&BufferPtr), DPxPTR(BufferVarPtr), BufferVarSize);
CUDA_ERR_STRING(Err);
return nullptr;
}
Err = cuMemcpyHtoD(SizeVarPtr, &HeapSize, SizeVarSize);
if (Err != CUDA_SUCCESS) {
REPORT("Error when copying data from host to device. Pointers: "
"host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
DPxPTR(&HeapSize), DPxPTR(SizeVarPtr), SizeVarSize);
CUDA_ERR_STRING(Err);
return nullptr;
}
DP("Successfully set heap buffer. omptarget_device_heap_buffer "
"= " DPxMOD ", omptarget_device_heap_size = %zu\n",
DPxPTR(BufferPtr), HeapSize);
} else {
DP("Finding global heap buffer pointer '%s' - symbol missing.\n",
SizeVarName);
DP("Continue, considering this is an image does not require heap "
"allocation.\n");
}
} else {
DP("Finding global heap buffer pointer '%s' - symbol missing.\n",
BufferVarName);
DP("Continue, considering this is an image does not require heap "
"allocation.\n");
}
}
return getOffloadEntriesTable(DeviceId);
}