Use GPUFirst with libc rpc

- add missing headers in rpc.h.def
- add an opcode in libc rpc to handle gpu first host functions calls
- Fixe pointer casting
- Fixe Generated function to account for AMD address space
- remove LibC duplicate FILE declarations
- remove global variable to allow asyncronize rpc call
This commit is contained in:
Nicolas Marie
2024-06-17 17:50:31 -07:00
parent c59cbdebfd
commit 3b1aae9380
8 changed files with 301 additions and 112 deletions

View File

@@ -16,6 +16,9 @@
#include <llvm-libc-types/rpc_opcodes_t.h>
#include <llvm-libc-types/rpc_port_t.h>
#include <stddef.h>
#include <stdio.h>
%%public_api()
#endif // LLVM_LIBC_GPU_RPC_H

View File

@@ -34,6 +34,7 @@ typedef enum {
RPC_PRINTF_TO_STDOUT,
RPC_PRINTF_TO_STDERR,
RPC_PRINTF_TO_STREAM,
RPC_GPUFIRST,
RPC_LAST = 0xFFFF,
} rpc_opcode_t;

View File

@@ -12,6 +12,7 @@
#include "llvm/Transforms/IPO/HostRPC.h"
#include "llvm/ADT/EnumeratedArray.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/CodeGen/CommandFlags.h"
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
#include "llvm/IR/BasicBlock.h"
@@ -30,7 +31,6 @@
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/IPO/Attributor.h"
#include <cstdint>
#define DEBUG_TYPE "host-rpc"
@@ -73,7 +73,9 @@ __OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper)
static constexpr const char *InternalPrefix[] = {
"__kmp", "llvm.", "nvm.",
"omp_", "vprintf", "malloc",
"free", "__keep_alive", "__llvm_omp_vprintf"};
"free", "__keep_alive", "__llvm_omp_vprintf",
"rpc_"
};
bool isInternalFunction(Function &F) {
auto Name = F.getName();
@@ -150,8 +152,7 @@ class HostRPC {
SmallVector<Function *> HostEntryTable;
EnumeratedArray<Function *, HostRPCRuntimeFunction,
HostRPCRuntimeFunction::OMPRTL___last>
RFIs;
HostRPCRuntimeFunction::OMPRTL___last> RFIs;
SmallVector<std::pair<CallInst *, CallInst *>> CallInstMap;
@@ -228,21 +229,31 @@ public:
#define __OMP_RTL(_ENUM, MOD, VARARG, RETTY, ...) \
{ \
SmallVector<Type *> Params{__VA_ARGS__}; \
FunctionType *FT = FunctionType::get(RETTY, Params, VARARG); \
Function *F = (MOD).getFunction(#_ENUM); \
if (!F) \
if (!F) { \
FunctionType *FT = FunctionType::get(RETTY, Params, VARARG); \
F = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage, \
#_ENUM, (MOD)); \
} \
RFIs[OMPRTL_##_ENUM] = F; \
}
// devices functions:
// get information about the functions that we are calling
__OMP_RTL(__kmpc_host_rpc_get_desc, M, false, Int8PtrTy, Int32Ty, Int32Ty,
Int8PtrTy)
// get arguments information about one of the argument
__OMP_RTL(__kmpc_host_rpc_add_arg, M, false, VoidTy, Int8PtrTy, Int64Ty,
Int32Ty)
// send the function to the host the function
__OMP_RTL(__kmpc_host_rpc_send_and_wait, M, false, Int64Ty, Int8PtrTy)
// host functions:
// get arguments (mirror of add arg)
__OMP_RTL(__kmpc_host_rpc_get_arg, HM, false, Int64Ty, Int8PtrTy, Int32Ty)
// send the ruturn value
__OMP_RTL(__kmpc_host_rpc_set_ret_val, HM, false, VoidTy, Int8PtrTy,
Int64Ty)
// Invoke the function on the host
__OMP_RTL(__kmpc_host_rpc_invoke_host_wrapper, HM, false, VoidTy, Int32Ty,
Int8PtrTy)
#undef __OMP_RTL
@@ -298,6 +309,10 @@ Value *HostRPC::convertFromInt64TyTo(Value *V, Type *T) {
return Builder.CreateBitCast(V, T);
}
LLVM_DEBUG(dbgs() << "[HostRPC] unknown type " << *T
<< " for typeFromint64_t.\n";);
llvm_unreachable("unknown cast from int64_t");
}
@@ -310,39 +325,40 @@ Constant *HostRPC::convertToInt64Ty(Constant *C) {
if (T->isPointerTy())
return ConstantExpr::getPtrToInt(C, Int64Ty);
if (T->isIntegerTy())
llvm_unreachable("I don't know how to fixe this");
//return ConstantExpr::getIntegerCast(C, Int64Ty, /* isSigned */ true);
if (T->isIntegerTy()) {
return ConstantFoldIntegerCast(C, Int64Ty, true, DL);
}
if (T->isFloatingPointTy()) {
// TODO: FIXEME getIntegerCast is hard to implement with new version of ConstExpr
//C = ConstantExpr::getBitCast(
// C, Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()));
//return ConstantExpr::getIntegerCast(C, Int64Ty, /* isSigned */ true);
llvm_unreachable("unsuported cast from float to int64_t");
// cast to an int of the same size
C = ConstantExpr::getBitCast(C,
Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()));
// set the int of size 64
return ConstantFoldIntegerCast(C, Int64Ty, true, DL);
}
llvm_unreachable("unknown cast to int64_t");
}
Constant *HostRPC::convertFromInt64TyTo(Constant *C, Type *T) {
assert(C->getType() == Int64Ty);
if (T == Int64Ty)
return C;
if (T->isPointerTy())
return ConstantExpr::getIntToPtr(C, T);
if (T->isIntegerTy())
llvm_unreachable("I don't know how to fixe this");
//return ConstantExpr::getIntegerCast(C, T, /* isSigned */ true);
if (T->isIntegerTy()) {
return ConstantFoldIntegerCast(C, T, true, DL);
}
if (T->isFloatingPointTy()) {
// TODO: FIXEME getIntegerCast is hard to implement with new version of ConstExpr
//C = ConstantExpr::getIntegerCast(
// C, Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()),
// /* isSigned */ true);
//return ConstantExpr::getBitCast(C, T);
llvm_unreachable("unsuported cast from int64_t to float");
// change size to T size
C = ConstantFoldIntegerCast(C,
Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()), true, DL);
// from int to float
return ConstantExpr::getBitCast(C, T);
}
llvm_unreachable("unknown cast from int64_t");
@@ -382,6 +398,10 @@ bool HostRPC::recollectInformation() {
if (F.use_empty())
continue;
LLVM_DEBUG({
dbgs() << "[HostRPC] RPCing function: " << F.getName() << "\n"
<< F << "\n";
});
FunctionWorkList.insert(&F);
}
@@ -391,11 +411,14 @@ bool HostRPC::recollectInformation() {
bool HostRPC::run() {
bool Changed = false;
LLVM_DEBUG(dbgs() << "[HostRPC] Running Pass\n");
if (!recollectInformation())
return Changed;
Changed = true;
// We add a couple of assumptions to those RPC functions such that AAs will
// not error out because of unknown implementation of those functions.
for (Function &F : M) {
@@ -424,7 +447,7 @@ bool HostRPC::run() {
}
}
LLVM_DEBUG(M.dump());
//LLVM_DEBUG(M.dump());
registerAAs();
@@ -438,6 +461,7 @@ bool HostRPC::run() {
if (!Changed)
return Changed;
// replace all call to the function to a call to the rpc wrapper that have replace it.
for (auto Itr = CallInstMap.rbegin(); Itr != CallInstMap.rend(); ++Itr) {
auto *CI = Itr->first;
auto *NewCI = Itr->second;
@@ -445,6 +469,7 @@ bool HostRPC::run() {
CI->eraseFromParent();
}
// erase all trace of the function in the Module
for (Function *F : FunctionWorkList)
if (F->user_empty())
F->eraseFromParent();
@@ -556,16 +581,21 @@ bool HostRPC::rewriteWithHostRPC(Function *F) {
Value *Operand = CI->getArgOperand(I);
LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Argument: " << I << ": " << *Operand << "\n"; });
// Check if scalar type.
if (!Operand->getType()->isPointerTy()) {
AII.emplace_back();
HandleDirectUse(Operand, AII.back());
IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Constant: " << *Operand << "\n"; });
continue;
}
if (CheckIfNullPtr(Operand))
if (CheckIfNullPtr(Operand)){
LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Null Ptr: " << *Operand << "\n"; });
continue;
}
auto Pred = [&](Value &Obj) {
if (CheckIfNullPtr(&Obj))
@@ -592,6 +622,7 @@ bool HostRPC::rewriteWithHostRPC(Function *F) {
: ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM);
} else if (CheckIfDynAlloc(&Obj)) {
// We will handle this case at runtime so here we don't do anything.
LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Dynamic Alloc: " << *Operand << "\n"; });
return true;
} else if (isa<AllocaInst>(&Obj)) {
llvm_unreachable("alloca instruction needs to be handled!");
@@ -607,9 +638,24 @@ bool HostRPC::rewriteWithHostRPC(Function *F) {
return true;
};
auto &AAUO = *A.getOrCreateAAFor<AAUnderlyingObjects>(
IRPosition::callsite_argument(*CI, I), nullptr, DepClassTy::NONE);
if (!AAUO.forallUnderlyingObjects(Pred))
LLVM_DEBUG({
dbgs() << "[HostRPC] function rewrite:\n"
<< "Function: " << *F << "\n"
<< "Call site: " << *CI << "\n "
<< "Operand: " << *Operand << "\n";
});
// TODO replace with LLVM functions to not use Attributors.
assert(!IRPosition::callsite_argument(*CI, I)
.getAnchorScope()->hasFnAttribute(Attribute::OptimizeNone)
&& "[HostRPC]: Optimize None is not supported");
const llvm::AAUnderlyingObjects* AAUO =
A.getOrCreateAAFor<AAUnderlyingObjects>(
IRPosition::callsite_argument(*CI, I));
LLVM_DEBUG({dbgs() << "[HostRPC] AAUO:" << AAUO << "\n";});
if (!AAUO->forallUnderlyingObjects(Pred))
llvm_unreachable("internal error");
}
@@ -625,22 +671,27 @@ bool HostRPC::rewriteWithHostRPC(Function *F) {
Value *Next = NullPtr;
for (auto &AI : AII) {
Value *AIV = Builder.CreateAlloca(ArgInfoTy);
Value *AIIArg =
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(0)});
Builder.Insert(AIIArg);
Builder.CreateStore(convertToInt64Ty(AI.BasePtr), AIIArg);
Value *AIIType =
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(1)});
Builder.Insert(AIIType);
Builder.CreateStore(AI.Type, AIIType);
Value *AIISize =
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(2)});
Builder.Insert(AIISize);
Builder.CreateStore(AI.Size, AIISize);
Value *AIINext =
GetElementPtrInst::Create(Int8PtrTy, AIV, {getConstantInt64(3)});
Builder.Insert(AIINext);
Builder.CreateStore(Next, AIINext);
Next = AIV;
}
Value *AIIV = GetElementPtrInst::Create(Int8PtrTy, ArgInfoVal,
@@ -659,16 +710,25 @@ bool HostRPC::rewriteWithHostRPC(Function *F) {
cast<Constant>(AI.Size), Last});
auto *GV = new GlobalVariable(
M, ArgInfoTy, /* isConstant */ true,
GlobalValue::LinkageTypes::InternalLinkage, CS);
GlobalValue::LinkageTypes::InternalLinkage, CS, "",
nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 0);
// force adress space 0 on AMD GPU
// insted of address space 1 for globals
Last = GV;
}
ArgInfoInitVar.push_back(Last);
LLVM_DEBUG({
dbgs() << "[HostRPC] ArgInfoInitVar:" << *Last << "\n";
});
ArgInfoInitVar.push_back(Last);
}
Constant *ArgInfoInit = ConstantArray::get(
ArrayType::get(Int8PtrTy, NumArgs), ArgInfoInitVar);
ArgInfoVal = new GlobalVariable(
M, ArrayType::get(Int8PtrTy, NumArgs), /* isConstant */ true,
GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info");
GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info",
nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 0);
}
SmallVector<Value *> Args{ConstantInt::get(Int32Ty, WrapperNumber),
@@ -710,16 +770,24 @@ Function *HostRPC::getDeviceWrapperFunction(StringRef WrapperName, Function *F,
Value *Desc = nullptr;
{
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_get_desc];
LLVM_DEBUG({dbgs() << "[HostRPC] Building: rpc get desc: " << *Fn << "\n"; });
for (unsigned i = 0; i < 3; ++i)
LLVM_DEBUG({dbgs() << "ParamI: " << *(Fn->getFunctionType()->getParamType(i)) << "\n"; });
Desc = Builder.CreateCall(
Fn,
{WrapperFn->getArg(0),
ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
WrapperFn->getArg(1)},
"desc");
Fn,
{
WrapperFn->getArg(0),
ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
WrapperFn->getArg(1)
},
"desc"
);
}
{
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_add_arg];
LLVM_DEBUG({dbgs() << "[HostRPC] Building: rpc add arg\n"; });
for (unsigned I = NumArgSkipped; I < WrapperFn->arg_size(); ++I) {
Value *V = convertToInt64Ty(WrapperFn->getArg(I));
Builder.CreateCall(
@@ -727,6 +795,7 @@ Function *HostRPC::getDeviceWrapperFunction(StringRef WrapperName, Function *F,
}
}
LLVM_DEBUG({dbgs() << "[HostRPC] Building: rpc send and wait\n"; });
Value *RetVal =
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_send_and_wait], {Desc});
@@ -740,6 +809,8 @@ Function *HostRPC::getDeviceWrapperFunction(StringRef WrapperName, Function *F,
Builder.CreateRet(RetVal);
LLVM_DEBUG({dbgs() << "[HostRPC] Device Wrapper Function:\n" << *WrapperFn; });
return WrapperFn;
}

View File

@@ -102,7 +102,11 @@ set(src_files
${source_directory}/Workshare.cpp
)
# WarpAllocator.cpp is missing from this list
if (LIBOMPTARGET_DEVICE_BUILTIN_ALLOCATOR)
# Use the already buildin allocator of DeviceRTL instead of GPUFirst one,
# Does not support rpc call of function argument with pointer to GPU memory
# as Allocation informations is not saved.
list(APPEND src_files ${source_directory}/BuiltinAllocator.cpp)
elseif (LIBOMPTARGET_GENERIC_ALLOCATOR)
list(APPEND src_files ${source_directory}/GenericAllocator.cpp)
@@ -110,6 +114,7 @@ else()
list(APPEND src_files ${source_directory}/TeamAllocator.cpp)
endif()
# We disable the slp vectorizer during the runtime optimization to avoid
# vectorized accesses to the shared state. Generally, those are "good" but
# the optimizer pipeline (esp. Attributor) does not fully support vectorized
@@ -139,8 +144,10 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
-I${include_directory}
-I${devicertl_base_directory}/../include
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
-I${CMAKE_SOURCE_DIR}/../libc/include
)
if (LIBOMPTARGET_DEVICERTL_HOSTRPC_DEBUG)
list(APPEND bc_flags "-DHOSTRPC_DEBUG")
endif()
@@ -316,6 +323,7 @@ set_target_properties(omptarget.devicertl PROPERTIES
ARCHIVE_OUTPUT_DIRECTORY "${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}"
LINKER_LANGUAGE CXX
)
target_link_libraries(omptarget.devicertl PRIVATE omptarget.devicertl.all_objs)
install(TARGETS omptarget.devicertl ARCHIVE DESTINATION ${OFFLOAD_INSTALL_LIBDIR})

View File

@@ -14,10 +14,6 @@
#include "Types.h"
struct FILE;
extern FILE *stdin;
extern FILE *stdout;
extern FILE *stderr;
#ifndef _ASM_GENERIC_ERRNO_BASE_H
#define _ASM_GENERIC_ERRNO_BASE_H

View File

@@ -18,6 +18,8 @@
#include "Utils.h"
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
#include "llvm-libc-types/rpc_opcodes_t.h"
#include "llvm-libc-types/rpc_port_t.h"
#ifdef HOSTRPC_DEBUG
#define DEBUG_PREFIX "host-rpc-device"
@@ -32,15 +34,6 @@ using namespace hostrpc;
using ArgType = llvm::omp::OMPTgtHostRPCArgType;
Descriptor *omptarget_hostrpc_descriptor
__attribute__((used, retain, weak, visibility("protected")));
int32_t *omptarget_hostrpc_futex
__attribute__((used, retain, weak, visibility("protected")));
char *omptarget_hostrpc_memory_buffer
__attribute__((used, retain, weak, visibility("protected")));
size_t omptarget_hostrpc_memory_buffer_size
__attribute__((used, retain, weak, visibility("protected")));
#ifdef HOSTRPC_PROFILING
int32_t HostRPCId;
double GetDescStart;
@@ -53,33 +46,21 @@ double CopyBackStart;
double CopyBackEnd;
#endif
// libc rpc functions forward declare:
// TODO: replace when a proper header exposing device functions is created
extern "C" {
rpc_port_t rpc_open_port(rpc_opcode_t);
void rpc_send_n(rpc_port_t *handle, const void *src, size_t size);
void rpc_recv_n(rpc_port_t *handle, void *dst, size_t *size);
void rpc_close_port(rpc_port_t *handle);
}
namespace {
size_t HostRPCMemoryBufferCurrentPosition = 0;
constexpr const size_t Alignment = 16;
// FIXME: For now we only allow one thread requesting host RPC.
mutex::TicketLock HostRPCLock;
void *HostRPCMemAlloc(size_t Size) {
Size = utils::align_up(Size, Alignment);
if (Size + HostRPCMemoryBufferCurrentPosition <
omptarget_hostrpc_memory_buffer_size) {
void *R =
omptarget_hostrpc_memory_buffer + HostRPCMemoryBufferCurrentPosition;
atomic::add(&HostRPCMemoryBufferCurrentPosition, Size, atomic::acq_rel);
return R;
}
printf("%s:%d\n", __FILE__, __LINE__);
__builtin_trap();
return nullptr;
}
// For now we just reset the buffer.
void HostRPCMemReset() { HostRPCMemoryBufferCurrentPosition = 0; }
static_assert(sizeof(intptr_t) == sizeof(int64_t), "pointer size not match");
struct HostRPCArgInfo {
@@ -108,7 +89,7 @@ void *getMappedPointer(Descriptor *D, void *BasePtr, int64_t Size,
return utils::advance(MapTable[I].MappedBasePtr, Offset);
MapTable[I].BasePtr = BasePtr;
MapTable[I].MappedBasePtr = HostRPCMemAlloc(Size);
MapTable[I].MappedBasePtr = malloc(Size);
MapTable[I].Size = Size;
MapTable[I].Kind = Kind;
@@ -140,10 +121,7 @@ void copybackIfNeeded(Descriptor *D) {
extern "C" {
__attribute__((noinline, used)) void *
__kmpc_host_rpc_get_desc(int32_t CallId, int32_t NumArgs, void *ArgInfo) {
assert(omptarget_hostrpc_descriptor && omptarget_hostrpc_futex &&
"no host rpc pointer");
DP("device: stdin=%p, stdout=%p, stderr=%p\n", stdin, stdout, stderr);
DP("get desc for request (id=%d), NumArgs=%d, ArgInfo=%p.\n", CallId, NumArgs,
ArgInfo);
#ifdef HOSTRPC_DEBUG
@@ -154,15 +132,12 @@ __kmpc_host_rpc_get_desc(int32_t CallId, int32_t NumArgs, void *ArgInfo) {
}
#endif
HostRPCLock.lock();
#ifdef HOSTRPC_PROFILING
HostRPCId = CallId;
GetDescStart = omp_get_wtime();
#endif
// TODO: change it after we support a queue-like data structure.
Descriptor *D = omptarget_hostrpc_descriptor;
Descriptor *D = (Descriptor *) malloc(sizeof(Descriptor));
D->Id = CallId;
D->ArgInfo = reinterpret_cast<void **>(ArgInfo);
@@ -170,8 +145,8 @@ __kmpc_host_rpc_get_desc(int32_t CallId, int32_t NumArgs, void *ArgInfo) {
D->Status = EXEC_STAT_CREATED;
D->ReturnValue = 0;
D->Args =
reinterpret_cast<Argument *>(HostRPCMemAlloc(sizeof(Argument) * NumArgs));
D->ArgMap = HostRPCMemAlloc(sizeof(HostRPCPointerMapEntry) * NumArgs);
reinterpret_cast<Argument *>(malloc(sizeof(Argument) * NumArgs));
D->ArgMap = malloc(sizeof(HostRPCPointerMapEntry) * NumArgs);
assert(!NumArgs || (D->Args && D->ArgMap) && "out of host rpc memory!");
@@ -209,15 +184,6 @@ __kmpc_host_rpc_add_arg(void *Desc, int64_t ArgVal, int32_t ArgNum) {
void *ArgPtr = reinterpret_cast<void *>(ArgVal);
if (ArgPtr == stdin || ArgPtr == stdout || ArgPtr == stderr) {
ArgInDesc.Value = ArgVal;
ArgInDesc.ArgType = Type::ARG_POINTER;
DP("arg (no=%d) is stdin/stdout/stderr, done.\n", ArgNum);
return;
}
const auto *AI = reinterpret_cast<HostRPCArgInfo *>(D->ArgInfo[ArgNum]);
DP("try to find arg (no=%d) from args AI=%p\n", ArgNum, AI);
@@ -299,7 +265,7 @@ __kmpc_host_rpc_add_arg(void *Desc, int64_t ArgVal, int32_t ArgNum) {
__attribute__((noinline, used)) int64_t
__kmpc_host_rpc_send_and_wait(void *Desc) {
auto *D = reinterpret_cast<Descriptor *>(Desc);
Descriptor *D = reinterpret_cast<Descriptor *>(Desc);
int32_t Id = D->Id;
#ifdef HOSTRPC_PROFILING
@@ -307,22 +273,61 @@ __kmpc_host_rpc_send_and_wait(void *Desc) {
IssueAndWaitStart = omp_get_wtime();
#endif
atomic::add(omptarget_hostrpc_futex, 1U, atomic::acq_rel);
// A system fence is required to make sure futex on the host is also
// updated if USM is supported.
fence::system(atomic::seq_cst);
DP("sent request (id=%d) to host. waiting for finish.\n", Id);
// // WORKING back & forth of an uint64_t
//
// printf("[HostRPC] [Device]: Start \n");
//
// rpc_port_t port = rpc_open_port(RPC_GPUFIRST);
//
// uint64_t size_send = sizeof(uint64_t);
// void *buf_send = malloc(size_send);
// *((uint64_t *) buf_send) = 123456789;
//
// printf("[Hostrpc] [Device] [SEND]: %lu\n", *((uint64_t *) buf_send));
// printf("[HostRPC] [Device] [SEND] Size: %lu\n", size_send);
//
// rpc_send_n(&port, buf_send, size_send);
//
//
// uint64_t size_recv = sizeof(uint64_t);
// void *buf_recv = malloc(size_recv);
//
// rpc_recv_n(&port, buf_recv, &size_recv);
//
// printf("[HostRPC] [Device] [RECV]: %lu\n", *((uint64_t *) buf_recv));
// printf("[HostRPC] [Device] [RECV] Size: %lu\n", size_recv);
//
// rpc_close_port(&port);
//
// assert(size_send == size_recv);
//
// printf("[HostRPC] [Device]: End \n");
//
// // END of working part
unsigned NS = 8;
while (atomic::addSys(omptarget_hostrpc_futex, 0)) {
asm volatile("nanosleep.u32 %0;" : : "r"(NS));
// if (NS < 64)
// NS *= 2;
// fence::system(atomic::seq_cst);
}
rpc_port_t port = rpc_open_port(RPC_GPUFIRST);
Argument *Args = D->Args;
rpc_send_n(&port, D, sizeof(Descriptor));
rpc_send_n(&port, Args, sizeof(Argument) * D->NumArgs);
// CPU is calling the function here
// unuse
uint64_t size_recv = 0;
rpc_recv_n(&port, D, &size_recv);
rpc_recv_n(&port, Args, &size_recv);
D->Args = Args;
(void) size_recv;
rpc_close_port(&port);
#ifdef HOSTRPC_PROFILING
IssueAndWaitEnd = omp_get_wtime();
@@ -348,11 +353,14 @@ __kmpc_host_rpc_send_and_wait(void *Desc) {
CopyBackEnd = omp_get_wtime();
#endif
HostRPCMemReset();
// We can unlock now as we already get all temporary part.
// TODO: If we have a queue, we don't need this step.
HostRPCLock.unlock();
// free memory allocated for the call
HostRPCPointerMapEntry *MapTable = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
for(int i = 0; i < D->NumArgs && MapTable[i].BasePtr; ++i){
free(MapTable[i].MappedBasePtr);
}
free(D->Args);
free(D->ArgMap);
free(D);
DP("request (id=%d) is done with return code=%lx.\n", Id, Ret);
@@ -399,7 +407,7 @@ __kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
ArgInfoArray[4].Size = sizeof(void *) * nargs;
void *Args = nullptr;
if (nargs) {
Args = HostRPCMemAlloc(ArgInfoArray[4].Size);
Args = malloc(ArgInfoArray[4].Size);
__builtin_memcpy(Args, args, ArgInfoArray[4].Size);
}
ArgInfoArray[4].BasePtr = Args;

View File

@@ -45,8 +45,8 @@ struct Descriptor {
int32_t Id;
struct Argument *Args;
int64_t NumArgs;
volatile int64_t Status;
volatile int64_t ReturnValue;
int64_t Status;
int64_t ReturnValue;
// The following members will only be used by device.
void **ArgInfo;

View File

@@ -15,12 +15,45 @@
#if defined(LIBOMPTARGET_RPC_SUPPORT)
#include "llvm-libc-types/rpc_opcodes_t.h"
#include "llvmlibc_rpc_server.h"
#include "HostRPC.h"
#include "llvm/Support/DynamicLibrary.h"
#endif
using namespace llvm;
using namespace omp;
using namespace target;
#ifdef LIBOMPTARGET_RPC_SUPPORT
// GPUFirst Host Function Wrapper Invoker
class HostRPCInvokerWrapper {
void (*Invoker)(int32_t, void *) = nullptr;
std::unique_ptr<sys::DynamicLibrary> DL;
std::once_flag Flag;
void initInvoker() {
std::string ErrMsg;
DL = std::make_unique<sys::DynamicLibrary>(
sys::DynamicLibrary::getPermanentLibrary(nullptr, &ErrMsg));
assert(DL->isValid() && "invalid DL");
*((void **)&Invoker) =
DL->getAddressOfSymbol("__kmpc_host_rpc_invoke_host_wrapper");
assert(Invoker && "Invoker is nullptr");
}
public:
void invoke(int32_t CallNo, void *Desc) {
std::call_once(Flag, &HostRPCInvokerWrapper::initInvoker, this);
Invoker(CallNo, Desc);
}
};
HostRPCInvokerWrapper *Invoker;
// GPUFirst END
#endif
RPCServerTy::RPCServerTy(plugin::GenericPluginTy &Plugin)
: Handles(Plugin.getNumDevices()) {}
@@ -89,6 +122,75 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
"Failed to register RPC free handler for device %d: %d\n",
Device.getDeviceId(), Err);
// GPUFirst
// Register custom opcode handler for gpu first
auto GPUFirstHandler = [](rpc_port_t port, void *Data) {
// printf("[HostRPC] [Host]: GPUFirstHandler\n");
// // WORKING back & forth of an uint64_t
//
// printf("[HostRPC] [Host]: Start \n");
//
// uint64_t size_recv = 0;
// void *buf_recv = nullptr;
//
// rpc_recv_n(port, &buf_recv, &size_recv,
// [](uint64_t size, void* data){ return malloc(size); }, nullptr);
//
// printf("[HostRPC] [Host] [RECV]: %lu\n", *((uint64_t *) buf_recv));
// printf("[HostRPC] [Host] [RECV] Size: %lu\n", size_recv);
//
// uint64_t size_send = sizeof(uint64_t);
// void *buf_send = malloc(size_send);
// *((uint64_t *) buf_send) = 987654321;
//
// printf("[Hostrpc] [Host] [SEND]: %lu\n", *((uint64_t *) buf_send));
// printf("[HostRPC] [Host] [SEND] Size: %lu\n", size_send);
//
// rpc_send_n(port, &buf_send, &size_send);
//
// printf("[HostRPC] [Host]: End \n");
//
// // END of working part
auto _rpc_recv_n = [](rpc_port_t *handle, void **dst, size_t *size){
rpc_recv_n(*handle, dst, size,
[](uint64_t size, void* data){ return malloc(size); },
nullptr);
};
auto _rpc_send_n = [](rpc_port_t *handle, void *src, size_t size){
rpc_send_n(*handle, &src, &size);
};
uint64_t size_recv = 0;
hostrpc::Descriptor *D = nullptr;
hostrpc::Argument *Args = nullptr;
_rpc_recv_n(&port, reinterpret_cast<void **>(&D), &size_recv);
_rpc_recv_n(&port, reinterpret_cast<void **>(&Args), &size_recv);
D->Args = Args;
if(Invoker == nullptr)
Invoker = new HostRPCInvokerWrapper();
Invoker->invoke(D->Id, D);
_rpc_send_n(&port, D, sizeof(hostrpc::Descriptor));
_rpc_send_n(&port, D->Args, sizeof(hostrpc::Argument) * D->NumArgs);
free(D->Args);
free(D);
};
if (rpc_status_t Err =
rpc_register_callback(RPCDevice, RPC_GPUFIRST, GPUFirstHandler, &Invoker))
return plugin::Plugin::error(
"Failed to register RPC GPU First handler for device %d: %d\n", Device.getDeviceId(),
Err);
// GPUFirst END
// Get the address of the RPC client from the device.
void *ClientPtr;
plugin::GlobalTy ClientGlobal(rpc_client_symbol_name, sizeof(void *));