Use GPUFirst with libc rpc
- add missing headers in rpc.h.def - add an opcode in libc rpc to handle gpu first host functions calls - Fixe pointer casting - Fixe Generated function to account for AMD address space - remove LibC duplicate FILE declarations - remove global variable to allow asyncronize rpc call
This commit is contained in:
@@ -16,6 +16,9 @@
|
||||
#include <llvm-libc-types/rpc_opcodes_t.h>
|
||||
#include <llvm-libc-types/rpc_port_t.h>
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
||||
%%public_api()
|
||||
|
||||
#endif // LLVM_LIBC_GPU_RPC_H
|
||||
|
||||
@@ -34,6 +34,7 @@ typedef enum {
|
||||
RPC_PRINTF_TO_STDOUT,
|
||||
RPC_PRINTF_TO_STDERR,
|
||||
RPC_PRINTF_TO_STREAM,
|
||||
RPC_GPUFIRST,
|
||||
RPC_LAST = 0xFFFF,
|
||||
} rpc_opcode_t;
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include "llvm/Transforms/IPO/HostRPC.h"
|
||||
|
||||
#include "llvm/ADT/EnumeratedArray.h"
|
||||
#include "llvm/Analysis/ConstantFolding.h"
|
||||
#include "llvm/CodeGen/CommandFlags.h"
|
||||
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
|
||||
#include "llvm/IR/BasicBlock.h"
|
||||
@@ -30,7 +31,6 @@
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/Transforms/IPO/Attributor.h"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#define DEBUG_TYPE "host-rpc"
|
||||
@@ -73,7 +73,9 @@ __OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper)
|
||||
static constexpr const char *InternalPrefix[] = {
|
||||
"__kmp", "llvm.", "nvm.",
|
||||
"omp_", "vprintf", "malloc",
|
||||
"free", "__keep_alive", "__llvm_omp_vprintf"};
|
||||
"free", "__keep_alive", "__llvm_omp_vprintf",
|
||||
"rpc_"
|
||||
};
|
||||
|
||||
bool isInternalFunction(Function &F) {
|
||||
auto Name = F.getName();
|
||||
@@ -150,8 +152,7 @@ class HostRPC {
|
||||
SmallVector<Function *> HostEntryTable;
|
||||
|
||||
EnumeratedArray<Function *, HostRPCRuntimeFunction,
|
||||
HostRPCRuntimeFunction::OMPRTL___last>
|
||||
RFIs;
|
||||
HostRPCRuntimeFunction::OMPRTL___last> RFIs;
|
||||
|
||||
SmallVector<std::pair<CallInst *, CallInst *>> CallInstMap;
|
||||
|
||||
@@ -228,21 +229,31 @@ public:
|
||||
#define __OMP_RTL(_ENUM, MOD, VARARG, RETTY, ...) \
|
||||
{ \
|
||||
SmallVector<Type *> Params{__VA_ARGS__}; \
|
||||
FunctionType *FT = FunctionType::get(RETTY, Params, VARARG); \
|
||||
Function *F = (MOD).getFunction(#_ENUM); \
|
||||
if (!F) \
|
||||
if (!F) { \
|
||||
FunctionType *FT = FunctionType::get(RETTY, Params, VARARG); \
|
||||
F = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage, \
|
||||
#_ENUM, (MOD)); \
|
||||
} \
|
||||
RFIs[OMPRTL_##_ENUM] = F; \
|
||||
}
|
||||
// devices functions:
|
||||
// get information about the functions that we are calling
|
||||
__OMP_RTL(__kmpc_host_rpc_get_desc, M, false, Int8PtrTy, Int32Ty, Int32Ty,
|
||||
Int8PtrTy)
|
||||
// get arguments information about one of the argument
|
||||
__OMP_RTL(__kmpc_host_rpc_add_arg, M, false, VoidTy, Int8PtrTy, Int64Ty,
|
||||
Int32Ty)
|
||||
// send the function to the host the function
|
||||
__OMP_RTL(__kmpc_host_rpc_send_and_wait, M, false, Int64Ty, Int8PtrTy)
|
||||
|
||||
// host functions:
|
||||
// get arguments (mirror of add arg)
|
||||
__OMP_RTL(__kmpc_host_rpc_get_arg, HM, false, Int64Ty, Int8PtrTy, Int32Ty)
|
||||
// send the ruturn value
|
||||
__OMP_RTL(__kmpc_host_rpc_set_ret_val, HM, false, VoidTy, Int8PtrTy,
|
||||
Int64Ty)
|
||||
// Invoke the function on the host
|
||||
__OMP_RTL(__kmpc_host_rpc_invoke_host_wrapper, HM, false, VoidTy, Int32Ty,
|
||||
Int8PtrTy)
|
||||
#undef __OMP_RTL
|
||||
@@ -298,6 +309,10 @@ Value *HostRPC::convertFromInt64TyTo(Value *V, Type *T) {
|
||||
return Builder.CreateBitCast(V, T);
|
||||
}
|
||||
|
||||
|
||||
LLVM_DEBUG(dbgs() << "[HostRPC] unknown type " << *T
|
||||
<< " for typeFromint64_t.\n";);
|
||||
|
||||
llvm_unreachable("unknown cast from int64_t");
|
||||
}
|
||||
|
||||
@@ -310,39 +325,40 @@ Constant *HostRPC::convertToInt64Ty(Constant *C) {
|
||||
if (T->isPointerTy())
|
||||
return ConstantExpr::getPtrToInt(C, Int64Ty);
|
||||
|
||||
if (T->isIntegerTy())
|
||||
llvm_unreachable("I don't know how to fixe this");
|
||||
//return ConstantExpr::getIntegerCast(C, Int64Ty, /* isSigned */ true);
|
||||
if (T->isIntegerTy()) {
|
||||
return ConstantFoldIntegerCast(C, Int64Ty, true, DL);
|
||||
}
|
||||
|
||||
if (T->isFloatingPointTy()) {
|
||||
// TODO: FIXEME getIntegerCast is hard to implement with new version of ConstExpr
|
||||
//C = ConstantExpr::getBitCast(
|
||||
// C, Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()));
|
||||
//return ConstantExpr::getIntegerCast(C, Int64Ty, /* isSigned */ true);
|
||||
llvm_unreachable("unsuported cast from float to int64_t");
|
||||
// cast to an int of the same size
|
||||
C = ConstantExpr::getBitCast(C,
|
||||
Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()));
|
||||
// set the int of size 64
|
||||
return ConstantFoldIntegerCast(C, Int64Ty, true, DL);
|
||||
}
|
||||
|
||||
llvm_unreachable("unknown cast to int64_t");
|
||||
}
|
||||
|
||||
Constant *HostRPC::convertFromInt64TyTo(Constant *C, Type *T) {
|
||||
assert(C->getType() == Int64Ty);
|
||||
|
||||
if (T == Int64Ty)
|
||||
return C;
|
||||
|
||||
if (T->isPointerTy())
|
||||
return ConstantExpr::getIntToPtr(C, T);
|
||||
|
||||
if (T->isIntegerTy())
|
||||
llvm_unreachable("I don't know how to fixe this");
|
||||
//return ConstantExpr::getIntegerCast(C, T, /* isSigned */ true);
|
||||
if (T->isIntegerTy()) {
|
||||
return ConstantFoldIntegerCast(C, T, true, DL);
|
||||
}
|
||||
|
||||
if (T->isFloatingPointTy()) {
|
||||
// TODO: FIXEME getIntegerCast is hard to implement with new version of ConstExpr
|
||||
//C = ConstantExpr::getIntegerCast(
|
||||
// C, Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()),
|
||||
// /* isSigned */ true);
|
||||
//return ConstantExpr::getBitCast(C, T);
|
||||
llvm_unreachable("unsuported cast from int64_t to float");
|
||||
// change size to T size
|
||||
C = ConstantFoldIntegerCast(C,
|
||||
Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()), true, DL);
|
||||
// from int to float
|
||||
return ConstantExpr::getBitCast(C, T);
|
||||
}
|
||||
|
||||
llvm_unreachable("unknown cast from int64_t");
|
||||
@@ -382,6 +398,10 @@ bool HostRPC::recollectInformation() {
|
||||
if (F.use_empty())
|
||||
continue;
|
||||
|
||||
LLVM_DEBUG({
|
||||
dbgs() << "[HostRPC] RPCing function: " << F.getName() << "\n"
|
||||
<< F << "\n";
|
||||
});
|
||||
FunctionWorkList.insert(&F);
|
||||
}
|
||||
|
||||
@@ -391,11 +411,14 @@ bool HostRPC::recollectInformation() {
|
||||
bool HostRPC::run() {
|
||||
bool Changed = false;
|
||||
|
||||
LLVM_DEBUG(dbgs() << "[HostRPC] Running Pass\n");
|
||||
|
||||
if (!recollectInformation())
|
||||
return Changed;
|
||||
|
||||
Changed = true;
|
||||
|
||||
|
||||
// We add a couple of assumptions to those RPC functions such that AAs will
|
||||
// not error out because of unknown implementation of those functions.
|
||||
for (Function &F : M) {
|
||||
@@ -424,7 +447,7 @@ bool HostRPC::run() {
|
||||
}
|
||||
}
|
||||
|
||||
LLVM_DEBUG(M.dump());
|
||||
//LLVM_DEBUG(M.dump());
|
||||
|
||||
registerAAs();
|
||||
|
||||
@@ -438,6 +461,7 @@ bool HostRPC::run() {
|
||||
if (!Changed)
|
||||
return Changed;
|
||||
|
||||
// replace all call to the function to a call to the rpc wrapper that have replace it.
|
||||
for (auto Itr = CallInstMap.rbegin(); Itr != CallInstMap.rend(); ++Itr) {
|
||||
auto *CI = Itr->first;
|
||||
auto *NewCI = Itr->second;
|
||||
@@ -445,6 +469,7 @@ bool HostRPC::run() {
|
||||
CI->eraseFromParent();
|
||||
}
|
||||
|
||||
// erase all trace of the function in the Module
|
||||
for (Function *F : FunctionWorkList)
|
||||
if (F->user_empty())
|
||||
F->eraseFromParent();
|
||||
@@ -556,16 +581,21 @@ bool HostRPC::rewriteWithHostRPC(Function *F) {
|
||||
|
||||
Value *Operand = CI->getArgOperand(I);
|
||||
|
||||
LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Argument: " << I << ": " << *Operand << "\n"; });
|
||||
|
||||
// Check if scalar type.
|
||||
if (!Operand->getType()->isPointerTy()) {
|
||||
AII.emplace_back();
|
||||
HandleDirectUse(Operand, AII.back());
|
||||
IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
|
||||
LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Constant: " << *Operand << "\n"; });
|
||||
continue;
|
||||
}
|
||||
|
||||
if (CheckIfNullPtr(Operand))
|
||||
if (CheckIfNullPtr(Operand)){
|
||||
LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Null Ptr: " << *Operand << "\n"; });
|
||||
continue;
|
||||
}
|
||||
|
||||
auto Pred = [&](Value &Obj) {
|
||||
if (CheckIfNullPtr(&Obj))
|
||||
@@ -592,6 +622,7 @@ bool HostRPC::rewriteWithHostRPC(Function *F) {
|
||||
: ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM);
|
||||
} else if (CheckIfDynAlloc(&Obj)) {
|
||||
// We will handle this case at runtime so here we don't do anything.
|
||||
LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Dynamic Alloc: " << *Operand << "\n"; });
|
||||
return true;
|
||||
} else if (isa<AllocaInst>(&Obj)) {
|
||||
llvm_unreachable("alloca instruction needs to be handled!");
|
||||
@@ -607,9 +638,24 @@ bool HostRPC::rewriteWithHostRPC(Function *F) {
|
||||
return true;
|
||||
};
|
||||
|
||||
auto &AAUO = *A.getOrCreateAAFor<AAUnderlyingObjects>(
|
||||
IRPosition::callsite_argument(*CI, I), nullptr, DepClassTy::NONE);
|
||||
if (!AAUO.forallUnderlyingObjects(Pred))
|
||||
LLVM_DEBUG({
|
||||
dbgs() << "[HostRPC] function rewrite:\n"
|
||||
<< "Function: " << *F << "\n"
|
||||
<< "Call site: " << *CI << "\n "
|
||||
<< "Operand: " << *Operand << "\n";
|
||||
});
|
||||
|
||||
// TODO replace with LLVM functions to not use Attributors.
|
||||
assert(!IRPosition::callsite_argument(*CI, I)
|
||||
.getAnchorScope()->hasFnAttribute(Attribute::OptimizeNone)
|
||||
&& "[HostRPC]: Optimize None is not supported");
|
||||
|
||||
const llvm::AAUnderlyingObjects* AAUO =
|
||||
A.getOrCreateAAFor<AAUnderlyingObjects>(
|
||||
IRPosition::callsite_argument(*CI, I));
|
||||
|
||||
LLVM_DEBUG({dbgs() << "[HostRPC] AAUO:" << AAUO << "\n";});
|
||||
if (!AAUO->forallUnderlyingObjects(Pred))
|
||||
llvm_unreachable("internal error");
|
||||
}
|
||||
|
||||
@@ -625,22 +671,27 @@ bool HostRPC::rewriteWithHostRPC(Function *F) {
|
||||
Value *Next = NullPtr;
|
||||
for (auto &AI : AII) {
|
||||
Value *AIV = Builder.CreateAlloca(ArgInfoTy);
|
||||
|
||||
Value *AIIArg =
|
||||
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(0)});
|
||||
Builder.Insert(AIIArg);
|
||||
Builder.CreateStore(convertToInt64Ty(AI.BasePtr), AIIArg);
|
||||
|
||||
Value *AIIType =
|
||||
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(1)});
|
||||
Builder.Insert(AIIType);
|
||||
Builder.CreateStore(AI.Type, AIIType);
|
||||
|
||||
Value *AIISize =
|
||||
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(2)});
|
||||
Builder.Insert(AIISize);
|
||||
Builder.CreateStore(AI.Size, AIISize);
|
||||
|
||||
Value *AIINext =
|
||||
GetElementPtrInst::Create(Int8PtrTy, AIV, {getConstantInt64(3)});
|
||||
Builder.Insert(AIINext);
|
||||
Builder.CreateStore(Next, AIINext);
|
||||
|
||||
Next = AIV;
|
||||
}
|
||||
Value *AIIV = GetElementPtrInst::Create(Int8PtrTy, ArgInfoVal,
|
||||
@@ -659,16 +710,25 @@ bool HostRPC::rewriteWithHostRPC(Function *F) {
|
||||
cast<Constant>(AI.Size), Last});
|
||||
auto *GV = new GlobalVariable(
|
||||
M, ArgInfoTy, /* isConstant */ true,
|
||||
GlobalValue::LinkageTypes::InternalLinkage, CS);
|
||||
GlobalValue::LinkageTypes::InternalLinkage, CS, "",
|
||||
nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 0);
|
||||
// force adress space 0 on AMD GPU
|
||||
// insted of address space 1 for globals
|
||||
Last = GV;
|
||||
}
|
||||
ArgInfoInitVar.push_back(Last);
|
||||
LLVM_DEBUG({
|
||||
dbgs() << "[HostRPC] ArgInfoInitVar:" << *Last << "\n";
|
||||
});
|
||||
ArgInfoInitVar.push_back(Last);
|
||||
}
|
||||
|
||||
|
||||
Constant *ArgInfoInit = ConstantArray::get(
|
||||
ArrayType::get(Int8PtrTy, NumArgs), ArgInfoInitVar);
|
||||
ArgInfoVal = new GlobalVariable(
|
||||
M, ArrayType::get(Int8PtrTy, NumArgs), /* isConstant */ true,
|
||||
GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info");
|
||||
GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info",
|
||||
nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 0);
|
||||
}
|
||||
|
||||
SmallVector<Value *> Args{ConstantInt::get(Int32Ty, WrapperNumber),
|
||||
@@ -710,16 +770,24 @@ Function *HostRPC::getDeviceWrapperFunction(StringRef WrapperName, Function *F,
|
||||
Value *Desc = nullptr;
|
||||
{
|
||||
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_get_desc];
|
||||
LLVM_DEBUG({dbgs() << "[HostRPC] Building: rpc get desc: " << *Fn << "\n"; });
|
||||
for (unsigned i = 0; i < 3; ++i)
|
||||
LLVM_DEBUG({dbgs() << "ParamI: " << *(Fn->getFunctionType()->getParamType(i)) << "\n"; });
|
||||
|
||||
Desc = Builder.CreateCall(
|
||||
Fn,
|
||||
{WrapperFn->getArg(0),
|
||||
ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
|
||||
WrapperFn->getArg(1)},
|
||||
"desc");
|
||||
Fn,
|
||||
{
|
||||
WrapperFn->getArg(0),
|
||||
ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
|
||||
WrapperFn->getArg(1)
|
||||
},
|
||||
"desc"
|
||||
);
|
||||
}
|
||||
|
||||
{
|
||||
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_add_arg];
|
||||
LLVM_DEBUG({dbgs() << "[HostRPC] Building: rpc add arg\n"; });
|
||||
for (unsigned I = NumArgSkipped; I < WrapperFn->arg_size(); ++I) {
|
||||
Value *V = convertToInt64Ty(WrapperFn->getArg(I));
|
||||
Builder.CreateCall(
|
||||
@@ -727,6 +795,7 @@ Function *HostRPC::getDeviceWrapperFunction(StringRef WrapperName, Function *F,
|
||||
}
|
||||
}
|
||||
|
||||
LLVM_DEBUG({dbgs() << "[HostRPC] Building: rpc send and wait\n"; });
|
||||
Value *RetVal =
|
||||
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_send_and_wait], {Desc});
|
||||
|
||||
@@ -740,6 +809,8 @@ Function *HostRPC::getDeviceWrapperFunction(StringRef WrapperName, Function *F,
|
||||
|
||||
Builder.CreateRet(RetVal);
|
||||
|
||||
LLVM_DEBUG({dbgs() << "[HostRPC] Device Wrapper Function:\n" << *WrapperFn; });
|
||||
|
||||
return WrapperFn;
|
||||
}
|
||||
|
||||
|
||||
@@ -102,7 +102,11 @@ set(src_files
|
||||
${source_directory}/Workshare.cpp
|
||||
)
|
||||
|
||||
# WarpAllocator.cpp is missing from this list
|
||||
if (LIBOMPTARGET_DEVICE_BUILTIN_ALLOCATOR)
|
||||
# Use the already buildin allocator of DeviceRTL instead of GPUFirst one,
|
||||
# Does not support rpc call of function argument with pointer to GPU memory
|
||||
# as Allocation informations is not saved.
|
||||
list(APPEND src_files ${source_directory}/BuiltinAllocator.cpp)
|
||||
elseif (LIBOMPTARGET_GENERIC_ALLOCATOR)
|
||||
list(APPEND src_files ${source_directory}/GenericAllocator.cpp)
|
||||
@@ -110,6 +114,7 @@ else()
|
||||
list(APPEND src_files ${source_directory}/TeamAllocator.cpp)
|
||||
endif()
|
||||
|
||||
|
||||
# We disable the slp vectorizer during the runtime optimization to avoid
|
||||
# vectorized accesses to the shared state. Generally, those are "good" but
|
||||
# the optimizer pipeline (esp. Attributor) does not fully support vectorized
|
||||
@@ -139,8 +144,10 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
|
||||
-I${include_directory}
|
||||
-I${devicertl_base_directory}/../include
|
||||
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
|
||||
-I${CMAKE_SOURCE_DIR}/../libc/include
|
||||
)
|
||||
|
||||
|
||||
if (LIBOMPTARGET_DEVICERTL_HOSTRPC_DEBUG)
|
||||
list(APPEND bc_flags "-DHOSTRPC_DEBUG")
|
||||
endif()
|
||||
@@ -316,6 +323,7 @@ set_target_properties(omptarget.devicertl PROPERTIES
|
||||
ARCHIVE_OUTPUT_DIRECTORY "${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}"
|
||||
LINKER_LANGUAGE CXX
|
||||
)
|
||||
|
||||
target_link_libraries(omptarget.devicertl PRIVATE omptarget.devicertl.all_objs)
|
||||
|
||||
install(TARGETS omptarget.devicertl ARCHIVE DESTINATION ${OFFLOAD_INSTALL_LIBDIR})
|
||||
|
||||
@@ -14,10 +14,6 @@
|
||||
|
||||
#include "Types.h"
|
||||
|
||||
struct FILE;
|
||||
extern FILE *stdin;
|
||||
extern FILE *stdout;
|
||||
extern FILE *stderr;
|
||||
|
||||
#ifndef _ASM_GENERIC_ERRNO_BASE_H
|
||||
#define _ASM_GENERIC_ERRNO_BASE_H
|
||||
|
||||
@@ -18,6 +18,8 @@
|
||||
#include "Utils.h"
|
||||
|
||||
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
|
||||
#include "llvm-libc-types/rpc_opcodes_t.h"
|
||||
#include "llvm-libc-types/rpc_port_t.h"
|
||||
|
||||
#ifdef HOSTRPC_DEBUG
|
||||
#define DEBUG_PREFIX "host-rpc-device"
|
||||
@@ -32,15 +34,6 @@ using namespace hostrpc;
|
||||
|
||||
using ArgType = llvm::omp::OMPTgtHostRPCArgType;
|
||||
|
||||
Descriptor *omptarget_hostrpc_descriptor
|
||||
__attribute__((used, retain, weak, visibility("protected")));
|
||||
int32_t *omptarget_hostrpc_futex
|
||||
__attribute__((used, retain, weak, visibility("protected")));
|
||||
char *omptarget_hostrpc_memory_buffer
|
||||
__attribute__((used, retain, weak, visibility("protected")));
|
||||
size_t omptarget_hostrpc_memory_buffer_size
|
||||
__attribute__((used, retain, weak, visibility("protected")));
|
||||
|
||||
#ifdef HOSTRPC_PROFILING
|
||||
int32_t HostRPCId;
|
||||
double GetDescStart;
|
||||
@@ -53,33 +46,21 @@ double CopyBackStart;
|
||||
double CopyBackEnd;
|
||||
#endif
|
||||
|
||||
|
||||
// libc rpc functions forward declare:
|
||||
// TODO: replace when a proper header exposing device functions is created
|
||||
extern "C" {
|
||||
rpc_port_t rpc_open_port(rpc_opcode_t);
|
||||
void rpc_send_n(rpc_port_t *handle, const void *src, size_t size);
|
||||
void rpc_recv_n(rpc_port_t *handle, void *dst, size_t *size);
|
||||
void rpc_close_port(rpc_port_t *handle);
|
||||
}
|
||||
|
||||
|
||||
namespace {
|
||||
size_t HostRPCMemoryBufferCurrentPosition = 0;
|
||||
constexpr const size_t Alignment = 16;
|
||||
|
||||
// FIXME: For now we only allow one thread requesting host RPC.
|
||||
mutex::TicketLock HostRPCLock;
|
||||
|
||||
void *HostRPCMemAlloc(size_t Size) {
|
||||
Size = utils::align_up(Size, Alignment);
|
||||
|
||||
if (Size + HostRPCMemoryBufferCurrentPosition <
|
||||
omptarget_hostrpc_memory_buffer_size) {
|
||||
void *R =
|
||||
omptarget_hostrpc_memory_buffer + HostRPCMemoryBufferCurrentPosition;
|
||||
atomic::add(&HostRPCMemoryBufferCurrentPosition, Size, atomic::acq_rel);
|
||||
return R;
|
||||
}
|
||||
|
||||
printf("%s:%d\n", __FILE__, __LINE__);
|
||||
__builtin_trap();
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// For now we just reset the buffer.
|
||||
void HostRPCMemReset() { HostRPCMemoryBufferCurrentPosition = 0; }
|
||||
|
||||
static_assert(sizeof(intptr_t) == sizeof(int64_t), "pointer size not match");
|
||||
|
||||
struct HostRPCArgInfo {
|
||||
@@ -108,7 +89,7 @@ void *getMappedPointer(Descriptor *D, void *BasePtr, int64_t Size,
|
||||
return utils::advance(MapTable[I].MappedBasePtr, Offset);
|
||||
|
||||
MapTable[I].BasePtr = BasePtr;
|
||||
MapTable[I].MappedBasePtr = HostRPCMemAlloc(Size);
|
||||
MapTable[I].MappedBasePtr = malloc(Size);
|
||||
MapTable[I].Size = Size;
|
||||
MapTable[I].Kind = Kind;
|
||||
|
||||
@@ -140,10 +121,7 @@ void copybackIfNeeded(Descriptor *D) {
|
||||
extern "C" {
|
||||
__attribute__((noinline, used)) void *
|
||||
__kmpc_host_rpc_get_desc(int32_t CallId, int32_t NumArgs, void *ArgInfo) {
|
||||
assert(omptarget_hostrpc_descriptor && omptarget_hostrpc_futex &&
|
||||
"no host rpc pointer");
|
||||
|
||||
DP("device: stdin=%p, stdout=%p, stderr=%p\n", stdin, stdout, stderr);
|
||||
DP("get desc for request (id=%d), NumArgs=%d, ArgInfo=%p.\n", CallId, NumArgs,
|
||||
ArgInfo);
|
||||
#ifdef HOSTRPC_DEBUG
|
||||
@@ -154,15 +132,12 @@ __kmpc_host_rpc_get_desc(int32_t CallId, int32_t NumArgs, void *ArgInfo) {
|
||||
}
|
||||
#endif
|
||||
|
||||
HostRPCLock.lock();
|
||||
|
||||
#ifdef HOSTRPC_PROFILING
|
||||
HostRPCId = CallId;
|
||||
GetDescStart = omp_get_wtime();
|
||||
#endif
|
||||
|
||||
// TODO: change it after we support a queue-like data structure.
|
||||
Descriptor *D = omptarget_hostrpc_descriptor;
|
||||
Descriptor *D = (Descriptor *) malloc(sizeof(Descriptor));
|
||||
|
||||
D->Id = CallId;
|
||||
D->ArgInfo = reinterpret_cast<void **>(ArgInfo);
|
||||
@@ -170,8 +145,8 @@ __kmpc_host_rpc_get_desc(int32_t CallId, int32_t NumArgs, void *ArgInfo) {
|
||||
D->Status = EXEC_STAT_CREATED;
|
||||
D->ReturnValue = 0;
|
||||
D->Args =
|
||||
reinterpret_cast<Argument *>(HostRPCMemAlloc(sizeof(Argument) * NumArgs));
|
||||
D->ArgMap = HostRPCMemAlloc(sizeof(HostRPCPointerMapEntry) * NumArgs);
|
||||
reinterpret_cast<Argument *>(malloc(sizeof(Argument) * NumArgs));
|
||||
D->ArgMap = malloc(sizeof(HostRPCPointerMapEntry) * NumArgs);
|
||||
|
||||
assert(!NumArgs || (D->Args && D->ArgMap) && "out of host rpc memory!");
|
||||
|
||||
@@ -209,15 +184,6 @@ __kmpc_host_rpc_add_arg(void *Desc, int64_t ArgVal, int32_t ArgNum) {
|
||||
|
||||
void *ArgPtr = reinterpret_cast<void *>(ArgVal);
|
||||
|
||||
if (ArgPtr == stdin || ArgPtr == stdout || ArgPtr == stderr) {
|
||||
ArgInDesc.Value = ArgVal;
|
||||
ArgInDesc.ArgType = Type::ARG_POINTER;
|
||||
|
||||
DP("arg (no=%d) is stdin/stdout/stderr, done.\n", ArgNum);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const auto *AI = reinterpret_cast<HostRPCArgInfo *>(D->ArgInfo[ArgNum]);
|
||||
|
||||
DP("try to find arg (no=%d) from args AI=%p\n", ArgNum, AI);
|
||||
@@ -299,7 +265,7 @@ __kmpc_host_rpc_add_arg(void *Desc, int64_t ArgVal, int32_t ArgNum) {
|
||||
|
||||
__attribute__((noinline, used)) int64_t
|
||||
__kmpc_host_rpc_send_and_wait(void *Desc) {
|
||||
auto *D = reinterpret_cast<Descriptor *>(Desc);
|
||||
Descriptor *D = reinterpret_cast<Descriptor *>(Desc);
|
||||
int32_t Id = D->Id;
|
||||
|
||||
#ifdef HOSTRPC_PROFILING
|
||||
@@ -307,22 +273,61 @@ __kmpc_host_rpc_send_and_wait(void *Desc) {
|
||||
IssueAndWaitStart = omp_get_wtime();
|
||||
#endif
|
||||
|
||||
atomic::add(omptarget_hostrpc_futex, 1U, atomic::acq_rel);
|
||||
|
||||
// A system fence is required to make sure futex on the host is also
|
||||
// updated if USM is supported.
|
||||
fence::system(atomic::seq_cst);
|
||||
|
||||
DP("sent request (id=%d) to host. waiting for finish.\n", Id);
|
||||
// // WORKING back & forth of an uint64_t
|
||||
//
|
||||
// printf("[HostRPC] [Device]: Start \n");
|
||||
//
|
||||
// rpc_port_t port = rpc_open_port(RPC_GPUFIRST);
|
||||
//
|
||||
// uint64_t size_send = sizeof(uint64_t);
|
||||
// void *buf_send = malloc(size_send);
|
||||
// *((uint64_t *) buf_send) = 123456789;
|
||||
//
|
||||
// printf("[Hostrpc] [Device] [SEND]: %lu\n", *((uint64_t *) buf_send));
|
||||
// printf("[HostRPC] [Device] [SEND] Size: %lu\n", size_send);
|
||||
//
|
||||
// rpc_send_n(&port, buf_send, size_send);
|
||||
//
|
||||
//
|
||||
// uint64_t size_recv = sizeof(uint64_t);
|
||||
// void *buf_recv = malloc(size_recv);
|
||||
//
|
||||
// rpc_recv_n(&port, buf_recv, &size_recv);
|
||||
//
|
||||
// printf("[HostRPC] [Device] [RECV]: %lu\n", *((uint64_t *) buf_recv));
|
||||
// printf("[HostRPC] [Device] [RECV] Size: %lu\n", size_recv);
|
||||
//
|
||||
// rpc_close_port(&port);
|
||||
//
|
||||
// assert(size_send == size_recv);
|
||||
//
|
||||
// printf("[HostRPC] [Device]: End \n");
|
||||
//
|
||||
// // END of working part
|
||||
|
||||
unsigned NS = 8;
|
||||
|
||||
while (atomic::addSys(omptarget_hostrpc_futex, 0)) {
|
||||
asm volatile("nanosleep.u32 %0;" : : "r"(NS));
|
||||
// if (NS < 64)
|
||||
// NS *= 2;
|
||||
// fence::system(atomic::seq_cst);
|
||||
}
|
||||
rpc_port_t port = rpc_open_port(RPC_GPUFIRST);
|
||||
|
||||
Argument *Args = D->Args;
|
||||
|
||||
rpc_send_n(&port, D, sizeof(Descriptor));
|
||||
rpc_send_n(&port, Args, sizeof(Argument) * D->NumArgs);
|
||||
|
||||
// CPU is calling the function here
|
||||
|
||||
// unuse
|
||||
uint64_t size_recv = 0;
|
||||
|
||||
rpc_recv_n(&port, D, &size_recv);
|
||||
rpc_recv_n(&port, Args, &size_recv);
|
||||
|
||||
D->Args = Args;
|
||||
|
||||
(void) size_recv;
|
||||
rpc_close_port(&port);
|
||||
|
||||
|
||||
#ifdef HOSTRPC_PROFILING
|
||||
IssueAndWaitEnd = omp_get_wtime();
|
||||
@@ -348,11 +353,14 @@ __kmpc_host_rpc_send_and_wait(void *Desc) {
|
||||
CopyBackEnd = omp_get_wtime();
|
||||
#endif
|
||||
|
||||
HostRPCMemReset();
|
||||
|
||||
// We can unlock now as we already get all temporary part.
|
||||
// TODO: If we have a queue, we don't need this step.
|
||||
HostRPCLock.unlock();
|
||||
// free memory allocated for the call
|
||||
HostRPCPointerMapEntry *MapTable = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
|
||||
for(int i = 0; i < D->NumArgs && MapTable[i].BasePtr; ++i){
|
||||
free(MapTable[i].MappedBasePtr);
|
||||
}
|
||||
free(D->Args);
|
||||
free(D->ArgMap);
|
||||
free(D);
|
||||
|
||||
DP("request (id=%d) is done with return code=%lx.\n", Id, Ret);
|
||||
|
||||
@@ -399,7 +407,7 @@ __kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
|
||||
ArgInfoArray[4].Size = sizeof(void *) * nargs;
|
||||
void *Args = nullptr;
|
||||
if (nargs) {
|
||||
Args = HostRPCMemAlloc(ArgInfoArray[4].Size);
|
||||
Args = malloc(ArgInfoArray[4].Size);
|
||||
__builtin_memcpy(Args, args, ArgInfoArray[4].Size);
|
||||
}
|
||||
ArgInfoArray[4].BasePtr = Args;
|
||||
|
||||
@@ -45,8 +45,8 @@ struct Descriptor {
|
||||
int32_t Id;
|
||||
struct Argument *Args;
|
||||
int64_t NumArgs;
|
||||
volatile int64_t Status;
|
||||
volatile int64_t ReturnValue;
|
||||
int64_t Status;
|
||||
int64_t ReturnValue;
|
||||
|
||||
// The following members will only be used by device.
|
||||
void **ArgInfo;
|
||||
|
||||
@@ -15,12 +15,45 @@
|
||||
#if defined(LIBOMPTARGET_RPC_SUPPORT)
|
||||
#include "llvm-libc-types/rpc_opcodes_t.h"
|
||||
#include "llvmlibc_rpc_server.h"
|
||||
|
||||
#include "HostRPC.h"
|
||||
#include "llvm/Support/DynamicLibrary.h"
|
||||
#endif
|
||||
|
||||
using namespace llvm;
|
||||
using namespace omp;
|
||||
using namespace target;
|
||||
|
||||
#ifdef LIBOMPTARGET_RPC_SUPPORT
|
||||
// GPUFirst Host Function Wrapper Invoker
|
||||
class HostRPCInvokerWrapper {
|
||||
void (*Invoker)(int32_t, void *) = nullptr;
|
||||
std::unique_ptr<sys::DynamicLibrary> DL;
|
||||
std::once_flag Flag;
|
||||
|
||||
void initInvoker() {
|
||||
std::string ErrMsg;
|
||||
DL = std::make_unique<sys::DynamicLibrary>(
|
||||
sys::DynamicLibrary::getPermanentLibrary(nullptr, &ErrMsg));
|
||||
|
||||
assert(DL->isValid() && "invalid DL");
|
||||
*((void **)&Invoker) =
|
||||
DL->getAddressOfSymbol("__kmpc_host_rpc_invoke_host_wrapper");
|
||||
assert(Invoker && "Invoker is nullptr");
|
||||
}
|
||||
|
||||
public:
|
||||
void invoke(int32_t CallNo, void *Desc) {
|
||||
std::call_once(Flag, &HostRPCInvokerWrapper::initInvoker, this);
|
||||
Invoker(CallNo, Desc);
|
||||
}
|
||||
};
|
||||
|
||||
HostRPCInvokerWrapper *Invoker;
|
||||
// GPUFirst END
|
||||
#endif
|
||||
|
||||
|
||||
RPCServerTy::RPCServerTy(plugin::GenericPluginTy &Plugin)
|
||||
: Handles(Plugin.getNumDevices()) {}
|
||||
|
||||
@@ -89,6 +122,75 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
|
||||
"Failed to register RPC free handler for device %d: %d\n",
|
||||
Device.getDeviceId(), Err);
|
||||
|
||||
// GPUFirst
|
||||
// Register custom opcode handler for gpu first
|
||||
auto GPUFirstHandler = [](rpc_port_t port, void *Data) {
|
||||
|
||||
// printf("[HostRPC] [Host]: GPUFirstHandler\n");
|
||||
// // WORKING back & forth of an uint64_t
|
||||
//
|
||||
// printf("[HostRPC] [Host]: Start \n");
|
||||
//
|
||||
// uint64_t size_recv = 0;
|
||||
// void *buf_recv = nullptr;
|
||||
//
|
||||
// rpc_recv_n(port, &buf_recv, &size_recv,
|
||||
// [](uint64_t size, void* data){ return malloc(size); }, nullptr);
|
||||
//
|
||||
// printf("[HostRPC] [Host] [RECV]: %lu\n", *((uint64_t *) buf_recv));
|
||||
// printf("[HostRPC] [Host] [RECV] Size: %lu\n", size_recv);
|
||||
//
|
||||
// uint64_t size_send = sizeof(uint64_t);
|
||||
// void *buf_send = malloc(size_send);
|
||||
// *((uint64_t *) buf_send) = 987654321;
|
||||
//
|
||||
// printf("[Hostrpc] [Host] [SEND]: %lu\n", *((uint64_t *) buf_send));
|
||||
// printf("[HostRPC] [Host] [SEND] Size: %lu\n", size_send);
|
||||
//
|
||||
// rpc_send_n(port, &buf_send, &size_send);
|
||||
//
|
||||
// printf("[HostRPC] [Host]: End \n");
|
||||
//
|
||||
// // END of working part
|
||||
|
||||
auto _rpc_recv_n = [](rpc_port_t *handle, void **dst, size_t *size){
|
||||
rpc_recv_n(*handle, dst, size,
|
||||
[](uint64_t size, void* data){ return malloc(size); },
|
||||
nullptr);
|
||||
};
|
||||
auto _rpc_send_n = [](rpc_port_t *handle, void *src, size_t size){
|
||||
rpc_send_n(*handle, &src, &size);
|
||||
};
|
||||
|
||||
|
||||
uint64_t size_recv = 0;
|
||||
|
||||
hostrpc::Descriptor *D = nullptr;
|
||||
hostrpc::Argument *Args = nullptr;
|
||||
|
||||
_rpc_recv_n(&port, reinterpret_cast<void **>(&D), &size_recv);
|
||||
_rpc_recv_n(&port, reinterpret_cast<void **>(&Args), &size_recv);
|
||||
|
||||
D->Args = Args;
|
||||
|
||||
if(Invoker == nullptr)
|
||||
Invoker = new HostRPCInvokerWrapper();
|
||||
Invoker->invoke(D->Id, D);
|
||||
|
||||
_rpc_send_n(&port, D, sizeof(hostrpc::Descriptor));
|
||||
_rpc_send_n(&port, D->Args, sizeof(hostrpc::Argument) * D->NumArgs);
|
||||
|
||||
free(D->Args);
|
||||
free(D);
|
||||
|
||||
};
|
||||
if (rpc_status_t Err =
|
||||
rpc_register_callback(RPCDevice, RPC_GPUFIRST, GPUFirstHandler, &Invoker))
|
||||
return plugin::Plugin::error(
|
||||
"Failed to register RPC GPU First handler for device %d: %d\n", Device.getDeviceId(),
|
||||
Err);
|
||||
// GPUFirst END
|
||||
|
||||
// Get the address of the RPC client from the device.
|
||||
void *ClientPtr;
|
||||
plugin::GlobalTy ClientGlobal(rpc_client_symbol_name, sizeof(void *));
|
||||
|
||||
Reference in New Issue
Block a user