Compare commits

...

1 Commits

Author SHA1 Message Date
Shilei Tian
92fc907ea7 [OpenMP] Add the initial support for direct gpu compilation 2023-06-08 22:48:46 -04:00
59 changed files with 5897 additions and 196 deletions

65
auto-host-rpc/device.ll Normal file
View File

@@ -0,0 +1,65 @@
; ModuleID = 'test-openmp-nvptx64-nvidia-cuda-sm_75.bc'
source_filename = "/home/shiltian/Documents/vscode/llvm-project/auto-host-rpc/test.c"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.ddd = type { i32, i32, float }
@__omp_rtl_debug_kind = weak_odr hidden local_unnamed_addr constant i32 0
@__omp_rtl_assume_teams_oversubscription = weak_odr hidden local_unnamed_addr constant i32 0
@__omp_rtl_assume_threads_oversubscription = weak_odr hidden local_unnamed_addr constant i32 0
@__omp_rtl_assume_no_thread_state = weak_odr hidden local_unnamed_addr constant i32 0
@__omp_rtl_assume_no_nested_parallelism = weak_odr hidden local_unnamed_addr constant i32 0
@.str = private unnamed_addr constant [9 x i8] c"main.cpp\00", align 1
@.str1 = private unnamed_addr constant [2 x i8] c"r\00", align 1
@.str2 = private unnamed_addr constant [3 x i8] c"%d\00", align 1
@.str3 = private unnamed_addr constant [7 x i8] c"%f%d%s\00", align 1
@.str4 = private unnamed_addr constant [6 x i8] c"hello\00", align 1
; Function Attrs: convergent nounwind
define hidden void @foo() local_unnamed_addr #0 {
entry:
%d = tail call align 16 dereferenceable_or_null(12) ptr @__kmpc_alloc_shared(i64 12) #4
%call = tail call noalias ptr @fopen(ptr noundef nonnull @.str, ptr noundef nonnull @.str1) #5
%call1 = tail call i32 (ptr, ptr, ...) @fprintf(ptr noundef %call, ptr noundef nonnull @.str2, i32 noundef 6) #5
%call2 = tail call i32 (ptr, ptr, ...) @fprintf(ptr noundef %call, ptr noundef nonnull @.str3, double noundef 6.000000e+00, i32 noundef 1, ptr noundef nonnull @.str4) #5
%a = getelementptr inbounds %struct.ddd, ptr %d, i64 0, i32 1
%call3 = tail call i32 (ptr, ptr, ...) @fscanf(ptr noundef %call, ptr noundef nonnull @.str2, ptr noundef nonnull %a) #5
tail call void @__kmpc_free_shared(ptr %d, i64 12)
ret void
}
; Function Attrs: nofree nosync nounwind allocsize(0)
declare ptr @__kmpc_alloc_shared(i64) local_unnamed_addr #1
; Function Attrs: convergent
declare noalias ptr @fopen(ptr noundef, ptr noundef) local_unnamed_addr #2
; Function Attrs: convergent
declare i32 @fprintf(ptr noundef, ptr noundef, ...) local_unnamed_addr #2
; Function Attrs: convergent
declare i32 @fscanf(ptr noundef, ptr noundef, ...) local_unnamed_addr #2
; Function Attrs: nosync nounwind
declare void @__kmpc_free_shared(ptr allocptr nocapture, i64) local_unnamed_addr #3
attributes #0 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx77,+sm_75" }
attributes #1 = { nofree nosync nounwind allocsize(0) }
attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx77,+sm_75" }
attributes #3 = { nosync nounwind }
attributes #4 = { nounwind }
attributes #5 = { convergent nounwind }
!llvm.module.flags = !{!0, !1, !2, !3, !4, !5}
!llvm.ident = !{!6, !7}
!nvvm.annotations = !{}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 7]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 7, !"openmp", i32 50}
!3 = !{i32 7, !"openmp-device", i32 50}
!4 = !{i32 8, !"PIC Level", i32 2}
!5 = !{i32 7, !"frame-pointer", i32 2}
!6 = !{!"clang version 16.0.0"}
!7 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}

560
auto-host-rpc/main.cpp Normal file
View File

@@ -0,0 +1,560 @@
#include "llvm/ADT/EnumeratedArray.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/CommandFlags.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetOptions.h"
#include <string>
using namespace llvm;
namespace {
static LLVMContext Context;
static codegen::RegisterCodeGenFlags RCGF;
static constexpr const char *InternalPrefix[] = {"__kmp", "llvm.", "nvm.",
"omp_"};
std::string typeToString(Type *T) {
if (T->is16bitFPTy())
return "f16";
if (T->isFloatTy())
return "f32";
if (T->isDoubleTy())
return "f64";
if (T->isPointerTy())
return "ptr";
if (T->isStructTy())
return std::string(T->getStructName());
if (T->isIntegerTy())
return "i" + std::to_string(T->getIntegerBitWidth());
llvm_unreachable("unknown type");
}
} // namespace
namespace llvm {
enum class HostRPCRuntimeFunction {
#define __OMPRTL_HOST_RPC(_ENUM) OMPRTL_##_ENUM
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_ret_val),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val),
__OMPRTL_HOST_RPC(__last),
#undef __OMPRTL_HOST_RPC
};
#define __OMPRTL_HOST_RPC(_ENUM) \
auto OMPRTL_##_ENUM = HostRPCRuntimeFunction::OMPRTL_##_ENUM;
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_ret_val)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val)
#undef __OMPRTL_HOST_RPC
enum OMPHostRPCArgType {
// No need to copy.
OMP_HOST_RPC_ARG_SCALAR = 0,
OMP_HOST_RPC_ARG_PTR = 1,
// Copy to host.
OMP_HOST_RPC_ARG_PTR_COPY_TO = 2,
// Copy to device
OMP_HOST_RPC_ARG_PTR_COPY_FROM = 3,
// TODO: Do we have a tofrom pointer?
OMP_HOST_RPC_ARG_PTR_COPY_TOFROM = 4,
};
// struct HostRPCArgInfo {
// // OMPHostRPCArgType
// int64_t Type;
// int64_t Size;
// };
class AutoHostRPC {
LLVMContext &Context;
// Device module
Module &DM;
// Host module
Module &HM;
// Types
Type *Int8PtrTy;
Type *VoidTy;
Type *Int32Ty;
Type *Int64Ty;
StructType *ArgInfoTy;
struct CallSiteInfo {
CallInst *CI = nullptr;
SmallVector<Type *> Params;
};
struct HostRPCArgInfo {
// OMPHostRPCArgType
Constant *Type;
Value *Size;
};
//
SmallVector<Function *> HostEntryTable;
EnumeratedArray<Function *, HostRPCRuntimeFunction,
HostRPCRuntimeFunction::OMPRTL___last>
RFIs;
static std::string getWrapperFunctionName(Function *F, CallSiteInfo &CSI) {
std::string Name = "__kmpc_host_rpc_wrapper_" + std::string(F->getName());
if (!F->isVarArg())
return Name;
for (unsigned I = F->getFunctionType()->getNumParams();
I < CSI.Params.size(); ++I) {
Name.push_back('_');
Name.append(typeToString(CSI.Params[I]));
}
return Name;
}
static bool isInternalFunction(Function &F) {
auto Name = F.getName();
for (auto P : InternalPrefix)
if (Name.startswith(P))
return true;
return false;
}
Value *convertToInt64Ty(IRBuilder<> &Builder, Value *V);
Value *convertFromInt64TyTo(IRBuilder<> &Builder, Value *V, Type *TargetTy);
// int device_wrapper(call_no, arg_info, ...) {
// void *desc = __kmpc_host_rpc_get_desc(call_no, num_args, arg_info);
// __kmpc_host_rpc_add_arg(desc, arg1, sizeof(arg1));
// __kmpc_host_rpc_add_arg(desc, arg2, sizeof(arg2));
// ...
// __kmpc_host_rpc_send_and_wait(desc);
// int r = (int)__kmpc_host_rpc_get_ret_val(desc);
// return r;
// }
Function *getDeviceWrapperFunction(StringRef WrapperName, Function *F,
CallSiteInfo &CSI);
// void host_wrapper(desc) {
// int arg1 = (int)__kmpc_host_rpc_get_arg(desc, 0);
// float arg2 = (float)__kmpc_host_rpc_get_arg(desc, 1);
// char *arg3 = (char *)__kmpc_host_rpc_get_arg(desc, 2);
// ...
// int r = actual_call(arg1, arg2, arg3, ...);
// __kmpc_host_rpc_set_ret_val(ptr(desc, (int64_t)r);
// }
Function *getHostWrapperFunction(StringRef WrapperName, Function *F,
CallSiteInfo &CSI);
bool rewriteWithHostRPC(Function *F);
public:
AutoHostRPC(Module &DeviceModule, Module &HostModule)
: Context(DeviceModule.getContext()), DM(DeviceModule), HM(HostModule) {
assert(&DeviceModule.getContext() == &HostModule.getContext() &&
"device and host modules have different context");
#define __OMP_TYPE(TYPE) TYPE = Type::get##TYPE(Context)
__OMP_TYPE(Int8PtrTy);
__OMP_TYPE(VoidTy);
__OMP_TYPE(Int32Ty);
__OMP_TYPE(Int64Ty);
#undef __OMP_TYPE
#define __OMP_RTL(_ENUM, MOD, VARARG, RETTY, ...) \
{ \
SmallVector<Type *> Params{__VA_ARGS__}; \
FunctionType *FT = FunctionType::get(RETTY, Params, VARARG); \
RFIs[OMPRTL_##_ENUM] = Function::Create( \
FT, GlobalValue::LinkageTypes::InternalLinkage, #_ENUM, MOD); \
}
__OMP_RTL(__kmpc_host_rpc_get_desc, DM, false, Int8PtrTy, Int32Ty, Int32Ty,
Int8PtrTy)
__OMP_RTL(__kmpc_host_rpc_add_arg, DM, false, VoidTy, Int8PtrTy, Int64Ty,
Int64Ty)
__OMP_RTL(__kmpc_host_rpc_send_and_wait, DM, false, VoidTy, Int8PtrTy)
__OMP_RTL(__kmpc_host_rpc_get_ret_val, DM, false, Int64Ty, Int8PtrTy)
__OMP_RTL(__kmpc_host_rpc_get_arg, HM, false, Int64Ty, Int8PtrTy, Int32Ty)
__OMP_RTL(__kmpc_host_rpc_set_ret_val, HM, false, VoidTy, Int8PtrTy,
Int64Ty)
#undef __OMP_RTL
ArgInfoTy = StructType::create({Int64Ty, Int64Ty}, "struct.arg_info_t");
}
bool run();
};
Value *AutoHostRPC::convertToInt64Ty(IRBuilder<> &Builder, Value *V) {
Type *T = V->getType();
if (T == Int64Ty)
return V;
if (T->isPointerTy())
return Builder.CreatePtrToInt(V, Int64Ty);
if (T->isIntegerTy())
return Builder.CreateIntCast(V, Int64Ty, false);
if (T->isFloatingPointTy()) {
if (T->isFloatTy())
V = Builder.CreateFPToSI(V, Int32Ty);
return Builder.CreateFPToSI(V, Int64Ty);
}
llvm_unreachable("unknown cast to int64_t");
}
Value *AutoHostRPC::convertFromInt64TyTo(IRBuilder<> &Builder, Value *V,
Type *T) {
if (T == Int64Ty)
return V;
if (T->isPointerTy())
return Builder.CreateIntToPtr(V, T);
if (T->isIntegerTy())
return Builder.CreateIntCast(V, T, /* isSigned */ true);
if (T->isFloatingPointTy()) {
if (T->isFloatTy())
V = Builder.CreateIntCast(V, Int32Ty, /* isSigned */ true);
V = Builder.CreateSIToFP(V, T);
return V;
}
llvm_unreachable("unknown cast from int64_t");
}
bool AutoHostRPC::run() {
bool Changed = false;
SmallVector<Function *> WorkList;
for (Function &F : DM) {
// If the function is already defined, it definitely does not require RPC.
if (!F.isDeclaration())
continue;
// If it is an internal function, skip it as well.
if (isInternalFunction(F))
continue;
// If there is no use of the function, skip it.
if (F.use_empty())
continue;
WorkList.push_back(&F);
}
if (WorkList.empty())
return Changed;
for (Function *F : WorkList)
Changed |= rewriteWithHostRPC(F);
return Changed;
}
bool AutoHostRPC::rewriteWithHostRPC(Function *F) {
bool Changed = false;
SmallVector<CallInst *> WorkList;
for (User *U : F->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
WorkList.push_back(CI);
}
if (WorkList.empty())
return Changed;
for (CallInst *CI : WorkList) {
CallSiteInfo CSI;
CSI.CI = CI;
unsigned NumArgs = CI->arg_size();
for (unsigned I = 0; I < NumArgs; ++I)
CSI.Params.push_back(CI->getArgOperand(I)->getType());
std::string WrapperName = getWrapperFunctionName(F, CSI);
Function *DeviceWrapperFn = getDeviceWrapperFunction(WrapperName, F, CSI);
Function *HostWrapperFn = getHostWrapperFunction(WrapperName, F, CSI);
int32_t WrapperNumber = -1;
for (unsigned I = 0; I < HostEntryTable.size(); ++I) {
if (HostEntryTable[I] == HostWrapperFn) {
WrapperNumber = I;
break;
}
}
if (WrapperNumber == -1) {
WrapperNumber = HostEntryTable.size();
HostEntryTable.push_back(HostWrapperFn);
}
DataLayout DL = DM.getDataLayout();
IRBuilder<> Builder(CI);
auto CheckIfIdentifierPtr = [](const Value *V) {
auto *CI = dyn_cast<CallInst>(V);
if (!CI)
return false;
Function *Callee = CI->getCalledFunction();
return Callee->getName().startswith("__kmpc_host_rpc_wrapper_");
};
auto CheckIfAlloca = [](const Value *V) {
auto *CI = dyn_cast<CallInst>(V);
if (!CI)
return false;
Function *Callee = CI->getCalledFunction();
return Callee->getName() == "__kmpc_alloc_shared" ||
Callee->getName() == "malloc";
};
SmallVector<HostRPCArgInfo> ArgInfos;
bool IsConstantArgInfo = true;
for (Value *Op : CI->args()) {
if (!Op->getType()->isPointerTy()) {
HostRPCArgInfo AI{
ConstantInt::get(Int64Ty,
OMPHostRPCArgType::OMP_HOST_RPC_ARG_SCALAR),
ConstantInt::getNullValue(Int64Ty)};
ArgInfos.push_back(std::move(AI));
continue;
}
Value *SizeVal = nullptr;
OMPHostRPCArgType ArgType = OMP_HOST_RPC_ARG_PTR_COPY_TOFROM;
SmallVector<const Value *> Objects;
getUnderlyingObjects(Op, Objects);
// TODO: Handle phi node
if (Objects.size() != 1)
llvm_unreachable("we can't handle phi node yet");
auto *Obj = Objects.front();
if (CheckIfIdentifierPtr(Obj)) {
ArgType = OMP_HOST_RPC_ARG_SCALAR;
SizeVal = ConstantInt::getNullValue(Int64Ty);
} else if (CheckIfAlloca(Obj)) {
auto *CI = dyn_cast<CallInst>(Obj);
SizeVal = CI->getOperand(0);
if (!isa<Constant>(SizeVal))
IsConstantArgInfo = false;
} else {
if (auto *GV = dyn_cast<GlobalVariable>(Obj)) {
SizeVal = ConstantInt::get(Int64Ty,
DL.getTypeStoreSize(GV->getValueType()));
if (GV->isConstant())
ArgType = OMP_HOST_RPC_ARG_PTR_COPY_TO;
if (GV->isConstant() && GV->hasInitializer()) {
// TODO: If the global variable is contant, we can do some
// optimization.
}
} else {
// TODO: fix that when it occurs
llvm_unreachable("cannot handle unknown type");
}
}
HostRPCArgInfo AI{ConstantInt::get(Int64Ty, ArgType), SizeVal};
ArgInfos.push_back(std::move(AI));
}
Value *ArgInfo = nullptr;
if (!IsConstantArgInfo) {
ArgInfo = Builder.CreateAlloca(
ArgInfoTy, ConstantInt::get(Int64Ty, NumArgs), "arg_info");
for (unsigned I = 0; I < NumArgs; ++I) {
Value *AII = GetElementPtrInst::Create(
ArrayType::get(ArgInfoTy, NumArgs), ArgInfo,
{ConstantInt::getNullValue(Int64Ty), ConstantInt::get(Int64Ty, I)});
Value *AIIType = GetElementPtrInst::Create(
ArgInfoTy, AII, {ConstantInt::get(Int64Ty, 0)});
Value *AIISize = GetElementPtrInst::Create(
ArgInfoTy, AII, {ConstantInt::get(Int64Ty, 1)});
Builder.Insert(AII);
Builder.Insert(AIIType);
Builder.Insert(AIISize);
Builder.CreateStore(ArgInfos[I].Type, AIIType);
Builder.CreateStore(ArgInfos[I].Size, AIISize);
}
} else {
SmallVector<Constant *> ArgInfoInitVar;
for (auto &AI : ArgInfos) {
auto *CS =
ConstantStruct::get(ArgInfoTy, {AI.Type, cast<Constant>(AI.Size)});
ArgInfoInitVar.push_back(CS);
}
Constant *ArgInfoInit = ConstantArray::get(
ArrayType::get(ArgInfoTy, NumArgs), ArgInfoInitVar);
ArgInfo = new GlobalVariable(
DM, ArrayType::get(ArgInfoTy, NumArgs), /* isConstant */ true,
GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info");
}
SmallVector<Value *> Args{ConstantInt::get(Int32Ty, WrapperNumber),
ArgInfo};
for (Value *Op : CI->args())
Args.push_back(Op);
CallInst *NewCall = Builder.CreateCall(DeviceWrapperFn, Args);
CI->replaceAllUsesWith(NewCall);
CI->eraseFromParent();
}
F->eraseFromParent();
return true;
}
Function *AutoHostRPC::getDeviceWrapperFunction(StringRef WrapperName,
Function *F,
CallSiteInfo &CSI) {
Function *WrapperFn = DM.getFunction(WrapperName);
if (WrapperFn)
return WrapperFn;
// return_type device_wrapper(int32_t call_no, void *arg_info, ...)
SmallVector<Type *> Params{Int32Ty, Int8PtrTy};
Params.append(CSI.Params);
Type *RetTy = F->getReturnType();
FunctionType *FT = FunctionType::get(RetTy, Params, /*isVarArg*/ false);
WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::WeakODRLinkage,
WrapperName, DM);
// Emit the body of the device wrapper
IRBuilder<> Builder(Context);
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
Builder.SetInsertPoint(EntryBB);
// skip call_no and arg_info.
constexpr const unsigned NumArgSkipped = 2;
Value *Desc = nullptr;
{
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_get_desc];
Desc = Builder.CreateCall(
Fn,
{WrapperFn->getArg(0),
ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
WrapperFn->getArg(1)},
"desc");
}
{
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_add_arg];
for (unsigned I = NumArgSkipped; I < WrapperFn->arg_size(); ++I) {
Value *V = convertToInt64Ty(Builder, WrapperFn->getArg(I));
Builder.CreateCall(Fn, {Desc, V, ConstantInt::getNullValue(Int64Ty)});
}
}
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_send_and_wait], {Desc});
if (RetTy->isVoidTy()) {
Builder.CreateRetVoid();
return WrapperFn;
}
Value *RetVal =
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_ret_val], {Desc});
if (RetTy != RetVal->getType())
RetVal = convertFromInt64TyTo(Builder, RetVal, RetTy);
Builder.CreateRet(RetVal);
return WrapperFn;
}
Function *AutoHostRPC::getHostWrapperFunction(StringRef WrapperName,
Function *F, CallSiteInfo &CSI) {
Function *WrapperFn = HM.getFunction(WrapperName);
if (WrapperFn)
return WrapperFn;
SmallVector<Type *> Params{Int8PtrTy};
FunctionType *FT = FunctionType::get(VoidTy, Params, /* isVarArg */ false);
WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,
WrapperName, HM);
Value *Desc = WrapperFn->getArg(0);
// Emit the body of the host wrapper
IRBuilder<> Builder(Context);
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
Builder.SetInsertPoint(EntryBB);
SmallVector<Value *> Args;
for (unsigned I = 0; I < CSI.CI->arg_size(); ++I) {
Value *V = Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_arg],
{Desc, ConstantInt::get(Int32Ty, I)});
Args.push_back(convertFromInt64TyTo(Builder, V, CSI.Params[I]));
}
// The host callee that will be called eventually by the host wrapper.
Function *HostCallee = HM.getFunction(F->getName());
if (!HostCallee)
HostCallee = Function::Create(F->getFunctionType(), F->getLinkage(),
F->getName(), HM);
Value *RetVal = Builder.CreateCall(HostCallee, Args);
RetVal = convertToInt64Ty(Builder, RetVal);
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_set_ret_val], {Desc, RetVal});
Builder.CreateRetVoid();
return WrapperFn;
}
} // namespace llvm
int main(int argc, char *argv[]) {
InitializeAllTargets();
InitializeAllTargetMCs();
InitializeAllAsmPrinters();
InitializeAllAsmParsers();
SMDiagnostic Err;
std::unique_ptr<Module> DM = parseIRFile("device.ll", Err, Context);
if (!DM)
return 1;
Module HM("host-rpc.bc", Context);
// get the right target triple
HM.setTargetTriple(Triple::normalize("x86-64"));
AutoHostRPC RPC(*DM, HM);
(void)RPC.run();
DM->dump();
// HM.dump();
return 0;
}

19
auto-host-rpc/test.c Normal file
View File

@@ -0,0 +1,19 @@
#include <stdio.h>
#pragma omp begin declare target device_type(nohost)
struct ddd {
int num;
int a;
float b;
};
void foo() {
FILE *fp = fopen("main.cpp", "r");
struct ddd d;
fprintf(fp, "%d", 6);
fprintf(fp, "%f%d%s", 6.0f, 1, "hello");
fscanf(fp, "%d", &d.a);
}
#pragma omp end declare target

View File

@@ -253,6 +253,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
LANGOPT(OpenMPGlobalizeToGlobalSpace , 1, 0, "Globalize to global space for the globalized variables")
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")

View File

@@ -2744,6 +2744,10 @@ def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested
HelpText<"Assert no nested parallel regions in the GPU">,
MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;
def fopenmp_globalize_to_global_space : Flag<["-"], "fopenmp-globalize-to-global-space">,
HelpText<"Globalize to global space for the globalized variables">,
MarshallingInfoFlag<LangOpts<"OpenMPGlobalizeToGlobalSpace">>;
} // let Group = f_Group
} // let Flags = [CC1Option, FC1Option, NoArgumentUnused, HelpHidden]

View File

@@ -2476,50 +2476,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
(IPD->getParameterKind() == ImplicitParamDecl::ThreadPrivateVar);
}
Address DeclPtr = Address::invalid();
Address AllocaPtr = Address::invalid();
bool DoStore = false;
bool IsScalar = hasScalarEvaluationKind(Ty);
bool UseIndirectDebugAddress = false;
// If we already have a pointer to the argument, reuse the input pointer.
if (Arg.isIndirect()) {
// If we have a prettier pointer type at this point, bitcast to that.
DeclPtr = Arg.getIndirectAddress();
DeclPtr = Builder.CreateElementBitCast(DeclPtr, ConvertTypeForMem(Ty),
D.getName());
// Indirect argument is in alloca address space, which may be different
// from the default address space.
auto AllocaAS = CGM.getASTAllocaAddressSpace();
auto *V = DeclPtr.getPointer();
AllocaPtr = DeclPtr;
// For truly ABI indirect arguments -- those that are not `byval` -- store
// the address of the argument on the stack to preserve debug information.
ABIArgInfo ArgInfo = CurFnInfo->arguments()[ArgNo - 1].info;
if (ArgInfo.isIndirect())
UseIndirectDebugAddress = !ArgInfo.getIndirectByVal();
if (UseIndirectDebugAddress) {
auto PtrTy = getContext().getPointerType(Ty);
AllocaPtr = CreateMemTemp(PtrTy, getContext().getTypeAlignInChars(PtrTy),
D.getName() + ".indirect_addr");
EmitStoreOfScalar(V, AllocaPtr, /* Volatile */ false, PtrTy);
}
auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
auto DestLangAS =
getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
if (SrcLangAS != DestLangAS) {
assert(getContext().getTargetAddressSpace(SrcLangAS) ==
CGM.getDataLayout().getAllocaAddrSpace());
auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
auto *T = DeclPtr.getElementType()->getPointerTo(DestAS);
DeclPtr =
DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
*this, V, SrcLangAS, DestLangAS, T, true),
DeclPtr.isKnownNonNull());
}
auto PushCleanupIfNeeded = [this, Ty, &D](Address DeclPtr) {
// Push a destructor cleanup for this parameter if the ABI requires it.
// Don't push a cleanup in a thunk for a method that will also emit a
// cleanup.
@@ -2535,87 +2492,126 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
EHStack.stable_begin();
}
}
} else {
// Check if the parameter address is controlled by OpenMP runtime.
Address OpenMPLocalAddr =
getLangOpts().OpenMP
? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
: Address::invalid();
if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
DeclPtr = OpenMPLocalAddr;
AllocaPtr = DeclPtr;
};
Address DeclPtr = Address::invalid();
Address AllocaPtr = Address::invalid();
Address OpenMPLocalAddr =
getLangOpts().OpenMP
? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
: Address::invalid();
bool DoStore = false;
bool IsScalar = hasScalarEvaluationKind(Ty);
bool UseIndirectDebugAddress = false;
if (OpenMPLocalAddr.isValid()) {
DeclPtr = OpenMPLocalAddr;
AllocaPtr = DeclPtr;
LValue Dst = MakeAddrLValue(DeclPtr, Ty);
if (Arg.isIndirect()) {
LValue Src = MakeAddrLValue(Arg.getIndirectAddress(), Ty);
callCStructCopyConstructor(Dst, Src);
PushCleanupIfNeeded(Arg.getIndirectAddress());
} else {
// Otherwise, create a temporary to hold the value.
EmitStoreOfScalar(Arg.getDirectValue(), Dst, /* isInitialization */ true);
}
} else {
// If we already have a pointer to the argument, reuse the input pointer.
if (Arg.isIndirect()) {
// If we have a prettier pointer type at this point, bitcast to that.
DeclPtr = Arg.getIndirectAddress();
DeclPtr = Builder.CreateElementBitCast(DeclPtr, ConvertTypeForMem(Ty),
D.getName());
// Indirect argument is in alloca address space, which may be different
// from the default address space.
auto AllocaAS = CGM.getASTAllocaAddressSpace();
auto *V = DeclPtr.getPointer();
AllocaPtr = DeclPtr;
auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
auto DestLangAS =
getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
if (SrcLangAS != DestLangAS) {
assert(getContext().getTargetAddressSpace(SrcLangAS) ==
CGM.getDataLayout().getAllocaAddrSpace());
auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
auto *T = DeclPtr.getElementType()->getPointerTo(DestAS);
DeclPtr =
DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
*this, V, SrcLangAS, DestLangAS, T, true),
DeclPtr.isKnownNonNull());
}
PushCleanupIfNeeded(DeclPtr);
} else {
// Create a temporary to hold the value.
DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
D.getName() + ".addr", &AllocaPtr);
DoStore = true;
}
DoStore = true;
}
llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
LValue lv = MakeAddrLValue(DeclPtr, Ty);
if (IsScalar) {
Qualifiers qs = Ty.getQualifiers();
if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
// We honor __attribute__((ns_consumed)) for types with lifetime.
// For __strong, it's handled by just skipping the initial retain;
// otherwise we have to balance out the initial +1 with an extra
// cleanup to do the release at the end of the function.
bool isConsumed = D.hasAttr<NSConsumedAttr>();
LValue lv = MakeAddrLValue(DeclPtr, Ty);
if (IsScalar) {
Qualifiers qs = Ty.getQualifiers();
if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
// We honor __attribute__((ns_consumed)) for types with lifetime.
// For __strong, it's handled by just skipping the initial retain;
// otherwise we have to balance out the initial +1 with an extra
// cleanup to do the release at the end of the function.
bool isConsumed = D.hasAttr<NSConsumedAttr>();
// If a parameter is pseudo-strong then we can omit the implicit retain.
if (D.isARCPseudoStrong()) {
assert(lt == Qualifiers::OCL_Strong &&
"pseudo-strong variable isn't strong?");
assert(qs.hasConst() && "pseudo-strong variable should be const!");
lt = Qualifiers::OCL_ExplicitNone;
}
// If a parameter is pseudo-strong then we can omit the implicit retain.
if (D.isARCPseudoStrong()) {
assert(lt == Qualifiers::OCL_Strong &&
"pseudo-strong variable isn't strong?");
assert(qs.hasConst() && "pseudo-strong variable should be const!");
lt = Qualifiers::OCL_ExplicitNone;
}
// Load objects passed indirectly.
if (Arg.isIndirect() && !ArgVal)
ArgVal = Builder.CreateLoad(DeclPtr);
// Load objects passed indirectly.
if (Arg.isIndirect() && !ArgVal)
ArgVal = Builder.CreateLoad(DeclPtr);
if (lt == Qualifiers::OCL_Strong) {
if (!isConsumed) {
if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
// use objc_storeStrong(&dest, value) for retaining the
// object. But first, store a null into 'dest' because
// objc_storeStrong attempts to release its old value.
llvm::Value *Null = CGM.EmitNullConstant(D.getType());
EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
DoStore = false;
if (lt == Qualifiers::OCL_Strong) {
if (!isConsumed) {
if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
// use objc_storeStrong(&dest, value) for retaining the
// object. But first, store a null into 'dest' because
// objc_storeStrong attempts to release its old value.
llvm::Value *Null = CGM.EmitNullConstant(D.getType());
EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
DoStore = false;
} else
// Don't use objc_retainBlock for block pointers, because we
// don't want to Block_copy something just because we got it
// as a parameter.
ArgVal = EmitARCRetainNonBlock(ArgVal);
}
} else {
// Push the cleanup for a consumed parameter.
if (isConsumed) {
ARCPreciseLifetime_t precise =
(D.hasAttr<ObjCPreciseLifetimeAttr>() ? ARCPreciseLifetime
: ARCImpreciseLifetime);
EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(),
ArgVal, precise);
}
if (lt == Qualifiers::OCL_Weak) {
EmitARCInitWeak(DeclPtr, ArgVal);
DoStore = false; // The weak init is a store, no need to do two.
}
else
// Don't use objc_retainBlock for block pointers, because we
// don't want to Block_copy something just because we got it
// as a parameter.
ArgVal = EmitARCRetainNonBlock(ArgVal);
}
} else {
// Push the cleanup for a consumed parameter.
if (isConsumed) {
ARCPreciseLifetime_t precise = (D.hasAttr<ObjCPreciseLifetimeAttr>()
? ARCPreciseLifetime : ARCImpreciseLifetime);
EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(), ArgVal,
precise);
}
if (lt == Qualifiers::OCL_Weak) {
EmitARCInitWeak(DeclPtr, ArgVal);
DoStore = false; // The weak init is a store, no need to do two.
}
// Enter the cleanup scope.
EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
}
// Enter the cleanup scope.
EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
}
}
// Store the initial value into the alloca.
if (DoStore)
EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
// Store the initial value into the alloca.
if (DoStore)
EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
}
setAddrOfLocalVar(&D, DeclPtr);

View File

@@ -1059,10 +1059,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
// Allocate space for the variable to be globalized
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
llvm::CallBase *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
? OMPRTL_malloc
: OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
// FIXME: We should use the variables actual alignment as an argument.
VoidPtr->addRetAttr(llvm::Attribute::get(
CGM.getLLVMContext(), llvm::Attribute::Alignment,
@@ -1099,10 +1101,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
// Allocate space for this VLA object to be globalized.
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
llvm::CallBase *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
? OMPRTL_malloc
: OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
VoidPtr->addRetAttr(
llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
CGM.getContext().getTargetInfo().getNewAlign()));
@@ -1130,20 +1134,29 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
// Deallocate the memory for each globalized VLA object
for (const auto &AddrSizePair :
llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
{AddrSizePair.first, AddrSizePair.second});
if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
{AddrSizePair.first});
else
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
{AddrSizePair.first, AddrSizePair.second});
}
// Deallocate the memory for each globalized value
for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
const auto *VD = cast<VarDecl>(Rec.first);
I->getSecond().MappedParams->restore(CGF);
llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
CGF.getTypeSize(VD->getType())};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
FreeArgs);
if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
{Rec.second.GlobalizedVal});
else
CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
OMPRTL___kmpc_free_shared),
{Rec.second.GlobalizedVal, CGF.getTypeSize(VD->getType())});
}
}
}

View File

@@ -6209,6 +6209,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism");
if (Args.hasArg(options::OPT_fopenmp_offload_mandatory))
CmdArgs.push_back("-fopenmp-offload-mandatory");
if (Args.hasArg(options::OPT_fopenmp_globalize_to_global_space))
CmdArgs.push_back("-fopenmp-globalize-to-global-space");
break;
default:
// By default, if Clang doesn't know how to generate useful OpenMP code

View File

@@ -0,0 +1,33 @@
// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
// expected-no-diagnostics
extern int printf(const char *, ...);
struct S {
int a;
float b;
};
// CHECK: define{{.*}}void @test(%struct.S* noundef byval(%struct.S) align {{[0-9]+}} [[arg:%[0-9a-zA-Z]+]])
// CHECK: [[g:%[0-9a-zA-Z]+]] = call align {{[0-9]+}} i8* @__kmpc_alloc_shared
// CHECK: bitcast i8* [[g]] to %struct.S*
// CHECK: bitcast %struct.S* [[arg]] to i8**
// CHECK: call void [[cc:@__copy_constructor[_0-9a-zA-Z]+]]
// CHECK: void [[cc]]
void test(struct S s) {
#pragma omp parallel for
for (int i = 0; i < s.a; ++i) {
printf("%i : %i : %f\n", i, s.a, s.b);
}
}
void foo() {
#pragma omp target teams num_teams(1)
{
struct S s;
s.a = 7;
s.b = 11;
test(s);
}
}

View File

@@ -141,6 +141,13 @@ static constexpr OptTable::Info InfoTable[] = {
#undef OPTION
};
/// Host RPC module that will be shared to the corresponding pass.
Module *HostModule = nullptr;
/// We only need to generate the host RPC module once.
bool IsHostModuleGenerated = false;
/// Host RPC object file.
StringRef HostRPCObjFile;
class WrapperOptTable : public opt::GenericOptTable {
public:
WrapperOptTable() : opt::GenericOptTable(InfoTable) {}
@@ -498,10 +505,12 @@ std::vector<std::string> getTargetFeatures(ArrayRef<OffloadFile> InputFiles) {
return UnifiedFeatures;
}
template <typename ModuleHook = function_ref<bool(size_t, const Module &)>>
template <typename PreHookTy = function_ref<bool(size_t, const Module &)>,
typename PostHookTy = function_ref<bool(size_t, const Module &)>>
std::unique_ptr<lto::LTO> createLTO(
const ArgList &Args, const std::vector<std::string> &Features,
ModuleHook Hook = [](size_t, const Module &) { return true; }) {
PreHookTy PreHook = [](size_t, const Module &) { return true; },
PostHookTy PostHook = [](size_t, const Module &) { return true; }) {
const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
StringRef Arch = Args.getLastArgValue(OPT_arch_EQ);
lto::Config Conf;
@@ -555,9 +564,10 @@ std::unique_ptr<lto::LTO> createLTO(
return true;
};
}
Conf.PostOptModuleHook = Hook;
Conf.CGFileType =
(Triple.isNVPTX() || SaveTemps) ? CGFT_AssemblyFile : CGFT_ObjectFile;
Conf.PreOptModuleHook = PreHook;
Conf.PostOptModuleHook = PostHook;
Conf.CGFileType = Triple.isNVPTX() ? CGFT_AssemblyFile : CGFT_ObjectFile;
// TODO: Handle remark files
Conf.HasWholeProgramVisibility = Args.hasArg(OPT_whole_program);
@@ -573,6 +583,58 @@ bool isValidCIdentifier(StringRef S) {
[](char C) { return C == '_' || isAlnum(C); });
}
bool writeHostModule(std::string &FileName) {
if (!HostModule)
return false;
if (HostModule->getFunctionList().empty())
return false;
auto HostTriple = HostModule->getTargetTriple();
FileName =
sys::path::filename(ExecutableName).str() + "-host-rpc-" + HostTriple;
auto TempFileOrErr = createOutputFile(FileName, "bc");
if (!TempFileOrErr)
reportError(TempFileOrErr.takeError());
int FD = -1;
if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
reportError(errorCodeToError(EC));
auto Out = std::make_unique<llvm::raw_fd_ostream>(FD, true);
WriteBitcodeToFile(*HostModule, *Out);
return true;
}
std::unique_ptr<lto::LTO> createHostRPCLTO(StringRef HostTriple) {
const llvm::Triple Triple(HostTriple);
lto::Config Conf;
lto::ThinBackend Backend;
Backend =
lto::createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
// TODO: host arch?
// Conf.CPU = Arch.str();
Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple);
// TODO: host features?
// Conf.MAttrs = Features;
Conf.CGOptLevel = *CodeGenOpt::getLevel(3);
Conf.OptLevel = 3;
Conf.UseDefaultPipeline = true;
Conf.DefaultTriple = Triple.getTriple();
LTOError = false;
Conf.DiagHandler = diagnosticHandler;
Conf.PTO.LoopVectorization = Conf.OptLevel > 1;
Conf.PTO.SLPVectorization = Conf.OptLevel > 1;
Conf.CGFileType = CGFT_ObjectFile;
Conf.HasWholeProgramVisibility = false;
return std::make_unique<lto::LTO>(std::move(Conf), Backend);
}
Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
SmallVectorImpl<StringRef> &OutputFiles,
const ArgList &Args) {
@@ -645,12 +707,51 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
BitcodeOutput.push_back(*TempFileOrErr);
return false;
};
auto AddHostModuleAddr = [&](size_t, const Module &M) {
if (!HostModule)
return true;
Module &CM = const_cast<Module &>(M);
auto *MD = CM.getOrInsertNamedMetadata("llvm.hostrpc.hostmodule");
MD->clearOperands();
MD->addOperand(MDTuple::get(
CM.getContext(), {ConstantAsMetadata::get(ConstantInt::get(
Type::getInt64Ty(CM.getContext()),
reinterpret_cast<uintptr_t>(HostModule)))}));
return true;
};
// We assume visibility of the whole program if every input file was bitcode.
auto Features = getTargetFeatures(BitcodeInputFiles);
auto LTOBackend = Args.hasArg(OPT_embed_bitcode)
? createLTO(Args, Features, OutputBitcode)
: createLTO(Args, Features);
auto LTOBackend =
Args.hasArg(OPT_embed_bitcode)
? createLTO(Args, Features, AddHostModuleAddr, OutputBitcode)
: createLTO(Args, Features, AddHostModuleAddr);
LLVMContext &Ctx = LTOBackend->getContext();
StringRef HostTriple =
Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple());
std::unique_ptr<Module> HostModulePtr;
if (!IsHostModuleGenerated) {
HostModulePtr = std::make_unique<Module>(
sys::path::filename(ExecutableName).str() + "-host-rpc.bc", Ctx);
HostModule = HostModulePtr.get();
HostModulePtr->setTargetTriple(HostTriple);
std::string Msg;
const Target *T =
TargetRegistry::lookupTarget(HostModule->getTargetTriple(), Msg);
if (!T)
return createStringError(inconvertibleErrorCode(), Msg);
auto Options =
codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple(HostTriple));
StringRef CPU = "";
StringRef Features = "";
std::unique_ptr<TargetMachine> TM(
T->createTargetMachine(HostTriple, CPU, Features, Options, Reloc::PIC_,
HostModule->getCodeModel()));
HostModule->setDataLayout(TM->createDataLayout());
}
// We need to resolve the symbols so the LTO backend knows which symbols need
// to be kept or can be internalized. This is a simplified symbol resolution
@@ -743,6 +844,57 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
if (Error Err = LTOBackend->run(AddStream))
return Err;
std::string HostModuleTempFile;
bool ValidHostModule = writeHostModule(HostModuleTempFile);
// Reset the HostModule pointer.
HostModulePtr.reset();
HostModule = nullptr;
// TODO: this is really redundant code.
if (ValidHostModule) {
auto HostLTO = createHostRPCLTO(HostTriple);
std::string HostBitCodeFile = HostModuleTempFile + ".bc";
auto BufferOrError = MemoryBuffer::getFile(HostBitCodeFile);
if (!BufferOrError)
reportError(createFileError(HostBitCodeFile, BufferOrError.getError()));
Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr =
llvm::lto::InputFile::create(*BufferOrError.get());
if (!BitcodeFileOrErr)
return BitcodeFileOrErr.takeError();
const auto Symbols = (*BitcodeFileOrErr)->symbols();
SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
size_t Idx = 0;
for (auto &Sym : Symbols) {
(void)Sym;
lto::SymbolResolution &Res = Resolutions[Idx++];
Res.ExportDynamic = true;
Res.VisibleToRegularObj = true;
Res.LinkerRedefined = false;
Res.Prevailing = true;
}
if (Error Err = HostLTO->add(std::move(*BitcodeFileOrErr), Resolutions))
return Err;
auto RPCAddStream =
[&](size_t Task,
const Twine &ModuleName) -> std::unique_ptr<CachedFileStream> {
int FD = -1;
auto TempFileOrErr = createOutputFile(
sys::path::filename(ExecutableName) + "-host-rpc-" + HostTriple, "o");
if (!TempFileOrErr)
reportError(TempFileOrErr.takeError());
HostRPCObjFile = *TempFileOrErr;
if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
reportError(errorCodeToError(EC));
return std::make_unique<CachedFileStream>(
std::make_unique<llvm::raw_fd_ostream>(FD, true));
};
if (Error Err = HostLTO->run(RPCAddStream))
return Err;
}
if (LTOError)
return createStringError(inconvertibleErrorCode(),
"Errors encountered inside the LTO pipeline.");
@@ -1105,6 +1257,9 @@ linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles,
WrappedOutput.push_back(*OutputOrErr);
}
if (!HostRPCObjFile.empty())
WrappedOutput.push_back(HostRPCObjFile);
return WrappedOutput;
}

View File

@@ -25,6 +25,19 @@ enum OMPTgtExecModeFlags : unsigned char {
OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
};
enum OMPTgtHostRPCArgType {
// No need to copy.
OMP_HOST_RPC_ARG_SCALAR = 0,
OMP_HOST_RPC_ARG_PTR = 1,
// Copy to device.
OMP_HOST_RPC_ARG_COPY_TO = OMP_HOST_RPC_ARG_PTR | (1 << 1),
// Copy to device.
OMP_HOST_RPC_ARG_COPY_FROM = OMP_HOST_RPC_ARG_PTR | (1 << 2),
// Copy to and from device.
OMP_HOST_RPC_ARG_COPY_TOFROM =
OMP_HOST_RPC_ARG_COPY_TO | OMP_HOST_RPC_ARG_COPY_FROM,
};
} // end namespace omp
} // end namespace llvm

View File

@@ -222,7 +222,9 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
__OMP_RTL(__kmpc_get_warp_size, false, Int32, )
__OMP_RTL(omp_get_thread_num, false, Int32, )
__OMP_RTL(omp_get_bulk_thread_num, false, Int32, )
__OMP_RTL(omp_get_num_threads, false, Int32, )
__OMP_RTL(omp_get_bulk_num_threads, false, Int32, )
__OMP_RTL(omp_get_max_threads, false, Int32, )
__OMP_RTL(omp_in_parallel, false, Int32, )
__OMP_RTL(omp_get_dynamic, false, Int32, )
@@ -471,6 +473,8 @@ __OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr, Int32,
__OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
__OMP_RTL(malloc, false, VoidPtr, SizeTy)
__OMP_RTL(free, false, Void, VoidPtr)
__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)
__OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
@@ -484,6 +488,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
__OMP_RTL(__kmpc_launch_parallel_51_kernel, false, Void, Int8Ptr, Int32, Int32,
Int32, VoidPtrPtr, Int64)
__OMP_RTL(__last, false, Void, )
#undef __OMP_RTL
@@ -668,6 +675,8 @@ __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_bulk_thread_num, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_bulk_num_threads, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, SExt, ParamAttrs())

View File

@@ -288,6 +288,9 @@ public:
/// by LTO but might not be visible from bitcode symbol table.
static ArrayRef<const char*> getRuntimeLibcallSymbols();
/// Returns the context.
LLVMContext &getContext() { return RegularLTO.Ctx; }
private:
Config Conf;

View File

@@ -0,0 +1,31 @@
//===- Transform/IPO/HostRPC.h - Code of automatic host rpc -----*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_IPO_HOSTRPC_H
#define LLVM_TRANSFORMS_IPO_HOSTRPC_H
#include "llvm/Analysis/CGSCCPassManager.h"
#include "llvm/IR/PassManager.h"
namespace llvm {
class HostRPCPass : public PassInfoMixin<HostRPCPass> {
public:
HostRPCPass() : LTOPhase(ThinOrFullLTOPhase::None) {}
HostRPCPass(ThinOrFullLTOPhase LTOPhase) : LTOPhase(LTOPhase) {}
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
private:
const ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None;
};
} // namespace llvm
#endif // LLVM_TRANSFORMS_IPO_HOSTRPC_H

View File

@@ -0,0 +1,30 @@
//===--------------- CanonicalizeMainFunction.h -----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Utility function to canonicalize main function.
// The canonical main function is defined as: int main(int argc, char *argv[]);
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
#define LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
#include "llvm/IR/PassManager.h"
namespace llvm {
/// A pass that canonicalizes main function in a module.
class CanonicalizeMainFunctionPass
: public PassInfoMixin<CanonicalizeMainFunctionPass> {
public:
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};
} // end namespace llvm
#endif // LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H

View File

@@ -108,6 +108,7 @@
#include "llvm/Transforms/IPO/GlobalDCE.h"
#include "llvm/Transforms/IPO/GlobalOpt.h"
#include "llvm/Transforms/IPO/GlobalSplit.h"
#include "llvm/Transforms/IPO/HostRPC.h"
#include "llvm/Transforms/IPO/HotColdSplitting.h"
#include "llvm/Transforms/IPO/IROutliner.h"
#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
@@ -228,6 +229,7 @@
#include "llvm/Transforms/Utils/BreakCriticalEdges.h"
#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
#include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
#include "llvm/Transforms/Utils/CountVisits.h"
#include "llvm/Transforms/Utils/Debugify.h"
#include "llvm/Transforms/Utils/EntryExitInstrumenter.h"

View File

@@ -51,6 +51,7 @@
#include "llvm/Transforms/IPO/GlobalDCE.h"
#include "llvm/Transforms/IPO/GlobalOpt.h"
#include "llvm/Transforms/IPO/GlobalSplit.h"
#include "llvm/Transforms/IPO/HostRPC.h"
#include "llvm/Transforms/IPO/HotColdSplitting.h"
#include "llvm/Transforms/IPO/IROutliner.h"
#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
@@ -120,6 +121,7 @@
#include "llvm/Transforms/Utils/AddDiscriminators.h"
#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
#include "llvm/Transforms/Utils/CountVisits.h"
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
@@ -176,9 +178,9 @@ static cl::opt<bool> EnablePostPGOLoopRotation(
"enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden,
cl::desc("Run the loop rotation transformation after PGO instrumentation"));
static cl::opt<bool> EnableGlobalAnalyses(
"enable-global-analyses", cl::init(true), cl::Hidden,
cl::desc("Enable inter-procedural analyses"));
static cl::opt<bool>
EnableGlobalAnalyses("enable-global-analyses", cl::init(true), cl::Hidden,
cl::desc("Enable inter-procedural analyses"));
static cl::opt<bool>
RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden,
@@ -273,6 +275,13 @@ cl::opt<bool> EnableMemProfContextDisambiguation(
"enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
static cl::opt<bool> EnableHostRPC("enable-host-rpc", cl::init(false),
cl::Hidden, cl::desc("Enable HostRPC pass"));
static cl::opt<bool> EnableCanonicalizeMainFunction(
"enable-canonicalize-main-function", cl::init(false), cl::Hidden,
cl::desc("Enable CanonicalizeMainFunction pass"));
PipelineTuningOptions::PipelineTuningOptions() {
LoopInterleaving = true;
LoopVectorization = true;
@@ -1042,6 +1051,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
}
if (EnableCanonicalizeMainFunction)
MPM.addPass(CanonicalizeMainFunctionPass());
// Try to perform OpenMP specific optimizations on the module. This is a
// (quick!) no-op if there are no OpenMP runtime calls present in the module.
MPM.addPass(OpenMPOptPass());
@@ -1061,11 +1073,10 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
// and prior to optimizing globals.
// FIXME: This position in the pipeline hasn't been carefully considered in
// years, it should be re-analyzed.
MPM.addPass(IPSCCPPass(
IPSCCPOptions(/*AllowFuncSpec=*/
Level != OptimizationLevel::Os &&
Level != OptimizationLevel::Oz &&
!isLTOPreLink(Phase))));
MPM.addPass(IPSCCPPass(IPSCCPOptions(/*AllowFuncSpec=*/
Level != OptimizationLevel::Os &&
Level != OptimizationLevel::Oz &&
!isLTOPreLink(Phase))));
// Attach metadata to indirect call sites indicating the set of functions
// they may target at run-time. This should follow IPSCCP.
@@ -1471,7 +1482,7 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
ModulePassManager
PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
if (Level == OptimizationLevel::O0)
return buildO0DefaultPipeline(Level, /*LTOPreLink*/true);
return buildO0DefaultPipeline(Level, /*LTOPreLink*/ true);
ModulePassManager MPM;
@@ -1594,6 +1605,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
// in the current module.
MPM.addPass(CrossDSOCFIPass());
if (EnableCanonicalizeMainFunction)
MPM.addPass(CanonicalizeMainFunctionPass());
if (Level == OptimizationLevel::O0) {
// The WPD and LowerTypeTest passes need to run at -O0 to lower type
// metadata and intrinsics.
@@ -1624,6 +1638,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
// Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink));
if (EnableHostRPC)
MPM.addPass(HostRPCPass(ThinOrFullLTOPhase::FullLTOPostLink));
// Remove unused virtual tables to improve the quality of code generated by
// whole-program devirtualization and bitset lowering.
MPM.addPass(GlobalDCEPass());
@@ -1996,6 +2013,9 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,
MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));
if (EnableCanonicalizeMainFunction)
MPM.addPass(CanonicalizeMainFunctionPass());
return MPM;
}

View File

@@ -47,6 +47,7 @@ MODULE_PASS("openmp-opt", OpenMPOptPass())
MODULE_PASS("openmp-opt-postlink", OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink))
MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass())
MODULE_PASS("canonicalize-main-function", CanonicalizeMainFunctionPass())
MODULE_PASS("cg-profile", CGProfilePass())
MODULE_PASS("check-debugify", NewPMCheckDebugifyPass())
MODULE_PASS("constmerge", ConstantMergePass())
@@ -128,6 +129,7 @@ MODULE_PASS("sanmd-module", SanitizerBinaryMetadataPass())
MODULE_PASS("memprof-module", ModuleMemProfilerPass())
MODULE_PASS("poison-checking", PoisonCheckingPass())
MODULE_PASS("pseudo-probe-update", PseudoProbeUpdatePass())
MODULE_PASS("host-rpc", HostRPCPass())
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS

View File

@@ -383,12 +383,25 @@ static bool getPotentialCopiesOfMemoryValue(
dbgs() << "Underlying object is a valid nullptr, giving up.\n";);
return false;
}
// TODO: Use assumed noalias return.
if (!isa<AllocaInst>(&Obj) && !isa<GlobalVariable>(&Obj) &&
!(IsLoad ? isAllocationFn(&Obj, TLI) : isNoAliasCall(&Obj))) {
LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << Obj
<< "\n";);
return false;
if (!isa<AllocaInst>(&Obj) && !isa<GlobalVariable>(&Obj)) {
auto *CB = dyn_cast<CallBase>(&Obj);
bool R = false;
if (CB) {
auto &AAN =
A.getOrCreateAAFor<AANoAlias>(IRPosition::callsite_returned(*CB),
&QueryingAA, DepClassTy::OPTIONAL);
if (AAN.isValidState())
R = AAN.isAssumedNoAlias();
else
R = isNoAliasCall(&Obj);
if (!R)
R = isAllocationFn(&Obj, TLI);
}
if (!R) {
LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << Obj
<< "\n";);
return false;
}
}
if (auto *GV = dyn_cast<GlobalVariable>(&Obj))
if (!GV->hasLocalLinkage() &&
@@ -2081,7 +2094,8 @@ void Attributor::runTillFixpoint() {
LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
<< IterationCounter << "/" << MaxIterations
<< " iterations\n");
<< " iterations. #Changed AAs: " << ChangedAAs.size()
<< "\n");
// Reset abstract arguments not settled in a sound fixpoint by now. This
// happens when we stopped the fixpoint iteration early. Note that only the
@@ -2096,13 +2110,21 @@ void Attributor::runTillFixpoint() {
AbstractState &State = ChangedAA->getState();
if (!State.isAtFixpoint()) {
LLVM_DEBUG(
dbgs() << "\n[Attributor] Invalidate AA " << *ChangedAA
<< " as it is not at fix point after max iterations.\n");
State.indicatePessimisticFixpoint();
NumAttributesTimedOut++;
}
for (auto &DepIt : ChangedAA->Deps)
for (auto &DepIt : ChangedAA->Deps) {
LLVM_DEBUG(dbgs() << "\n[Attributor] Invalidate dependent AA "
<< *(DepIt.getPointer()) << " of AA " << *ChangedAA
<< "\n");
ChangedAAs.push_back(cast<AbstractAttribute>(DepIt.getPointer()));
}
ChangedAA->Deps.clear();
}

View File

@@ -1592,6 +1592,8 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
// might change while we iterate through a loop. For now, we give up if
// the PHI is not invariant.
if (isa<PHINode>(Usr)) {
if (!Usr->getType()->isPointerTy())
return false;
// Note the order here, the Usr access might change the map, CurPtr is
// already in it though.
bool IsFirstPHIUser = !OffsetInfoMap.count(Usr);
@@ -1814,6 +1816,31 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
return false;
}
if (auto *II = dyn_cast<ICmpInst>(Usr)) {
auto CheckIfUsedAsPred = [&](const Use &U, bool &Follow) {
const auto *UU = U.getUser();
if (isa<SelectInst>(UU))
return true;
if (isa<BranchInst>(UU))
return true;
LLVM_DEBUG(dbgs() << "[AAPointerInfo] ICmpInst user not handled " << *UU
<< "\n");
return false;
};
if (!A.checkForAllUses(CheckIfUsedAsPred, *this, *II,
/* CheckBBLivenessOnly */ true,
DepClassTy::OPTIONAL,
/* IgnoreDroppableUses */ true)) {
LLVM_DEBUG(
dbgs() << "[AAPointerInfo] Check for all uses failed for ICmpInst "
<< *II << "\n");
return false;
}
return true;
}
LLVM_DEBUG(dbgs() << "[AAPointerInfo] User not handled " << *Usr << "\n");
return false;
};
@@ -1932,7 +1959,6 @@ struct AAPointerInfoCallSiteArgument final : AAPointerInfoFloating {
const auto &NoCaptureAA =
A.getAAFor<AANoCapture>(*this, getIRPosition(), DepClassTy::OPTIONAL);
if (!NoCaptureAA.isAssumedNoCapture())
return indicatePessimisticFixpoint();
@@ -11827,10 +11853,35 @@ struct AAUnderlyingObjectsImpl
/// See AbstractAttribute::trackStatistics()
void trackStatistics() const override {}
// TODO: This is a temporary solution for terminals.
bool checkIfTerminals(Attributor &A, Value *V) {
auto *CI = dyn_cast<CallInst>(V);
if (!CI)
return false;
Function *Scope = CI->getFunction();
const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*Scope);
LibFunc TLIFn;
if (TLI && TLI->getLibFunc(*CI, TLIFn)) {
if (TLIFn == LibFunc::LibFunc_malloc ||
TLIFn == LibFunc::LibFunc___kmpc_alloc_shared)
return true;
}
return false;
}
/// See AbstractAttribute::updateImpl(...).
ChangeStatus updateImpl(Attributor &A) override {
auto &Ptr = getAssociatedValue();
if (checkIfTerminals(A, &Ptr)) {
bool Changed = false;
Changed |= InterAssumedUnderlyingObjects.insert(&Ptr);
Changed |= IntraAssumedUnderlyingObjects.insert(&Ptr);
return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
}
auto DoUpdate = [&](SmallSetVector<Value *, 8> &UnderlyingObjects,
AA::ValueScope Scope) {
bool UsedAssumedInformation = false;
@@ -11846,6 +11897,10 @@ struct AAUnderlyingObjectsImpl
for (unsigned I = 0; I < Values.size(); ++I) {
auto &VAC = Values[I];
auto *Obj = VAC.getValue();
if (checkIfTerminals(A, Obj)) {
Changed |= UnderlyingObjects.insert(Obj);
continue;
}
Value *UO = getUnderlyingObject(Obj);
if (UO && UO != VAC.getValue() && SeenObjects.insert(UO).second) {
const auto &OtherAA = A.getAAFor<AAUnderlyingObjects>(
@@ -11876,6 +11931,16 @@ struct AAUnderlyingObjectsImpl
continue;
}
if (auto *LI = dyn_cast<LoadInst>(Obj)) {
LLVM_DEBUG({
dbgs() << "[AAUnderlyingObjects] for CtxI ";
getCtxI()->print(dbgs());
dbgs() << " at position " << getIRPosition() << " has LoadInst ";
LI->print(dbgs());
dbgs() << '\n';
});
}
Changed |= UnderlyingObjects.insert(Obj);
}
@@ -11886,6 +11951,8 @@ struct AAUnderlyingObjectsImpl
Changed |= DoUpdate(IntraAssumedUnderlyingObjects, AA::Intraprocedural);
Changed |= DoUpdate(InterAssumedUnderlyingObjects, AA::Interprocedural);
LLVM_DEBUG(dumpState(dbgs()));
return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
}
@@ -11924,6 +11991,19 @@ private:
return Changed;
}
void dumpState(raw_ostream &O) {
O << "Underlying objects:\nintra procedureal:\n";
for (auto *Obj : IntraAssumedUnderlyingObjects) {
Obj->print(O);
O << '\n';
}
O << "inter procedureal:\n";
for (auto *Obj : InterAssumedUnderlyingObjects) {
Obj->print(O);
O << '\n';
}
}
/// All the underlying objects collected so far via intra procedural scope.
SmallSetVector<Value *, 8> IntraAssumedUnderlyingObjects;
/// All the underlying objects collected so far via inter procedural scope.

View File

@@ -19,6 +19,7 @@ add_llvm_component_library(LLVMipo
GlobalDCE.cpp
GlobalOpt.cpp
GlobalSplit.cpp
HostRPC.cpp
HotColdSplitting.cpp
IPO.cpp
IROutliner.cpp

View File

@@ -0,0 +1,887 @@
//===- Transform/IPO/HostRPC.h - Code of automatic host rpc -----*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/IPO/HostRPC.h"
#include "llvm/ADT/EnumeratedArray.h"
#include "llvm/CodeGen/CommandFlags.h"
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/IPO/Attributor.h"
#include <cstdint>
#define DEBUG_TYPE "host-rpc"
using namespace llvm;
using ArgType = llvm::omp::OMPTgtHostRPCArgType;
static cl::opt<bool>
UseDummyHostModule("host-rpc-use-dummy-host-module", cl::init(false),
cl::Hidden,
cl::desc("Use dummy host module if there no host module "
"attached to the device module"));
namespace {
enum class HostRPCRuntimeFunction {
#define __OMPRTL_HOST_RPC(_ENUM) OMPRTL_##_ENUM
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper),
__OMPRTL_HOST_RPC(__last),
#undef __OMPRTL_HOST_RPC
};
#define __OMPRTL_HOST_RPC(_ENUM) \
auto OMPRTL_##_ENUM = HostRPCRuntimeFunction::OMPRTL_##_ENUM;
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper)
#undef __OMPRTL_HOST_RPC
// TODO: Remove those functions implemented in device runtime.
static constexpr const char *InternalPrefix[] = {
"__kmp", "llvm.", "nvm.",
"omp_", "vprintf", "malloc",
"free", "__keep_alive", "__llvm_omp_vprintf"};
bool isInternalFunction(Function &F) {
auto Name = F.getName();
for (auto *P : InternalPrefix)
if (Name.startswith(P))
return true;
return false;
}
std::string typeToString(Type *T) {
if (T->is16bitFPTy())
return "f16";
if (T->isFloatTy())
return "f32";
if (T->isDoubleTy())
return "f64";
if (T->isPointerTy())
return "ptr";
if (T->isStructTy())
return std::string(T->getStructName());
if (T->isIntegerTy())
return "i" + std::to_string(T->getIntegerBitWidth());
LLVM_DEBUG(dbgs() << "[HostRPC] unknown type " << *T
<< " for typeToString.\n";);
llvm_unreachable("unknown type");
}
class HostRPC {
/// LLVM context instance
LLVMContext &Context;
/// Device module.
Module &M;
/// Host module
Module &HM;
/// Data layout of the device module.
DataLayout DL;
IRBuilder<> Builder;
/// External functions we are operating on.
SmallSetVector<Function *, 8> FunctionWorkList;
/// Attributor instance.
Attributor &A;
// Types
Type *Int8PtrTy;
Type *VoidTy;
Type *Int32Ty;
Type *Int64Ty;
StructType *ArgInfoTy;
// Values
Constant *NullPtr;
Constant *NullInt64;
struct CallSiteInfo {
CallInst *CI = nullptr;
SmallVector<Type *> Params;
};
struct HostRPCArgInfo {
Value *BasePtr = nullptr;
Constant *Type = nullptr;
Value *Size = nullptr;
};
///
SmallVector<Function *> HostEntryTable;
EnumeratedArray<Function *, HostRPCRuntimeFunction,
HostRPCRuntimeFunction::OMPRTL___last>
RFIs;
SmallVector<std::pair<CallInst *, CallInst *>> CallInstMap;
Constant *getConstantInt64(uint64_t Val) {
return ConstantInt::get(Int64Ty, Val);
}
static std::string getWrapperFunctionName(Function *F, CallSiteInfo &CSI) {
std::string Name = "__kmpc_host_rpc_wrapper_" + std::string(F->getName());
if (!F->isVarArg())
return Name;
for (unsigned I = F->getFunctionType()->getNumParams();
I < CSI.Params.size(); ++I) {
Name.push_back('_');
Name.append(typeToString(CSI.Params[I]));
}
return Name;
}
void registerAAs();
Value *convertToInt64Ty(Value *V);
Value *convertFromInt64TyTo(Value *V, Type *TargetTy);
Constant *convertToInt64Ty(Constant *C);
Constant *convertFromInt64TyTo(Constant *C, Type *T);
// int device_wrapper(call_no, arg_info, ...) {
// void *desc = __kmpc_host_rpc_get_desc(call_no, num_args, arg_info);
// __kmpc_host_rpc_add_arg(desc, arg1, sizeof(arg1));
// __kmpc_host_rpc_add_arg(desc, arg2, sizeof(arg2));
// ...
// int r = (int)__kmpc_host_rpc_send_and_wait(desc);
// return r;
// }
Function *getDeviceWrapperFunction(StringRef WrapperName, Function *F,
CallSiteInfo &CSI);
// void host_wrapper(desc) {
// int arg1 = (int)__kmpc_host_rpc_get_arg(desc, 0);
// float arg2 = (float)__kmpc_host_rpc_get_arg(desc, 1);
// char *arg3 = (char *)__kmpc_host_rpc_get_arg(desc, 2);
// ...
// int r = actual_call(arg1, arg2, arg3, ...);
// __kmpc_host_rpc_set_ret_val(ptr(desc, (int64_t)r);
// }
Function *getHostWrapperFunction(StringRef WrapperName, Function *F,
CallSiteInfo &CSI);
bool rewriteWithHostRPC(Function *F);
void emitHostWrapperInvoker();
bool recollectInformation();
public:
HostRPC(Module &DeviceModule, Module &HostModule, Attributor &A)
: Context(DeviceModule.getContext()), M(DeviceModule), HM(HostModule),
DL(M.getDataLayout()), Builder(Context), A(A) {
assert(&M.getContext() == &HM.getContext() &&
"device and host modules have different context");
#define __OMP_TYPE(TYPE) TYPE = Type::get##TYPE(Context)
__OMP_TYPE(Int8PtrTy);
__OMP_TYPE(VoidTy);
__OMP_TYPE(Int32Ty);
__OMP_TYPE(Int64Ty);
#undef __OMP_TYPE
NullPtr = ConstantInt::getNullValue(Int8PtrTy);
NullInt64 = ConstantInt::getNullValue(Int64Ty);
#define __OMP_RTL(_ENUM, MOD, VARARG, RETTY, ...) \
{ \
SmallVector<Type *> Params{__VA_ARGS__}; \
FunctionType *FT = FunctionType::get(RETTY, Params, VARARG); \
Function *F = (MOD).getFunction(#_ENUM); \
if (!F) \
F = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage, \
#_ENUM, (MOD)); \
RFIs[OMPRTL_##_ENUM] = F; \
}
__OMP_RTL(__kmpc_host_rpc_get_desc, M, false, Int8PtrTy, Int32Ty, Int32Ty,
Int8PtrTy)
__OMP_RTL(__kmpc_host_rpc_add_arg, M, false, VoidTy, Int8PtrTy, Int64Ty,
Int32Ty)
__OMP_RTL(__kmpc_host_rpc_send_and_wait, M, false, Int64Ty, Int8PtrTy)
__OMP_RTL(__kmpc_host_rpc_get_arg, HM, false, Int64Ty, Int8PtrTy, Int32Ty)
__OMP_RTL(__kmpc_host_rpc_set_ret_val, HM, false, VoidTy, Int8PtrTy,
Int64Ty)
__OMP_RTL(__kmpc_host_rpc_invoke_host_wrapper, HM, false, VoidTy, Int32Ty,
Int8PtrTy)
#undef __OMP_RTL
ArgInfoTy = StructType::create({Int64Ty, Int64Ty, Int64Ty, Int8PtrTy},
"struct.arg_info_t");
}
bool run();
};
Value *HostRPC::convertToInt64Ty(Value *V) {
if (auto *C = dyn_cast<Constant>(V))
return convertToInt64Ty(C);
Type *T = V->getType();
if (T == Int64Ty)
return V;
if (T->isPointerTy())
return Builder.CreatePtrToInt(V, Int64Ty);
if (T->isIntegerTy())
return Builder.CreateIntCast(V, Int64Ty, /* isSigned */ true);
if (T->isFloatingPointTy()) {
V = Builder.CreateBitCast(
V, Type::getIntNTy(V->getContext(), T->getScalarSizeInBits()));
return Builder.CreateIntCast(V, Int64Ty, /* isSigned */ true);
}
llvm_unreachable("unknown cast to int64_t");
}
Value *HostRPC::convertFromInt64TyTo(Value *V, Type *T) {
if (auto *C = dyn_cast<Constant>(V))
return convertFromInt64TyTo(C, T);
if (T == Int64Ty)
return V;
if (T->isPointerTy())
return Builder.CreateIntToPtr(V, T);
if (T->isIntegerTy())
return Builder.CreateIntCast(V, T, /* isSigned */ true);
if (T->isFloatingPointTy()) {
V = Builder.CreateIntCast(
V, Type::getIntNTy(V->getContext(), T->getScalarSizeInBits()),
/* isSigned */ true);
return Builder.CreateBitCast(V, T);
}
llvm_unreachable("unknown cast from int64_t");
}
Constant *HostRPC::convertToInt64Ty(Constant *C) {
Type *T = C->getType();
if (T == Int64Ty)
return C;
if (T->isPointerTy())
return ConstantExpr::getPtrToInt(C, Int64Ty);
if (T->isIntegerTy())
return ConstantExpr::getIntegerCast(C, Int64Ty, /* isSigned */ true);
if (T->isFloatingPointTy()) {
C = ConstantExpr::getBitCast(
C, Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()));
return ConstantExpr::getIntegerCast(C, Int64Ty, /* isSigned */ true);
}
llvm_unreachable("unknown cast to int64_t");
}
Constant *HostRPC::convertFromInt64TyTo(Constant *C, Type *T) {
if (T == Int64Ty)
return C;
if (T->isPointerTy())
return ConstantExpr::getIntToPtr(C, T);
if (T->isIntegerTy())
return ConstantExpr::getIntegerCast(C, T, /* isSigned */ true);
if (T->isFloatingPointTy()) {
C = ConstantExpr::getIntegerCast(
C, Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()),
/* isSigned */ true);
return ConstantExpr::getBitCast(C, T);
}
llvm_unreachable("unknown cast from int64_t");
}
void HostRPC::registerAAs() {
for (auto *F : FunctionWorkList)
for (User *U : F->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
for (unsigned I = 0; I < CI->arg_size(); ++I) {
Value *Operand = CI->getArgOperand(I);
if (!Operand->getType()->isPointerTy())
continue;
A.getOrCreateAAFor<AAUnderlyingObjects>(
IRPosition::callsite_argument(*CI, I),
/* QueryingAA */ nullptr, DepClassTy::NONE);
}
}
}
bool HostRPC::recollectInformation() {
FunctionWorkList.clear();
for (Function &F : M) {
// If the function is already defined, it definitely does not require RPC.
if (!F.isDeclaration())
continue;
// If it is an internal function, skip it as well.
if (isInternalFunction(F))
continue;
// If there is no use of the function, skip it.
if (F.use_empty())
continue;
FunctionWorkList.insert(&F);
}
return !FunctionWorkList.empty();
}
bool HostRPC::run() {
bool Changed = false;
if (!recollectInformation())
return Changed;
Changed = true;
// We add a couple of assumptions to those RPC functions such that AAs will
// not error out because of unknown implementation of those functions.
for (Function &F : M) {
if (!F.isDeclaration())
continue;
F.addFnAttr(Attribute::NoRecurse);
for (auto &Arg : F.args())
if (Arg.getType()->isPointerTy())
Arg.addAttr(Attribute::NoCapture);
if (!F.isVarArg())
continue;
for (User *U : F.users()) {
auto *CB = dyn_cast<CallBase>(U);
if (!CB)
continue;
for (unsigned I = F.getFunctionType()->getNumParams(); I < CB->arg_size();
++I) {
Value *Arg = CB->getArgOperand(I);
if (Arg->getType()->isPointerTy())
CB->addParamAttr(I, Attribute::NoCapture);
}
}
}
LLVM_DEBUG(M.dump());
registerAAs();
ChangeStatus Status = A.run();
if (!recollectInformation())
return Status == ChangeStatus::CHANGED;
for (Function *F : FunctionWorkList)
Changed |= rewriteWithHostRPC(F);
if (!Changed)
return Changed;
for (auto Itr = CallInstMap.rbegin(); Itr != CallInstMap.rend(); ++Itr) {
auto *CI = Itr->first;
auto *NewCI = Itr->second;
CI->replaceAllUsesWith(NewCI);
CI->eraseFromParent();
}
for (Function *F : FunctionWorkList)
if (F->user_empty())
F->eraseFromParent();
emitHostWrapperInvoker();
return Changed;
}
bool HostRPC::rewriteWithHostRPC(Function *F) {
SmallVector<CallInst *> WorkList;
for (User *U : F->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
WorkList.push_back(CI);
}
if (WorkList.empty())
return false;
for (CallInst *CI : WorkList) {
CallSiteInfo CSI;
CSI.CI = CI;
unsigned NumArgs = CI->arg_size();
for (unsigned I = 0; I < NumArgs; ++I)
CSI.Params.push_back(CI->getArgOperand(I)->getType());
std::string WrapperName = getWrapperFunctionName(F, CSI);
Function *DeviceWrapperFn = getDeviceWrapperFunction(WrapperName, F, CSI);
Function *HostWrapperFn = getHostWrapperFunction(WrapperName, F, CSI);
int32_t WrapperNumber = -1;
for (unsigned I = 0; I < HostEntryTable.size(); ++I) {
if (HostEntryTable[I] == HostWrapperFn) {
WrapperNumber = I;
break;
}
}
if (WrapperNumber == -1) {
WrapperNumber = HostEntryTable.size();
HostEntryTable.push_back(HostWrapperFn);
}
auto CheckIfIdentifierPtr = [this](const Value *V) {
auto *CI = dyn_cast<CallInst>(V);
if (!CI)
return false;
Function *Callee = CI->getCalledFunction();
if (this->FunctionWorkList.count(Callee))
return true;
return Callee->getName().startswith("__kmpc_host_rpc_wrapper_");
};
auto CheckIfDynAlloc = [](Value *V) -> CallInst * {
auto *CI = dyn_cast<CallInst>(V);
if (!CI)
return nullptr;
Function *Callee = CI->getCalledFunction();
auto Name = Callee->getName();
if (Name == "malloc" || Name == "__kmpc_alloc_shared")
return CI;
return nullptr;
};
auto CheckIfStdIO = [](Value *V) -> GlobalVariable * {
auto *LI = dyn_cast<LoadInst>(V);
if (!LI)
return nullptr;
auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
if (!GV)
return nullptr;
auto Name = GV->getName();
if (Name == "stdout" || Name == "stderr" || Name == "stdin")
return GV;
return nullptr;
};
auto CheckIfGlobalVariable = [](Value *V) {
if (auto *GV = dyn_cast<GlobalVariable>(V))
return GV;
if (auto *LI = dyn_cast<LoadInst>(V))
if (auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()))
return GV;
return static_cast<GlobalVariable *>(nullptr);
};
auto CheckIfNullPtr = [](Value *V) {
if (!V->getType()->isPointerTy())
return false;
return V == ConstantInt::getNullValue(V->getType());
};
auto HandleDirectUse = [&](Value *Ptr, HostRPCArgInfo &AI,
bool IsPointer = false) {
AI.BasePtr = Ptr;
AI.Type = getConstantInt64(IsPointer ? ArgType::OMP_HOST_RPC_ARG_PTR
: ArgType::OMP_HOST_RPC_ARG_SCALAR);
AI.Size = NullInt64;
};
SmallVector<SmallVector<HostRPCArgInfo>> ArgInfo;
bool IsConstantArgInfo = true;
for (unsigned I = 0; I < CI->arg_size(); ++I) {
ArgInfo.emplace_back();
auto &AII = ArgInfo.back();
Value *Operand = CI->getArgOperand(I);
// Check if scalar type.
if (!Operand->getType()->isPointerTy()) {
AII.emplace_back();
HandleDirectUse(Operand, AII.back());
IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
continue;
}
if (CheckIfNullPtr(Operand))
continue;
auto Pred = [&](Value &Obj) {
if (CheckIfNullPtr(&Obj))
return true;
bool IsConstantArgument = false;
if (!F->isVarArg() &&
F->hasParamAttribute(I, Attribute::AttrKind::ReadOnly))
IsConstantArgument = true;
HostRPCArgInfo AI;
if (auto *IO = CheckIfStdIO(&Obj)) {
HandleDirectUse(IO, AI, /* IsPointer */ true);
} else if (CheckIfIdentifierPtr(&Obj)) {
IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
HandleDirectUse(Operand, AI, /* IsPointer */ true);
} else if (auto *GV = CheckIfGlobalVariable(&Obj)) {
AI.BasePtr = GV;
AI.Size = getConstantInt64(DL.getTypeStoreSize(GV->getValueType()));
AI.Type =
getConstantInt64(GV->isConstant() || IsConstantArgument
? ArgType::OMP_HOST_RPC_ARG_COPY_TO
: ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM);
} else if (CheckIfDynAlloc(&Obj)) {
// We will handle this case at runtime so here we don't do anything.
return true;
} else if (isa<AllocaInst>(&Obj)) {
llvm_unreachable("alloca instruction needs to be handled!");
} else {
LLVM_DEBUG({
dbgs() << "[HostRPC] warning: call site " << *CI << ", operand "
<< *Operand << ", underlying object " << Obj
<< " cannot be handled.\n";
});
return true;
}
AII.push_back(std::move(AI));
return true;
};
auto &AAUO = A.getOrCreateAAFor<AAUnderlyingObjects>(
IRPosition::callsite_argument(*CI, I), nullptr, DepClassTy::NONE);
if (!AAUO.forallUnderlyingObjects(Pred))
llvm_unreachable("internal error");
}
// Reset the insert point to the call site.
Builder.SetInsertPoint(CI);
Value *ArgInfoVal = nullptr;
if (!IsConstantArgInfo) {
ArgInfoVal = Builder.CreateAlloca(Int8PtrTy, getConstantInt64(NumArgs),
"arg_info");
for (unsigned I = 0; I < NumArgs; ++I) {
auto &AII = ArgInfo[I];
Value *Next = NullPtr;
for (auto &AI : AII) {
Value *AIV = Builder.CreateAlloca(ArgInfoTy);
Value *AIIArg =
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(0)});
Builder.Insert(AIIArg);
Builder.CreateStore(convertToInt64Ty(AI.BasePtr), AIIArg);
Value *AIIType =
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(1)});
Builder.Insert(AIIType);
Builder.CreateStore(AI.Type, AIIType);
Value *AIISize =
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(2)});
Builder.Insert(AIISize);
Builder.CreateStore(AI.Size, AIISize);
Value *AIINext =
GetElementPtrInst::Create(Int8PtrTy, AIV, {getConstantInt64(3)});
Builder.Insert(AIINext);
Builder.CreateStore(Next, AIINext);
Next = AIV;
}
Value *AIIV = GetElementPtrInst::Create(Int8PtrTy, ArgInfoVal,
{getConstantInt64(I)});
Builder.Insert(AIIV);
Builder.CreateStore(Next, AIIV);
}
} else {
SmallVector<Constant *> ArgInfoInitVar;
for (auto &AII : ArgInfo) {
Constant *Last = NullPtr;
for (auto &AI : AII) {
auto *Arg = cast<Constant>(AI.BasePtr);
auto *CS =
ConstantStruct::get(ArgInfoTy, {convertToInt64Ty(Arg), AI.Type,
cast<Constant>(AI.Size), Last});
auto *GV = new GlobalVariable(
M, ArgInfoTy, /* isConstant */ true,
GlobalValue::LinkageTypes::InternalLinkage, CS);
Last = GV;
}
ArgInfoInitVar.push_back(Last);
}
Constant *ArgInfoInit = ConstantArray::get(
ArrayType::get(Int8PtrTy, NumArgs), ArgInfoInitVar);
ArgInfoVal = new GlobalVariable(
M, ArrayType::get(Int8PtrTy, NumArgs), /* isConstant */ true,
GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info");
}
SmallVector<Value *> Args{ConstantInt::get(Int32Ty, WrapperNumber),
ArgInfoVal};
for (Value *Operand : CI->args())
Args.push_back(Operand);
CallInst *NewCall = Builder.CreateCall(DeviceWrapperFn, Args);
CallInstMap.emplace_back(CI, NewCall);
}
return true;
}
Function *HostRPC::getDeviceWrapperFunction(StringRef WrapperName, Function *F,
CallSiteInfo &CSI) {
Function *WrapperFn = M.getFunction(WrapperName);
if (WrapperFn)
return WrapperFn;
// return_type device_wrapper(int32_t call_no, void *arg_info, ...)
SmallVector<Type *> Params{Int32Ty, Int8PtrTy};
Params.append(CSI.Params);
Type *RetTy = F->getReturnType();
FunctionType *FT = FunctionType::get(RetTy, Params, /*isVarArg*/ false);
WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::InternalLinkage,
WrapperName, M);
// Emit the body of the device wrapper
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
Builder.SetInsertPoint(EntryBB);
// skip call_no and arg_info.
constexpr const unsigned NumArgSkipped = 2;
Value *Desc = nullptr;
{
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_get_desc];
Desc = Builder.CreateCall(
Fn,
{WrapperFn->getArg(0),
ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
WrapperFn->getArg(1)},
"desc");
}
{
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_add_arg];
for (unsigned I = NumArgSkipped; I < WrapperFn->arg_size(); ++I) {
Value *V = convertToInt64Ty(WrapperFn->getArg(I));
Builder.CreateCall(
Fn, {Desc, V, ConstantInt::get(Int32Ty, I - NumArgSkipped)});
}
}
Value *RetVal =
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_send_and_wait], {Desc});
if (RetTy->isVoidTy()) {
Builder.CreateRetVoid();
return WrapperFn;
}
if (RetTy != RetVal->getType())
RetVal = convertFromInt64TyTo(RetVal, RetTy);
Builder.CreateRet(RetVal);
return WrapperFn;
}
Function *HostRPC::getHostWrapperFunction(StringRef WrapperName, Function *F,
CallSiteInfo &CSI) {
Function *WrapperFn = HM.getFunction(WrapperName);
if (WrapperFn)
return WrapperFn;
SmallVector<Type *> Params{Int8PtrTy};
FunctionType *FT = FunctionType::get(VoidTy, Params, /* isVarArg */ false);
WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,
WrapperName, HM);
Value *Desc = WrapperFn->getArg(0);
// Emit the body of the host wrapper
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
Builder.SetInsertPoint(EntryBB);
SmallVector<Value *> Args;
for (unsigned I = 0; I < CSI.CI->arg_size(); ++I) {
Value *V = Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_arg],
{Desc, ConstantInt::get(Int32Ty, I)});
Args.push_back(convertFromInt64TyTo(V, CSI.Params[I]));
}
// The host callee that will be called eventually by the host wrapper.
Function *HostCallee = HM.getFunction(F->getName());
if (!HostCallee)
HostCallee = Function::Create(F->getFunctionType(), F->getLinkage(),
F->getName(), HM);
Value *RetVal = Builder.CreateCall(HostCallee, Args);
if (!RetVal->getType()->isVoidTy()) {
RetVal = convertToInt64Ty(RetVal);
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_set_ret_val],
{Desc, RetVal});
}
Builder.CreateRetVoid();
return WrapperFn;
}
void HostRPC::emitHostWrapperInvoker() {
IRBuilder<> Builder(Context);
unsigned NumEntries = HostEntryTable.size();
Function *F = RFIs[OMPRTL___kmpc_host_rpc_invoke_host_wrapper];
F->setDLLStorageClass(
GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
Value *CallNo = F->getArg(0);
Value *Desc = F->getArg(1);
SmallVector<BasicBlock *> SwitchBBs;
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", F);
BasicBlock *ReturnBB = BasicBlock::Create(Context, "return", F);
// Emit code for the return bb.
Builder.SetInsertPoint(ReturnBB);
Builder.CreateRetVoid();
// Create BB for each host entry and emit function call.
for (unsigned I = 0; I < NumEntries; ++I) {
BasicBlock *BB = BasicBlock::Create(Context, "invoke.bb", F, ReturnBB);
SwitchBBs.push_back(BB);
Builder.SetInsertPoint(BB);
Builder.CreateCall(HostEntryTable[I], {Desc});
Builder.CreateBr(ReturnBB);
}
// Emit code for the entry BB.
Builder.SetInsertPoint(EntryBB);
SwitchInst *Switch = Builder.CreateSwitch(CallNo, ReturnBB, NumEntries);
for (unsigned I = 0; I < NumEntries; ++I)
Switch->addCase(ConstantInt::get(cast<IntegerType>(Int32Ty), I),
SwitchBBs[I]);
}
Module *getHostModule(Module &M) {
auto *MD = M.getNamedMetadata("llvm.hostrpc.hostmodule");
if (!MD || MD->getNumOperands() == 0)
return nullptr;
auto *Node = MD->getOperand(0);
assert(Node->getNumOperands() == 1 && "invliad named metadata");
auto *CAM = dyn_cast<ConstantAsMetadata>(Node->getOperand(0));
if (!CAM)
return nullptr;
auto *CI = cast<ConstantInt>(CAM->getValue());
Module *Mod = reinterpret_cast<Module *>(CI->getZExtValue());
M.eraseNamedMetadata(MD);
return Mod;
}
} // namespace
PreservedAnalyses HostRPCPass::run(Module &M, ModuleAnalysisManager &AM) {
std::unique_ptr<Module> DummyHostModule;
Module *HostModule = nullptr;
if (UseDummyHostModule) {
DummyHostModule =
std::make_unique<Module>("dummy-host-rpc.bc", M.getContext());
HostModule = DummyHostModule.get();
} else {
HostModule = getHostModule(M);
}
if (!HostModule)
return PreservedAnalyses::all();
bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
// The pass will not run if it is not invoked directly or not invoked at link
// time.
if (!UseDummyHostModule && !PostLink)
return PreservedAnalyses::all();
FunctionAnalysisManager &FAM =
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
AnalysisGetter AG(FAM);
CallGraphUpdater CGUpdater;
BumpPtrAllocator Allocator;
AttributorConfig AC(CGUpdater);
AC.DefaultInitializeLiveInternals = false;
AC.RewriteSignatures = false;
AC.PassName = DEBUG_TYPE;
AC.MaxFixpointIterations = 1024;
InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
SetVector<Function *> Functions;
Attributor A(Functions, InfoCache, AC);
HostRPC RPC(M, *HostModule, A);
bool Changed = RPC.run();
LLVM_DEBUG({
if (Changed && UseDummyHostModule) {
M.dump();
HostModule->dump();
}
});
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}

View File

@@ -53,6 +53,7 @@
#include "llvm/Transforms/IPO/Attributor.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
#include <algorithm>
#include <optional>
@@ -135,6 +136,15 @@ static cl::opt<bool>
cl::desc("Enables more verbose remarks."), cl::Hidden,
cl::init(false));
static cl::opt<bool>
EnableParallel51Split("openmp-opt-enable-parallel-51-split",
cl::desc("Enable the kernel split at parallel_51."),
cl::Hidden, cl::init(false));
static cl::opt<bool> EnableParallel51SplitMultiTeams(
"openmp-opt-enable-parallel-51-split-multi-teams",
cl::desc("Enable multi-teams support for the parallel 51 kernel."),
cl::Hidden, cl::init(false));
static cl::opt<unsigned>
SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
cl::desc("Maximal number of attributor iterations."),
@@ -812,6 +822,28 @@ private:
}
};
bool canUseMultiTeam(Function *OutlinedFn, OMPInformationCache &OMPInfoCache) {
if (!EnableParallel51SplitMultiTeams)
return false;
return true;
}
void collectReachingKernels(KernelSet &Kernels, Function *F,
SmallVector<Kernel> &ReachingKernels) {
if (Kernels.count(F)) {
if (F->hasFnAttribute("omp_parallel_51_kernel"))
ReachingKernels.push_back(F);
return;
}
for (User *U : F->users()) {
auto *I = dyn_cast<Instruction>(U);
if (!I)
continue;
collectReachingKernels(Kernels, I->getFunction(), ReachingKernels);
}
}
struct OpenMPOpt {
using OptimizationRemarkGetter =
@@ -841,6 +873,11 @@ struct OpenMPOpt {
<< OMPInfoCache.ModuleSlice.size() << " functions\n");
if (IsModulePass) {
if (EnableParallel51Split)
Changed |= splitKernels();
OMPInfoCache.recollectUses();
Changed |= runAttributor(IsModulePass);
// Recollect uses, in case Attributor deleted any.
@@ -940,6 +977,307 @@ struct OpenMPOpt {
}
private:
/// Create a new kernel for each function call to __kmpc_parallel_51.
bool splitKernels() {
bool Changed = false;
OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
if (!Parallel51RFI.Declaration)
return Changed;
SmallVector<CallInst *> WorkItems;
for (User *U : Parallel51RFI.Declaration->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
Function *F = CI->getFunction();
// If the parent function is already from kernel split, skip it.
Attribute Attr = F->getFnAttribute("omp_parallel_51_kernel");
if (Attr.isValid())
continue;
WorkItems.push_back(CI);
}
if (WorkItems.empty())
return Changed;
Changed = true;
auto CreateNewKernel = [](LLVMContext &Ctx, Module &M,
OpenMPIRBuilder &IRBuilder, Function *Parallel51,
const CallInst *CI) {
SmallVector<Type *, 4> ArgTypes;
// int32_t gtid
ArgTypes.push_back(Type::getInt32Ty(Ctx));
// int32_t if_expr
ArgTypes.push_back(Type::getInt32Ty(Ctx));
// int32_t num_threads
ArgTypes.push_back(Type::getInt32Ty(Ctx));
// void **args
ArgTypes.push_back(Type::getInt8PtrTy(Ctx));
// int64_t nargs
ArgTypes.push_back(Type::getInt64Ty(Ctx));
FunctionType *FT =
FunctionType::get(Type::getVoidTy(Ctx), ArgTypes, false);
std::string KernelName = "__omp_offloading_parallel_51_from_";
KernelName += CI->getFunction()->getName();
// Sanitize the kernel name
{
size_t DotPos = KernelName.find('.');
while (DotPos != std::string::npos) {
KernelName[DotPos] = '_';
DotPos = KernelName.find('.');
}
}
Function *K =
Function::Create(FT, GlobalValue::WeakODRLinkage, KernelName, &M);
K->setVisibility(GlobalValue::ProtectedVisibility);
// exec mode global variable
GlobalVariable *ModeGV = new GlobalVariable(
M, Type::getInt8Ty(Ctx), /*isConstant=*/true,
GlobalValue::WeakAnyLinkage,
ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD),
K->getName() + "_exec_mode");
appendToCompilerUsed(M, {ModeGV});
// Attach "kernel" metadata to the new kernel.
NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
Metadata *MDVals[] = {ConstantAsMetadata::get(K),
MDString::get(Ctx, "kernel"),
ConstantAsMetadata::get(ConstantInt::get(
llvm::Type::getInt32Ty(Ctx), 1))};
MD->addOperand(MDNode::get(Ctx, MDVals));
// Set kernel attributes.
K->setAttributes(
AttributeList::get(Ctx, AttributeList::FunctionIndex,
CI->getFunction()->getAttributes().getFnAttrs()));
K->removeFnAttr("omp_target_thread_limit");
K->removeFnAttr("omp_target_num_teams");
K->addFnAttr("omp_parallel_51_kernel");
K->addFnAttr("kernel");
K->getArg(3)->addAttr(Attribute::NoAlias);
auto *EntryBB = BasicBlock::Create(Ctx, "entry", K);
OpenMPIRBuilder::LocationDescription TargetInitLoc(
{EntryBB, EntryBB->end()});
auto IP = IRBuilder.createTargetInit(TargetInitLoc, /* IsSPMD */ true);
BasicBlock *UserCodeBB = IP.getBlock();
auto UseDirectIfPossible = [](Value *LHS, Value *RHS) -> Value * {
if (isa<Constant>(LHS))
return LHS;
return RHS;
};
Value *Ident = CI->getOperand(0);
Value *TId = UseDirectIfPossible(CI->getOperand(1), K->getArg(0));
Value *IfExpr = UseDirectIfPossible(CI->getOperand(2), K->getArg(1));
Value *NumThreads = UseDirectIfPossible(CI->getOperand(3), K->getArg(2));
Value *ProcBind = CI->getOperand(4);
Value *Fn = CI->getOperand(5);
Value *WrapperFn = CI->getOperand(6);
Value *Args = K->getArg(3);
Value *NArgs = UseDirectIfPossible(CI->getOperand(8), K->getArg(4));
(void)CallInst::Create(Parallel51,
{Ident, TId, IfExpr, NumThreads, ProcBind, Fn,
WrapperFn, Args, NArgs},
"", UserCodeBB);
OpenMPIRBuilder::LocationDescription TargetDeInitLoc(
{UserCodeBB, UserCodeBB->end()});
IRBuilder.createTargetDeinit(TargetDeInitLoc, /* IsSPMD */ true);
IRBuilder.Builder.CreateRetVoid();
return K;
};
auto EnableMultiTeam = [](Function *OutlinedFn) {
SmallVector<CallInst *> WorkItems;
for (BasicBlock &BB : *OutlinedFn)
for (Instruction &I : BB) {
CallInst *C = dyn_cast<CallInst>(&I);
if (!C)
continue;
Function *Callee = C->getCalledFunction();
auto CalleeName = Callee->getName();
if (CalleeName == "__kmpc_for_static_init_4" ||
CalleeName == "__kmpc_for_static_init_4u" ||
CalleeName == "__kmpc_for_static_init_8" ||
CalleeName == "__kmpc_for_static_init_8u")
WorkItems.push_back(C);
}
for (CallInst *C : WorkItems) {
constexpr const unsigned SchedTypeArgNum = 2;
Value *SchedTypeVal = C->getOperand(SchedTypeArgNum);
ConstantInt *SchedTypeCI = cast<ConstantInt>(SchedTypeVal);
int32_t SchedType = SchedTypeCI->getSExtValue();
Value *NewSchedType = SchedTypeVal;
if (SchedType == /*kmp_sched_static_chunk*/ 33)
NewSchedType = ConstantInt::get(SchedTypeCI->getType(), 100);
else if (SchedType == /*kmp_sched_static_chunk*/ 34)
NewSchedType = ConstantInt::get(SchedTypeCI->getType(), 101);
C->setOperand(SchedTypeArgNum, NewSchedType);
}
};
auto &Ctx = M.getContext();
for (CallInst *CI : WorkItems) {
Function *K = CreateNewKernel(Ctx, M, OMPInfoCache.OMPBuilder,
Parallel51RFI.Declaration, CI);
OMPInfoCache.Kernels.insert(K);
constexpr const unsigned OutlinedFnArgNum = 5;
Function *OutlinedFn =
dyn_cast<Function>(CI->getOperand(OutlinedFnArgNum));
assert(OutlinedFn && "arg fn is not a function");
if (canUseMultiTeam(OutlinedFn, OMPInfoCache))
EnableMultiTeam(OutlinedFn);
auto &Ctx = M.getContext();
IRBuilder<> Builder(CI);
FunctionCallee Callee =
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
M, OMPRTL___kmpc_launch_parallel_51_kernel);
Constant *KernelName = ConstantDataArray::getString(Ctx, K->getName());
GlobalVariable *KernelNameVar =
new GlobalVariable(M, KernelName->getType(), /*isConstant=*/true,
GlobalValue::WeakAnyLinkage, KernelName);
Value *Args[6] = {KernelNameVar, CI->getArgOperand(1),
CI->getArgOperand(2), CI->getArgOperand(3),
CI->getArgOperand(7), CI->getArgOperand(8)};
CallInst *C = Builder.CreateCall(Callee, Args);
OMPInfoCache.setCallingConvention(Callee, C);
CI->eraseFromParent();
}
// Check all the use of __kmpc_parallel_51 again. If it is just used in
// parallel kernel, we can mark the main kernel as SPMD mode as well.
bool OnlyUsedByParallelKernel = true;
for (User *U : Parallel51RFI.Declaration->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
Function *F = CI->getFunction();
Attribute Attr = F->getFnAttribute("omp_parallel_51_kernel");
if (Attr.isValid())
continue;
OnlyUsedByParallelKernel = false;
break;
}
if (OnlyUsedByParallelKernel) {
for (auto K : OMPInfoCache.Kernels) {
Attribute Attr = K->getFnAttribute("omp_parallel_51_kernel");
if (Attr.isValid())
continue;
GlobalVariable *ExecMode =
M.getGlobalVariable((K->getName() + "_exec_mode").str());
assert(ExecMode && "kernel without exec mode");
assert(ExecMode->getInitializer() &&
"ExecMode doesn't have initializer!");
assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
"ExecMode is not an integer!");
const int8_t ExecModeVal =
cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
// Kernel is already in SPMD mode, skip.
if (ExecModeVal & OMP_TGT_EXEC_MODE_SPMD)
continue;
auto *NewExecModeC =
ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD);
ExecMode->setInitializer(NewExecModeC);
OMPInformationCache::RuntimeFunctionInfo &TargetInitRFI =
OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
OMPInformationCache::RuntimeFunctionInfo &TargetDeInitRFI =
OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
CallInst *InitCI = nullptr;
CallInst *DeInitCI = nullptr;
for (User *U : TargetInitRFI.Declaration->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
if (CI->getFunction() == K) {
InitCI = CI;
break;
}
}
for (User *U : TargetDeInitRFI.Declaration->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
if (CI->getFunction() == K) {
DeInitCI = CI;
break;
}
}
assert(InitCI && DeInitCI && "kernel without init and deinit");
InitCI->setArgOperand(1, NewExecModeC);
InitCI->setArgOperand(2,
ConstantInt::getNullValue(Type::getInt1Ty(Ctx)));
DeInitCI->setArgOperand(1, NewExecModeC);
}
}
// Check the use of omp_get_thread_num and omp_get_num_threads.
OMPInformationCache::RuntimeFunctionInfo &ThreadNumRFI =
OMPInfoCache.RFIs[OMPRTL_omp_get_thread_num];
OMPInformationCache::RuntimeFunctionInfo &NumThreadsRFI =
OMPInfoCache.RFIs[OMPRTL_omp_get_num_threads];
auto CheckIfOnlyUsedByParallelKernel = [&](CallInst *CI) {
SmallVector<Kernel> ReachingKernels;
collectReachingKernels(OMPInfoCache.Kernels, CI->getFunction(),
ReachingKernels);
return ReachingKernels.size() < 2;
};
SmallVector<CallInst *> WorkList;
for (User *U : ThreadNumRFI.Declaration->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
if (!CheckIfOnlyUsedByParallelKernel(CI))
continue;
WorkList.push_back(CI);
}
for (User *U : NumThreadsRFI.Declaration->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
if (!CheckIfOnlyUsedByParallelKernel(CI))
continue;
WorkList.push_back(CI);
}
for (CallInst *CI : WorkList) {
Function *Callee = nullptr;
if (CI->getCalledFunction() == ThreadNumRFI.Declaration)
Callee = OMPInfoCache.RFIs[OMPRTL_omp_get_bulk_thread_num].Declaration;
if (CI->getCalledFunction() == NumThreadsRFI.Declaration)
Callee = OMPInfoCache.RFIs[OMPRTL_omp_get_bulk_num_threads].Declaration;
assert(Callee && "unknown callee");
auto &Builder = OMPInfoCache.OMPBuilder.Builder;
Builder.SetInsertPoint(CI);
Value *C = Builder.CreateCall(Callee);
CI->replaceAllUsesWith(C);
CI->eraseFromParent();
}
return Changed;
}
/// Merge parallel regions when it is safe.
bool mergeParallelRegions() {
const unsigned CallbackCalleeOperand = 2;
@@ -3062,7 +3400,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
if (EDAA.getState().isValidState()) {
const auto &CalleeED = EDAA.getFunctionExecutionDomain();
ED.IsReachedFromAlignedBarrierOnly =
CalleeED.IsReachedFromAlignedBarrierOnly;
CalleeED.IsReachedFromAlignedBarrierOnly;
AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
ED.EncounteredNonLocalSideEffect |=
@@ -3553,7 +3891,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
Attributor::SimplifictionCallbackTy StateMachineSimplifyCB =
[&](const IRPosition &IRP, const AbstractAttribute *AA,
bool &UsedAssumedInformation) -> std::optional<Value *> {
return nullptr;
return nullptr;
};
Attributor::SimplifictionCallbackTy ModeSimplifyCB =

View File

@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMTransformUtils
CallGraphUpdater.cpp
CanonicalizeAliases.cpp
CanonicalizeFreezeInLoops.cpp
CanonicalizeMainFunction.cpp
CloneFunction.cpp
CloneModule.cpp
CodeExtractor.cpp

View File

@@ -0,0 +1,103 @@
//===-------------- CanonicalizeMainFunction.cpp ----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Utility function to canonicalize main function.
// The canonical main function is defined as: int main(int argc, char *argv[]);
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/Support/CommandLine.h"
using namespace llvm;
#define DEBUG_TYPE "canonicalize-main-function"
static cl::opt<std::string>
MainFunctionName("canonical-main-function-name",
cl::desc("New main function name"),
cl::value_desc("main function name"));
bool rewriteMainFunction(Function &F) {
if (F.arg_size() == 2 && F.getReturnType()->isIntegerTy(32))
return false;
auto &Ctx = F.getContext();
auto &DL = F.getParent()->getDataLayout();
auto *Int32Ty = IntegerType::getInt32Ty(Ctx);
auto *PtrTy = PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace());
FunctionType *NewFnTy =
FunctionType::get(Int32Ty, {Int32Ty, PtrTy}, /* isVarArg */ false);
Function *NewFn =
Function::Create(NewFnTy, F.getLinkage(), F.getAddressSpace(), "");
F.getParent()->getFunctionList().insert(F.getIterator(), NewFn);
NewFn->takeName(&F);
NewFn->copyAttributesFrom(&F);
NewFn->setSubprogram(F.getSubprogram());
F.setSubprogram(nullptr);
NewFn->splice(NewFn->begin(), &F);
if (!F.getReturnType()->isIntegerTy(32)) {
SmallVector<ReturnInst *> WorkList;
for (BasicBlock &BB : *NewFn)
for (Instruction &I : BB) {
auto *RI = dyn_cast<ReturnInst>(&I);
if (!RI)
continue;
assert(RI->getReturnValue() == nullptr &&
"return value of a void main function is not nullptr");
WorkList.push_back(RI);
}
for (auto *RI : WorkList) {
(void)ReturnInst::Create(Ctx, ConstantInt::getNullValue(Int32Ty), RI);
RI->eraseFromParent();
}
}
if (F.arg_size() == NewFn->arg_size())
for (unsigned I = 0; I < NewFn->arg_size(); ++I) {
Argument *OldArg = F.getArg(I);
Argument *NewArg = NewFn->getArg(I);
NewArg->takeName(OldArg);
OldArg->replaceAllUsesWith(NewArg);
}
return true;
}
PreservedAnalyses CanonicalizeMainFunctionPass::run(Module &M,
ModuleAnalysisManager &AM) {
Function *MainFunc = nullptr;
for (Function &F : M)
if (F.getName() == "main") {
assert(MainFunc == nullptr && "more than one main function");
MainFunc = &F;
}
if (MainFunc == nullptr)
return PreservedAnalyses::all();
bool Changed = false;
if (!MainFunctionName.empty() && MainFunc->getName() != MainFunctionName) {
MainFunc->setName(MainFunctionName);
Changed = true;
}
if (rewriteMainFunction(*MainFunc)) {
MainFunc->eraseFromParent();
Changed = true;
}
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}

View File

@@ -87,6 +87,7 @@ set(include_files
${include_directory}/Interface.h
${include_directory}/LibC.h
${include_directory}/Mapping.h
${include_directory}/Memory.h
${include_directory}/State.h
${include_directory}/Synchronization.h
${include_directory}/Types.h
@@ -96,6 +97,7 @@ set(include_files
set(src_files
${source_directory}/Configuration.cpp
${source_directory}/Debug.cpp
${source_directory}/HostRPC.cpp
${source_directory}/Kernel.cpp
${source_directory}/LibC.cpp
${source_directory}/Mapping.cpp
@@ -109,6 +111,14 @@ set(src_files
${source_directory}/Workshare.cpp
)
if (LIBOMPTARGET_DEVICE_BUILTIN_ALLOCATOR)
list(APPEND src_files ${source_directory}/BuiltinAllocator.cpp)
elseif (LIBOMPTARGET_GENERIC_ALLOCATOR)
list(APPEND src_files ${source_directory}/GenericAllocator.cpp)
else()
list(APPEND src_files ${source_directory}/TeamAllocator.cpp)
endif()
set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512)
set(link_opt_flags -O3 -openmp-opt-disable -attributor-enable=module)
set(link_export_flag -passes=internalize -internalize-public-api-file=${source_directory}/exports)
@@ -123,11 +133,20 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
-nocudalib -nogpulib -nostdinc
-fopenmp -fopenmp-cuda-mode
-Wno-unknown-cuda-version
-DOMPTARGET_DEVICE_RUNTIME
-I${include_directory}
-I${devicertl_base_directory}/../include
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
)
if (LIBOMPTARGET_DEVICERTL_HOSTRPC_DEBUG)
list(APPEND bc_flags "-DHOSTRPC_DEBUG")
endif()
if (LIBOMPTARGET_DEVICERTL_HOSTRPC_PROFILING)
list(APPEND bc_flags "-DHOSTRPC_PROFILING")
endif()
# first create an object target
add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
function(compileDeviceRTLLibrary target_cpu target_name target_triple)

View File

@@ -32,6 +32,12 @@ void __assert_fail(const char *assertion, const char *file, unsigned line,
__assert_assume(expr); \
}
#define assert(expr) \
{ \
if (!(expr)) \
__assert_fail(#expr, __FILE__, __LINE__, __PRETTY_FUNCTION__); \
}
///}
#define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);

View File

@@ -355,6 +355,22 @@ int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
///}
/// Host RPC
///
/// {
void *__kmpc_host_rpc_get_desc(int32_t CallNo, int32_t NumArgs, void *ArgInfo);
void __kmpc_host_rpc_add_arg(void *Desc, int64_t Arg, int32_t ArgNum);
int64_t __kmpc_host_rpc_send_and_wait(void *Desc);
/// }
/// Launch parallel kernel
///
///{
void __kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
int32_t if_expr, int32_t num_threads,
void **args, int64_t nargs);
///}
}
#endif

View File

@@ -14,11 +14,115 @@
#include "Types.h"
struct FILE;
extern FILE *stdin;
extern FILE *stdout;
extern FILE *stderr;
#ifndef _ASM_GENERIC_ERRNO_BASE_H
#define _ASM_GENERIC_ERRNO_BASE_H
#define EPERM 1 /* Operation not permitted */
#define ENOENT 2 /* No such file or directory */
#define ESRCH 3 /* No such process */
#define EINTR 4 /* Interrupted system call */
#define EIO 5 /* I/O error */
#define ENXIO 6 /* No such device or address */
#define E2BIG 7 /* Argument list too long */
#define ENOEXEC 8 /* Exec format error */
#define EBADF 9 /* Bad file number */
#define ECHILD 10 /* No child processes */
#define EAGAIN 11 /* Try again */
#define ENOMEM 12 /* Out of memory */
#define EACCES 13 /* Permission denied */
#define EFAULT 14 /* Bad address */
#define ENOTBLK 15 /* Block device required */
#define EBUSY 16 /* Device or resource busy */
#define EEXIST 17 /* File exists */
#define EXDEV 18 /* Cross-device link */
#define ENODEV 19 /* No such device */
#define ENOTDIR 20 /* Not a directory */
#define EISDIR 21 /* Is a directory */
#define EINVAL 22 /* Invalid argument */
#define ENFILE 23 /* File table overflow */
#define EMFILE 24 /* Too many open files */
#define ENOTTY 25 /* Not a typewriter */
#define ETXTBSY 26 /* Text file busy */
#define EFBIG 27 /* File too large */
#define ENOSPC 28 /* No space left on device */
#define ESPIPE 29 /* Illegal seek */
#define EROFS 30 /* Read-only file system */
#define EMLINK 31 /* Too many links */
#define EPIPE 32 /* Broken pipe */
#define EDOM 33 /* Math argument out of domain of func */
#define ERANGE 34 /* Math result not representable */
#endif
#define errno (*__errno_location())
extern "C" {
int memcmp(const void *lhs, const void *rhs, size_t count);
int printf(const char *format, ...);
long strtol(const char *str, char **str_end, int base);
int strcmp(const char *lhs, const char *rhs);
void *calloc(size_t num, size_t size);
int strcasecmp(const char *string1, const char *string2);
void exit(int exit_code);
size_t strlen(const char *str);
int atoi(const char *str);
char *strcpy(char *dest, const char *src);
int stat(const char *path, struct stat *buf);
int *__errno_location();
char *strcat(char *dest, const char *src);
void perror(const char *s);
int strncmp(const char *lhs, const char *rhs, size_t count);
char *strncpy(char *dest, const char *src, size_t count);
char *strchr(const char *str, int ch);
char *strtok(char *str, const char *delim);
const unsigned short **__ctype_b_loc(void);
void *realloc(void *ptr, size_t new_size);
void qsort(void *const pbase, size_t total_elems, size_t size,
int (*comp)(const void *, const void *));
int gettimeofday(struct timeval *tv, struct timezone *tz);
char *__xpg_basename(const char *path);
void srand(unsigned seed);
int rand();
int abs(int n);
void *memcpy(void *dest, const void *src, size_t count);
double atof(const char *str);
double strtod(const char *str, char **ptr);
long strtol(const char *nptr, char **endptr, int base);
}
#endif

View File

@@ -0,0 +1,39 @@
//===--- Memory.h - OpenMP device runtime memory allocator -------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//
//===----------------------------------------------------------------------===//
#ifndef OMPTARGET_MEMORY_H
#define OMPTARGET_MEMORY_H
#include "Types.h"
extern "C" {
__attribute__((leaf)) void *malloc(size_t Size);
__attribute__((leaf)) void free(void *Ptr);
}
namespace ompx {
namespace memory {
struct MemoryAllocationInfo {
void *BasePtr = nullptr;
size_t Size = 0;
bool isValid() const { return BasePtr; }
};
/// Get the memory allocation information if pointer \p P is in the range of one
/// of the buffer allocated by \p malloc.
MemoryAllocationInfo getMemoryAllocationInfo(void *P);
} // namespace memory
} // namespace ompx
#endif

View File

@@ -29,6 +29,9 @@ enum OrderingTy {
/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering);
///
int32_t addSys(int32_t *Addr, int32_t V);
/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
/// result is stored in \p *Addr;
/// {
@@ -128,6 +131,60 @@ void system(atomic::OrderingTy Ordering);
} // namespace fence
namespace atomic {
/// Atomically load \p Addr with \p Ordering semantics.
uint32_t load(uint32_t *Addr, int Ordering);
/// Atomically load \p Addr with \p Ordering semantics.
uint64_t load(uint64_t *Addr, int Ordering);
/// Atomically store \p V to \p Addr with \p Ordering semantics.
void store(uint32_t *Addr, uint32_t V, int Ordering);
void store(uint64_t *Addr, uint64_t V, int Ordering);
/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);
/// Atomically add \p V to \p *Addr with \p Ordering semantics.
uint32_t add(uint32_t *Addr, uint32_t V, int Ordering);
/// Atomically add \p V to \p *Addr with \p Ordering semantics.
uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);
} // namespace atomic
namespace mutex {
class TicketLock {
uint64_t NowServing = 0;
uint64_t NextTicket = 0;
public:
TicketLock() = default;
TicketLock(const TicketLock &) = delete;
TicketLock(TicketLock &&) = delete;
void lock();
void unlock();
};
template <typename T> class LockGuard {
T &Lock;
public:
explicit LockGuard(T &L) : Lock(L) { Lock.lock(); }
~LockGuard() { Lock.unlock(); }
};
} // namespace mutex
} // namespace ompx
#endif

View File

@@ -36,6 +36,7 @@ using size_t = decltype(sizeof(char));
// TODO: Properly implement this
using intptr_t = int64_t;
using uintptr_t = uint64_t;
using ptrdiff_t = intptr_t;
static_assert(sizeof(int8_t) == 1, "type size mismatch");
static_assert(sizeof(uint8_t) == 1, "type size mismatch");
@@ -83,6 +84,11 @@ enum kmp_sched_t {
kmp_sched_distr_static_nochunk = 92,
kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
// The following two are for direct GPU compilation where we don't have outer
// loop to distribute workload among teams but we still need to.
kmp_sched_distr_parallel_static_noloop_chunk = 100,
kmp_sched_distr_parallel_static_noloop_nochunk = 101,
kmp_sched_default = kmp_sched_static_nochunk,
kmp_sched_unordered_first = kmp_sched_static_chunk,
kmp_sched_unordered_last = kmp_sched_auto,

View File

@@ -16,6 +16,9 @@
#pragma omp begin declare target device_type(nohost)
extern "C" double omp_get_wtime();
extern "C" int printf(const char *, ...);
namespace ompx {
namespace utils {
@@ -37,8 +40,8 @@ template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
}
/// Advance \p Ptr by \p Bytes bytes.
template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
template <typename Ty1, typename Ty2> inline Ty1 advance(Ty1 Ptr, Ty2 Bytes) {
return reinterpret_cast<Ty1>(reinterpret_cast<char *>(Ptr) + Bytes);
}
/// Return the first bit set in \p V.
@@ -82,12 +85,39 @@ template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
return *((DstTy *)(&V));
}
template <typename Ty = char>
ptrdiff_t getPtrDiff(const void *End, const void *Begin) {
return reinterpret_cast<const Ty *>(End) -
reinterpret_cast<const Ty *>(Begin);
}
inline bool isInRange(void *Ptr, void *BasePtr, int64_t Offset) {
ptrdiff_t Diff = getPtrDiff(Ptr, BasePtr);
return Diff >= 0 && Diff < Offset;
}
inline intptr_t ptrtoint(void *Ptr) { return reinterpret_cast<intptr_t>(Ptr); }
template <typename T> T min(T a, T b) { return a < b ? a : b; }
/// A pointer variable that has by design an `undef` value. Use with care.
__attribute__((loader_uninitialized)) static void *const UndefPtr;
#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
class SimpleProfiler {
const char *HeadLine = nullptr;
double Start;
public:
SimpleProfiler(const char *HL) : HeadLine(HL), Start(omp_get_wtime()) {}
~SimpleProfiler() {
double End = omp_get_wtime();
printf("%s --> %lf s.\n", HeadLine, End - Start);
}
};
} // namespace utils
} // namespace ompx

View File

@@ -0,0 +1,40 @@
//===------- BuiltinAllocator.cpp - Generic GPU memory allocator -- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#pragma omp begin declare target device_type(nohost)
#include "Debug.h"
#include "Memory.h"
#include "Utils.h"
using namespace ompx;
namespace ompx {
namespace memory {
MemoryAllocationInfo getMemoryAllocationInfo(void *P) { return {}; }
void init() {}
} // namespace memory
} // namespace ompx
extern "C" {
void *realloc(void *ptr, size_t new_size) {
void *NewPtr = malloc(new_size);
if (!NewPtr)
return nullptr;
__builtin_memcpy(NewPtr, ptr, new_size);
free(ptr);
return NewPtr;
}
}
#pragma omp end declare target

View File

@@ -0,0 +1,17 @@
target_sources(omptarget.devicertl PRIVATE
Configuration.cpp
Debug.cpp
HostRPC.cpp
GenericAllocator.cpp
Kernel.cpp
LibC.cpp
Mapping.cpp
Misc.cpp
Parallelism.cpp
Reduction.cpp
State.cpp
Synchronization.cpp
Tasking.cpp
Utils.cpp
WarpAllocator.cpp
Workshare.cpp)

View File

@@ -0,0 +1,308 @@
//===------- GenericAllocator.cpp - Generic GPU memory allocator -- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#pragma omp begin declare target device_type(nohost)
#include "Debug.h"
#include "Memory.h"
#include "Synchronization.h"
#include "Utils.h"
using namespace ompx;
char *CONSTANT(omptarget_device_heap_buffer)
__attribute__((used, retain, weak, visibility("protected")));
size_t CONSTANT(omptarget_device_heap_size)
__attribute__((used, retain, weak, visibility("protected")));
namespace {
size_t HeapCurPos = 0;
mutex::TicketLock HeapLock;
mutex::TicketLock AllocationListLock;
mutex::TicketLock FreeListLock;
constexpr const size_t Alignment = 16;
using intptr_t = int64_t;
struct SimpleLinkListNode {
template <typename T> friend struct SimpleLinkList;
protected:
SimpleLinkListNode *Prev = nullptr;
SimpleLinkListNode *Next = nullptr;
bool operator>(const SimpleLinkListNode &RHS) const {
const auto This = reinterpret_cast<int64_t>(this);
const auto RHSThis = reinterpret_cast<int64_t>(&RHS);
return This > RHSThis;
}
SimpleLinkListNode() = default;
SimpleLinkListNode(const SimpleLinkListNode &) = delete;
SimpleLinkListNode(SimpleLinkListNode &&) = delete;
};
struct AllocationMetadata final : SimpleLinkListNode {
public:
size_t getUserSize() const;
void *getUserAddr() const {
return const_cast<void *>(reinterpret_cast<const void *>(this + 1));
}
size_t getSize() const { return Size; }
void setSize(size_t V) { Size = V; }
bool isInRange(void *Ptr) {
return utils::isInRange(Ptr, this + 1, getUserSize());
}
static AllocationMetadata *getFromUserAddr(void *P) {
return getFromAddr(P) - 1;
}
static AllocationMetadata *getFromAddr(void *P) {
return reinterpret_cast<AllocationMetadata *>(P);
}
private:
size_t Size = 0;
int64_t Reserved = 0;
};
constexpr const size_t AllocationMetadataSize = sizeof(AllocationMetadata);
static_assert(AllocationMetadataSize == 32,
"expect the metadata size to be 32");
size_t AllocationMetadata::getUserSize() const {
return Size - AllocationMetadataSize;
}
template <typename T> struct SimpleLinkList {
struct iterator {
friend SimpleLinkList;
T *operator->() { return reinterpret_cast<T *>(Node); }
T &operator*() { return reinterpret_cast<T &>(*Node); }
iterator operator++() {
Node = Node->Next;
return *this;
}
bool operator==(const iterator &RHS) const { return Node == RHS.Node; }
bool operator!=(const iterator &RHS) const { return !(*this == RHS); }
iterator next() const {
iterator Itr;
Itr.Node = Node->Next;
return Itr;
}
private:
SimpleLinkListNode *Node = nullptr;
};
iterator begin() {
iterator Itr;
Itr.Node = Head.Next;
return Itr;
}
iterator end() { return {}; }
void insert(SimpleLinkListNode *Node) { insertImpl(&Head, Node); }
void remove(iterator Itr) {
SimpleLinkListNode *Node = Itr.Node;
remove(Node);
}
void remove(SimpleLinkListNode *Node) { removeImpl(Node); }
bool empty() const { return Head.Next == nullptr; }
private:
static void insertImpl(SimpleLinkListNode *Current,
SimpleLinkListNode *Node) {
SimpleLinkListNode *OldNext = Current->Next;
Node->Prev = Current;
Node->Next = OldNext;
if (OldNext)
OldNext->Prev = Node;
Current->Next = Node;
}
static void removeImpl(SimpleLinkListNode *Node) {
SimpleLinkListNode *Prev = Node->Prev;
SimpleLinkListNode *Next = Node->Next;
Prev->Next = Next;
if (Next)
Next->Prev = Prev;
Node->Prev = nullptr;
Node->Next = nullptr;
}
/// Check if the node is in the list.
bool checkExist(SimpleLinkListNode *Node) const {
SimpleLinkListNode *P = Head.Next;
while (P) {
if (P == Node)
return true;
P = P->Next;
}
return false;
}
static bool checkSanity(SimpleLinkListNode *Current,
SimpleLinkListNode *Next) {
return Current->Next == Next && (!Next || Next->Prev == Current);
}
static bool checkSanity(SimpleLinkListNode *Prev, SimpleLinkListNode *Current,
SimpleLinkListNode *Next) {
return Prev->Next == Current && Current->Next == Next &&
Current->Prev == Prev && (!Next || Next->Prev == Current);
}
static bool checkDangling(SimpleLinkListNode *Node) {
return Node->Next == nullptr && Node->Prev == nullptr;
}
SimpleLinkListNode Head;
};
SimpleLinkList<AllocationMetadata> AllocationList;
SimpleLinkList<AllocationMetadata> FreeList;
constexpr const int SplitRatio = 5;
#ifndef LIBOMPTARGET_MEMORY_PROFILING
struct MemoryProfiler {
MemoryProfiler(const char *S) {}
~MemoryProfiler() {}
};
#else
struct MemoryProfiler : utils::SimpleProfiler {
MemoryProfiler(const char *S) : utils::SimpleProfiler(S) {}
~MemoryProfiler() { utils::SimpleProfiler::~SimpleProfiler(); }
};
#endif
} // namespace
namespace ompx {
namespace memory {
MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
mutex::LockGuard ALG(AllocationListLock);
for (auto Itr = AllocationList.begin(); Itr != AllocationList.end(); ++Itr) {
AllocationMetadata *MD = &(*Itr);
if (MD->isInRange(P))
return {MD->getUserAddr(), MD->getUserSize()};
}
return {};
}
} // namespace memory
} // namespace ompx
extern "C" {
void *memset(void *dest, int ch, size_t count);
void *malloc(size_t Size) {
MemoryProfiler Profiler(__FUNCTION__);
Size = utils::align_up(Size + AllocationMetadataSize, Alignment);
AllocationMetadata *MD = nullptr;
{
mutex::LockGuard FLG(FreeListLock);
auto Itr = FreeList.begin();
for (; Itr != FreeList.end(); ++Itr) {
if (Itr->getSize() >= Size)
break;
}
bool Found = Itr != FreeList.end() && (Itr->getSize() / Size < SplitRatio);
if (Found) {
MD = &(*Itr);
FreeList.remove(Itr);
}
}
if (MD) {
mutex::LockGuard ALG(AllocationListLock);
AllocationList.insert(MD);
return MD->getUserAddr();
}
{
mutex::LockGuard LG(HeapLock);
if (Size + HeapCurPos < omptarget_device_heap_size) {
void *R = omptarget_device_heap_buffer + HeapCurPos;
(void)atomic::add(&HeapCurPos, Size, atomic::acq_rel);
MD = AllocationMetadata::getFromAddr(R);
}
}
if (MD) {
// We need to reset the head in case of any dirty data.
memset(MD, 0, AllocationMetadataSize);
MD->setSize(Size);
mutex::LockGuard ALG(AllocationListLock);
AllocationList.insert(MD);
return MD->getUserAddr();
}
printf("out of heap memory! size=%lu, cur=%lu.\n", Size, HeapCurPos);
printf("%s:%d\n", __FILE__, __LINE__);
__builtin_trap();
}
void free(void *P) {
MemoryProfiler Profiler(__FUNCTION__);
if (!P)
return;
auto *MD = AllocationMetadata::getFromUserAddr(P);
{
mutex::LockGuard ALG(AllocationListLock);
AllocationList.remove(MD);
}
{
mutex::LockGuard FLG(FreeListLock);
FreeList.insert(MD);
}
}
void *realloc(void *ptr, size_t new_size) {
MemoryProfiler Profiler(__FUNCTION__);
void *NewPtr = malloc(new_size);
if (!NewPtr)
return nullptr;
auto *OldMD = AllocationMetadata::getFromUserAddr(ptr);
assert(ptr == OldMD->getUserAddr());
__builtin_memcpy(NewPtr, ptr, utils::min(OldMD->getUserSize(), new_size));
free(ptr);
return NewPtr;
}
void __kmpc_target_init_allocator() {}
}
#pragma omp end declare target

View File

@@ -0,0 +1,420 @@
//===------- HostRPC.cpp - Implementation of host RPC ------------- C++ ---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#pragma omp begin declare target device_type(nohost)
#include "HostRPC.h"
#include "Debug.h"
#include "LibC.h"
#include "Memory.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
#ifdef HOSTRPC_DEBUG
#define DEBUG_PREFIX "host-rpc-device"
#define DP(FMT, ...) \
{ printf("%s --> " FMT, DEBUG_PREFIX, __VA_ARGS__); }
#else
#define DP(FMT, ...)
#endif
using namespace ompx;
using namespace hostrpc;
using ArgType = llvm::omp::OMPTgtHostRPCArgType;
Descriptor *omptarget_hostrpc_descriptor
__attribute__((used, retain, weak, visibility("protected")));
int32_t *omptarget_hostrpc_futex
__attribute__((used, retain, weak, visibility("protected")));
char *omptarget_hostrpc_memory_buffer
__attribute__((used, retain, weak, visibility("protected")));
size_t omptarget_hostrpc_memory_buffer_size
__attribute__((used, retain, weak, visibility("protected")));
#ifdef HOSTRPC_PROFILING
int32_t HostRPCId;
double GetDescStart;
double GetDescEnd;
double AddArgStart;
double AddArgEnd;
double IssueAndWaitStart;
double IssueAndWaitEnd;
double CopyBackStart;
double CopyBackEnd;
#endif
namespace {
size_t HostRPCMemoryBufferCurrentPosition = 0;
constexpr const size_t Alignment = 16;
// FIXME: For now we only allow one thread requesting host RPC.
mutex::TicketLock HostRPCLock;
void *HostRPCMemAlloc(size_t Size) {
Size = utils::align_up(Size, Alignment);
if (Size + HostRPCMemoryBufferCurrentPosition <
omptarget_hostrpc_memory_buffer_size) {
void *R =
omptarget_hostrpc_memory_buffer + HostRPCMemoryBufferCurrentPosition;
atomic::add(&HostRPCMemoryBufferCurrentPosition, Size, atomic::acq_rel);
return R;
}
printf("%s:%d\n", __FILE__, __LINE__);
__builtin_trap();
return nullptr;
}
// For now we just reset the buffer.
void HostRPCMemReset() { HostRPCMemoryBufferCurrentPosition = 0; }
static_assert(sizeof(intptr_t) == sizeof(int64_t), "pointer size not match");
struct HostRPCArgInfo {
void *BasePtr;
int64_t Type;
int64_t Size;
HostRPCArgInfo *Next;
};
struct HostRPCPointerMapEntry {
void *BasePtr;
void *MappedBasePtr;
int64_t Size;
int64_t Kind;
};
void *getMappedPointer(Descriptor *D, void *BasePtr, int64_t Size,
int64_t Offset, int64_t Kind) {
assert(D->ArgMap && "ArgMap should not be nullptr");
HostRPCPointerMapEntry *MapTable =
reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
int I = 0;
for (; I < D->NumArgs && MapTable[I].BasePtr; ++I)
if (MapTable[I].BasePtr == BasePtr)
return utils::advance(MapTable[I].MappedBasePtr, Offset);
MapTable[I].BasePtr = BasePtr;
MapTable[I].MappedBasePtr = HostRPCMemAlloc(Size);
MapTable[I].Size = Size;
MapTable[I].Kind = Kind;
if (Kind & ArgType::OMP_HOST_RPC_ARG_COPY_TO) {
__builtin_memcpy(MapTable[I].MappedBasePtr, BasePtr, Size);
DP("getMappedPointer: copy %ld bytes memory from %p to %p.\n", Size,
BasePtr, MapTable[I].MappedBasePtr);
}
return utils::advance(MapTable[I].MappedBasePtr, Offset);
}
void copybackIfNeeded(Descriptor *D) {
if (!D->ArgMap)
return;
auto *MapTable = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
for (int I = 0; I < D->NumArgs && MapTable[I].BasePtr; ++I)
if (MapTable[I].Kind & ArgType::OMP_HOST_RPC_ARG_COPY_FROM) {
__builtin_memcpy(MapTable[I].BasePtr, MapTable[I].MappedBasePtr,
MapTable[I].Size);
DP("copybackIfNeeded: copy %ld bytes memory from %p to %p.\n",
MapTable[I].Size, MapTable[I].MappedBasePtr, MapTable[I].BasePtr);
}
}
} // namespace
extern "C" {
__attribute__((noinline, used)) void *
__kmpc_host_rpc_get_desc(int32_t CallId, int32_t NumArgs, void *ArgInfo) {
assert(omptarget_hostrpc_descriptor && omptarget_hostrpc_futex &&
"no host rpc pointer");
DP("device: stdin=%p, stdout=%p, stderr=%p\n", stdin, stdout, stderr);
DP("get desc for request (id=%d), NumArgs=%d, ArgInfo=%p.\n", CallId, NumArgs,
ArgInfo);
#ifdef HOSTRPC_DEBUG
{
void **AIs = reinterpret_cast<void **>(ArgInfo);
for (int I = 0; I < NumArgs; ++I)
DP("ArgInfo[%d]=%p.\n", I, AIs[I]);
}
#endif
HostRPCLock.lock();
#ifdef HOSTRPC_PROFILING
HostRPCId = CallId;
GetDescStart = omp_get_wtime();
#endif
// TODO: change it after we support a queue-like data structure.
Descriptor *D = omptarget_hostrpc_descriptor;
D->Id = CallId;
D->ArgInfo = reinterpret_cast<void **>(ArgInfo);
D->NumArgs = NumArgs;
D->Status = EXEC_STAT_CREATED;
D->ReturnValue = 0;
D->Args =
reinterpret_cast<Argument *>(HostRPCMemAlloc(sizeof(Argument) * NumArgs));
D->ArgMap = HostRPCMemAlloc(sizeof(HostRPCPointerMapEntry) * NumArgs);
assert(!NumArgs || (D->Args && D->ArgMap) && "out of host rpc memory!");
// Reset the map table.
auto *ArgMap = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
for (int I = 0; I < NumArgs; ++I)
ArgMap[I].BasePtr = nullptr;
#ifdef HOSTRPC_PROFILING
GetDescEnd = omp_get_wtime();
AddArgStart = omp_get_wtime();
#endif
return D;
}
__attribute__((noinline, used)) void
__kmpc_host_rpc_add_arg(void *Desc, int64_t ArgVal, int32_t ArgNum) {
auto *D = reinterpret_cast<Descriptor *>(Desc);
assert(ArgNum < D->NumArgs && "out-of-range arguments");
Argument &ArgInDesc = D->Args[ArgNum];
DP("add arg (no=%d), arg=%lx to request (id=%d).\n", ArgNum, ArgVal, D->Id);
// This early branch can rule out nullptr and zero scalar value because it
// doesn't matter whether it is a pointer or scalar value.
if (ArgVal == 0) {
ArgInDesc.Value = 0;
ArgInDesc.ArgType = Type::ARG_LITERAL;
DP("arg (no=%d) is null, done.\n", ArgNum);
return;
}
void *ArgPtr = reinterpret_cast<void *>(ArgVal);
if (ArgPtr == stdin || ArgPtr == stdout || ArgPtr == stderr) {
ArgInDesc.Value = ArgVal;
ArgInDesc.ArgType = Type::ARG_POINTER;
DP("arg (no=%d) is stdin/stdout/stderr, done.\n", ArgNum);
return;
}
const auto *AI = reinterpret_cast<HostRPCArgInfo *>(D->ArgInfo[ArgNum]);
DP("try to find arg (no=%d) from args AI=%p\n", ArgNum, AI);
if (AI) {
// Let's first check if Arg is a scalar.
if (AI->Type == ArgType::OMP_HOST_RPC_ARG_SCALAR) {
assert(AI->BasePtr == ArgPtr && "invalid scalar argument info");
assert(AI->Next == nullptr && "invalid scalar argument info");
ArgInDesc.Value = ArgVal;
ArgInDesc.ArgType = Type::ARG_LITERAL;
DP("arg (no=%d) is scalar, done.\n", ArgNum);
return;
}
// Then let's see if it is a literal pointer that we don't need copy.
auto *P = AI;
while (P) {
if (P->Type == ArgType::OMP_HOST_RPC_ARG_PTR && P->BasePtr == ArgPtr) {
ArgInDesc.Value = ArgVal;
ArgInDesc.ArgType = Type::ARG_POINTER;
DP("arg (no=%d) is literal pointer, done.\n", ArgNum);
return;
}
P = P->Next;
}
// Next we check if it is within the range of any buffer described in
// argument info.
P = AI;
while (P) {
if ((P->Type & ArgType::OMP_HOST_RPC_ARG_PTR) && P->Size) {
if (utils::isInRange(ArgPtr, P->BasePtr, P->Size)) {
auto Size = P->Size;
auto Offset = utils::getPtrDiff(ArgPtr, P->BasePtr);
ArgInDesc.Value = utils::ptrtoint(
getMappedPointer(D, P->BasePtr, Size, Offset, P->Type));
ArgInDesc.ArgType = Type::ARG_POINTER;
DP("found a match for arg (no=%d). done.\n", ArgNum);
return;
}
}
P = P->Next;
}
}
// Now we can't find a match from argument info, then we assume it is from
// dynamic allocation.
memory::MemoryAllocationInfo MAI = memory::getMemoryAllocationInfo(ArgPtr);
if (MAI.isValid()) {
auto Size = MAI.Size;
auto Offset = utils::getPtrDiff(ArgPtr, MAI.BasePtr);
ArgInDesc.Value = utils::ptrtoint(
getMappedPointer(D, MAI.BasePtr, Size, Offset,
/* Kind */ ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM));
ArgInDesc.ArgType = Type::ARG_POINTER;
DP("arg (no=%d) is from malloc. done.\n", ArgNum);
return;
}
printf("request (id=%d) arg (no=%d, val=%p) is unknown. send it to host "
"directly.\n",
D->Id, ArgNum, ArgPtr);
ArgInDesc.Value = ArgVal;
ArgInDesc.ArgType = Type::ARG_POINTER;
}
__attribute__((noinline, used)) int64_t
__kmpc_host_rpc_send_and_wait(void *Desc) {
auto *D = reinterpret_cast<Descriptor *>(Desc);
int32_t Id = D->Id;
#ifdef HOSTRPC_PROFILING
AddArgEnd = omp_get_wtime();
IssueAndWaitStart = omp_get_wtime();
#endif
atomic::add(omptarget_hostrpc_futex, 1U, atomic::acq_rel);
// A system fence is required to make sure futex on the host is also
// updated if USM is supported.
fence::system(atomic::seq_cst);
DP("sent request (id=%d) to host. waiting for finish.\n", Id);
unsigned NS = 8;
while (atomic::addSys(omptarget_hostrpc_futex, 0)) {
asm volatile("nanosleep.u32 %0;" : : "r"(NS));
// if (NS < 64)
// NS *= 2;
// fence::system(atomic::seq_cst);
}
#ifdef HOSTRPC_PROFILING
IssueAndWaitEnd = omp_get_wtime();
#endif
DP("finish waiting for request (id=%d).\n", Id);
int64_t Ret = D->ReturnValue;
assert(!D->NumArgs || D->ArgMap && "arg map should not be nullptr");
#ifdef HOSTRPC_PROFILING
CopyBackStart = omp_get_wtime();
#endif
if (D->ArgMap) {
DP("copy memory back for request (id=%d).\n", Id);
copybackIfNeeded(D);
DP("finish copy memory back for request (id=%d).\n", Id);
}
#ifdef HOSTRPC_PROFILING
CopyBackEnd = omp_get_wtime();
#endif
HostRPCMemReset();
// We can unlock now as we already get all temporary part.
// TODO: If we have a queue, we don't need this step.
HostRPCLock.unlock();
DP("request (id=%d) is done with return code=%lx.\n", Id, Ret);
#ifdef HOSTRPC_PROFILING
printf("[host-rpc-profiling-device] id=%d, init=%lf, add_arg=%lf, wait=%lf, "
"copy=%lf.\n",
HostRPCId, GetDescEnd - GetDescStart, AddArgEnd - AddArgStart,
IssueAndWaitEnd - IssueAndWaitStart, CopyBackEnd - CopyBackStart);
#endif
return Ret;
}
__attribute__((noinline, used)) void
__kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
int32_t if_expr, int32_t num_threads,
void **args, int64_t nargs) {
constexpr const int64_t NumArgs = 6;
HostRPCArgInfo ArgInfoArray[NumArgs];
void *ArgInfo[NumArgs];
for (unsigned I = 0; I < NumArgs; ++I) {
ArgInfoArray[I].BasePtr = 0;
ArgInfoArray[I].Type = ArgType::OMP_HOST_RPC_ARG_SCALAR;
ArgInfoArray[I].Size = 0;
ArgInfoArray[I].Next = nullptr;
ArgInfo[I] = &ArgInfoArray[I];
}
auto *D = (Descriptor *)__kmpc_host_rpc_get_desc(0, NumArgs, (void *)ArgInfo);
// Set up arg info struct.
ArgInfoArray[0].BasePtr = const_cast<char *>(name);
ArgInfoArray[0].Type = ArgType::OMP_HOST_RPC_ARG_COPY_TO;
ArgInfoArray[0].Size = strlen(name) + 1;
ArgInfoArray[1].BasePtr =
reinterpret_cast<void *>(static_cast<int64_t>(gtid));
ArgInfoArray[2].BasePtr =
reinterpret_cast<void *>(static_cast<int64_t>(if_expr));
ArgInfoArray[3].BasePtr =
reinterpret_cast<void *>(static_cast<int64_t>(num_threads));
ArgInfoArray[5].BasePtr = reinterpret_cast<void *>(nargs);
// We need to treat args in a little bit different way because nargs might be
// on the stack.
ArgInfoArray[4].Size = sizeof(void *) * nargs;
void *Args = nullptr;
if (nargs) {
Args = HostRPCMemAlloc(ArgInfoArray[4].Size);
__builtin_memcpy(Args, args, ArgInfoArray[4].Size);
}
ArgInfoArray[4].BasePtr = Args;
D->Id = CALLID___kmpc_launch_parallel_51_kernel;
__kmpc_host_rpc_add_arg(D, reinterpret_cast<int64_t>(name), 0);
__kmpc_host_rpc_add_arg(D, gtid, 1);
__kmpc_host_rpc_add_arg(D, if_expr, 2);
__kmpc_host_rpc_add_arg(D, num_threads, 3);
__kmpc_host_rpc_add_arg(D, reinterpret_cast<int64_t>(Args), 4);
__kmpc_host_rpc_add_arg(D, nargs, 5);
(void)__kmpc_host_rpc_send_and_wait(D);
}
}
#pragma omp end declare target

View File

@@ -13,6 +13,7 @@
#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"
#include "Memory.h"
#include "State.h"
#include "Synchronization.h"
#include "Types.h"

View File

@@ -7,6 +7,12 @@
//===----------------------------------------------------------------------===//
#include "LibC.h"
#include "Debug.h"
#include "Memory.h"
#include "Synchronization.h"
#include "Utils.h"
using namespace ompx;
#pragma omp begin declare target device_type(nohost)
@@ -32,8 +38,324 @@ int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
return -1;
}
} // namespace impl
#pragma omp end declare variant
// Dummy variable for stdio.
int StdInDummyVar;
int StdOutDummyVar;
int StdErrDummyVar;
struct FILE;
__attribute__((used, retain, weak, visibility("protected"))) FILE *stdin =
(FILE *)&StdInDummyVar;
__attribute__((used, retain, weak, visibility("protected"))) FILE *stdout =
(FILE *)&StdOutDummyVar;
__attribute__((used, retain, weak, visibility("protected"))) FILE *stderr =
(FILE *)&StdErrDummyVar;
typedef int (*__compar_fn_t)(const void *, const void *);
typedef int (*__compar_d_fn_t)(const void *, const void *, void *);
namespace {
const int32_t ToLowerMapTable[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 'a', 'b',
'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 91, 92, 93, 94, 95, 96,
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 123, 124, 125, 126,
127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
};
const int32_t *ToLowerPTable = ToLowerMapTable + 128;
#define X(x) (((x) / 256 | (x)*256) % 65536)
static const unsigned short BLocTable[] = {
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
X(0x200), X(0x200), X(0x200), X(0x200), X(0x320), X(0x220), X(0x220),
X(0x220), X(0x220), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x160),
X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
X(0x4c0), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8),
X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x4c0), X(0x4c0), X(0x4c0),
X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x8d5), X(0x8d5), X(0x8d5),
X(0x8d5), X(0x8d5), X(0x8d5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
X(0x8c5), X(0x8c5), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
X(0x4c0), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6),
X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6),
X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6),
X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x4c0),
X(0x4c0), X(0x4c0), X(0x4c0), X(0x200), 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
};
#undef X
static const unsigned short *BLocTablePTable = BLocTable + 128;
int ErrNo __attribute__((used, retain));
static unsigned long int RandomNext = 1;
#define SWAP(a, b, size) \
do { \
size_t __size = (size); \
char *__a = (a), *__b = (b); \
do { \
char __tmp = *__a; \
*__a++ = *__b; \
*__b++ = __tmp; \
} while (--__size > 0); \
} while (0)
/* Discontinue quicksort algorithm when partition gets below this size.
This particular magic number was chosen to work best on a Sun 4/260. */
#define MAX_THRESH 4
/* Stack node declarations used to store unfulfilled partition obligations. */
typedef struct {
char *lo;
char *hi;
} stack_node;
/* The next 4 #defines implement a very fast in-line stack abstraction. */
/* The stack needs log (total_elements) entries (we could even subtract
log(MAX_THRESH)). Since total_elements has type size_t, we get as
upper bound for log (total_elements):
bits per byte (CHAR_BIT) * sizeof(size_t). */
#define CHAR_BIT 8
#define STACK_SIZE (CHAR_BIT * sizeof(size_t))
#define PUSH(low, high) ((void)((top->lo = (low)), (top->hi = (high)), ++top))
#define POP(low, high) ((void)(--top, (low = top->lo), (high = top->hi)))
#define STACK_NOT_EMPTY (stack < top)
/* Order size using quicksort. This implementation incorporates
four optimizations discussed in Sedgewick:
1. Non-recursive, using an explicit stack of pointer that store the
next array partition to sort. To save time, this maximum amount
of space required to store an array of SIZE_MAX is allocated on the
stack. Assuming a 32-bit (64 bit) integer for size_t, this needs
only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes).
Pretty cheap, actually.
2. Chose the pivot element using a median-of-three decision tree.
This reduces the probability of selecting a bad pivot value and
eliminates certain extraneous comparisons.
3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving
insertion sort to order the MAX_THRESH items within each partition.
This is a big win, since insertion sort is faster for small, mostly
sorted array segments.
4. The larger of the two sub-partitions is always pushed onto the
stack first, with the algorithm then concentrating on the
smaller partition. This *guarantees* no more than log (total_elems)
stack size is needed (actually O(1) in this case)! */
void _quicksort(void *const pbase, size_t total_elems, size_t size,
__compar_d_fn_t cmp, void *arg) {
char *base_ptr = (char *)pbase;
const size_t max_thresh = MAX_THRESH * size;
if (total_elems == 0)
/* Avoid lossage with unsigned arithmetic below. */
return;
if (total_elems > MAX_THRESH) {
char *lo = base_ptr;
char *hi = &lo[size * (total_elems - 1)];
stack_node stack[STACK_SIZE];
stack_node *top = stack;
PUSH(nullptr, nullptr);
while (STACK_NOT_EMPTY) {
char *left_ptr;
char *right_ptr;
/* Select median value from among LO, MID, and HI. Rearrange
LO and HI so the three values are sorted. This lowers the
probability of picking a pathological pivot value and
skips a comparison for both the LEFT_PTR and RIGHT_PTR in
the while loops. */
char *mid = lo + size * ((hi - lo) / size >> 1);
if ((*cmp)((void *)mid, (void *)lo, arg) < 0)
SWAP(mid, lo, size);
if ((*cmp)((void *)hi, (void *)mid, arg) < 0)
SWAP(mid, hi, size);
else
goto jump_over;
if ((*cmp)((void *)mid, (void *)lo, arg) < 0)
SWAP(mid, lo, size);
jump_over:;
left_ptr = lo + size;
right_ptr = hi - size;
/* Here's the famous ``collapse the walls'' section of quicksort.
Gotta like those tight inner loops! They are the main reason
that this algorithm runs much faster than others. */
do {
while ((*cmp)((void *)left_ptr, (void *)mid, arg) < 0)
left_ptr += size;
while ((*cmp)((void *)mid, (void *)right_ptr, arg) < 0)
right_ptr -= size;
if (left_ptr < right_ptr) {
SWAP(left_ptr, right_ptr, size);
if (mid == left_ptr)
mid = right_ptr;
else if (mid == right_ptr)
mid = left_ptr;
left_ptr += size;
right_ptr -= size;
} else if (left_ptr == right_ptr) {
left_ptr += size;
right_ptr -= size;
break;
}
} while (left_ptr <= right_ptr);
/* Set up pointers for next iteration. First determine whether
left and right partitions are below the threshold size. If so,
ignore one or both. Otherwise, push the larger partition's
bounds on the stack and continue sorting the smaller one. */
if ((size_t)(right_ptr - lo) <= max_thresh) {
if ((size_t)(hi - left_ptr) <= max_thresh)
/* Ignore both small partitions. */
POP(lo, hi);
else
/* Ignore small left partition. */
lo = left_ptr;
} else if ((size_t)(hi - left_ptr) <= max_thresh)
/* Ignore small right partition. */
hi = right_ptr;
else if ((right_ptr - lo) > (hi - left_ptr)) {
/* Push larger left partition indices. */
PUSH(lo, right_ptr);
lo = left_ptr;
} else {
/* Push larger right partition indices. */
PUSH(left_ptr, hi);
hi = right_ptr;
}
}
}
/* Once the BASE_PTR array is partially sorted by quicksort the rest
is completely sorted using insertion sort, since this is efficient
for partitions below MAX_THRESH size. BASE_PTR points to the beginning
of the array to sort, and END_PTR points at the very last element in
the array (*not* one beyond it!). */
#define min(x, y) ((x) < (y) ? (x) : (y))
{
char *const end_ptr = &base_ptr[size * (total_elems - 1)];
char *tmp_ptr = base_ptr;
char *thresh = min(end_ptr, base_ptr + max_thresh);
char *run_ptr;
/* Find smallest element in first threshold and place it at the
array's beginning. This is the smallest array element,
and the operation speeds up insertion sort's inner loop. */
for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
if ((*cmp)((void *)run_ptr, (void *)tmp_ptr, arg) < 0)
tmp_ptr = run_ptr;
if (tmp_ptr != base_ptr)
SWAP(tmp_ptr, base_ptr, size);
/* Insertion sort, running from left-hand-side up to right-hand-side. */
run_ptr = base_ptr + size;
while ((run_ptr += size) <= end_ptr) {
tmp_ptr = run_ptr - size;
while ((*cmp)((void *)run_ptr, (void *)tmp_ptr, arg) < 0)
tmp_ptr -= size;
tmp_ptr += size;
if (tmp_ptr != run_ptr) {
char *trav;
trav = run_ptr + size;
while (--trav >= run_ptr) {
char c = *trav;
char *hi, *lo;
for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
*hi = *lo;
*hi = c;
}
}
}
}
}
} // namespace
extern "C" {
int memcmp(const void *lhs, const void *rhs, size_t count) {
@@ -51,6 +373,415 @@ int memcmp(const void *lhs, const void *rhs, size_t count) {
int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
return impl::omp_vprintf(Format, Arguments, Size);
}
// -----------------------------------------------------------------------------
#ifndef ULONG_MAX
#define ULONG_MAX ((unsigned long)(~0L)) /* 0xFFFFFFFF */
#endif
#ifndef LONG_MAX
#define LONG_MAX ((long)(ULONG_MAX >> 1)) /* 0x7FFFFFFF */
#endif
#ifndef LONG_MIN
#define LONG_MIN ((long)(~LONG_MAX)) /* 0x80000000 */
#endif
#ifndef ISDIGIT
#define ISDIGIT(C) ((C) >= '0' && (C) <= '9')
#endif
#ifndef ISSPACE
#define ISSPACE(C) ((C) == ' ')
#endif
#ifndef ISALPHA
#define ISALPHA(C) (((C) >= 'a' && (C) <= 'z') || ((C) >= 'A' && (C) <= 'Z'))
#endif
#ifndef ISUPPER
#define ISUPPER(C) ((C) >= 'A' && (C) <= 'Z')
#endif
long strtol(const char *nptr, char **endptr, int base) {
const char *s = nptr;
unsigned long acc;
int c;
unsigned long cutoff;
int neg = 0, any, cutlim;
/*
* Skip white space and pick up leading +/- sign if any.
* If base is 0, allow 0x for hex and 0 for octal, else
* assume decimal; if base is already 16, allow 0x.
*/
do {
c = *s++;
} while (ISSPACE(c));
if (c == '-') {
neg = 1;
c = *s++;
} else if (c == '+')
c = *s++;
if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X')) {
c = s[1];
s += 2;
base = 16;
}
if (base == 0)
base = c == '0' ? 8 : 10;
/*
* Compute the cutoff value between legal numbers and illegal
* numbers. That is the largest legal value, divided by the
* base. An input number that is greater than this value, if
* followed by a legal input character, is too big. One that
* is equal to this value may be valid or not; the limit
* between valid and invalid numbers is then based on the last
* digit. For instance, if the range for longs is
* [-2147483648..2147483647] and the input base is 10,
* cutoff will be set to 214748364 and cutlim to either
* 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated
* a value > 214748364, or equal but the next digit is > 7 (or 8),
* the number is too big, and we will return a range error.
*
* Set any if any `digits' consumed; make it negative to indicate
* overflow.
*/
cutoff = neg ? -(unsigned long)LONG_MIN : LONG_MAX;
cutlim = cutoff % (unsigned long)base;
cutoff /= (unsigned long)base;
for (acc = 0, any = 0;; c = *s++) {
if (ISDIGIT(c))
c -= '0';
else if (ISALPHA(c))
c -= ISUPPER(c) ? 'A' - 10 : 'a' - 10;
else
break;
if (c >= base)
break;
if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
any = -1;
else {
any = 1;
acc *= base;
acc += c;
}
}
if (any < 0) {
acc = neg ? LONG_MIN : LONG_MAX;
errno = ERANGE;
} else if (neg)
acc = -acc;
if (endptr != 0)
*endptr = const_cast<char *>(any ? s - 1 : nptr);
return (acc);
}
int strcmp(const char *lhs, const char *rhs) {
while (*lhs != '\0' && *rhs != '\0') {
if (*lhs == *rhs) {
++lhs;
++rhs;
}
return *lhs - *rhs;
}
if (*lhs != '\0')
return 1;
return -1;
}
void *calloc(size_t num, size_t size) {
size_t bits = num * size;
char *p = (char *)malloc(bits);
if (!p)
return p;
char *q = (char *)p;
while (q - p < bits) {
*(int *)q = 0;
q += sizeof(int);
}
while (q - p < bits) {
*q = 0;
q++;
}
return p;
}
void exit(int exit_code) { asm volatile("exit;"); }
size_t strlen(const char *str) {
size_t r = 0;
while (*str == ' ')
++str;
while (*str != '\0') {
++r;
++str;
}
return r;
}
char *strcpy(char *dest, const char *src) {
char *pd = dest;
const char *ps = src;
while (*ps != '\0')
*(pd++) = *(ps++);
*pd = '\0';
return dest;
}
int *__errno_location() { return &ErrNo; }
char *strcat(char *dest, const char *src) {
char *pd = dest;
const char *ps = src;
while (*pd != '\0')
++pd;
while (*ps != '\0')
*(pd++) = *(ps++);
*pd = '\0';
return dest;
}
void perror(const char *s) { printf("%s", s); }
int strncmp(const char *lhs, const char *rhs, size_t count) {
size_t c = 0;
while (*lhs != '\0' && *rhs != '\0' && c < count) {
if (*lhs == *rhs) {
++lhs;
++rhs;
++c;
} else
return *lhs - *rhs;
}
return 0;
}
char *strncpy(char *dest, const char *src, size_t count) {
char *pd = dest;
const char *ps = src;
size_t c = 0;
while (*ps != '\0' && c < count) {
*(pd++) = *(ps++);
++c;
}
if (c < count)
*pd = '\0';
return dest;
}
char *strchr(const char *s, int c) {
do {
if (*s == c)
return const_cast<char *>(s);
} while (*s++);
return nullptr;
}
char *strtok(char *str, const char *delim) {
static char *s = nullptr;
char *tok;
if (str == nullptr) {
if (s == nullptr)
return nullptr;
} else
s = str;
for (size_t i; (*s != '\0'); s++) {
for (i = 0; (delim[i] != '\0') && (*s != delim[i]); i++)
;
if (delim[i] == '\0')
break;
}
if (*s == '\0')
return s = nullptr;
tok = s++;
for (size_t i; (*s != '\0'); s++) {
for (i = 0; (delim[i] != '\0') && (*s != delim[i]); i++)
;
if (delim[i] != '\0')
break;
}
if (*s != '\0') {
*s = '\0';
s++;
}
return tok;
}
void srand(unsigned seed) { RandomNext = seed; }
int rand() {
RandomNext = RandomNext * 1103515245 + 12345;
return static_cast<unsigned int>((RandomNext / 65536)) % 2147483647;
}
int abs(int n) { return n > 0 ? n : -n; }
void *memcpy(void *dest, const void *src, size_t count) {
__builtin_memcpy(dest, src, count);
return dest;
}
unsigned long strtoul(const char *str, char **str_end, int base) {
unsigned long res = 0;
while (*str != '\0') {
if (*str == ' ') {
++str;
continue;
}
if (*str >= '0' && *str <= '9') {
res = res * 10 + *str - '0';
++str;
continue;
}
break;
}
if (*str_end)
*str_end = const_cast<char *>(str);
return res;
}
double atof(const char *s) { return strtod(s, nullptr); }
int atoi(const char *str) { return (int)strtol(str, nullptr, 10); }
double strtod(const char *str, char **ptr) {
char *p;
if (ptr == (char **)0)
return atof(str);
p = const_cast<char *>(str);
while (ISSPACE(*p))
++p;
if (*p == '+' || *p == '-')
++p;
/* INF or INFINITY. */
if ((p[0] == 'i' || p[0] == 'I') && (p[1] == 'n' || p[1] == 'N') &&
(p[2] == 'f' || p[2] == 'F')) {
if ((p[3] == 'i' || p[3] == 'I') && (p[4] == 'n' || p[4] == 'N') &&
(p[5] == 'i' || p[5] == 'I') && (p[6] == 't' || p[6] == 'T') &&
(p[7] == 'y' || p[7] == 'Y')) {
*ptr = p + 8;
return atof(str);
} else {
*ptr = p + 3;
return atof(str);
}
}
/* NAN or NAN(foo). */
if ((p[0] == 'n' || p[0] == 'N') && (p[1] == 'a' || p[1] == 'A') &&
(p[2] == 'n' || p[2] == 'N')) {
p += 3;
if (*p == '(') {
++p;
while (*p != '\0' && *p != ')')
++p;
if (*p == ')')
++p;
}
*ptr = p;
return atof(str);
}
/* digits, with 0 or 1 periods in it. */
if (ISDIGIT(*p) || *p == '.') {
int got_dot = 0;
while (ISDIGIT(*p) || (!got_dot && *p == '.')) {
if (*p == '.')
got_dot = 1;
++p;
}
/* Exponent. */
if (*p == 'e' || *p == 'E') {
int i;
i = 1;
if (p[i] == '+' || p[i] == '-')
++i;
if (ISDIGIT(p[i])) {
while (ISDIGIT(p[i]))
++i;
*ptr = p + i;
return atof(str);
}
}
*ptr = p;
return atof(str);
}
/* Didn't find any digits. Doesn't look like a number. */
*ptr = const_cast<char *>(str);
return 0.0;
}
void *memset(void *dest, int ch, size_t count) {
auto *P = reinterpret_cast<unsigned char *>(dest);
for (size_t I = 0; I < count; ++I, ++P)
*P = (unsigned char)ch;
return dest;
}
int tolower(int ch) {
if (ch >= 'A' && ch <= 'Z')
return ch + 32;
return ch;
}
char *strstr(const char *s1, const char *s2) {
const size_t len = strlen(s2);
while (*s1) {
if (!memcmp(s1, s2, len))
return const_cast<char *>(s1);
++s1;
}
return (0);
}
char *__xpg_basename(const char *path) { return const_cast<char *>(path); }
const unsigned short **__ctype_b_loc(void) { return &BLocTablePTable; }
const int32_t **__ctype_tolower_loc(void) { return &ToLowerPTable; }
void qsort(void *b, size_t n, size_t s, __compar_fn_t cmp) {
return _quicksort(b, n, s, (__compar_d_fn_t)cmp, nullptr);
}
int strcasecmp(const char *s1, const char *s2) {
const unsigned char *p1 = (const unsigned char *)s1;
const unsigned char *p2 = (const unsigned char *)s2;
int result;
if (p1 == p2)
return 0;
while ((result = tolower(*p1) - tolower(*p2++)) == 0)
if (*p1++ == '\0')
break;
return result;
}
}
#pragma omp end declare target

View File

@@ -212,9 +212,14 @@ uint32_t mapping::getThreadIdInBlock() {
uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
uint32_t mapping::getBlockSize(bool IsSPMD) {
uint32_t BlockSize =
mapping::getNumberOfProcessorElements() - (!IsSPMD * impl::getWarpSize());
return BlockSize;
uint32_t BlockSize = mapping::getNumberOfProcessorElements();
if (IsSPMD)
return BlockSize;
uint32_t WarpSize = impl::getWarpSize();
bool IsFullLastWarp = !(BlockSize % WarpSize);
if (OMP_LIKELY(IsFullLastWarp))
return BlockSize - WarpSize;
return BlockSize - BlockSize % WarpSize;
}
uint32_t mapping::getBlockSize() {
return mapping::getBlockSize(mapping::isSPMDMode());

View File

@@ -47,6 +47,9 @@ using namespace ompx;
namespace {
uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
if (NumThreadsClause != -1)
return NumThreadsClause;
uint32_t NThreadsICV =
NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
uint32_t NumThreads = mapping::getBlockSize();

View File

@@ -13,6 +13,7 @@
#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"
#include "Memory.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"
@@ -36,36 +37,6 @@ extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
namespace {
/// Fallback implementations are missing to trigger a link time error.
/// Implementations for new devices, including the host, should go into a
/// dedicated begin/end declare variant.
///
///{
extern "C" {
__attribute__((leaf)) void *malloc(uint64_t Size);
__attribute__((leaf)) void free(void *Ptr);
}
///}
/// AMDGCN implementations of the shuffle sync idiom.
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
extern "C" {
void *malloc(uint64_t Size) {
// TODO: Use some preallocated space for dynamic malloc.
return nullptr;
}
void free(void *Ptr) {}
}
#pragma omp end declare variant
///}
/// A "smart" stack in shared memory.
///
/// The stack exposes a malloc/free interface but works like a stack internally.
@@ -261,7 +232,9 @@ void state::enterDataEnvironment(IdentTy *Ident) {
static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
uint32_t Bytes = sizeof(ThreadStates[0]) * mapping::getBlockSize();
uint32_t Bytes =
sizeof(ThreadStates[0]) *
(mapping::getBlockSize() + (mapping::isSPMDMode() ? 0 : 1));
void *ThreadStatesPtr =
memory::allocGlobal(Bytes, "Thread state array allocation");
if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
@@ -349,6 +322,15 @@ int omp_get_thread_num(void) {
return omp_get_ancestor_thread_num(omp_get_level());
}
int omp_get_bulk_thread_num() {
ASSERT(mapping::isSPMDMode());
int BId = mapping::getBlockId();
int BSize = mapping::getBlockSize(/* IsSPMD */ true);
int TId = mapping::getThreadIdInBlock();
int Id = BId * BSize + TId;
return returnValIfLevelIsActive(omp_get_level(), Id, 0);
}
int omp_get_team_size(int Level) {
return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1);
}
@@ -357,6 +339,11 @@ int omp_get_num_threads(void) {
return omp_get_level() > 1 ? 1 : state::ParallelTeamSize;
}
int omp_get_bulk_num_threads(void) {
ASSERT(mapping::isSPMDMode());
return mapping::getKernelSize();
}
int omp_get_thread_limit(void) { return mapping::getBlockSize(); }
int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }

View File

@@ -115,6 +115,8 @@ uint32_t atomicExchange(uint32_t *Address, uint32_t Val,
}
///}
int32_t atomicAddSys(int32_t *Address, uint32_t Val);
// Forward declarations defined to be defined for AMDGCN and NVPTX.
uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering);
void namedBarrierInit();
@@ -313,6 +315,10 @@ uint32_t atomicInc(uint32_t *Address, uint32_t Val,
return __nvvm_atom_inc_gen_ui(Address, Val);
}
int32_t atomicAddSys(int32_t *Address, int32_t Val) {
return __nvvm_atom_sys_add_gen_i(Address, Val);
}
void namedBarrierInit() {}
void namedBarrier() {
@@ -484,10 +490,26 @@ uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
return impl::atomicInc(Addr, V, Ordering);
}
int32_t atomic::addSys(int32_t *Addr, int32_t Val) {
return impl::atomicAddSys(Addr, Val);
}
void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }
void mutex::TicketLock::lock() {
uint64_t MyTicket = atomic::add(&NextTicket, 1, atomic::seq_cst);
while (atomic::load(&NowServing, atomic::aquire) != MyTicket)
;
fence::kernel(atomic::aquire);
}
void mutex::TicketLock::unlock() {
fence::kernel(atomic::release);
atomic::add(&NowServing, 1, atomic::seq_cst);
}
extern "C" {
void __kmpc_ordered(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); }

View File

@@ -0,0 +1,304 @@
//===------- WarpAllocator.cpp - Warp memory allocator ------- C++ -*-========//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#pragma omp begin declare target device_type(nohost)
#include "Debug.h"
#include "Mapping.h"
#include "Memory.h"
#include "State.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"
using namespace ompx;
char *CONSTANT(omptarget_device_heap_buffer)
__attribute__((used, retain, weak, visibility("protected")));
size_t CONSTANT(omptarget_device_heap_size)
__attribute__((used, retain, weak, visibility("protected")));
namespace {
constexpr const size_t Alignment = 16;
constexpr const size_t FirstThreadRatio = 40;
constexpr const size_t SplitThreadhold = Alignment * 4;
template <typename T> T abs(T V) { return V > 0 ? V : -V; }
template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator;
class WarpAllocatorEntry {
template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> friend struct WarpAllocator;
/// If Size is less than 0, the entry is allocated (in use).
int64_t Size = 0;
/// PrevSize is also supposed to be greater than or equal to 0. When it is 0,
/// it is the first entry of the buffer.
int64_t PrevSize = 0;
public:
bool isFirst() const { return !PrevSize; }
size_t getSize() const { return abs(Size); }
void setSize(size_t V) { Size = V; }
void setPrevSize(WarpAllocatorEntry *Prev) {
PrevSize = Prev ? Prev->getSize() : 0;
}
size_t getUserSize() const { return getSize() - sizeof(WarpAllocatorEntry); }
// Note: isUsed can not be !isUnused or other way around because when Size is
// 0, it is uninitialized.
bool isUsed() const { return Size < 0; }
bool isUnused() const { return Size > 0; }
void setUsed() {
assert(isUnused() && "the entry is in use");
Size *= -1;
}
void setUnused() {
assert(isUsed() && "the entry is not in use");
Size *= -1;
}
char *getUserPtr() { return reinterpret_cast<char *>(this + 1); }
char *getEndPtr() { return reinterpret_cast<char *>(getNext()); }
WarpAllocatorEntry *getPrev() { return utils::advance(this, -PrevSize); }
WarpAllocatorEntry *getNext() { return utils::advance(this, getSize()); }
static WarpAllocatorEntry *fromUserPtr(void *Ptr) { return fromPtr(Ptr) - 1; }
static WarpAllocatorEntry *fromPtr(void *Ptr) {
return reinterpret_cast<WarpAllocatorEntry *>(Ptr);
}
};
static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");
template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
void init() {
if (mapping::isSPMDMode() &&
(mapping::getThreadIdInBlock() || mapping::getBlockId()))
return;
size_t HeapSize = omptarget_device_heap_size;
FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
size_t OtherThreadHeapSize =
(HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);
size_t TeamHeapSize = FirstThreadHeapSize / TEAM_SIZE;
TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
FirstTeamSize = TeamHeapSize;
char *LastLimit = omptarget_device_heap_buffer;
for (int I = 0; I < WARP_SIZE; ++I) {
for (int J = 0; J < TEAM_SIZE; ++J) {
Entries[I][J] = nullptr;
Limits[I][J] = LastLimit + TeamHeapSize * (J + 1);
}
LastLimit += I ? OtherThreadHeapSize : FirstThreadHeapSize;
Limits[I][TEAM_SIZE - 1] = LastLimit;
TeamHeapSize = OtherThreadHeapSize / TEAM_SIZE;
TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
}
}
void *allocate(size_t Size) {
int32_t TeamSlot = getTeamSlot();
int32_t TIdInWarp = mapping::getThreadIdInWarp();
Size = utils::align_up(Size + sizeof(WarpAllocatorEntry), Alignment);
// Error our early if the requested size is larger than the entire block.
if (Size > getBlockSize(TIdInWarp, TeamSlot))
return nullptr;
WarpAllocatorEntry *E = nullptr;
{
mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
auto *LastEntry = Entries[TIdInWarp][TeamSlot];
auto *NewWatermark = (LastEntry ? LastEntry->getEndPtr()
: getBlockBegin(TIdInWarp, TeamSlot)) +
Size;
if (NewWatermark >= Limits[TIdInWarp][TeamSlot]) {
E = findMemorySlow(Size, TIdInWarp, TeamSlot);
} else {
E = LastEntry ? LastEntry->getNext()
: WarpAllocatorEntry::fromPtr(
getBlockBegin(TIdInWarp, TeamSlot));
E->setSize(Size);
E->setPrevSize(LastEntry);
Entries[TIdInWarp][TeamSlot] = E;
}
if (!E)
return nullptr;
E->setUsed();
}
return E->getUserPtr();
}
void deallocate(void *Ptr) {
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(Ptr);
auto TeamSlot = getTeamSlot();
auto TIdInWarp = mapping::getThreadIdInWarp();
mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
E->setUnused();
// Is last entry?
if (E == Entries[TIdInWarp][TeamSlot]) {
do {
E = E->getPrev();
} while (!E->isFirst() && !E->isUsed());
Entries[TIdInWarp][TeamSlot] = E;
}
}
memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
if (!utils::isInRange(P, omptarget_device_heap_buffer,
omptarget_device_heap_size))
return {};
auto TeamSlot = getTeamSlot();
auto TIdInWarp = mapping::getThreadIdInWarp();
for (int I = TIdInWarp; I < TIdInWarp + WARP_SIZE; ++I) {
int TId = I % WARP_SIZE;
for (int J = TeamSlot; J < TeamSlot + TEAM_SIZE; ++J) {
int SId = J % TEAM_SIZE;
if (P < getBlockBegin(TId, SId) || P >= getBlockEnd(TId, SId))
continue;
mutex::LockGuard LG(Locks[I][SId]);
WarpAllocatorEntry *E = Entries[I][SId];
if (!E)
return {};
if (E->getEndPtr() <= P)
return {};
do {
if (E->getUserPtr() <= P && P < E->getEndPtr()) {
if (!E->isUsed())
return {};
return {E->getUserPtr(), E->getUserSize()};
}
E = E->getPrev();
} while (!E->isFirst());
}
}
return {};
}
private:
char *getBlockBegin(int32_t TIdInWarp, int32_t TeamSlot) const {
if (TeamSlot)
return Limits[TIdInWarp][TeamSlot - 1];
if (TIdInWarp)
return Limits[TIdInWarp - 1][TEAM_SIZE - 1];
return omptarget_device_heap_buffer;
}
char *getBlockEnd(int32_t TIdInWarp, int32_t TeamSlot) const {
return Limits[TIdInWarp][TeamSlot];
}
size_t getBlockSize(int32_t TIdInWarp, int32_t TeamSlot) const {
return getBlockEnd(TIdInWarp, TeamSlot) -
getBlockBegin(TIdInWarp, TeamSlot);
}
static int32_t getTeamSlot() { return mapping::getBlockId() % TEAM_SIZE; }
WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp,
int32_t TeamSlot) {
char *Ptr = getBlockBegin(TIdInWarp, TeamSlot);
char *Limit = getBlockEnd(TIdInWarp, TeamSlot);
WarpAllocatorEntry *E = WarpAllocatorEntry::fromPtr(Ptr);
do {
if (!E->isUsed() && E->getSize() >= Size)
break;
E = E->getNext();
if (reinterpret_cast<char *>(E) + Size > Limit)
return nullptr;
} while (1);
size_t OldSize = E->getSize();
if (OldSize - Size >= SplitThreadhold) {
auto *OldNext = E->getNext();
E->setSize(Size);
auto *LeftOverE = E->getNext();
LeftOverE->setPrevSize(E);
LeftOverE->setSize(OldSize - Size);
OldNext->setPrevSize(LeftOverE);
}
return E;
}
WarpAllocatorEntry *Entries[WARP_SIZE][TEAM_SIZE];
char *Limits[WARP_SIZE][TEAM_SIZE];
mutex::TicketLock Locks[WARP_SIZE][TEAM_SIZE];
size_t FirstThreadHeapSize;
size_t FirstTeamSize;
};
WarpAllocator<32, 16> Allocator;
} // namespace
namespace ompx {
namespace memory {
MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
return Allocator.getMemoryAllocationInfo(P);
}
} // namespace memory
} // namespace ompx
extern "C" {
void *malloc(size_t Size) {
if (!Size)
return nullptr;
void *P = Allocator.allocate(Size);
assert(P && "allocator out of memory");
assert(reinterpret_cast<intptr_t>(P) % Alignment == 0 &&
"misaligned address");
return P;
}
void free(void *P) {
if (!P)
return;
Allocator.deallocate(P);
}
void *realloc(void *ptr, size_t new_size) {
void *NewPtr = malloc(new_size);
if (!NewPtr)
return nullptr;
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(ptr);
__builtin_memcpy(NewPtr, ptr, utils::min(E->getUserSize(), new_size));
free(ptr);
return NewPtr;
}
void __kmpc_target_init_allocator() { Allocator.init(); }
}
#pragma omp end declare target

View File

@@ -0,0 +1,275 @@
//===------- WarpAllocator.cpp - Warp memory allocator ------- C++ -*-========//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#pragma omp begin declare target device_type(nohost)
#include "Debug.h"
#include "Mapping.h"
#include "Memory.h"
#include "State.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"
using namespace ompx;
char *CONSTANT(omptarget_device_heap_buffer)
__attribute__((used, retain, weak, visibility("protected")));
size_t CONSTANT(omptarget_device_heap_size)
__attribute__((used, retain, weak, visibility("protected")));
namespace {
constexpr const size_t Alignment = 16;
constexpr const size_t FirstThreadRatio = 40;
constexpr const size_t SplitThreadhold = Alignment * 16;
template <typename T> T abs(T V) { return V > 0 ? V : -V; }
template <int32_t WARP_SIZE> struct WarpAllocator;
class WarpAllocatorEntry {
template <int32_t WARP_SIZE> friend struct WarpAllocator;
/// If Size is less than 0, the entry is allocated (in use).
int64_t Size = 0;
/// PrevSize is also supposed to be greater than or equal to 0. When it is 0,
/// it is the first entry of the buffer.
int64_t PrevSize = 0;
public:
bool isFirst() const { return !PrevSize; }
size_t getSize() const { return abs(Size); }
void setSize(size_t V) { Size = V; }
void setPrevSize(WarpAllocatorEntry *Prev) {
PrevSize = Prev ? Prev->getSize() : 0;
}
size_t getUserSize() const { return getSize() - sizeof(WarpAllocatorEntry); }
// Note: isUsed can not be !isUnused or other way around because when Size is
// 0, it is uninitialized.
bool isUsed() const { return Size < 0; }
bool isUnused() const { return Size > 0; }
void setUsed() {
assert(isUnused() && "the entry is in use");
Size *= -1;
}
void setUnused() {
assert(isUsed() && "the entry is not in use");
Size *= -1;
}
char *getUserPtr() { return reinterpret_cast<char *>(this + 1); }
char *getEndPtr() { return reinterpret_cast<char *>(getNext()); }
WarpAllocatorEntry *getPrev() { return utils::advance(this, -PrevSize); }
WarpAllocatorEntry *getNext() { return utils::advance(this, getSize()); }
static WarpAllocatorEntry *fromUserPtr(void *Ptr) { return fromPtr(Ptr) - 1; }
static WarpAllocatorEntry *fromPtr(void *Ptr) {
return reinterpret_cast<WarpAllocatorEntry *>(Ptr);
}
};
static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");
template <int32_t WARP_SIZE> struct WarpAllocator {
void init() {
if (mapping::isSPMDMode() &&
(mapping::getThreadIdInBlock() || mapping::getBlockId()))
return;
size_t HeapSize = omptarget_device_heap_size;
size_t FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
size_t OtherThreadHeapSize =
(HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);
for (int I = 0; I < WARP_SIZE; ++I) {
Entries[I] = nullptr;
size_t PrivateOffset = OtherThreadHeapSize * I + FirstThreadHeapSize;
Limits[I] = omptarget_device_heap_buffer + PrivateOffset;
}
}
void *allocate(size_t Size) {
int32_t TIdInWarp = mapping::getThreadIdInWarp();
Size = utils::align_up(Size + sizeof(WarpAllocatorEntry), Alignment);
// Error our early if the requested size is larger than the entire block.
if (Size > getBlockSize(TIdInWarp))
return nullptr;
WarpAllocatorEntry *E = nullptr;
{
mutex::LockGuard LG(Locks[TIdInWarp]);
auto *LastEntry = Entries[TIdInWarp];
auto *NewWatermark =
(LastEntry ? LastEntry->getEndPtr() : getBlockBegin(TIdInWarp)) +
Size;
if (NewWatermark >= Limits[TIdInWarp]) {
E = findMemorySlow(Size, TIdInWarp);
} else {
E = LastEntry ? LastEntry->getNext()
: WarpAllocatorEntry::fromPtr(getBlockBegin(TIdInWarp));
E->setSize(Size);
E->setPrevSize(LastEntry);
Entries[TIdInWarp] = E;
}
}
if (!E)
return nullptr;
assert(E->isUnused() && "entry is not set to use properly");
E->setUsed();
return E->getUserPtr();
}
void deallocate(void *Ptr) {
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(Ptr);
auto TIdInWarp = mapping::getThreadIdInWarp();
mutex::LockGuard LG(Locks[TIdInWarp]);
E->setUnused();
// Is last entry?
if (E == Entries[TIdInWarp]) {
do {
E = E->getPrev();
} while (!E->isFirst() && !E->isUsed());
Entries[TIdInWarp] = E;
}
}
memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
if (!utils::isInRange(P, omptarget_device_heap_buffer,
omptarget_device_heap_size))
return {};
auto TIdInWarp = mapping::getThreadIdInWarp();
for (int I = TIdInWarp; I < TIdInWarp + mapping::getWarpSize(); ++I) {
int TId = I % mapping::getWarpSize();
if (P < getBlockBegin(TId) || P >= getBlockEnd(TId))
continue;
mutex::LockGuard LG(Locks[I]);
WarpAllocatorEntry *E = Entries[I];
if (!E)
return {};
if (E->getEndPtr() <= P)
return {};
do {
if (E->getUserPtr() <= P && P < E->getEndPtr()) {
if (!E->isUsed())
return {};
return {E->getUserPtr(), E->getUserSize()};
}
E = E->getPrev();
} while (!E->isFirst());
}
return {};
}
private:
char *getBlockBegin(int32_t TIdInWarp) const {
return TIdInWarp ? Limits[TIdInWarp - 1] : omptarget_device_heap_buffer;
}
char *getBlockEnd(int32_t TIdInWarp) const { return Limits[TIdInWarp]; }
size_t getBlockSize(int32_t TIdInWarp) const {
return getBlockEnd(TIdInWarp) - getBlockBegin(TIdInWarp);
}
WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp) {
char *Ptr = getBlockBegin(TIdInWarp);
char *Limit = getBlockEnd(TIdInWarp);
WarpAllocatorEntry *E = WarpAllocatorEntry::fromPtr(Ptr);
do {
if (!E->isUsed() && E->getSize() >= Size)
break;
E = E->getNext();
if (reinterpret_cast<char *>(E) + Size > Limit)
return nullptr;
} while (1);
size_t OldSize = E->getSize();
if (OldSize - Size >= SplitThreadhold) {
auto *OldNext = E->getNext();
E->setSize(Size);
auto *LeftOverE = E->getNext();
LeftOverE->setPrevSize(E);
LeftOverE->setSize(OldSize - Size);
OldNext->setPrevSize(LeftOverE);
}
return E;
}
WarpAllocatorEntry *Entries[WARP_SIZE];
char *Limits[WARP_SIZE];
mutex::TicketLock Locks[WARP_SIZE];
};
WarpAllocator<32> Allocator;
} // namespace
namespace ompx {
namespace memory {
MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
return Allocator.getMemoryAllocationInfo(P);
}
} // namespace memory
} // namespace ompx
extern "C" {
void *malloc(size_t Size) {
if (!Size)
return nullptr;
void *P = Allocator.allocate(Size);
assert(P && "allocator out of memory");
assert(reinterpret_cast<intptr_t>(P) % Alignment == 0 &&
"misaligned address");
return P;
}
void free(void *P) {
if (!P)
return;
Allocator.deallocate(P);
}
void *realloc(void *ptr, size_t new_size) {
void *NewPtr = malloc(new_size);
if (!NewPtr)
return nullptr;
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(ptr);
__builtin_memcpy(NewPtr, ptr, utils::min(E->getUserSize(), new_size));
free(ptr);
return NewPtr;
}
void __kmpc_target_init_allocator() { Allocator.init(); }
}
#pragma omp end declare target

View File

@@ -111,6 +111,38 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
stride = loopSize; // make sure we only do 1 chunk per warp
}
static void ForStaticNoLoopNoTrunk(int &last, T &lb, T &ub, ST &stride,
ST &chunk, T entityId,
T numberOfEntities) {
T loopSize = ub - lb + 1;
chunk = loopSize / numberOfEntities;
// We have more entities than iterations.
if (chunk == 0) {
chunk = 1;
lb = lb + entityId * chunk;
T inputUb = ub;
ub = lb + chunk - 1;
last = lb <= inputUb && inputUb <= ub;
stride = loopSize; // make sure we only do 1 chunk per warp
return;
}
T leftOver = loopSize - chunk * numberOfEntities;
if (entityId < leftOver) {
chunk++;
lb = lb + entityId * chunk;
} else {
lb = lb + entityId * chunk + leftOver;
}
T inputUb = ub;
ub = lb + chunk - 1; // Clang uses i <= ub
last = lb <= inputUb && inputUb <= ub;
stride = loopSize; // make sure we only do 1 chunk per warp
}
////////////////////////////////////////////////////////////////////////////////
// Support for Static Init
@@ -184,6 +216,19 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
omp_get_num_teams() * numberOfActiveOMPThreads);
break;
}
case kmp_sched_distr_parallel_static_noloop_chunk: {
ForStaticChunk(lastiter, lb, ub, stride, chunk,
numberOfActiveOMPThreads * omp_get_team_num() + gtid,
omp_get_num_teams() * numberOfActiveOMPThreads);
break;
}
case kmp_sched_distr_parallel_static_noloop_nochunk: {
ForStaticNoLoopNoTrunk(lastiter, lb, ub, stride, chunk,
numberOfActiveOMPThreads * omp_get_team_num() +
gtid,
omp_get_num_teams() * numberOfActiveOMPThreads);
break;
}
default: {
// ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,

View File

@@ -6,6 +6,62 @@ _ZN4ompx*
IsSPMDMode
memcmp
printf
__assert_fail
__ctype_b_loc
__ctype_tolower_loc
__xpg_basename
abs
atof
atoi
calloc
clock
clock_gettime
exit
fclose
feof
fflush
fgets
fopen
fprintf
fputs
fread
free
fscanf
fseek
ftell
fwrite
getc
gettimeofday
gmtime
malloc
memcmp
memcpy
memset
pclose
perror
popen
printf
qsort
rand
realloc
rewind
sprintf
srand
sscanf
stat
strcat
strchr
strcmp
strcpy
strftime
strlen
strncmp
strncpy
strstr
strtod
strtok
strtol
strtoul
time
tolower
strcasecmp

View File

@@ -0,0 +1,12 @@
#!/usr/bin/env python3
from driver.wrapper import run
def main():
run(is_cpp=False)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,12 @@
#!/usr/bin/env python3
from driver.wrapper import run
def main():
run(is_cpp=True)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,46 @@
//===------- Main.c - Direct compilation program start point ------ C -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <string.h>
extern int __user_main(int, char *[]);
extern void __kmpc_target_init_allocator(void);
#ifndef LOADER_THREAD_LIMIT
#ifdef SINGLE_THREAD_EXECUTION
#define LOADER_THREAD_LIMIT 1
#else
#define LOADER_THREAD_LIMIT 1024
#endif
#endif
#ifndef LOADER_NUM_TEAMS
#define LOADER_NUM_TEAMS 1
#endif
int main(int argc, char *argv[]) {
#pragma omp target enter data map(to: argv[:argc])
for (int I = 0; I < argc; ++I) {
#pragma omp target enter data map(to: argv[I][:strlen(argv[I])])
}
int Ret = 0;
#pragma omp target enter data map(to: Ret)
#pragma omp target teams num_teams(1) thread_limit(1)
{ __kmpc_target_init_allocator(); }
#pragma omp target teams num_teams(LOADER_NUM_TEAMS) thread_limit(LOADER_THREAD_LIMIT)
{
Ret = __user_main(argc, argv);
}
#pragma omp target exit data map(from: Ret)
return Ret;
}

View File

@@ -0,0 +1,14 @@
//===------- UserWrapper.h - User code wrapper --------------- C++ --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
#define OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
#pragma omp begin declare target device_type(nohost)
#endif

View File

@@ -0,0 +1,146 @@
import argparse
import os
import pathlib
import subprocess
import sys
import tempfile
cwd = os.path.dirname(os.path.realpath(__file__))
source_file_suffix = [".c", ".cpp", ".cu", ".hip", ".cc", ".cxx"]
def print_version():
cp = subprocess.run(["clang", "--version"])
if cp.returncode != 0:
sys.exit(cp.returncode)
def compile_loader(loader_name, targets, macros, verbose, dry_run):
cmd = [
"clang",
"-c",
"-fopenmp",
"-foffload-lto",
"-fopenmp-offload-mandatory",
"-o",
loader_name,
]
cmd += macros
if targets:
for arch in targets:
cmd.append("--offload-arch={}".format(arch))
else:
cmd.append("--offload-arch=native")
cmd.append(os.path.join(cwd, "Main.c"))
if verbose:
print(" ".join(cmd), file=sys.stderr)
if dry_run:
print(" ".join(cmd), file=sys.stderr)
return
cp = subprocess.run(cmd)
if cp.returncode != 0:
sys.exit(cp.returncode)
def invoke_clang(is_cpp, args, targets, verbose, dry_run):
cmd = [
"clang++" if is_cpp else "clang",
"-fopenmp",
"-foffload-lto",
"-fopenmp-offload-mandatory",
"-fopenmp-globalize-to-global-space",
"-include",
os.path.join(cwd, "UserWrapper.h"),
"--save-temps",
"-rdynamic",
"-mllvm",
"-enable-host-rpc",
"-mllvm",
"-openmp-opt-disable-state-machine-rewrite",
"-mllvm",
"-enable-canonicalize-main-function",
"-mllvm",
"-canonical-main-function-name=__user_main",
]
if targets:
for arch in targets:
cmd.append("--offload-arch={}".format(arch))
else:
cmd.append("--offload-arch=native")
cmd += args
if verbose:
print(" ".join(cmd))
if dry_run:
print(" ".join(cmd))
return
cp = subprocess.run(cmd)
if cp.returncode != 0:
sys.exit(cp.returncode)
def run(is_cpp=False):
parser = argparse.ArgumentParser(
prog="clang-gpu", description="clang LLVM GPU compiler"
)
parser.add_argument(
"-c",
action="store_true",
help="Only run preprocess, compile, and assemble steps",
)
parser.add_argument(
"-v", action="store_true", help="Show commands to run and use verbose output"
)
parser.add_argument(
"--version", action="store_true", help="Print version information"
)
parser.add_argument(
"-###",
action="store_true",
help="Print (but do not run) the commands to run for this compilation",
)
parser.add_argument(
"--offload-arch",
type=str,
help=(
"Specify an offloading device architecture for CUDA, HIP, or OpenMP. (e.g."
" sm_35). If 'native' is used the compiler will detect locally installed"
" architectures. For HIP offloading, the device architecture can be"
" followed by target ID features delimited by a colon (e.g."
" gfx908:xnack+:sramecc-). May be specified more than once."
),
nargs="*",
)
args, fwd_args = parser.parse_known_args()
if args.version:
print_version()
return
if args.c:
fwd_args.append("-c")
if args.v:
fwd_args.append("-v")
dry_run = vars(args)["###"]
loader_name = None
temp_files = []
if not args.c:
tf = tempfile.NamedTemporaryFile()
loader_name = "{}.o".format(tf.name)
macros = []
for arg in fwd_args:
if arg.startswith("-D"):
macros.append(arg)
compile_loader(loader_name, args.offload_arch, macros, args.v, dry_run)
fwd_args.append(loader_name)
temp_files.append(loader_name)
invoke_clang(is_cpp, fwd_args, args.offload_arch, args.v, dry_run)
for f in temp_files:
if os.path.isfile(f):
os.unlink(f)

View File

@@ -0,0 +1,67 @@
//===------- HostRPC.h - Host RPC ---------------------------- C++ --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef OPENMP_LIBOMPTARGET_INCLUDE_HOSTRPC_H
#define OPENMP_LIBOMPTARGET_INCLUDE_HOSTRPC_H
#ifdef OMPTARGET_DEVICE_RUNTIME
#include "Types.h"
#else
#include <cstdint>
#endif
namespace hostrpc {
/// RPC call identifier. Note: negative value only. Non-negative values are for
/// compiler generated functions.
enum CallId {
CALLID___kmpc_launch_parallel_51_kernel = -1,
CALLID_invalid = -2147483648,
};
/// Execution status.
enum ExecutionStatus {
EXEC_STAT_CREATED = 0,
EXEC_STAT_DONE = 1,
};
enum Type {
ARG_LITERAL = 0,
ARG_POINTER = 1,
};
struct Argument {
intptr_t Value;
int64_t ArgType;
};
struct Descriptor {
// The following member will be used by both host and device.
int32_t Id;
struct Argument *Args;
int64_t NumArgs;
volatile int64_t Status;
volatile int64_t ReturnValue;
// The following members will only be used by device.
void **ArgInfo;
void *ArgMap;
};
/// A wrapper of HostRPCDescriptor that will only be used between plugins and
/// libomptarget. It contains the three stdio global variables.
struct DescriptorWrapper {
Descriptor D;
void *StdIn = nullptr;
void *StdOut = nullptr;
void *StdErr = nullptr;
};
} // namespace hostrpc
#endif

View File

@@ -433,6 +433,10 @@ int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr,
void __tgt_set_info_flag(uint32_t);
int __tgt_print_device_info(int64_t DeviceId);
// Host RPC support
int64_t __kmpc_host_rpc_get_arg(void *Wrapper, int32_t ArgNo);
void __kmpc_host_rpc_set_ret_val(void *Wrapper, int64_t RetVal);
#ifdef __cplusplus
}
#endif

View File

@@ -57,7 +57,7 @@ target_include_directories(omptarget.rtl.cuda PRIVATE ${LIBOMPTARGET_INCLUDE_DIR
# Install plugin under the lib destination folder.
install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
set_target_properties(omptarget.rtl.cuda PROPERTIES
set_target_properties(omptarget.rtl.cuda PROPERTIES
INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
CXX_VISIBILITY_PRESET protected)

View File

@@ -11,19 +11,29 @@
//===----------------------------------------------------------------------===//
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/DynamicLibrary.h"
#include <algorithm>
#include <atomic>
#include <cassert>
#include <chrono>
#include <cstddef>
#include <cuda.h>
#include <list>
#include <memory>
#include <mutex>
#include <string>
#include <thread>
#include <type_traits>
#include <unordered_map>
#include <vector>
#include <sys/stat.h>
#include <sys/time.h>
#include "Debug.h"
#include "DeviceEnvironment.h"
#include "HostRPC.h"
#include "omptarget.h"
#include "omptargetplugin.h"
@@ -39,6 +49,7 @@
#include "llvm/Frontend/OpenMP/OMPConstants.h"
using namespace llvm;
using namespace std::chrono_literals;
// Utility for retrieving and printing CUDA error string.
#ifdef OMPTARGET_DEBUG
@@ -98,6 +109,8 @@ struct KernelTy {
};
namespace {
volatile bool IsHostRPCEnabled = false;
bool checkResult(CUresult Err, const char *ErrMsg) {
if (Err == CUDA_SUCCESS)
return true;
@@ -107,6 +120,369 @@ bool checkResult(CUresult Err, const char *ErrMsg) {
return false;
}
class ArgumentExtractor {
hostrpc::Descriptor &D;
public:
ArgumentExtractor(hostrpc::Descriptor &D) : D(D) {}
template <typename T> T getArg(unsigned Idx) {
assert(Idx < D.NumArgs && "unexpected argument index");
if constexpr (std::is_pointer<T>::value)
return reinterpret_cast<T>(D.Args[Idx].Value);
else
return static_cast<T>(D.Args[Idx].Value);
}
};
struct Parallel51KernelInfo {
std::string Name;
CUfunction Kernel;
int MaxNumThreads;
};
std::unordered_map<std::string, Parallel51KernelInfo> RPCKernelInfo;
template <typename T> void *toVoidPtr(T Val) {
if constexpr (std::is_pointer<T>::value)
return reinterpret_cast<void *>(Val);
else
return reinterpret_cast<void *>(static_cast<intptr_t>(Val));
}
bool handle___kmpc_launch_parallel_51_kernel(hostrpc::Descriptor &D,
CUmodule Module, CUstream Stream) {
assert(D.NumArgs == 6);
ArgumentExtractor AE(D);
auto KernelName = AE.getArg<const char *>(0);
auto GTid = AE.getArg<int32_t>(1);
auto IfExpr = AE.getArg<int32_t>(2);
auto NumThreads = AE.getArg<int32_t>(3);
auto Args = AE.getArg<void **>(4);
auto NArgs = AE.getArg<int64_t>(5);
DP("[host-rpc] get a parallel_51 kernel %s.\n", KernelName);
CUresult Err;
CUfunction Func;
int MaxNumThreads;
auto Itr = RPCKernelInfo.find(KernelName);
if (Itr == RPCKernelInfo.end()) {
Err = cuModuleGetFunction(&Func, Module, KernelName);
if (Err != CUDA_SUCCESS) {
REPORT("Loading '%s' failed\n", KernelName);
CUDA_ERR_STRING(Err);
return false;
}
Err = cuFuncGetAttribute(&MaxNumThreads,
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
if (Err != CUDA_SUCCESS) {
REPORT("Failed to get max threads per block for kernel '%s'\n",
KernelName);
CUDA_ERR_STRING(Err);
return false;
}
Parallel51KernelInfo Info{KernelName, Func, MaxNumThreads};
RPCKernelInfo[KernelName] = std::move(Info);
} else {
Func = Itr->second.Kernel;
MaxNumThreads = Itr->second.MaxNumThreads;
}
if (NumThreads > 0 && NumThreads < MaxNumThreads)
MaxNumThreads = NumThreads;
void *LaunchArgs[5] = {toVoidPtr(GTid), toVoidPtr(IfExpr),
toVoidPtr(NumThreads), Args, toVoidPtr(NArgs)};
void *Params[5];
for (int I = 0; I < 5; ++I)
Params[I] = &LaunchArgs[I];
int GridDimX = 1024;
if (const char *EnvStr = getenv("LIBOMPTARGET_PARALLEL_KERNEL_GRID_SIZE"))
GridDimX = std::stoi(EnvStr);
Err = cuLaunchKernel(Func, /* gridDimX */ GridDimX, /* gridDimY */ 1,
/* gridDimZ */ 1, MaxNumThreads,
/* blockDimY */ 1, /* blockDimZ */ 1, 0, Stream, Params,
nullptr);
if (Err != CUDA_SUCCESS) {
REPORT("Failed to launch the new kernel %s\n", KernelName);
CUDA_ERR_STRING(Err);
return false;
}
Err = cuStreamSynchronize(Stream);
if (Err != CUDA_SUCCESS) {
REPORT("Failed to synchronize the stream " DPxMOD "\n", DPxPTR(Stream));
CUDA_ERR_STRING(Err);
return false;
}
DP("[host-rpc] successfully launch a new kernel %s.\n", KernelName);
return true;
}
const char *DescriptorVarName = "omptarget_hostrpc_descriptor";
const char *FutexVarName = "omptarget_hostrpc_futex";
const char *MemBufVarName = "omptarget_hostrpc_memory_buffer";
const char *MemBufSizeVarName = "omptarget_hostrpc_memory_buffer_size";
bool initHostRPCServer(CUmodule Module, CUcontext Context,
CUdeviceptr &Descriptor, CUdeviceptr &Futex) {
DP("start to init host RPC server...\n");
CUresult Err = cuCtxSetCurrent(Context);
if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
return false;
auto CheckGlobal = [Module](CUdeviceptr &Ptr, const char *Name, size_t Size) {
size_t CUSize;
CUresult Err = cuModuleGetGlobal(&Ptr, &CUSize, Module, Name);
if (Err != CUDA_SUCCESS)
return false;
if (CUSize != Size)
return false;
return true;
};
CUdeviceptr DescriptorVar;
CUdeviceptr FutexVar;
CUdeviceptr MemBufVar;
CUdeviceptr MemBufSizeVar;
if (!CheckGlobal(DescriptorVar, DescriptorVarName,
sizeof(hostrpc::Descriptor *))) {
REPORT("Loading global '%s' failed\n", DescriptorVarName);
CUDA_ERR_STRING(Err);
return false;
}
if (!CheckGlobal(FutexVar, FutexVarName, sizeof(int32_t *))) {
REPORT("Loading global '%s' failed\n", FutexVarName);
CUDA_ERR_STRING(Err);
return false;
}
if (!CheckGlobal(MemBufVar, MemBufVarName, sizeof(char *))) {
REPORT("Loading global '%s' failed\n", MemBufVarName);
CUDA_ERR_STRING(Err);
return false;
}
if (!CheckGlobal(MemBufSizeVar, MemBufSizeVarName, sizeof(size_t))) {
REPORT("Loading global '%s' failed\n", MemBufSizeVarName);
CUDA_ERR_STRING(Err);
return false;
}
Err = cuMemAllocManaged(&Descriptor, sizeof(hostrpc::Descriptor),
CU_MEM_ATTACH_GLOBAL);
if (Err != CUDA_SUCCESS) {
DP("Failed to allocate USM for descriptor.\n");
CUDA_ERR_STRING(Err);
return false;
}
Err = cuMemAdvise(Descriptor, sizeof(hostrpc::Descriptor),
CU_MEM_ADVISE_SET_ACCESSED_BY, 0);
if (Err != CUDA_SUCCESS) {
DP("Failed to cuMemAdvise USM for descriptor.\n");
CUDA_ERR_STRING(Err);
return false;
}
Err = cuMemAllocManaged(&Futex, sizeof(int32_t), CU_MEM_ATTACH_GLOBAL);
if (Err != CUDA_SUCCESS) {
DP("Failed to allocate USM for futex.\n");
CUDA_ERR_STRING(Err);
return false;
}
Err = cuMemAdvise(Futex, sizeof(int32_t), CU_MEM_ADVISE_SET_ACCESSED_BY, 0);
if (Err != CUDA_SUCCESS) {
DP("Failed to cuMemAdvise USM for futex.\n");
CUDA_ERR_STRING(Err);
return false;
}
*reinterpret_cast<int32_t *>(Futex) = 0;
CUdeviceptr MemBuf;
// 128MB
const size_t Size = 134217728;
Err = cuMemAllocManaged(&MemBuf, Size, CU_MEM_ATTACH_GLOBAL);
if (Err != CUDA_SUCCESS) {
REPORT("Failed to allocate USM for host rpc buffer\n");
CUDA_ERR_STRING(Err);
return false;
}
Err = cuMemAdvise(MemBuf, Size, CU_MEM_ADVISE_SET_ACCESSED_BY, 0);
if (Err != CUDA_SUCCESS) {
DP("Failed to cuMemAdvise USM for membuf.\n");
CUDA_ERR_STRING(Err);
return false;
}
Err = cuMemcpyHtoD(DescriptorVar, &Descriptor, sizeof(hostrpc::Descriptor *));
if (Err != CUDA_SUCCESS) {
REPORT("Failed to set %s to " DPxMOD ".\n", DescriptorVarName,
DPxPTR(Descriptor));
CUDA_ERR_STRING(Err);
return false;
}
Err = cuMemcpyHtoD(FutexVar, &Futex, sizeof(int32_t *));
if (Err != CUDA_SUCCESS) {
REPORT("Failed to set %s to " DPxMOD ".\n", FutexVarName, DPxPTR(Futex));
CUDA_ERR_STRING(Err);
return false;
}
Err = cuMemcpyHtoD(MemBufVar, &MemBuf, sizeof(int32_t *));
if (Err != CUDA_SUCCESS) {
REPORT("Failed to set %s to " DPxMOD ".\n", MemBufVarName, DPxPTR(MemBuf));
CUDA_ERR_STRING(Err);
return false;
}
Err = cuMemcpyHtoD(MemBufSizeVar, &Size, sizeof(size_t));
if (Err != CUDA_SUCCESS) {
REPORT("Failed to set %s to " DPxMOD ".\n", MemBufSizeVarName,
DPxPTR(MemBufSizeVar));
CUDA_ERR_STRING(Err);
return false;
}
return true;
}
class HostRPCInvokerWrapper {
void (*Invoker)(int32_t, void *) = nullptr;
std::unique_ptr<llvm::sys::DynamicLibrary> DL;
std::once_flag Flag;
void initInvoker() {
std::string ErrMsg;
DL = std::make_unique<sys::DynamicLibrary>(
sys::DynamicLibrary::getPermanentLibrary(nullptr, &ErrMsg));
assert(DL->isValid() && "invalid DL");
*((void **)&Invoker) =
DL->getAddressOfSymbol("__kmpc_host_rpc_invoke_host_wrapper");
assert(Invoker && "Invoker is nullptr");
}
public:
void invoke(int32_t CallNo, void *Desc) {
std::call_once(Flag, &HostRPCInvokerWrapper::initInvoker, this);
Invoker(CallNo, Desc);
}
};
void runHostRPCServer(CUmodule Module, CUcontext Context,
CUdeviceptr DescriptorPtr, CUdeviceptr FutexPtr) {
CUresult Err = cuCtxSetCurrent(Context);
if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
return;
CUstream Stream;
Err = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
if (Err != CUDA_SUCCESS) {
REPORT("Failed to create a stream to launch a new kernel\n");
CUDA_ERR_STRING(Err);
return;
}
HostRPCInvokerWrapper Invoker;
hostrpc::DescriptorWrapper Wrapper;
auto CheckGlobal = [Module](CUdeviceptr &Ptr, const char *Name, size_t Size) {
size_t CUSize;
CUresult Err = cuModuleGetGlobal(&Ptr, &CUSize, Module, Name);
if (Err != CUDA_SUCCESS)
return false;
if (CUSize != Size)
return false;
return true;
};
{
CUdeviceptr DevPtr;
if (CheckGlobal(DevPtr, "StdInDummyVar", sizeof(int)))
Wrapper.StdIn = reinterpret_cast<void *>(DevPtr);
if (CheckGlobal(DevPtr, "StdOutDummyVar", sizeof(int)))
Wrapper.StdOut = reinterpret_cast<void *>(DevPtr);
if (CheckGlobal(DevPtr, "StdErrDummyVar", sizeof(int)))
Wrapper.StdErr = reinterpret_cast<void *>(DevPtr);
DP("[host-rpc] host: stdin=%p, stdout=%p, stderr=%p\n", Wrapper.StdIn,
Wrapper.StdOut, Wrapper.StdErr);
}
while (IsHostRPCEnabled) {
if (__sync_fetch_and_add(reinterpret_cast<uint32_t *>(FutexPtr), 0,
__ATOMIC_ACQUIRE) == 0) {
// std::this_thread::sleep_for(50ns);
continue;
}
auto HostRPCD2HStart = std::chrono::high_resolution_clock::now();
// Get the descriptor.
Wrapper.D = *reinterpret_cast<hostrpc::Descriptor *>(DescriptorPtr);
auto HostRPCD2HEnd = std::chrono::high_resolution_clock::now();
// If the client still didn't reset the descriptor, we skip it.
if (Wrapper.D.Status == hostrpc::EXEC_STAT_DONE) {
// std::this_thread::sleep_for(50ns);
continue;
}
DP("[host-rpc] get a request (id=%d).\n", Wrapper.D.Id);
auto HostRPCHandleStart = std::chrono::high_resolution_clock::now();
bool HandleResult = false;
switch (Wrapper.D.Id) {
case hostrpc::CALLID___kmpc_launch_parallel_51_kernel:
HandleResult =
handle___kmpc_launch_parallel_51_kernel(Wrapper.D, Module, Stream);
break;
default:
Invoker.invoke(Wrapper.D.Id, &Wrapper);
HandleResult = true;
}
Wrapper.D.Status = hostrpc::EXEC_STAT_DONE;
if (!HandleResult)
Wrapper.D.ReturnValue = 0;
auto HostRPCHandleEnd = std::chrono::high_resolution_clock::now();
DP("[host-rpc] finish request (id=%d) with retval 0x%lx.\n", Wrapper.D.Id,
Wrapper.D.ReturnValue);
auto HostRPCH2DStart = std::chrono::high_resolution_clock::now();
// Update the descriptor and futex word on the device.
*reinterpret_cast<hostrpc::Descriptor *>(DescriptorPtr) = Wrapper.D;
__sync_fetch_and_sub(reinterpret_cast<uint32_t *>(FutexPtr), 1,
__ATOMIC_ACQ_REL);
auto HostRPCH2DEnd = std::chrono::high_resolution_clock::now();
DP("[host-rpc-profiling-host] id=%d, D2H: %ld, handle=%ld, H2D=%ld.\n",
Wrapper.D.Id, (HostRPCD2HEnd - HostRPCD2HStart).count(),
(HostRPCHandleEnd - HostRPCHandleStart).count(),
(HostRPCH2DEnd - HostRPCH2DStart).count());
}
}
int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
CUstream Stream) {
CUresult Err =
@@ -366,6 +742,9 @@ class DeviceRTLTy {
std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
std::mutex PeerAccessMatrixLock;
std::vector<std::vector<std::thread>> HostRPCServers;
std::vector<std::vector<int>> HostRPCInitState;
/// A class responsible for interacting with device native runtime library to
/// allocate and free memory.
class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
@@ -528,6 +907,8 @@ public:
Modules.resize(NumberOfDevices);
StreamPool.resize(NumberOfDevices);
EventPool.resize(NumberOfDevices);
HostRPCServers.resize(NumberOfDevices);
HostRPCInitState.resize(NumberOfDevices);
PeerAccessMatrix.resize(NumberOfDevices);
for (auto &V : PeerAccessMatrix)
V.resize(NumberOfDevices, PeerAccessState::Unkown);
@@ -764,6 +1145,11 @@ public:
}
int deinitDevice(const int DeviceId) {
IsHostRPCEnabled = false;
for (auto &T : HostRPCServers[DeviceId])
T.join();
auto IsInitialized = InitializedFlags[DeviceId];
if (!IsInitialized)
return OFFLOAD_SUCCESS;
@@ -968,6 +1354,107 @@ public:
}
}
// Initialize heap buffer
{
const char *BufferVarName = "omptarget_device_heap_buffer";
const char *SizeVarName = "omptarget_device_heap_size";
CUdeviceptr BufferVarPtr;
CUdeviceptr SizeVarPtr;
size_t BufferVarSize;
size_t SizeVarSize;
Err = cuModuleGetGlobal(&BufferVarPtr, &BufferVarSize, Module,
BufferVarName);
if (Err == CUDA_SUCCESS) {
if (BufferVarSize != sizeof(uint64_t)) {
REPORT("Global global heap buffer pointer '%s' - size mismatch (%zu "
"!= %zu)\n",
BufferVarName, BufferVarSize, sizeof(uint64_t));
CUDA_ERR_STRING(Err);
return nullptr;
}
Err = cuModuleGetGlobal(&SizeVarPtr, &SizeVarSize, Module, SizeVarName);
if (Err == CUDA_SUCCESS) {
if (SizeVarSize != sizeof(uint64_t)) {
REPORT("Global global heap size variable '%s' - size mismatch (%zu "
"!= %zu)\n",
SizeVarName, SizeVarSize, sizeof(uint64_t));
CUDA_ERR_STRING(Err);
return nullptr;
}
size_t FreeGPUMemory = 0;
size_t TotalGPUMemory = 0;
// By default we allocate 12GB memory.
size_t HeapSize = 12884901888U;
Err = cuMemGetInfo(&FreeGPUMemory, &TotalGPUMemory);
if (Err == CUDA_SUCCESS)
HeapSize = FreeGPUMemory * 50 / 100;
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Allocate %zu bytes device memory for heap.\n", HeapSize);
CUdeviceptr BufferPtr;
Err = cuMemAlloc(&BufferPtr, HeapSize);
if (Err != CUDA_SUCCESS) {
REPORT("Error when allocating heap bufferm size = %zu\n", HeapSize);
CUDA_ERR_STRING(Err);
return nullptr;
}
Err = cuMemcpyHtoD(BufferVarPtr, &BufferPtr, BufferVarSize);
if (Err != CUDA_SUCCESS) {
REPORT("Error when copying data from host to device. Pointers: "
"host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
DPxPTR(&BufferPtr), DPxPTR(BufferVarPtr), BufferVarSize);
CUDA_ERR_STRING(Err);
return nullptr;
}
Err = cuMemcpyHtoD(SizeVarPtr, &HeapSize, SizeVarSize);
if (Err != CUDA_SUCCESS) {
REPORT("Error when copying data from host to device. Pointers: "
"host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
DPxPTR(&HeapSize), DPxPTR(SizeVarPtr), SizeVarSize);
CUDA_ERR_STRING(Err);
return nullptr;
}
DP("Successfully set heap buffer. omptarget_device_heap_buffer "
"= " DPxMOD ", omptarget_device_heap_size = %zu\n",
DPxPTR(BufferPtr), HeapSize);
} else {
DP("Finding global heap buffer pointer '%s' - symbol missing.\n",
SizeVarName);
DP("Continue, considering this is an image does not require heap "
"allocation.\n");
}
} else {
DP("Finding global heap buffer pointer '%s' - symbol missing.\n",
BufferVarName);
DP("Continue, considering this is an image does not require heap "
"allocation.\n");
}
}
// Start the host RPC thread
IsHostRPCEnabled = true;
CUdeviceptr DescriptorPtr;
CUdeviceptr FutexPtr;
if (!initHostRPCServer(Module, DeviceData[DeviceId].Context, DescriptorPtr,
FutexPtr)) {
REPORT("Failed to init host RPC server\n");
return nullptr;
}
HostRPCServers[DeviceId].emplace_back(runHostRPCServer, Module,
DeviceData[DeviceId].Context,
DescriptorPtr, FutexPtr);
return getOffloadEntriesTable(DeviceId);
}
@@ -1560,8 +2047,8 @@ int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *Image,
__tgt_image_info *Info) {
if (!__tgt_rtl_is_valid_binary(Image))
return false;
// if (!__tgt_rtl_is_valid_binary(Image))
// return false;
// A subarchitecture was not specified. Assume it is compatible.
if (!Info || !Info->Arch)

View File

@@ -64,7 +64,8 @@ VERS1.0 {
__tgt_interop_init;
__tgt_interop_use;
__tgt_interop_destroy;
ompt_libomptarget_connect;
__kmpc_host_rpc_get_arg;
__kmpc_host_rpc_set_ret_val;
local:
*;
};

View File

@@ -16,6 +16,7 @@
#include "private.h"
#include "rtl.h"
#include "HostRPC.h"
#include "Utilities.h"
#include <cassert>
@@ -435,3 +436,33 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
delete AsyncInfo;
*AsyncHandle = nullptr;
}
// Host RPC support functions.
EXTERN int64_t __kmpc_host_rpc_get_arg(void *Wrapper, int32_t ArgNum) {
auto *W = reinterpret_cast<hostrpc::DescriptorWrapper *>(Wrapper);
auto &SD = W->D;
assert(ArgNum < SD.NumArgs && "out-of-range argument");
int64_t ArgVal = SD.Args[ArgNum].Value;
void *ArgPtr = reinterpret_cast<void *>(ArgVal);
DP("[host-rpc] get argno=%d arg=%lx...\n", ArgNum, ArgVal);
if (W->StdIn && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
ArgPtr == W->StdIn)
return (int64_t)stdin;
if (W->StdOut && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
ArgPtr == W->StdOut)
return (int64_t)stdout;
if (W->StdErr && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
ArgPtr == W->StdErr)
return (int64_t)stderr;
return ArgVal;
}
EXTERN void __kmpc_host_rpc_set_ret_val(void *Wrapper, int64_t RetVal) {
auto &SD = reinterpret_cast<hostrpc::DescriptorWrapper *>(Wrapper)->D;
SD.ReturnValue = RetVal;
}