Compare commits
4 Commits
globalisel
...
llvm-test-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b19051093e | ||
|
|
de7beadb39 | ||
|
|
25d5970544 | ||
|
|
20f98716b0 |
65
auto-host-rpc/device.ll
Normal file
65
auto-host-rpc/device.ll
Normal file
@@ -0,0 +1,65 @@
|
||||
; ModuleID = 'test-openmp-nvptx64-nvidia-cuda-sm_75.bc'
|
||||
source_filename = "/home/shiltian/Documents/vscode/llvm-project/auto-host-rpc/test.c"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.ddd = type { i32, i32, float }
|
||||
|
||||
@__omp_rtl_debug_kind = weak_odr hidden local_unnamed_addr constant i32 0
|
||||
@__omp_rtl_assume_teams_oversubscription = weak_odr hidden local_unnamed_addr constant i32 0
|
||||
@__omp_rtl_assume_threads_oversubscription = weak_odr hidden local_unnamed_addr constant i32 0
|
||||
@__omp_rtl_assume_no_thread_state = weak_odr hidden local_unnamed_addr constant i32 0
|
||||
@__omp_rtl_assume_no_nested_parallelism = weak_odr hidden local_unnamed_addr constant i32 0
|
||||
@.str = private unnamed_addr constant [9 x i8] c"main.cpp\00", align 1
|
||||
@.str1 = private unnamed_addr constant [2 x i8] c"r\00", align 1
|
||||
@.str2 = private unnamed_addr constant [3 x i8] c"%d\00", align 1
|
||||
@.str3 = private unnamed_addr constant [7 x i8] c"%f%d%s\00", align 1
|
||||
@.str4 = private unnamed_addr constant [6 x i8] c"hello\00", align 1
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
define hidden void @foo() local_unnamed_addr #0 {
|
||||
entry:
|
||||
%d = tail call align 16 dereferenceable_or_null(12) ptr @__kmpc_alloc_shared(i64 12) #4
|
||||
%call = tail call noalias ptr @fopen(ptr noundef nonnull @.str, ptr noundef nonnull @.str1) #5
|
||||
%call1 = tail call i32 (ptr, ptr, ...) @fprintf(ptr noundef %call, ptr noundef nonnull @.str2, i32 noundef 6) #5
|
||||
%call2 = tail call i32 (ptr, ptr, ...) @fprintf(ptr noundef %call, ptr noundef nonnull @.str3, double noundef 6.000000e+00, i32 noundef 1, ptr noundef nonnull @.str4) #5
|
||||
%a = getelementptr inbounds %struct.ddd, ptr %d, i64 0, i32 1
|
||||
%call3 = tail call i32 (ptr, ptr, ...) @fscanf(ptr noundef %call, ptr noundef nonnull @.str2, ptr noundef nonnull %a) #5
|
||||
tail call void @__kmpc_free_shared(ptr %d, i64 12)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nofree nosync nounwind allocsize(0)
|
||||
declare ptr @__kmpc_alloc_shared(i64) local_unnamed_addr #1
|
||||
|
||||
; Function Attrs: convergent
|
||||
declare noalias ptr @fopen(ptr noundef, ptr noundef) local_unnamed_addr #2
|
||||
|
||||
; Function Attrs: convergent
|
||||
declare i32 @fprintf(ptr noundef, ptr noundef, ...) local_unnamed_addr #2
|
||||
|
||||
; Function Attrs: convergent
|
||||
declare i32 @fscanf(ptr noundef, ptr noundef, ...) local_unnamed_addr #2
|
||||
|
||||
; Function Attrs: nosync nounwind
|
||||
declare void @__kmpc_free_shared(ptr allocptr nocapture, i64) local_unnamed_addr #3
|
||||
|
||||
attributes #0 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx77,+sm_75" }
|
||||
attributes #1 = { nofree nosync nounwind allocsize(0) }
|
||||
attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx77,+sm_75" }
|
||||
attributes #3 = { nosync nounwind }
|
||||
attributes #4 = { nounwind }
|
||||
attributes #5 = { convergent nounwind }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2, !3, !4, !5}
|
||||
!llvm.ident = !{!6, !7}
|
||||
!nvvm.annotations = !{}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 7]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 7, !"openmp", i32 50}
|
||||
!3 = !{i32 7, !"openmp-device", i32 50}
|
||||
!4 = !{i32 8, !"PIC Level", i32 2}
|
||||
!5 = !{i32 7, !"frame-pointer", i32 2}
|
||||
!6 = !{!"clang version 16.0.0"}
|
||||
!7 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
||||
560
auto-host-rpc/main.cpp
Normal file
560
auto-host-rpc/main.cpp
Normal file
@@ -0,0 +1,560 @@
|
||||
#include "llvm/ADT/EnumeratedArray.h"
|
||||
#include "llvm/ADT/Triple.h"
|
||||
#include "llvm/Analysis/ValueTracking.h"
|
||||
#include "llvm/CodeGen/CommandFlags.h"
|
||||
#include "llvm/IR/BasicBlock.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IRReader/IRReader.h"
|
||||
#include "llvm/Support/SourceMgr.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace {
|
||||
static LLVMContext Context;
|
||||
static codegen::RegisterCodeGenFlags RCGF;
|
||||
static constexpr const char *InternalPrefix[] = {"__kmp", "llvm.", "nvm.",
|
||||
"omp_"};
|
||||
|
||||
std::string typeToString(Type *T) {
|
||||
if (T->is16bitFPTy())
|
||||
return "f16";
|
||||
if (T->isFloatTy())
|
||||
return "f32";
|
||||
if (T->isDoubleTy())
|
||||
return "f64";
|
||||
if (T->isPointerTy())
|
||||
return "ptr";
|
||||
if (T->isStructTy())
|
||||
return std::string(T->getStructName());
|
||||
if (T->isIntegerTy())
|
||||
return "i" + std::to_string(T->getIntegerBitWidth());
|
||||
|
||||
llvm_unreachable("unknown type");
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace llvm {
|
||||
|
||||
enum class HostRPCRuntimeFunction {
|
||||
#define __OMPRTL_HOST_RPC(_ENUM) OMPRTL_##_ENUM
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc),
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg),
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg),
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait),
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_ret_val),
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val),
|
||||
__OMPRTL_HOST_RPC(__last),
|
||||
#undef __OMPRTL_HOST_RPC
|
||||
};
|
||||
|
||||
#define __OMPRTL_HOST_RPC(_ENUM) \
|
||||
auto OMPRTL_##_ENUM = HostRPCRuntimeFunction::OMPRTL_##_ENUM;
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc)
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg)
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg)
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait)
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_ret_val)
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val)
|
||||
#undef __OMPRTL_HOST_RPC
|
||||
|
||||
enum OMPHostRPCArgType {
|
||||
// No need to copy.
|
||||
OMP_HOST_RPC_ARG_SCALAR = 0,
|
||||
OMP_HOST_RPC_ARG_PTR = 1,
|
||||
// Copy to host.
|
||||
OMP_HOST_RPC_ARG_PTR_COPY_TO = 2,
|
||||
// Copy to device
|
||||
OMP_HOST_RPC_ARG_PTR_COPY_FROM = 3,
|
||||
// TODO: Do we have a tofrom pointer?
|
||||
OMP_HOST_RPC_ARG_PTR_COPY_TOFROM = 4,
|
||||
};
|
||||
|
||||
// struct HostRPCArgInfo {
|
||||
// // OMPHostRPCArgType
|
||||
// int64_t Type;
|
||||
// int64_t Size;
|
||||
// };
|
||||
|
||||
class AutoHostRPC {
|
||||
LLVMContext &Context;
|
||||
// Device module
|
||||
Module &DM;
|
||||
// Host module
|
||||
Module &HM;
|
||||
// Types
|
||||
Type *Int8PtrTy;
|
||||
Type *VoidTy;
|
||||
Type *Int32Ty;
|
||||
Type *Int64Ty;
|
||||
StructType *ArgInfoTy;
|
||||
|
||||
struct CallSiteInfo {
|
||||
CallInst *CI = nullptr;
|
||||
SmallVector<Type *> Params;
|
||||
};
|
||||
|
||||
struct HostRPCArgInfo {
|
||||
// OMPHostRPCArgType
|
||||
Constant *Type;
|
||||
Value *Size;
|
||||
};
|
||||
|
||||
//
|
||||
SmallVector<Function *> HostEntryTable;
|
||||
|
||||
EnumeratedArray<Function *, HostRPCRuntimeFunction,
|
||||
HostRPCRuntimeFunction::OMPRTL___last>
|
||||
RFIs;
|
||||
|
||||
static std::string getWrapperFunctionName(Function *F, CallSiteInfo &CSI) {
|
||||
std::string Name = "__kmpc_host_rpc_wrapper_" + std::string(F->getName());
|
||||
if (!F->isVarArg())
|
||||
return Name;
|
||||
|
||||
for (unsigned I = F->getFunctionType()->getNumParams();
|
||||
I < CSI.Params.size(); ++I) {
|
||||
Name.push_back('_');
|
||||
Name.append(typeToString(CSI.Params[I]));
|
||||
}
|
||||
|
||||
return Name;
|
||||
}
|
||||
|
||||
static bool isInternalFunction(Function &F) {
|
||||
auto Name = F.getName();
|
||||
|
||||
for (auto P : InternalPrefix)
|
||||
if (Name.startswith(P))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
Value *convertToInt64Ty(IRBuilder<> &Builder, Value *V);
|
||||
|
||||
Value *convertFromInt64TyTo(IRBuilder<> &Builder, Value *V, Type *TargetTy);
|
||||
|
||||
// int device_wrapper(call_no, arg_info, ...) {
|
||||
// void *desc = __kmpc_host_rpc_get_desc(call_no, num_args, arg_info);
|
||||
// __kmpc_host_rpc_add_arg(desc, arg1, sizeof(arg1));
|
||||
// __kmpc_host_rpc_add_arg(desc, arg2, sizeof(arg2));
|
||||
// ...
|
||||
// __kmpc_host_rpc_send_and_wait(desc);
|
||||
// int r = (int)__kmpc_host_rpc_get_ret_val(desc);
|
||||
// return r;
|
||||
// }
|
||||
Function *getDeviceWrapperFunction(StringRef WrapperName, Function *F,
|
||||
CallSiteInfo &CSI);
|
||||
|
||||
// void host_wrapper(desc) {
|
||||
// int arg1 = (int)__kmpc_host_rpc_get_arg(desc, 0);
|
||||
// float arg2 = (float)__kmpc_host_rpc_get_arg(desc, 1);
|
||||
// char *arg3 = (char *)__kmpc_host_rpc_get_arg(desc, 2);
|
||||
// ...
|
||||
// int r = actual_call(arg1, arg2, arg3, ...);
|
||||
// __kmpc_host_rpc_set_ret_val(ptr(desc, (int64_t)r);
|
||||
// }
|
||||
Function *getHostWrapperFunction(StringRef WrapperName, Function *F,
|
||||
CallSiteInfo &CSI);
|
||||
|
||||
bool rewriteWithHostRPC(Function *F);
|
||||
|
||||
public:
|
||||
AutoHostRPC(Module &DeviceModule, Module &HostModule)
|
||||
: Context(DeviceModule.getContext()), DM(DeviceModule), HM(HostModule) {
|
||||
assert(&DeviceModule.getContext() == &HostModule.getContext() &&
|
||||
"device and host modules have different context");
|
||||
|
||||
#define __OMP_TYPE(TYPE) TYPE = Type::get##TYPE(Context)
|
||||
__OMP_TYPE(Int8PtrTy);
|
||||
__OMP_TYPE(VoidTy);
|
||||
__OMP_TYPE(Int32Ty);
|
||||
__OMP_TYPE(Int64Ty);
|
||||
#undef __OMP_TYPE
|
||||
|
||||
#define __OMP_RTL(_ENUM, MOD, VARARG, RETTY, ...) \
|
||||
{ \
|
||||
SmallVector<Type *> Params{__VA_ARGS__}; \
|
||||
FunctionType *FT = FunctionType::get(RETTY, Params, VARARG); \
|
||||
RFIs[OMPRTL_##_ENUM] = Function::Create( \
|
||||
FT, GlobalValue::LinkageTypes::InternalLinkage, #_ENUM, MOD); \
|
||||
}
|
||||
__OMP_RTL(__kmpc_host_rpc_get_desc, DM, false, Int8PtrTy, Int32Ty, Int32Ty,
|
||||
Int8PtrTy)
|
||||
__OMP_RTL(__kmpc_host_rpc_add_arg, DM, false, VoidTy, Int8PtrTy, Int64Ty,
|
||||
Int64Ty)
|
||||
__OMP_RTL(__kmpc_host_rpc_send_and_wait, DM, false, VoidTy, Int8PtrTy)
|
||||
__OMP_RTL(__kmpc_host_rpc_get_ret_val, DM, false, Int64Ty, Int8PtrTy)
|
||||
__OMP_RTL(__kmpc_host_rpc_get_arg, HM, false, Int64Ty, Int8PtrTy, Int32Ty)
|
||||
__OMP_RTL(__kmpc_host_rpc_set_ret_val, HM, false, VoidTy, Int8PtrTy,
|
||||
Int64Ty)
|
||||
#undef __OMP_RTL
|
||||
|
||||
ArgInfoTy = StructType::create({Int64Ty, Int64Ty}, "struct.arg_info_t");
|
||||
}
|
||||
|
||||
bool run();
|
||||
};
|
||||
|
||||
Value *AutoHostRPC::convertToInt64Ty(IRBuilder<> &Builder, Value *V) {
|
||||
Type *T = V->getType();
|
||||
|
||||
if (T == Int64Ty)
|
||||
return V;
|
||||
|
||||
if (T->isPointerTy())
|
||||
return Builder.CreatePtrToInt(V, Int64Ty);
|
||||
|
||||
if (T->isIntegerTy())
|
||||
return Builder.CreateIntCast(V, Int64Ty, false);
|
||||
|
||||
if (T->isFloatingPointTy()) {
|
||||
if (T->isFloatTy())
|
||||
V = Builder.CreateFPToSI(V, Int32Ty);
|
||||
return Builder.CreateFPToSI(V, Int64Ty);
|
||||
}
|
||||
|
||||
llvm_unreachable("unknown cast to int64_t");
|
||||
}
|
||||
|
||||
Value *AutoHostRPC::convertFromInt64TyTo(IRBuilder<> &Builder, Value *V,
|
||||
Type *T) {
|
||||
if (T == Int64Ty)
|
||||
return V;
|
||||
|
||||
if (T->isPointerTy())
|
||||
return Builder.CreateIntToPtr(V, T);
|
||||
|
||||
if (T->isIntegerTy())
|
||||
return Builder.CreateIntCast(V, T, /* isSigned */ true);
|
||||
|
||||
if (T->isFloatingPointTy()) {
|
||||
if (T->isFloatTy())
|
||||
V = Builder.CreateIntCast(V, Int32Ty, /* isSigned */ true);
|
||||
V = Builder.CreateSIToFP(V, T);
|
||||
return V;
|
||||
}
|
||||
|
||||
llvm_unreachable("unknown cast from int64_t");
|
||||
}
|
||||
|
||||
bool AutoHostRPC::run() {
|
||||
bool Changed = false;
|
||||
|
||||
SmallVector<Function *> WorkList;
|
||||
|
||||
for (Function &F : DM) {
|
||||
// If the function is already defined, it definitely does not require RPC.
|
||||
if (!F.isDeclaration())
|
||||
continue;
|
||||
|
||||
// If it is an internal function, skip it as well.
|
||||
if (isInternalFunction(F))
|
||||
continue;
|
||||
|
||||
// If there is no use of the function, skip it.
|
||||
if (F.use_empty())
|
||||
continue;
|
||||
|
||||
WorkList.push_back(&F);
|
||||
}
|
||||
|
||||
if (WorkList.empty())
|
||||
return Changed;
|
||||
|
||||
for (Function *F : WorkList)
|
||||
Changed |= rewriteWithHostRPC(F);
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool AutoHostRPC::rewriteWithHostRPC(Function *F) {
|
||||
bool Changed = false;
|
||||
|
||||
SmallVector<CallInst *> WorkList;
|
||||
|
||||
for (User *U : F->users()) {
|
||||
auto *CI = dyn_cast<CallInst>(U);
|
||||
if (!CI)
|
||||
continue;
|
||||
WorkList.push_back(CI);
|
||||
}
|
||||
|
||||
if (WorkList.empty())
|
||||
return Changed;
|
||||
|
||||
for (CallInst *CI : WorkList) {
|
||||
CallSiteInfo CSI;
|
||||
CSI.CI = CI;
|
||||
unsigned NumArgs = CI->arg_size();
|
||||
for (unsigned I = 0; I < NumArgs; ++I)
|
||||
CSI.Params.push_back(CI->getArgOperand(I)->getType());
|
||||
|
||||
std::string WrapperName = getWrapperFunctionName(F, CSI);
|
||||
Function *DeviceWrapperFn = getDeviceWrapperFunction(WrapperName, F, CSI);
|
||||
Function *HostWrapperFn = getHostWrapperFunction(WrapperName, F, CSI);
|
||||
|
||||
int32_t WrapperNumber = -1;
|
||||
for (unsigned I = 0; I < HostEntryTable.size(); ++I) {
|
||||
if (HostEntryTable[I] == HostWrapperFn) {
|
||||
WrapperNumber = I;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (WrapperNumber == -1) {
|
||||
WrapperNumber = HostEntryTable.size();
|
||||
HostEntryTable.push_back(HostWrapperFn);
|
||||
}
|
||||
|
||||
DataLayout DL = DM.getDataLayout();
|
||||
IRBuilder<> Builder(CI);
|
||||
|
||||
auto CheckIfIdentifierPtr = [](const Value *V) {
|
||||
auto *CI = dyn_cast<CallInst>(V);
|
||||
if (!CI)
|
||||
return false;
|
||||
Function *Callee = CI->getCalledFunction();
|
||||
return Callee->getName().startswith("__kmpc_host_rpc_wrapper_");
|
||||
};
|
||||
|
||||
auto CheckIfAlloca = [](const Value *V) {
|
||||
auto *CI = dyn_cast<CallInst>(V);
|
||||
if (!CI)
|
||||
return false;
|
||||
Function *Callee = CI->getCalledFunction();
|
||||
return Callee->getName() == "__kmpc_alloc_shared" ||
|
||||
Callee->getName() == "malloc";
|
||||
};
|
||||
|
||||
SmallVector<HostRPCArgInfo> ArgInfos;
|
||||
bool IsConstantArgInfo = true;
|
||||
|
||||
for (Value *Op : CI->args()) {
|
||||
if (!Op->getType()->isPointerTy()) {
|
||||
HostRPCArgInfo AI{
|
||||
ConstantInt::get(Int64Ty,
|
||||
OMPHostRPCArgType::OMP_HOST_RPC_ARG_SCALAR),
|
||||
ConstantInt::getNullValue(Int64Ty)};
|
||||
ArgInfos.push_back(std::move(AI));
|
||||
continue;
|
||||
}
|
||||
|
||||
Value *SizeVal = nullptr;
|
||||
OMPHostRPCArgType ArgType = OMP_HOST_RPC_ARG_PTR_COPY_TOFROM;
|
||||
|
||||
SmallVector<const Value *> Objects;
|
||||
getUnderlyingObjects(Op, Objects);
|
||||
|
||||
// TODO: Handle phi node
|
||||
if (Objects.size() != 1)
|
||||
llvm_unreachable("we can't handle phi node yet");
|
||||
|
||||
auto *Obj = Objects.front();
|
||||
if (CheckIfIdentifierPtr(Obj)) {
|
||||
ArgType = OMP_HOST_RPC_ARG_SCALAR;
|
||||
SizeVal = ConstantInt::getNullValue(Int64Ty);
|
||||
} else if (CheckIfAlloca(Obj)) {
|
||||
auto *CI = dyn_cast<CallInst>(Obj);
|
||||
SizeVal = CI->getOperand(0);
|
||||
if (!isa<Constant>(SizeVal))
|
||||
IsConstantArgInfo = false;
|
||||
} else {
|
||||
if (auto *GV = dyn_cast<GlobalVariable>(Obj)) {
|
||||
SizeVal = ConstantInt::get(Int64Ty,
|
||||
DL.getTypeStoreSize(GV->getValueType()));
|
||||
if (GV->isConstant())
|
||||
ArgType = OMP_HOST_RPC_ARG_PTR_COPY_TO;
|
||||
if (GV->isConstant() && GV->hasInitializer()) {
|
||||
// TODO: If the global variable is contant, we can do some
|
||||
// optimization.
|
||||
}
|
||||
} else {
|
||||
// TODO: fix that when it occurs
|
||||
llvm_unreachable("cannot handle unknown type");
|
||||
}
|
||||
}
|
||||
|
||||
HostRPCArgInfo AI{ConstantInt::get(Int64Ty, ArgType), SizeVal};
|
||||
ArgInfos.push_back(std::move(AI));
|
||||
}
|
||||
|
||||
Value *ArgInfo = nullptr;
|
||||
|
||||
if (!IsConstantArgInfo) {
|
||||
ArgInfo = Builder.CreateAlloca(
|
||||
ArgInfoTy, ConstantInt::get(Int64Ty, NumArgs), "arg_info");
|
||||
for (unsigned I = 0; I < NumArgs; ++I) {
|
||||
Value *AII = GetElementPtrInst::Create(
|
||||
ArrayType::get(ArgInfoTy, NumArgs), ArgInfo,
|
||||
{ConstantInt::getNullValue(Int64Ty), ConstantInt::get(Int64Ty, I)});
|
||||
Value *AIIType = GetElementPtrInst::Create(
|
||||
ArgInfoTy, AII, {ConstantInt::get(Int64Ty, 0)});
|
||||
Value *AIISize = GetElementPtrInst::Create(
|
||||
ArgInfoTy, AII, {ConstantInt::get(Int64Ty, 1)});
|
||||
Builder.Insert(AII);
|
||||
Builder.Insert(AIIType);
|
||||
Builder.Insert(AIISize);
|
||||
Builder.CreateStore(ArgInfos[I].Type, AIIType);
|
||||
Builder.CreateStore(ArgInfos[I].Size, AIISize);
|
||||
}
|
||||
} else {
|
||||
SmallVector<Constant *> ArgInfoInitVar;
|
||||
for (auto &AI : ArgInfos) {
|
||||
auto *CS =
|
||||
ConstantStruct::get(ArgInfoTy, {AI.Type, cast<Constant>(AI.Size)});
|
||||
ArgInfoInitVar.push_back(CS);
|
||||
}
|
||||
Constant *ArgInfoInit = ConstantArray::get(
|
||||
ArrayType::get(ArgInfoTy, NumArgs), ArgInfoInitVar);
|
||||
ArgInfo = new GlobalVariable(
|
||||
DM, ArrayType::get(ArgInfoTy, NumArgs), /* isConstant */ true,
|
||||
GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info");
|
||||
}
|
||||
|
||||
SmallVector<Value *> Args{ConstantInt::get(Int32Ty, WrapperNumber),
|
||||
ArgInfo};
|
||||
for (Value *Op : CI->args())
|
||||
Args.push_back(Op);
|
||||
|
||||
CallInst *NewCall = Builder.CreateCall(DeviceWrapperFn, Args);
|
||||
|
||||
CI->replaceAllUsesWith(NewCall);
|
||||
CI->eraseFromParent();
|
||||
}
|
||||
|
||||
F->eraseFromParent();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
Function *AutoHostRPC::getDeviceWrapperFunction(StringRef WrapperName,
|
||||
Function *F,
|
||||
CallSiteInfo &CSI) {
|
||||
Function *WrapperFn = DM.getFunction(WrapperName);
|
||||
if (WrapperFn)
|
||||
return WrapperFn;
|
||||
|
||||
// return_type device_wrapper(int32_t call_no, void *arg_info, ...)
|
||||
SmallVector<Type *> Params{Int32Ty, Int8PtrTy};
|
||||
Params.append(CSI.Params);
|
||||
|
||||
Type *RetTy = F->getReturnType();
|
||||
|
||||
FunctionType *FT = FunctionType::get(RetTy, Params, /*isVarArg*/ false);
|
||||
WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::WeakODRLinkage,
|
||||
WrapperName, DM);
|
||||
|
||||
// Emit the body of the device wrapper
|
||||
IRBuilder<> Builder(Context);
|
||||
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
|
||||
Builder.SetInsertPoint(EntryBB);
|
||||
|
||||
// skip call_no and arg_info.
|
||||
constexpr const unsigned NumArgSkipped = 2;
|
||||
|
||||
Value *Desc = nullptr;
|
||||
{
|
||||
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_get_desc];
|
||||
Desc = Builder.CreateCall(
|
||||
Fn,
|
||||
{WrapperFn->getArg(0),
|
||||
ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
|
||||
WrapperFn->getArg(1)},
|
||||
"desc");
|
||||
}
|
||||
|
||||
{
|
||||
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_add_arg];
|
||||
for (unsigned I = NumArgSkipped; I < WrapperFn->arg_size(); ++I) {
|
||||
Value *V = convertToInt64Ty(Builder, WrapperFn->getArg(I));
|
||||
Builder.CreateCall(Fn, {Desc, V, ConstantInt::getNullValue(Int64Ty)});
|
||||
}
|
||||
}
|
||||
|
||||
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_send_and_wait], {Desc});
|
||||
|
||||
if (RetTy->isVoidTy()) {
|
||||
Builder.CreateRetVoid();
|
||||
return WrapperFn;
|
||||
}
|
||||
|
||||
Value *RetVal =
|
||||
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_ret_val], {Desc});
|
||||
if (RetTy != RetVal->getType())
|
||||
RetVal = convertFromInt64TyTo(Builder, RetVal, RetTy);
|
||||
|
||||
Builder.CreateRet(RetVal);
|
||||
|
||||
return WrapperFn;
|
||||
}
|
||||
|
||||
Function *AutoHostRPC::getHostWrapperFunction(StringRef WrapperName,
|
||||
Function *F, CallSiteInfo &CSI) {
|
||||
Function *WrapperFn = HM.getFunction(WrapperName);
|
||||
if (WrapperFn)
|
||||
return WrapperFn;
|
||||
|
||||
SmallVector<Type *> Params{Int8PtrTy};
|
||||
FunctionType *FT = FunctionType::get(VoidTy, Params, /* isVarArg */ false);
|
||||
WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,
|
||||
WrapperName, HM);
|
||||
|
||||
Value *Desc = WrapperFn->getArg(0);
|
||||
|
||||
// Emit the body of the host wrapper
|
||||
IRBuilder<> Builder(Context);
|
||||
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
|
||||
Builder.SetInsertPoint(EntryBB);
|
||||
|
||||
SmallVector<Value *> Args;
|
||||
for (unsigned I = 0; I < CSI.CI->arg_size(); ++I) {
|
||||
Value *V = Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_arg],
|
||||
{Desc, ConstantInt::get(Int32Ty, I)});
|
||||
Args.push_back(convertFromInt64TyTo(Builder, V, CSI.Params[I]));
|
||||
}
|
||||
|
||||
// The host callee that will be called eventually by the host wrapper.
|
||||
Function *HostCallee = HM.getFunction(F->getName());
|
||||
if (!HostCallee)
|
||||
HostCallee = Function::Create(F->getFunctionType(), F->getLinkage(),
|
||||
F->getName(), HM);
|
||||
|
||||
Value *RetVal = Builder.CreateCall(HostCallee, Args);
|
||||
RetVal = convertToInt64Ty(Builder, RetVal);
|
||||
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_set_ret_val], {Desc, RetVal});
|
||||
Builder.CreateRetVoid();
|
||||
|
||||
return WrapperFn;
|
||||
}
|
||||
|
||||
} // namespace llvm
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
InitializeAllTargets();
|
||||
InitializeAllTargetMCs();
|
||||
InitializeAllAsmPrinters();
|
||||
InitializeAllAsmParsers();
|
||||
|
||||
SMDiagnostic Err;
|
||||
std::unique_ptr<Module> DM = parseIRFile("device.ll", Err, Context);
|
||||
if (!DM)
|
||||
return 1;
|
||||
Module HM("host-rpc.bc", Context);
|
||||
// get the right target triple
|
||||
HM.setTargetTriple(Triple::normalize("x86-64"));
|
||||
|
||||
AutoHostRPC RPC(*DM, HM);
|
||||
(void)RPC.run();
|
||||
|
||||
DM->dump();
|
||||
// HM.dump();
|
||||
|
||||
return 0;
|
||||
}
|
||||
19
auto-host-rpc/test.c
Normal file
19
auto-host-rpc/test.c
Normal file
@@ -0,0 +1,19 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
struct ddd {
|
||||
int num;
|
||||
int a;
|
||||
float b;
|
||||
};
|
||||
|
||||
void foo() {
|
||||
FILE *fp = fopen("main.cpp", "r");
|
||||
struct ddd d;
|
||||
fprintf(fp, "%d", 6);
|
||||
fprintf(fp, "%f%d%s", 6.0f, 1, "hello");
|
||||
fscanf(fp, "%d", &d.a);
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
@@ -258,6 +258,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
|
||||
LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
|
||||
LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
|
||||
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
|
||||
LANGOPT(OpenMPGlobalizeToGlobalSpace , 1, 0, "Globalize to global space for the globalized variables")
|
||||
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
|
||||
LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
|
||||
LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
|
||||
|
||||
@@ -3497,6 +3497,10 @@ def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested
|
||||
HelpText<"Assert no nested parallel regions in the GPU">,
|
||||
MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;
|
||||
|
||||
def fopenmp_globalize_to_global_space : Flag<["-"], "fopenmp-globalize-to-global-space">,
|
||||
HelpText<"Globalize to global space for the globalized variables">,
|
||||
MarshallingInfoFlag<LangOpts<"OpenMPGlobalizeToGlobalSpace">>;
|
||||
|
||||
} // let Group = f_Group
|
||||
} // let Visibility = [ClangOption, CC1Option, FC1Option]
|
||||
} // let Flags = [NoArgumentUnused, HelpHidden]
|
||||
|
||||
@@ -2530,48 +2530,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
|
||||
(IPD->getParameterKind() == ImplicitParamKind::ThreadPrivateVar);
|
||||
}
|
||||
|
||||
Address DeclPtr = Address::invalid();
|
||||
Address AllocaPtr = Address::invalid();
|
||||
bool DoStore = false;
|
||||
bool IsScalar = hasScalarEvaluationKind(Ty);
|
||||
bool UseIndirectDebugAddress = false;
|
||||
|
||||
// If we already have a pointer to the argument, reuse the input pointer.
|
||||
if (Arg.isIndirect()) {
|
||||
DeclPtr = Arg.getIndirectAddress();
|
||||
DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty));
|
||||
// Indirect argument is in alloca address space, which may be different
|
||||
// from the default address space.
|
||||
auto AllocaAS = CGM.getASTAllocaAddressSpace();
|
||||
auto *V = DeclPtr.getPointer();
|
||||
AllocaPtr = DeclPtr;
|
||||
|
||||
// For truly ABI indirect arguments -- those that are not `byval` -- store
|
||||
// the address of the argument on the stack to preserve debug information.
|
||||
ABIArgInfo ArgInfo = CurFnInfo->arguments()[ArgNo - 1].info;
|
||||
if (ArgInfo.isIndirect())
|
||||
UseIndirectDebugAddress = !ArgInfo.getIndirectByVal();
|
||||
if (UseIndirectDebugAddress) {
|
||||
auto PtrTy = getContext().getPointerType(Ty);
|
||||
AllocaPtr = CreateMemTemp(PtrTy, getContext().getTypeAlignInChars(PtrTy),
|
||||
D.getName() + ".indirect_addr");
|
||||
EmitStoreOfScalar(V, AllocaPtr, /* Volatile */ false, PtrTy);
|
||||
}
|
||||
|
||||
auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
|
||||
auto DestLangAS =
|
||||
getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
|
||||
if (SrcLangAS != DestLangAS) {
|
||||
assert(getContext().getTargetAddressSpace(SrcLangAS) ==
|
||||
CGM.getDataLayout().getAllocaAddrSpace());
|
||||
auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
|
||||
auto *T = llvm::PointerType::get(getLLVMContext(), DestAS);
|
||||
DeclPtr =
|
||||
DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
|
||||
*this, V, SrcLangAS, DestLangAS, T, true),
|
||||
DeclPtr.isKnownNonNull());
|
||||
}
|
||||
|
||||
auto PushCleanupIfNeeded = [this, Ty, &D](Address DeclPtr) {
|
||||
// Push a destructor cleanup for this parameter if the ABI requires it.
|
||||
// Don't push a cleanup in a thunk for a method that will also emit a
|
||||
// cleanup.
|
||||
@@ -2587,87 +2546,124 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
|
||||
EHStack.stable_begin();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Check if the parameter address is controlled by OpenMP runtime.
|
||||
Address OpenMPLocalAddr =
|
||||
getLangOpts().OpenMP
|
||||
? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
|
||||
: Address::invalid();
|
||||
if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
|
||||
DeclPtr = OpenMPLocalAddr;
|
||||
AllocaPtr = DeclPtr;
|
||||
};
|
||||
|
||||
Address DeclPtr = Address::invalid();
|
||||
Address AllocaPtr = Address::invalid();
|
||||
Address OpenMPLocalAddr =
|
||||
getLangOpts().OpenMP
|
||||
? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
|
||||
: Address::invalid();
|
||||
bool DoStore = false;
|
||||
bool IsScalar = hasScalarEvaluationKind(Ty);
|
||||
bool UseIndirectDebugAddress = false;
|
||||
if (OpenMPLocalAddr.isValid()) {
|
||||
DeclPtr = OpenMPLocalAddr;
|
||||
AllocaPtr = DeclPtr;
|
||||
LValue Dst = MakeAddrLValue(DeclPtr, Ty);
|
||||
if (Arg.isIndirect()) {
|
||||
LValue Src = MakeAddrLValue(Arg.getIndirectAddress(), Ty);
|
||||
callCStructCopyConstructor(Dst, Src);
|
||||
PushCleanupIfNeeded(Arg.getIndirectAddress());
|
||||
} else {
|
||||
// Otherwise, create a temporary to hold the value.
|
||||
EmitStoreOfScalar(Arg.getDirectValue(), Dst, /* isInitialization */ true);
|
||||
}
|
||||
} else {
|
||||
// If we already have a pointer to the argument, reuse the input pointer.
|
||||
if (Arg.isIndirect()) {
|
||||
DeclPtr = Arg.getIndirectAddress();
|
||||
DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty));
|
||||
// Indirect argument is in alloca address space, which may be different
|
||||
// from the default address space.
|
||||
auto AllocaAS = CGM.getASTAllocaAddressSpace();
|
||||
auto *V = DeclPtr.getPointer();
|
||||
AllocaPtr = DeclPtr;
|
||||
auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
|
||||
auto DestLangAS =
|
||||
getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
|
||||
if (SrcLangAS != DestLangAS) {
|
||||
assert(getContext().getTargetAddressSpace(SrcLangAS) ==
|
||||
CGM.getDataLayout().getAllocaAddrSpace());
|
||||
auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
|
||||
auto *T = llvm::PointerType::get(getLLVMContext(), DestAS);
|
||||
DeclPtr =
|
||||
DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
|
||||
*this, V, SrcLangAS, DestLangAS, T, true),
|
||||
DeclPtr.isKnownNonNull());
|
||||
}
|
||||
PushCleanupIfNeeded(DeclPtr);
|
||||
} else {
|
||||
// Create a temporary to hold the value.
|
||||
DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
|
||||
D.getName() + ".addr", &AllocaPtr);
|
||||
DoStore = true;
|
||||
}
|
||||
DoStore = true;
|
||||
}
|
||||
|
||||
llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
|
||||
llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
|
||||
|
||||
LValue lv = MakeAddrLValue(DeclPtr, Ty);
|
||||
if (IsScalar) {
|
||||
Qualifiers qs = Ty.getQualifiers();
|
||||
if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
|
||||
// We honor __attribute__((ns_consumed)) for types with lifetime.
|
||||
// For __strong, it's handled by just skipping the initial retain;
|
||||
// otherwise we have to balance out the initial +1 with an extra
|
||||
// cleanup to do the release at the end of the function.
|
||||
bool isConsumed = D.hasAttr<NSConsumedAttr>();
|
||||
LValue lv = MakeAddrLValue(DeclPtr, Ty);
|
||||
if (IsScalar) {
|
||||
Qualifiers qs = Ty.getQualifiers();
|
||||
if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
|
||||
// We honor __attribute__((ns_consumed)) for types with lifetime.
|
||||
// For __strong, it's handled by just skipping the initial retain;
|
||||
// otherwise we have to balance out the initial +1 with an extra
|
||||
// cleanup to do the release at the end of the function.
|
||||
bool isConsumed = D.hasAttr<NSConsumedAttr>();
|
||||
|
||||
// If a parameter is pseudo-strong then we can omit the implicit retain.
|
||||
if (D.isARCPseudoStrong()) {
|
||||
assert(lt == Qualifiers::OCL_Strong &&
|
||||
"pseudo-strong variable isn't strong?");
|
||||
assert(qs.hasConst() && "pseudo-strong variable should be const!");
|
||||
lt = Qualifiers::OCL_ExplicitNone;
|
||||
}
|
||||
// If a parameter is pseudo-strong then we can omit the implicit retain.
|
||||
if (D.isARCPseudoStrong()) {
|
||||
assert(lt == Qualifiers::OCL_Strong &&
|
||||
"pseudo-strong variable isn't strong?");
|
||||
assert(qs.hasConst() && "pseudo-strong variable should be const!");
|
||||
lt = Qualifiers::OCL_ExplicitNone;
|
||||
}
|
||||
|
||||
// Load objects passed indirectly.
|
||||
if (Arg.isIndirect() && !ArgVal)
|
||||
ArgVal = Builder.CreateLoad(DeclPtr);
|
||||
// Load objects passed indirectly.
|
||||
if (Arg.isIndirect() && !ArgVal)
|
||||
ArgVal = Builder.CreateLoad(DeclPtr);
|
||||
|
||||
if (lt == Qualifiers::OCL_Strong) {
|
||||
if (!isConsumed) {
|
||||
if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
|
||||
// use objc_storeStrong(&dest, value) for retaining the
|
||||
// object. But first, store a null into 'dest' because
|
||||
// objc_storeStrong attempts to release its old value.
|
||||
llvm::Value *Null = CGM.EmitNullConstant(D.getType());
|
||||
EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
|
||||
EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
|
||||
DoStore = false;
|
||||
if (lt == Qualifiers::OCL_Strong) {
|
||||
if (!isConsumed) {
|
||||
if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
|
||||
// use objc_storeStrong(&dest, value) for retaining the
|
||||
// object. But first, store a null into 'dest' because
|
||||
// objc_storeStrong attempts to release its old value.
|
||||
llvm::Value *Null = CGM.EmitNullConstant(D.getType());
|
||||
EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
|
||||
EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
|
||||
DoStore = false;
|
||||
} else
|
||||
// Don't use objc_retainBlock for block pointers, because we
|
||||
// don't want to Block_copy something just because we got it
|
||||
// as a parameter.
|
||||
ArgVal = EmitARCRetainNonBlock(ArgVal);
|
||||
}
|
||||
} else {
|
||||
// Push the cleanup for a consumed parameter.
|
||||
if (isConsumed) {
|
||||
ARCPreciseLifetime_t precise =
|
||||
(D.hasAttr<ObjCPreciseLifetimeAttr>() ? ARCPreciseLifetime
|
||||
: ARCImpreciseLifetime);
|
||||
EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(),
|
||||
ArgVal, precise);
|
||||
}
|
||||
|
||||
if (lt == Qualifiers::OCL_Weak) {
|
||||
EmitARCInitWeak(DeclPtr, ArgVal);
|
||||
DoStore = false; // The weak init is a store, no need to do two.
|
||||
}
|
||||
else
|
||||
// Don't use objc_retainBlock for block pointers, because we
|
||||
// don't want to Block_copy something just because we got it
|
||||
// as a parameter.
|
||||
ArgVal = EmitARCRetainNonBlock(ArgVal);
|
||||
}
|
||||
} else {
|
||||
// Push the cleanup for a consumed parameter.
|
||||
if (isConsumed) {
|
||||
ARCPreciseLifetime_t precise = (D.hasAttr<ObjCPreciseLifetimeAttr>()
|
||||
? ARCPreciseLifetime : ARCImpreciseLifetime);
|
||||
EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(), ArgVal,
|
||||
precise);
|
||||
}
|
||||
|
||||
if (lt == Qualifiers::OCL_Weak) {
|
||||
EmitARCInitWeak(DeclPtr, ArgVal);
|
||||
DoStore = false; // The weak init is a store, no need to do two.
|
||||
}
|
||||
// Enter the cleanup scope.
|
||||
EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
|
||||
}
|
||||
|
||||
// Enter the cleanup scope.
|
||||
EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
|
||||
}
|
||||
}
|
||||
|
||||
// Store the initial value into the alloca.
|
||||
if (DoStore)
|
||||
EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
|
||||
// Store the initial value into the alloca.
|
||||
if (DoStore)
|
||||
EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
|
||||
}
|
||||
|
||||
setAddrOfLocalVar(&D, DeclPtr);
|
||||
|
||||
|
||||
@@ -1083,10 +1083,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
|
||||
|
||||
// Allocate space for the variable to be globalized
|
||||
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
|
||||
llvm::CallBase *VoidPtr =
|
||||
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
|
||||
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
|
||||
AllocArgs, VD->getName());
|
||||
llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
|
||||
OMPBuilder.getOrCreateRuntimeFunction(
|
||||
CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
|
||||
? OMPRTL_malloc
|
||||
: OMPRTL___kmpc_alloc_shared),
|
||||
AllocArgs, VD->getName());
|
||||
// FIXME: We should use the variables actual alignment as an argument.
|
||||
VoidPtr->addRetAttr(llvm::Attribute::get(
|
||||
CGM.getLLVMContext(), llvm::Attribute::Alignment,
|
||||
@@ -1116,12 +1118,14 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
|
||||
I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
|
||||
LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
|
||||
CGM.getContext().getDeclAlign(VD),
|
||||
|
||||
AlignmentSource::Decl);
|
||||
I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress(CGF));
|
||||
}
|
||||
I->getSecond().MappedParams->apply(CGF);
|
||||
}
|
||||
|
||||
|
||||
bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
|
||||
const VarDecl *VD) const {
|
||||
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
|
||||
@@ -1149,10 +1153,12 @@ CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
|
||||
|
||||
// Allocate space for this VLA object to be globalized.
|
||||
llvm::Value *AllocArgs[] = {Size};
|
||||
llvm::CallBase *VoidPtr =
|
||||
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
|
||||
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
|
||||
AllocArgs, VD->getName());
|
||||
llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
|
||||
OMPBuilder.getOrCreateRuntimeFunction(
|
||||
CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
|
||||
? OMPRTL_malloc
|
||||
: OMPRTL___kmpc_alloc_shared),
|
||||
AllocArgs, VD->getName());
|
||||
VoidPtr->addRetAttr(llvm::Attribute::get(
|
||||
CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
|
||||
|
||||
@@ -1178,20 +1184,29 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
|
||||
// globalized in the prolog (i.e. emitGenericVarsProlog).
|
||||
for (const auto &AddrSizePair :
|
||||
llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
|
||||
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
|
||||
CGM.getModule(), OMPRTL___kmpc_free_shared),
|
||||
{AddrSizePair.first, AddrSizePair.second});
|
||||
if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
|
||||
CGF.EmitRuntimeCall(
|
||||
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
|
||||
{AddrSizePair.first});
|
||||
else
|
||||
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
|
||||
CGM.getModule(), OMPRTL___kmpc_free_shared),
|
||||
{AddrSizePair.first, AddrSizePair.second});
|
||||
}
|
||||
// Deallocate the memory for each globalized value
|
||||
for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
|
||||
const auto *VD = cast<VarDecl>(Rec.first);
|
||||
I->getSecond().MappedParams->restore(CGF);
|
||||
|
||||
llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
|
||||
CGF.getTypeSize(VD->getType())};
|
||||
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
|
||||
CGM.getModule(), OMPRTL___kmpc_free_shared),
|
||||
FreeArgs);
|
||||
if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
|
||||
CGF.EmitRuntimeCall(
|
||||
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
|
||||
{Rec.second.GlobalizedVal});
|
||||
else
|
||||
CGF.EmitRuntimeCall(
|
||||
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
|
||||
OMPRTL___kmpc_free_shared),
|
||||
{Rec.second.GlobalizedVal, CGF.getTypeSize(VD->getType())});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6502,6 +6502,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
|
||||
CmdArgs.push_back("-fopenmp-offload-mandatory");
|
||||
if (Args.hasArg(options::OPT_fopenmp_force_usm))
|
||||
CmdArgs.push_back("-fopenmp-force-usm");
|
||||
if (Args.hasArg(options::OPT_fopenmp_globalize_to_global_space))
|
||||
CmdArgs.push_back("-fopenmp-globalize-to-global-space");
|
||||
break;
|
||||
default:
|
||||
// By default, if Clang doesn't know how to generate useful OpenMP code
|
||||
|
||||
33
clang/test/OpenMP/globalization_byval_struct.c
Normal file
33
clang/test/OpenMP/globalization_byval_struct.c
Normal file
@@ -0,0 +1,33 @@
|
||||
// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
|
||||
// expected-no-diagnostics
|
||||
|
||||
extern int printf(const char *, ...);
|
||||
|
||||
struct S {
|
||||
int a;
|
||||
float b;
|
||||
};
|
||||
|
||||
// CHECK: define{{.*}}void @test(%struct.S* noundef byval(%struct.S) align {{[0-9]+}} [[arg:%[0-9a-zA-Z]+]])
|
||||
// CHECK: [[g:%[0-9a-zA-Z]+]] = call align {{[0-9]+}} i8* @__kmpc_alloc_shared
|
||||
// CHECK: bitcast i8* [[g]] to %struct.S*
|
||||
// CHECK: bitcast %struct.S* [[arg]] to i8**
|
||||
// CHECK: call void [[cc:@__copy_constructor[_0-9a-zA-Z]+]]
|
||||
// CHECK: void [[cc]]
|
||||
void test(struct S s) {
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < s.a; ++i) {
|
||||
printf("%i : %i : %f\n", i, s.a, s.b);
|
||||
}
|
||||
}
|
||||
|
||||
void foo() {
|
||||
#pragma omp target teams num_teams(1)
|
||||
{
|
||||
struct S s;
|
||||
s.a = 7;
|
||||
s.b = 11;
|
||||
test(s);
|
||||
}
|
||||
}
|
||||
@@ -137,6 +137,13 @@ static constexpr OptTable::Info InfoTable[] = {
|
||||
#undef OPTION
|
||||
};
|
||||
|
||||
/// Host RPC module that will be shared to the corresponding pass.
|
||||
Module *HostModule = nullptr;
|
||||
/// We only need to generate the host RPC module once.
|
||||
bool IsHostModuleGenerated = false;
|
||||
/// Host RPC object file.
|
||||
StringRef HostRPCObjFile;
|
||||
|
||||
class WrapperOptTable : public opt::GenericOptTable {
|
||||
public:
|
||||
WrapperOptTable() : opt::GenericOptTable(InfoTable) {}
|
||||
@@ -613,10 +620,12 @@ std::vector<std::string> getTargetFeatures(ArrayRef<OffloadFile> InputFiles) {
|
||||
return UnifiedFeatures;
|
||||
}
|
||||
|
||||
template <typename ModuleHook = function_ref<bool(size_t, const Module &)>>
|
||||
template <typename PreHookTy = function_ref<bool(size_t, const Module &)>,
|
||||
typename PostHookTy = function_ref<bool(size_t, const Module &)>>
|
||||
std::unique_ptr<lto::LTO> createLTO(
|
||||
const ArgList &Args, const std::vector<std::string> &Features,
|
||||
ModuleHook Hook = [](size_t, const Module &) { return true; }) {
|
||||
PreHookTy PreHook = [](size_t, const Module &) { return true; },
|
||||
PostHookTy PostHook = [](size_t, const Module &) { return true; }) {
|
||||
const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
|
||||
// We need to remove AMD's target-id from the processor if present.
|
||||
StringRef Arch = Args.getLastArgValue(OPT_arch_EQ).split(":").first;
|
||||
@@ -671,7 +680,9 @@ std::unique_ptr<lto::LTO> createLTO(
|
||||
return true;
|
||||
};
|
||||
}
|
||||
Conf.PostOptModuleHook = Hook;
|
||||
|
||||
Conf.PreOptModuleHook = PreHook;
|
||||
Conf.PostOptModuleHook = PostHook;
|
||||
Conf.CGFileType = (Triple.isNVPTX() || SaveTemps)
|
||||
? CodeGenFileType::AssemblyFile
|
||||
: CodeGenFileType::ObjectFile;
|
||||
@@ -690,6 +701,58 @@ bool isValidCIdentifier(StringRef S) {
|
||||
[](char C) { return C == '_' || isAlnum(C); });
|
||||
}
|
||||
|
||||
bool writeHostModule(std::string &FileName) {
|
||||
if (!HostModule)
|
||||
return false;
|
||||
if (HostModule->getFunctionList().empty())
|
||||
return false;
|
||||
|
||||
auto HostTriple = HostModule->getTargetTriple();
|
||||
FileName =
|
||||
sys::path::filename(ExecutableName).str() + "-host-rpc-" + HostTriple;
|
||||
auto TempFileOrErr = createOutputFile(FileName, "bc");
|
||||
if (!TempFileOrErr)
|
||||
reportError(TempFileOrErr.takeError());
|
||||
int FD = -1;
|
||||
if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
|
||||
reportError(errorCodeToError(EC));
|
||||
|
||||
auto Out = std::make_unique<llvm::raw_fd_ostream>(FD, true);
|
||||
WriteBitcodeToFile(*HostModule, *Out);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::unique_ptr<lto::LTO> createHostRPCLTO(StringRef HostTriple) {
|
||||
const llvm::Triple Triple(HostTriple);
|
||||
lto::Config Conf;
|
||||
lto::ThinBackend Backend;
|
||||
Backend =
|
||||
lto::createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
|
||||
|
||||
// TODO: host arch?
|
||||
// Conf.CPU = Arch.str();
|
||||
Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple);
|
||||
|
||||
// TODO: host features?
|
||||
// Conf.MAttrs = Features;
|
||||
Conf.CGOptLevel = *CodeGenOpt::getLevel(3);
|
||||
Conf.OptLevel = 3;
|
||||
Conf.UseDefaultPipeline = true;
|
||||
Conf.DefaultTriple = Triple.getTriple();
|
||||
|
||||
LTOError = false;
|
||||
Conf.DiagHandler = diagnosticHandler;
|
||||
|
||||
Conf.PTO.LoopVectorization = Conf.OptLevel > 1;
|
||||
Conf.PTO.SLPVectorization = Conf.OptLevel > 1;
|
||||
Conf.CGFileType = CodeGenFileType::ObjectFile;
|
||||
|
||||
Conf.HasWholeProgramVisibility = false;
|
||||
|
||||
return std::make_unique<lto::LTO>(std::move(Conf), Backend);
|
||||
}
|
||||
|
||||
Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
|
||||
SmallVectorImpl<StringRef> &OutputFiles,
|
||||
const ArgList &Args) {
|
||||
@@ -775,14 +838,52 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
|
||||
BitcodeOutput.push_back(*TempFileOrErr);
|
||||
return false;
|
||||
};
|
||||
auto AddHostModuleAddr = [&](size_t, const Module &M) {
|
||||
if (!HostModule)
|
||||
return true;
|
||||
|
||||
Module &CM = const_cast<Module &>(M);
|
||||
auto *MD = CM.getOrInsertNamedMetadata("llvm.hostrpc.hostmodule");
|
||||
MD->clearOperands();
|
||||
MD->addOperand(MDTuple::get(
|
||||
CM.getContext(), {ConstantAsMetadata::get(ConstantInt::get(
|
||||
Type::getInt64Ty(CM.getContext()),
|
||||
reinterpret_cast<uintptr_t>(HostModule)))}));
|
||||
return true;
|
||||
};
|
||||
|
||||
// We assume visibility of the whole program if every input file was bitcode.
|
||||
auto Features = getTargetFeatures(BitcodeInputFiles);
|
||||
auto LTOBackend = Args.hasArg(OPT_embed_bitcode) ||
|
||||
Args.hasArg(OPT_builtin_bitcode_EQ) ||
|
||||
Args.hasArg(OPT_clang_backend)
|
||||
? createLTO(Args, Features, OutputBitcode)
|
||||
: createLTO(Args, Features);
|
||||
? createLTO(Args, Features, AddHostModuleAddr, OutputBitcode)
|
||||
: createLTO(Args, Features, AddHostModuleAddr);
|
||||
|
||||
LLVMContext &Ctx = LTOBackend->getContext();
|
||||
StringRef HostTriple =
|
||||
Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple());
|
||||
std::unique_ptr<Module> HostModulePtr;
|
||||
if (!IsHostModuleGenerated) {
|
||||
HostModulePtr = std::make_unique<Module>(
|
||||
sys::path::filename(ExecutableName).str() + "-host-rpc.bc", Ctx);
|
||||
HostModule = HostModulePtr.get();
|
||||
HostModulePtr->setTargetTriple(HostTriple);
|
||||
|
||||
std::string Msg;
|
||||
const Target *T =
|
||||
TargetRegistry::lookupTarget(HostModule->getTargetTriple(), Msg);
|
||||
if (!T)
|
||||
return createStringError(inconvertibleErrorCode(), Msg);
|
||||
auto Options =
|
||||
codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple(HostTriple));
|
||||
StringRef CPU = "";
|
||||
StringRef Features = "";
|
||||
std::unique_ptr<TargetMachine> TM(
|
||||
T->createTargetMachine(HostTriple, CPU, Features, Options, Reloc::PIC_,
|
||||
HostModule->getCodeModel()));
|
||||
HostModule->setDataLayout(TM->createDataLayout());
|
||||
}
|
||||
|
||||
// We need to resolve the symbols so the LTO backend knows which symbols need
|
||||
// to be kept or can be internalized. This is a simplified symbol resolution
|
||||
@@ -806,6 +907,7 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
|
||||
SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
|
||||
size_t Idx = 0;
|
||||
for (auto &Sym : Symbols) {
|
||||
|
||||
lto::SymbolResolution &Res = Resolutions[Idx++];
|
||||
|
||||
// We will use this as the prevailing symbol definition in LTO unless
|
||||
@@ -876,6 +978,57 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
|
||||
if (Error Err = LTOBackend->run(AddStream))
|
||||
return Err;
|
||||
|
||||
std::string HostModuleTempFile;
|
||||
bool ValidHostModule = writeHostModule(HostModuleTempFile);
|
||||
// Reset the HostModule pointer.
|
||||
HostModulePtr.reset();
|
||||
HostModule = nullptr;
|
||||
// TODO: this is really redundant code.
|
||||
if (ValidHostModule) {
|
||||
auto HostLTO = createHostRPCLTO(HostTriple);
|
||||
|
||||
std::string HostBitCodeFile = HostModuleTempFile + ".bc";
|
||||
auto BufferOrError = MemoryBuffer::getFile(HostBitCodeFile);
|
||||
if (!BufferOrError)
|
||||
reportError(createFileError(HostBitCodeFile, BufferOrError.getError()));
|
||||
Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr =
|
||||
llvm::lto::InputFile::create(*BufferOrError.get());
|
||||
if (!BitcodeFileOrErr)
|
||||
return BitcodeFileOrErr.takeError();
|
||||
|
||||
const auto Symbols = (*BitcodeFileOrErr)->symbols();
|
||||
SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
|
||||
size_t Idx = 0;
|
||||
for (auto &Sym : Symbols) {
|
||||
(void)Sym;
|
||||
lto::SymbolResolution &Res = Resolutions[Idx++];
|
||||
Res.ExportDynamic = true;
|
||||
Res.VisibleToRegularObj = true;
|
||||
Res.LinkerRedefined = false;
|
||||
Res.Prevailing = true;
|
||||
}
|
||||
if (Error Err = HostLTO->add(std::move(*BitcodeFileOrErr), Resolutions))
|
||||
return Err;
|
||||
|
||||
auto RPCAddStream =
|
||||
[&](size_t Task,
|
||||
const Twine &ModuleName) -> std::unique_ptr<CachedFileStream> {
|
||||
int FD = -1;
|
||||
auto TempFileOrErr = createOutputFile(
|
||||
sys::path::filename(ExecutableName) + "-host-rpc-" + HostTriple, "o");
|
||||
if (!TempFileOrErr)
|
||||
reportError(TempFileOrErr.takeError());
|
||||
HostRPCObjFile = *TempFileOrErr;
|
||||
if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
|
||||
reportError(errorCodeToError(EC));
|
||||
return std::make_unique<CachedFileStream>(
|
||||
std::make_unique<llvm::raw_fd_ostream>(FD, true));
|
||||
};
|
||||
|
||||
if (Error Err = HostLTO->run(RPCAddStream))
|
||||
return Err;
|
||||
}
|
||||
|
||||
if (LTOError)
|
||||
return createStringError(inconvertibleErrorCode(),
|
||||
"Errors encountered inside the LTO pipeline.");
|
||||
@@ -1244,6 +1397,9 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
|
||||
WrappedOutput.push_back(*OutputOrErr);
|
||||
}
|
||||
|
||||
if (!HostRPCObjFile.empty())
|
||||
WrappedOutput.push_back(HostRPCObjFile);
|
||||
|
||||
return WrappedOutput;
|
||||
}
|
||||
|
||||
|
||||
@@ -181,6 +181,10 @@ set(TARGET_LIBC_ENTRYPOINTS
|
||||
|
||||
# gpu/rpc.h entrypoints
|
||||
libc.src.gpu.rpc_host_call
|
||||
libc.src.gpu.rpc_open_port
|
||||
libc.src.gpu.rpc_send_n
|
||||
libc.src.gpu.rpc_recv_n
|
||||
libc.src.gpu.rpc_close_port
|
||||
)
|
||||
|
||||
set(TARGET_LIBM_ENTRYPOINTS
|
||||
|
||||
@@ -604,6 +604,7 @@ if(LIBC_TARGET_OS_IS_GPU)
|
||||
DEPENDS
|
||||
.llvm_libc_common_h
|
||||
.llvm-libc-types.rpc_opcodes_t
|
||||
.llvm-libc-types.rpc_port_t
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
@@ -11,7 +11,10 @@
|
||||
|
||||
#include <__llvm-libc-common.h>
|
||||
|
||||
#include <llvm-libc-types/size_t.h>
|
||||
|
||||
#include <llvm-libc-types/rpc_opcodes_t.h>
|
||||
#include <llvm-libc-types/rpc_port_t.h>
|
||||
|
||||
%%public_api()
|
||||
|
||||
|
||||
@@ -93,6 +93,7 @@ add_header(socklen_t HDR socklen_t.h)
|
||||
add_header(struct_sockaddr_un HDR struct_sockaddr_un.h)
|
||||
add_header(struct_sockaddr HDR struct_sockaddr.h)
|
||||
add_header(rpc_opcodes_t HDR rpc_opcodes_t.h)
|
||||
add_header(rpc_port_t HDR rpc_port_t.h)
|
||||
add_header(ACTION HDR ACTION.h)
|
||||
add_header(ENTRY HDR ENTRY.h)
|
||||
add_header(struct_hsearch_data HDR struct_hsearch_data.h)
|
||||
|
||||
16
libc/include/llvm-libc-types/rpc_port_t.h
Normal file
16
libc/include/llvm-libc-types/rpc_port_t.h
Normal file
@@ -0,0 +1,16 @@
|
||||
//===-- Definition of type rpc_port_t -------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __LLVM_LIBC_TYPES_RPC_PORT_T_H__
|
||||
#define __LLVM_LIBC_TYPES_RPC_PORT_T_H__
|
||||
|
||||
typedef struct {
|
||||
__UINT8_TYPE__ reserved[32];
|
||||
} rpc_port_t;
|
||||
|
||||
#endif // __LLVM_LIBC_TYPES_RPC_PORT_T_H__
|
||||
@@ -15,6 +15,7 @@ typedef enum : unsigned short {
|
||||
RPC_TEST_NOOP = 1 << 15,
|
||||
RPC_TEST_INCREMENT,
|
||||
RPC_TEST_INTERFACE,
|
||||
RPC_TEST_EXTERNAL,
|
||||
RPC_TEST_STREAM,
|
||||
} rpc_test_opcode_t;
|
||||
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
def RPCPortT : NamedType<"rpc_port_t">;
|
||||
def RPCPortPtrT : PtrType<RPCPortT>;
|
||||
|
||||
def GPUExtensions : StandardSpec<"GPUExtensions"> {
|
||||
HeaderSpec RPC = HeaderSpec<
|
||||
"gpu/rpc.h",
|
||||
[], // Macros
|
||||
[], // Types
|
||||
[RPCPortT, SizeTType], // Types
|
||||
[], // Enumerations
|
||||
[
|
||||
FunctionSpec<
|
||||
@@ -10,6 +13,26 @@ def GPUExtensions : StandardSpec<"GPUExtensions"> {
|
||||
RetValSpec<VoidType>,
|
||||
[ArgSpec<VoidPtr>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
|
||||
>,
|
||||
FunctionSpec<
|
||||
"rpc_open_port",
|
||||
RetValSpec<RPCPortT>,
|
||||
[ArgSpec<UnsignedIntType>]
|
||||
>,
|
||||
FunctionSpec<
|
||||
"rpc_send_n",
|
||||
RetValSpec<VoidType>,
|
||||
[ArgSpec<RPCPortPtrT>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
|
||||
>,
|
||||
FunctionSpec<
|
||||
"rpc_recv_n",
|
||||
RetValSpec<VoidType>,
|
||||
[ArgSpec<RPCPortPtrT>, ArgSpec<VoidPtr>, ArgSpec<SizeTPtr>]
|
||||
>,
|
||||
FunctionSpec<
|
||||
"rpc_close_port",
|
||||
RetValSpec<VoidType>,
|
||||
[ArgSpec<RPCPortPtrT>]
|
||||
>,
|
||||
]
|
||||
>;
|
||||
let Headers = [
|
||||
|
||||
@@ -25,12 +25,14 @@ namespace LIBC_NAMESPACE::cpp {
|
||||
#define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
|
||||
#endif
|
||||
|
||||
// This implementation of bit_cast requires trivially-constructible To, to avoid
|
||||
// UB in the implementation.
|
||||
template <typename To, typename From>
|
||||
LIBC_INLINE constexpr cpp::enable_if_t<
|
||||
(sizeof(To) == sizeof(From)) &&
|
||||
// Implementation of bit_cast that cannot use the compiler builtin must be
|
||||
// trivially-constructible To, to avoid UB in the implementation.
|
||||
#if !LIBC_HAS_BUILTIN(__builtin_bit_cast)
|
||||
cpp::is_trivially_constructible<To>::value &&
|
||||
#endif
|
||||
cpp::is_trivially_copyable<To>::value &&
|
||||
cpp::is_trivially_copyable<From>::value,
|
||||
To>
|
||||
|
||||
@@ -323,6 +323,7 @@ public:
|
||||
LIBC_INLINE void send_n(const void *src, uint64_t size);
|
||||
template <typename A>
|
||||
LIBC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc);
|
||||
LIBC_INLINE void recv_n(void *dst, uint64_t *size);
|
||||
|
||||
LIBC_INLINE uint16_t get_opcode() const {
|
||||
return process.header[index].opcode;
|
||||
@@ -359,7 +360,9 @@ struct Client {
|
||||
: process(port_count, buffer) {}
|
||||
|
||||
using Port = rpc::Port<false>;
|
||||
template <uint16_t opcode> LIBC_INLINE Port open();
|
||||
|
||||
template <uint16_t opcode> LIBC_INLINE Port open() { return open(opcode); }
|
||||
LIBC_INLINE Port open(uint16_t opcode);
|
||||
|
||||
private:
|
||||
Process<false> process;
|
||||
@@ -484,6 +487,14 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper routine to simplify the interface when recieving from the GPU using
|
||||
/// thread private pointers to the underly value and the destination pointer
|
||||
/// contains enough data to recieve the values.
|
||||
template <bool T> LIBC_INLINE void Port<T>::recv_n(void *dst, uint64_t *size) {
|
||||
void **dst_ptr = &dst;
|
||||
recv_n(dst_ptr, size, [=](uint64_t) { return dst; });
|
||||
}
|
||||
|
||||
/// Receives an arbitrarily sized data buffer across the shared channel in
|
||||
/// multiples of the packet length. The \p alloc function is called with the
|
||||
/// size of the data so that we can initialize the size of the \p dst buffer.
|
||||
@@ -522,9 +533,9 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
|
||||
/// is, there are send operations pending that haven't been serviced on this
|
||||
/// port. Each port instance uses an associated \p opcode to tell the server
|
||||
/// what to do. The Client interface provides the appropriate lane size to the
|
||||
/// port using the platform's returned value.
|
||||
template <uint16_t opcode>
|
||||
[[clang::convergent]] LIBC_INLINE Client::Port Client::open() {
|
||||
/// port using the platform's returned value. It is required that \p opcode is
|
||||
/// uniform between all the lanes for this to work.
|
||||
[[clang::convergent]] LIBC_INLINE Client::Port Client::open(uint16_t opcode) {
|
||||
// Repeatedly perform a naive linear scan for a port that can be opened to
|
||||
// send data.
|
||||
for (uint32_t index = gpu::get_cluster_id();; ++index) {
|
||||
|
||||
@@ -995,24 +995,13 @@ LIBC_INLINE_VAR constexpr bool is_big_int_v = is_big_int<T>::value;
|
||||
|
||||
namespace cpp {
|
||||
|
||||
// Specialization of cpp::bit_cast ('bit.h') from T to BigInt.
|
||||
template <typename To, typename From>
|
||||
LIBC_INLINE constexpr cpp::enable_if_t<
|
||||
(sizeof(To) == sizeof(From)) && cpp::is_trivially_copyable<To>::value &&
|
||||
cpp::is_trivially_copyable<From>::value && is_big_int<To>::value,
|
||||
To>
|
||||
bit_cast(const From &from) {
|
||||
To out;
|
||||
using Storage = decltype(out.val);
|
||||
out.val = cpp::bit_cast<Storage>(from);
|
||||
return out;
|
||||
}
|
||||
|
||||
// Specialization of cpp::bit_cast ('bit.h') from BigInt to T.
|
||||
template <typename To, size_t Bits>
|
||||
LIBC_INLINE constexpr cpp::enable_if_t<
|
||||
sizeof(To) == sizeof(UInt<Bits>) &&
|
||||
#if !LIBC_HAS_BUILTIN(__builtin_bit_cast)
|
||||
cpp::is_trivially_constructible<To>::value &&
|
||||
#endif
|
||||
cpp::is_trivially_copyable<To>::value &&
|
||||
cpp::is_trivially_copyable<UInt<Bits>>::value,
|
||||
To>
|
||||
|
||||
@@ -8,3 +8,58 @@ add_entrypoint_object(
|
||||
libc.src.__support.RPC.rpc_client
|
||||
libc.src.__support.GPU.utils
|
||||
)
|
||||
|
||||
add_entrypoint_object(
|
||||
rpc_open_port
|
||||
SRCS
|
||||
rpc_open_port.cpp
|
||||
HDRS
|
||||
rpc_open_port.h
|
||||
DEPENDS
|
||||
libc.src.__support.RPC.rpc_client
|
||||
libc.src.__support.GPU.utils
|
||||
)
|
||||
|
||||
add_entrypoint_object(
|
||||
rpc_close_port
|
||||
SRCS
|
||||
rpc_close_port.cpp
|
||||
HDRS
|
||||
rpc_close_port.h
|
||||
DEPENDS
|
||||
libc.src.__support.RPC.rpc_client
|
||||
libc.src.__support.GPU.utils
|
||||
)
|
||||
|
||||
add_entrypoint_object(
|
||||
rpc_get_buffer
|
||||
SRCS
|
||||
rpc_get_buffer.cpp
|
||||
HDRS
|
||||
rpc_get_buffer.h
|
||||
DEPENDS
|
||||
libc.src.__support.RPC.rpc_client
|
||||
libc.src.__support.GPU.utils
|
||||
)
|
||||
|
||||
add_entrypoint_object(
|
||||
rpc_send_n
|
||||
SRCS
|
||||
rpc_send_n.cpp
|
||||
HDRS
|
||||
rpc_send_n.h
|
||||
DEPENDS
|
||||
libc.src.__support.RPC.rpc_client
|
||||
libc.src.__support.GPU.utils
|
||||
)
|
||||
|
||||
add_entrypoint_object(
|
||||
rpc_recv_n
|
||||
SRCS
|
||||
rpc_recv_n.cpp
|
||||
HDRS
|
||||
rpc_recv_n.h
|
||||
DEPENDS
|
||||
libc.src.__support.RPC.rpc_client
|
||||
libc.src.__support.GPU.utils
|
||||
)
|
||||
|
||||
25
libc/src/gpu/rpc_close_port.cpp
Normal file
25
libc/src/gpu/rpc_close_port.cpp
Normal file
@@ -0,0 +1,25 @@
|
||||
//===---------- GPU implementation of the external RPC port interface -----===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "src/gpu/rpc_close_port.h"
|
||||
|
||||
#include "src/__support/GPU/utils.h"
|
||||
#include "src/__support/RPC/rpc_client.h"
|
||||
#include "src/__support/common.h"
|
||||
|
||||
namespace LIBC_NAMESPACE {
|
||||
|
||||
static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
|
||||
|
||||
LLVM_LIBC_FUNCTION(void, rpc_close_port, (rpc_port_t * handle)) {
|
||||
rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
|
||||
port.close();
|
||||
*handle = cpp::bit_cast<rpc_port_t>(port);
|
||||
}
|
||||
|
||||
} // namespace LIBC_NAMESPACE
|
||||
20
libc/src/gpu/rpc_close_port.h
Normal file
20
libc/src/gpu/rpc_close_port.h
Normal file
@@ -0,0 +1,20 @@
|
||||
//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H
|
||||
#define LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H
|
||||
|
||||
#include <gpu/rpc.h>
|
||||
|
||||
namespace LIBC_NAMESPACE {
|
||||
|
||||
void rpc_close_port(rpc_port_t *handle);
|
||||
|
||||
} // namespace LIBC_NAMESPACE
|
||||
|
||||
#endif // LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H
|
||||
26
libc/src/gpu/rpc_open_port.cpp
Normal file
26
libc/src/gpu/rpc_open_port.cpp
Normal file
@@ -0,0 +1,26 @@
|
||||
//===---------- GPU implementation of the external RPC port interface -----===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "src/gpu/rpc_open_port.h"
|
||||
|
||||
#include "src/__support/CPP/bit.h"
|
||||
#include "src/__support/GPU/utils.h"
|
||||
#include "src/__support/RPC/rpc_client.h"
|
||||
#include "src/__support/common.h"
|
||||
|
||||
namespace LIBC_NAMESPACE {
|
||||
|
||||
static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
|
||||
|
||||
LLVM_LIBC_FUNCTION(rpc_port_t, rpc_open_port, (unsigned opcode)) {
|
||||
uint32_t uniform = gpu::broadcast_value(gpu::get_lane_mask(), opcode);
|
||||
rpc::Client::Port port = rpc::client.open(static_cast<uint16_t>(uniform));
|
||||
return cpp::bit_cast<rpc_port_t>(port);
|
||||
}
|
||||
|
||||
} // namespace LIBC_NAMESPACE
|
||||
20
libc/src/gpu/rpc_open_port.h
Normal file
20
libc/src/gpu/rpc_open_port.h
Normal file
@@ -0,0 +1,20 @@
|
||||
//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H
|
||||
#define LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H
|
||||
|
||||
#include <gpu/rpc.h>
|
||||
|
||||
namespace LIBC_NAMESPACE {
|
||||
|
||||
rpc_port_t rpc_open_port(unsigned opcode);
|
||||
|
||||
} // namespace LIBC_NAMESPACE
|
||||
|
||||
#endif // LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H
|
||||
26
libc/src/gpu/rpc_recv_n.cpp
Normal file
26
libc/src/gpu/rpc_recv_n.cpp
Normal file
@@ -0,0 +1,26 @@
|
||||
//===---------- GPU implementation of the external RPC port interface -----===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "src/gpu/rpc_recv_n.h"
|
||||
|
||||
#include "src/__support/GPU/utils.h"
|
||||
#include "src/__support/RPC/rpc_client.h"
|
||||
#include "src/__support/common.h"
|
||||
|
||||
namespace LIBC_NAMESPACE {
|
||||
|
||||
static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
|
||||
|
||||
LLVM_LIBC_FUNCTION(void, rpc_recv_n,
|
||||
(rpc_port_t * handle, void *dst, size_t *size)) {
|
||||
rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
|
||||
port.recv_n(dst, reinterpret_cast<uint64_t *>(size));
|
||||
*handle = cpp::bit_cast<rpc_port_t>(port);
|
||||
}
|
||||
|
||||
} // namespace LIBC_NAMESPACE
|
||||
20
libc/src/gpu/rpc_recv_n.h
Normal file
20
libc/src/gpu/rpc_recv_n.h
Normal file
@@ -0,0 +1,20 @@
|
||||
//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_LIBC_SRC_GPU_RPC_RECV_H
|
||||
#define LLVM_LIBC_SRC_GPU_RPC_RECV_H
|
||||
|
||||
#include <gpu/rpc.h>
|
||||
|
||||
namespace LIBC_NAMESPACE {
|
||||
|
||||
void rpc_recv_n(rpc_port_t *handle, void *dst, size_t *size);
|
||||
|
||||
} // namespace LIBC_NAMESPACE
|
||||
|
||||
#endif // LLVM_LIBC_SRC_GPU_RPC_RECV_H
|
||||
26
libc/src/gpu/rpc_send_n.cpp
Normal file
26
libc/src/gpu/rpc_send_n.cpp
Normal file
@@ -0,0 +1,26 @@
|
||||
//===---------- GPU implementation of the external RPC port interface -----===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "src/gpu/rpc_send_n.h"
|
||||
|
||||
#include "src/__support/GPU/utils.h"
|
||||
#include "src/__support/RPC/rpc_client.h"
|
||||
#include "src/__support/common.h"
|
||||
|
||||
namespace LIBC_NAMESPACE {
|
||||
|
||||
static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
|
||||
|
||||
LLVM_LIBC_FUNCTION(void, rpc_send_n,
|
||||
(rpc_port_t * handle, const void *src, size_t size)) {
|
||||
rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
|
||||
port.send_n(src, size);
|
||||
*handle = cpp::bit_cast<rpc_port_t>(port);
|
||||
}
|
||||
|
||||
} // namespace LIBC_NAMESPACE
|
||||
20
libc/src/gpu/rpc_send_n.h
Normal file
20
libc/src/gpu/rpc_send_n.h
Normal file
@@ -0,0 +1,20 @@
|
||||
//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_LIBC_SRC_GPU_RPC_SEND_H
|
||||
#define LLVM_LIBC_SRC_GPU_RPC_SEND_H
|
||||
|
||||
#include <gpu/rpc.h>
|
||||
|
||||
namespace LIBC_NAMESPACE {
|
||||
|
||||
void rpc_send_n(rpc_port_t *handle, const void *src, size_t size);
|
||||
|
||||
} // namespace LIBC_NAMESPACE
|
||||
|
||||
#endif // LLVM_LIBC_SRC_GPU_RPC_SEND_H
|
||||
@@ -44,6 +44,21 @@ add_integration_test(
|
||||
rpc_interface_test.cpp
|
||||
)
|
||||
|
||||
add_integration_test(
|
||||
startup_rpc_external_interface_test
|
||||
SUITE libc-startup-tests
|
||||
SRCS
|
||||
rpc_external_interface_test.cpp
|
||||
DEPENDS
|
||||
libc.src.gpu.rpc_open_port
|
||||
libc.src.gpu.rpc_send_n
|
||||
libc.src.gpu.rpc_recv_n
|
||||
libc.src.gpu.rpc_close_port
|
||||
LOADER_ARGS
|
||||
--threads 32
|
||||
--blocks 8
|
||||
)
|
||||
|
||||
add_integration_test(
|
||||
startup_rpc_stream_test
|
||||
SUITE libc-startup-tests
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
//===-- Loader test to check the external RPC interface with the loader ---===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <gpu/rpc.h>
|
||||
|
||||
#include "src/gpu/rpc_close_port.h"
|
||||
#include "src/gpu/rpc_open_port.h"
|
||||
#include "src/gpu/rpc_recv_n.h"
|
||||
#include "src/gpu/rpc_send_n.h"
|
||||
|
||||
#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
|
||||
#include "src/__support/GPU/utils.h"
|
||||
#include "src/__support/RPC/rpc_client.h"
|
||||
#include "test/IntegrationTest/test.h"
|
||||
|
||||
using namespace LIBC_NAMESPACE;
|
||||
|
||||
static void test_interface() {
|
||||
uint32_t num_additions =
|
||||
10 + 10 * gpu::get_thread_id() + 10 * gpu::get_block_id();
|
||||
uint64_t cnt = 0;
|
||||
for (uint32_t i = 0; i < num_additions; ++i) {
|
||||
size_t size = sizeof(uint64_t);
|
||||
rpc_port_t port = LIBC_NAMESPACE::rpc_open_port(RPC_TEST_EXTERNAL);
|
||||
LIBC_NAMESPACE::rpc_send_n(&port, &cnt, size);
|
||||
LIBC_NAMESPACE::rpc_recv_n(&port, &cnt, &size);
|
||||
LIBC_NAMESPACE::rpc_close_port(&port);
|
||||
ASSERT_TRUE(size == sizeof(uint64_t));
|
||||
}
|
||||
|
||||
ASSERT_TRUE(cnt == num_additions && "Invalid number of increments");
|
||||
}
|
||||
|
||||
TEST_MAIN(int argc, char **argv, char **envp) {
|
||||
test_interface();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -11,8 +11,8 @@
|
||||
|
||||
#include "utils/gpu/server/llvmlibc_rpc_server.h"
|
||||
|
||||
#include "llvm-libc-types/rpc_opcodes_t.h"
|
||||
#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
|
||||
#include "llvm-libc-types/rpc_opcodes_t.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
@@ -222,6 +222,28 @@ inline void register_rpc_callbacks(uint32_t device_id) {
|
||||
}
|
||||
},
|
||||
nullptr);
|
||||
|
||||
// Register the stream test handler.
|
||||
rpc_register_callback(
|
||||
device_id, static_cast<rpc_opcode_t>(RPC_TEST_EXTERNAL),
|
||||
[](rpc_port_t port, void *data) {
|
||||
uint64_t sizes[lane_size] = {0};
|
||||
void *dst[lane_size] = {nullptr};
|
||||
rpc_recv_n(
|
||||
port, dst, sizes,
|
||||
[](uint64_t size, void *) -> void * { return new char[size]; },
|
||||
nullptr);
|
||||
for (uint64_t i = 0; i < lane_size; ++i) {
|
||||
if (dst[i])
|
||||
*reinterpret_cast<uint64_t *>(dst[i]) += 1;
|
||||
}
|
||||
rpc_send_n(port, dst, sizes);
|
||||
for (uint64_t i = 0; i < lane_size; ++i) {
|
||||
if (dst[i])
|
||||
delete[] reinterpret_cast<uint8_t *>(dst[i]);
|
||||
}
|
||||
},
|
||||
nullptr);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -25,6 +25,19 @@ enum OMPTgtExecModeFlags : unsigned char {
|
||||
OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
|
||||
};
|
||||
|
||||
enum OMPTgtHostRPCArgType {
|
||||
// No need to copy.
|
||||
OMP_HOST_RPC_ARG_SCALAR = 0,
|
||||
OMP_HOST_RPC_ARG_PTR = 1,
|
||||
// Copy to device.
|
||||
OMP_HOST_RPC_ARG_COPY_TO = OMP_HOST_RPC_ARG_PTR | (1 << 1),
|
||||
// Copy to device.
|
||||
OMP_HOST_RPC_ARG_COPY_FROM = OMP_HOST_RPC_ARG_PTR | (1 << 2),
|
||||
// Copy to and from device.
|
||||
OMP_HOST_RPC_ARG_COPY_TOFROM =
|
||||
OMP_HOST_RPC_ARG_COPY_TO | OMP_HOST_RPC_ARG_COPY_FROM,
|
||||
};
|
||||
|
||||
} // end namespace omp
|
||||
} // end namespace llvm
|
||||
|
||||
|
||||
@@ -227,7 +227,9 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
|
||||
__OMP_RTL(__kmpc_get_warp_size, false, Int32, )
|
||||
|
||||
__OMP_RTL(omp_get_thread_num, false, Int32, )
|
||||
__OMP_RTL(omp_get_bulk_thread_num, false, Int32, )
|
||||
__OMP_RTL(omp_get_num_threads, false, Int32, )
|
||||
__OMP_RTL(omp_get_bulk_num_threads, false, Int32, )
|
||||
__OMP_RTL(omp_get_max_threads, false, Int32, )
|
||||
__OMP_RTL(omp_in_parallel, false, Int32, )
|
||||
__OMP_RTL(omp_get_dynamic, false, Int32, )
|
||||
@@ -490,6 +492,8 @@ __OMP_RTL(__kmpc_reduction_get_fixed_buffer, false, VoidPtr, )
|
||||
|
||||
__OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
|
||||
|
||||
__OMP_RTL(malloc, false, VoidPtr, SizeTy)
|
||||
__OMP_RTL(free, false, Void, VoidPtr)
|
||||
__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
|
||||
__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)
|
||||
__OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
|
||||
@@ -503,6 +507,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
|
||||
__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
|
||||
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
|
||||
|
||||
__OMP_RTL(__kmpc_launch_parallel_51_kernel, false, Void, Int8Ptr, Int32, Int32,
|
||||
Int32, VoidPtrPtr, Int64)
|
||||
|
||||
__OMP_RTL(__last, false, Void, )
|
||||
|
||||
#undef __OMP_RTL
|
||||
@@ -710,6 +717,8 @@ __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs())
|
||||
|
||||
__OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs())
|
||||
__OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs())
|
||||
__OMP_RTL_ATTRS(omp_get_bulk_thread_num, GetterAttrs, SExt, ParamAttrs())
|
||||
__OMP_RTL_ATTRS(omp_get_bulk_num_threads, GetterAttrs, SExt, ParamAttrs())
|
||||
__OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs())
|
||||
__OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs())
|
||||
__OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, SExt, ParamAttrs())
|
||||
|
||||
@@ -60,6 +60,9 @@ struct Config {
|
||||
bool VerifyEach = false;
|
||||
bool DisableVerify = false;
|
||||
|
||||
/// Use the standard optimization pipeline.
|
||||
bool UseDefaultPipeline = false;
|
||||
|
||||
/// Flag to indicate that the optimizer should not assume builtins are present
|
||||
/// on the target.
|
||||
bool Freestanding = false;
|
||||
|
||||
@@ -303,6 +303,9 @@ public:
|
||||
/// by LTO but might not be visible from bitcode symbol table.
|
||||
static ArrayRef<const char*> getRuntimeLibcallSymbols();
|
||||
|
||||
/// Returns the context.
|
||||
LLVMContext &getContext() { return RegularLTO.Ctx; }
|
||||
|
||||
private:
|
||||
Config Conf;
|
||||
|
||||
|
||||
31
llvm/include/llvm/Transforms/IPO/HostRPC.h
Normal file
31
llvm/include/llvm/Transforms/IPO/HostRPC.h
Normal file
@@ -0,0 +1,31 @@
|
||||
//===- Transform/IPO/HostRPC.h - Code of automatic host rpc -----*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TRANSFORMS_IPO_HOSTRPC_H
|
||||
#define LLVM_TRANSFORMS_IPO_HOSTRPC_H
|
||||
|
||||
#include "llvm/Analysis/CGSCCPassManager.h"
|
||||
#include "llvm/IR/PassManager.h"
|
||||
|
||||
namespace llvm {
|
||||
class HostRPCPass : public PassInfoMixin<HostRPCPass> {
|
||||
public:
|
||||
HostRPCPass() : LTOPhase(ThinOrFullLTOPhase::None) {}
|
||||
HostRPCPass(ThinOrFullLTOPhase LTOPhase) : LTOPhase(LTOPhase) {}
|
||||
|
||||
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
|
||||
|
||||
private:
|
||||
const ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None;
|
||||
};
|
||||
} // namespace llvm
|
||||
|
||||
#endif // LLVM_TRANSFORMS_IPO_HOSTRPC_H
|
||||
@@ -0,0 +1,30 @@
|
||||
//===--------------- CanonicalizeMainFunction.h -----------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Utility function to canonicalize main function.
|
||||
// The canonical main function is defined as: int main(int argc, char *argv[]);
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
|
||||
#define LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
|
||||
|
||||
#include "llvm/IR/PassManager.h"
|
||||
|
||||
namespace llvm {
|
||||
|
||||
/// A pass that canonicalizes main function in a module.
|
||||
class CanonicalizeMainFunctionPass
|
||||
: public PassInfoMixin<CanonicalizeMainFunctionPass> {
|
||||
public:
|
||||
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
|
||||
};
|
||||
|
||||
} // end namespace llvm
|
||||
|
||||
#endif // LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
|
||||
@@ -330,6 +330,8 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
|
||||
report_fatal_error(Twine("unable to parse pass pipeline description '") +
|
||||
Conf.OptPipeline + "': " + toString(std::move(Err)));
|
||||
}
|
||||
} else if (Conf.UseDefaultPipeline) {
|
||||
MPM.addPass(PB.buildPerModuleDefaultPipeline(OL));
|
||||
} else if (IsThinLTO) {
|
||||
MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary));
|
||||
} else {
|
||||
|
||||
@@ -141,6 +141,7 @@
|
||||
#include "llvm/Transforms/IPO/GlobalDCE.h"
|
||||
#include "llvm/Transforms/IPO/GlobalOpt.h"
|
||||
#include "llvm/Transforms/IPO/GlobalSplit.h"
|
||||
#include "llvm/Transforms/IPO/HostRPC.h"
|
||||
#include "llvm/Transforms/IPO/HotColdSplitting.h"
|
||||
#include "llvm/Transforms/IPO/IROutliner.h"
|
||||
#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
|
||||
@@ -264,6 +265,7 @@
|
||||
#include "llvm/Transforms/Utils/BreakCriticalEdges.h"
|
||||
#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
|
||||
#include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
|
||||
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
|
||||
#include "llvm/Transforms/Utils/CountVisits.h"
|
||||
#include "llvm/Transforms/Utils/DXILUpgrade.h"
|
||||
#include "llvm/Transforms/Utils/Debugify.h"
|
||||
|
||||
@@ -53,6 +53,7 @@
|
||||
#include "llvm/Transforms/IPO/GlobalDCE.h"
|
||||
#include "llvm/Transforms/IPO/GlobalOpt.h"
|
||||
#include "llvm/Transforms/IPO/GlobalSplit.h"
|
||||
#include "llvm/Transforms/IPO/HostRPC.h"
|
||||
#include "llvm/Transforms/IPO/HotColdSplitting.h"
|
||||
#include "llvm/Transforms/IPO/IROutliner.h"
|
||||
#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
|
||||
@@ -126,6 +127,7 @@
|
||||
#include "llvm/Transforms/Utils/AddDiscriminators.h"
|
||||
#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
|
||||
#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
|
||||
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
|
||||
#include "llvm/Transforms/Utils/CountVisits.h"
|
||||
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
|
||||
#include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
|
||||
@@ -182,9 +184,9 @@ static cl::opt<bool> EnablePostPGOLoopRotation(
|
||||
"enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden,
|
||||
cl::desc("Run the loop rotation transformation after PGO instrumentation"));
|
||||
|
||||
static cl::opt<bool> EnableGlobalAnalyses(
|
||||
"enable-global-analyses", cl::init(true), cl::Hidden,
|
||||
cl::desc("Enable inter-procedural analyses"));
|
||||
static cl::opt<bool>
|
||||
EnableGlobalAnalyses("enable-global-analyses", cl::init(true), cl::Hidden,
|
||||
cl::desc("Enable inter-procedural analyses"));
|
||||
|
||||
static cl::opt<bool>
|
||||
RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden,
|
||||
@@ -306,6 +308,13 @@ cl::opt<bool> EnableMemProfContextDisambiguation(
|
||||
extern cl::opt<bool> EnableInferAlignmentPass;
|
||||
} // namespace llvm
|
||||
|
||||
static cl::opt<bool> EnableHostRPC("enable-host-rpc", cl::init(false),
|
||||
cl::Hidden, cl::desc("Enable HostRPC pass"));
|
||||
|
||||
static cl::opt<bool> EnableCanonicalizeMainFunction(
|
||||
"enable-canonicalize-main-function", cl::init(false), cl::Hidden,
|
||||
cl::desc("Enable CanonicalizeMainFunction pass"));
|
||||
|
||||
PipelineTuningOptions::PipelineTuningOptions() {
|
||||
LoopInterleaving = true;
|
||||
LoopVectorization = true;
|
||||
@@ -1091,6 +1100,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
|
||||
PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
|
||||
}
|
||||
|
||||
if (EnableCanonicalizeMainFunction)
|
||||
MPM.addPass(CanonicalizeMainFunctionPass());
|
||||
|
||||
// Try to perform OpenMP specific optimizations on the module. This is a
|
||||
// (quick!) no-op if there are no OpenMP runtime calls present in the module.
|
||||
MPM.addPass(OpenMPOptPass());
|
||||
@@ -1110,11 +1122,10 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
|
||||
// and prior to optimizing globals.
|
||||
// FIXME: This position in the pipeline hasn't been carefully considered in
|
||||
// years, it should be re-analyzed.
|
||||
MPM.addPass(IPSCCPPass(
|
||||
IPSCCPOptions(/*AllowFuncSpec=*/
|
||||
Level != OptimizationLevel::Os &&
|
||||
Level != OptimizationLevel::Oz &&
|
||||
!isLTOPreLink(Phase))));
|
||||
MPM.addPass(IPSCCPPass(IPSCCPOptions(/*AllowFuncSpec=*/
|
||||
Level != OptimizationLevel::Os &&
|
||||
Level != OptimizationLevel::Oz &&
|
||||
!isLTOPreLink(Phase))));
|
||||
|
||||
// Attach metadata to indirect call sites indicating the set of functions
|
||||
// they may target at run-time. This should follow IPSCCP.
|
||||
@@ -1588,7 +1599,7 @@ PassBuilder::buildFatLTODefaultPipeline(OptimizationLevel Level, bool ThinLTO,
|
||||
ModulePassManager
|
||||
PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
|
||||
if (Level == OptimizationLevel::O0)
|
||||
return buildO0DefaultPipeline(Level, /*LTOPreLink*/true);
|
||||
return buildO0DefaultPipeline(Level, /*LTOPreLink*/ true);
|
||||
|
||||
ModulePassManager MPM;
|
||||
|
||||
@@ -1711,6 +1722,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
|
||||
// in the current module.
|
||||
MPM.addPass(CrossDSOCFIPass());
|
||||
|
||||
if (EnableCanonicalizeMainFunction)
|
||||
MPM.addPass(CanonicalizeMainFunctionPass());
|
||||
|
||||
if (Level == OptimizationLevel::O0) {
|
||||
// The WPD and LowerTypeTest passes need to run at -O0 to lower type
|
||||
// metadata and intrinsics.
|
||||
@@ -1741,6 +1755,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
|
||||
// Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
|
||||
MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink));
|
||||
|
||||
if (EnableHostRPC)
|
||||
MPM.addPass(HostRPCPass(ThinOrFullLTOPhase::FullLTOPostLink));
|
||||
|
||||
// Remove unused virtual tables to improve the quality of code generated by
|
||||
// whole-program devirtualization and bitset lowering.
|
||||
MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true));
|
||||
@@ -2115,6 +2132,9 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,
|
||||
|
||||
MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));
|
||||
|
||||
if (EnableCanonicalizeMainFunction)
|
||||
MPM.addPass(CanonicalizeMainFunctionPass());
|
||||
|
||||
return MPM;
|
||||
}
|
||||
|
||||
|
||||
@@ -47,6 +47,7 @@ MODULE_PASS("attributor", AttributorPass())
|
||||
MODULE_PASS("attributor-light", AttributorLightPass())
|
||||
MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
|
||||
MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass())
|
||||
MODULE_PASS("canonicalize-main-function", CanonicalizeMainFunctionPass())
|
||||
MODULE_PASS("check-debugify", NewPMCheckDebugifyPass())
|
||||
MODULE_PASS("constmerge", ConstantMergePass())
|
||||
MODULE_PASS("coro-cleanup", CoroCleanupPass())
|
||||
@@ -141,6 +142,7 @@ MODULE_PASS("tsan-module", ModuleThreadSanitizerPass())
|
||||
MODULE_PASS("verify", VerifierPass())
|
||||
MODULE_PASS("view-callgraph", CallGraphViewerPass())
|
||||
MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
|
||||
MODULE_PASS("host-rpc", HostRPCPass())
|
||||
#undef MODULE_PASS
|
||||
|
||||
#ifndef MODULE_PASS_WITH_PARAMS
|
||||
|
||||
@@ -397,12 +397,25 @@ static bool getPotentialCopiesOfMemoryValue(
|
||||
dbgs() << "Underlying object is a valid nullptr, giving up.\n";);
|
||||
return false;
|
||||
}
|
||||
// TODO: Use assumed noalias return.
|
||||
if (!isa<AllocaInst>(&Obj) && !isa<GlobalVariable>(&Obj) &&
|
||||
!(IsLoad ? isAllocationFn(&Obj, TLI) : isNoAliasCall(&Obj))) {
|
||||
LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << Obj
|
||||
<< "\n";);
|
||||
return false;
|
||||
if (!isa<AllocaInst>(&Obj) && !isa<GlobalVariable>(&Obj)) {
|
||||
auto *CB = dyn_cast<CallBase>(&Obj);
|
||||
bool R = false;
|
||||
if (CB) {
|
||||
auto &AAN =
|
||||
*A.getOrCreateAAFor<AANoAlias>(IRPosition::callsite_returned(*CB),
|
||||
&QueryingAA, DepClassTy::OPTIONAL);
|
||||
if (AAN.isValidState())
|
||||
R = AAN.isAssumedNoAlias();
|
||||
else
|
||||
R = isNoAliasCall(&Obj);
|
||||
if (!R)
|
||||
R = isAllocationFn(&Obj, TLI);
|
||||
}
|
||||
if (!R) {
|
||||
LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << Obj
|
||||
<< "\n";);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (auto *GV = dyn_cast<GlobalVariable>(&Obj))
|
||||
if (!GV->hasLocalLinkage() &&
|
||||
@@ -2226,7 +2239,8 @@ void Attributor::runTillFixpoint() {
|
||||
|
||||
LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
|
||||
<< IterationCounter << "/" << MaxIterations
|
||||
<< " iterations\n");
|
||||
<< " iterations. #Changed AAs: " << ChangedAAs.size()
|
||||
<< "\n");
|
||||
|
||||
// Reset abstract arguments not settled in a sound fixpoint by now. This
|
||||
// happens when we stopped the fixpoint iteration early. Note that only the
|
||||
@@ -2241,13 +2255,21 @@ void Attributor::runTillFixpoint() {
|
||||
|
||||
AbstractState &State = ChangedAA->getState();
|
||||
if (!State.isAtFixpoint()) {
|
||||
LLVM_DEBUG(
|
||||
dbgs() << "\n[Attributor] Invalidate AA " << *ChangedAA
|
||||
<< " as it is not at fix point after max iterations.\n");
|
||||
|
||||
State.indicatePessimisticFixpoint();
|
||||
|
||||
NumAttributesTimedOut++;
|
||||
}
|
||||
|
||||
for (auto &DepIt : ChangedAA->Deps)
|
||||
for (auto &DepIt : ChangedAA->Deps) {
|
||||
LLVM_DEBUG(dbgs() << "\n[Attributor] Invalidate dependent AA "
|
||||
<< *(DepIt.getPointer()) << " of AA " << *ChangedAA
|
||||
<< "\n");
|
||||
ChangedAAs.push_back(cast<AbstractAttribute>(DepIt.getPointer()));
|
||||
}
|
||||
ChangedAA->Deps.clear();
|
||||
}
|
||||
|
||||
|
||||
@@ -1669,6 +1669,8 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
|
||||
// might change while we iterate through a loop. For now, we give up if
|
||||
// the PHI is not invariant.
|
||||
if (isa<PHINode>(Usr)) {
|
||||
if (!Usr->getType()->isPointerTy())
|
||||
return false;
|
||||
// Note the order here, the Usr access might change the map, CurPtr is
|
||||
// already in it though.
|
||||
bool IsFirstPHIUser = !OffsetInfoMap.count(Usr);
|
||||
@@ -1899,6 +1901,31 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (auto *II = dyn_cast<ICmpInst>(Usr)) {
|
||||
auto CheckIfUsedAsPred = [&](const Use &U, bool &Follow) {
|
||||
const auto *UU = U.getUser();
|
||||
if (isa<SelectInst>(UU))
|
||||
return true;
|
||||
if (isa<BranchInst>(UU))
|
||||
return true;
|
||||
|
||||
LLVM_DEBUG(dbgs() << "[AAPointerInfo] ICmpInst user not handled " << *UU
|
||||
<< "\n");
|
||||
return false;
|
||||
};
|
||||
if (!A.checkForAllUses(CheckIfUsedAsPred, *this, *II,
|
||||
/* CheckBBLivenessOnly */ true,
|
||||
DepClassTy::OPTIONAL,
|
||||
/* IgnoreDroppableUses */ true)) {
|
||||
LLVM_DEBUG(
|
||||
dbgs() << "[AAPointerInfo] Check for all uses failed for ICmpInst "
|
||||
<< *II << "\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << "[AAPointerInfo] User not handled " << *Usr << "\n");
|
||||
return false;
|
||||
};
|
||||
@@ -2040,6 +2067,7 @@ struct AAPointerInfoCallSiteReturned final : AAPointerInfoFloating {
|
||||
};
|
||||
} // namespace
|
||||
|
||||
|
||||
/// -----------------------NoUnwind Function Attribute--------------------------
|
||||
|
||||
namespace {
|
||||
@@ -11899,10 +11927,35 @@ struct AAUnderlyingObjectsImpl
|
||||
/// See AbstractAttribute::trackStatistics()
|
||||
void trackStatistics() const override {}
|
||||
|
||||
// TODO: This is a temporary solution for terminals.
|
||||
bool checkIfTerminals(Attributor &A, Value *V) {
|
||||
auto *CI = dyn_cast<CallInst>(V);
|
||||
if (!CI)
|
||||
return false;
|
||||
|
||||
Function *Scope = CI->getFunction();
|
||||
const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*Scope);
|
||||
LibFunc TLIFn;
|
||||
if (TLI && TLI->getLibFunc(*CI, TLIFn)) {
|
||||
if (TLIFn == LibFunc::LibFunc_malloc ||
|
||||
TLIFn == LibFunc::LibFunc___kmpc_alloc_shared)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// See AbstractAttribute::updateImpl(...).
|
||||
ChangeStatus updateImpl(Attributor &A) override {
|
||||
auto &Ptr = getAssociatedValue();
|
||||
|
||||
if (checkIfTerminals(A, &Ptr)) {
|
||||
bool Changed = false;
|
||||
Changed |= InterAssumedUnderlyingObjects.insert(&Ptr);
|
||||
Changed |= IntraAssumedUnderlyingObjects.insert(&Ptr);
|
||||
return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
|
||||
}
|
||||
|
||||
auto DoUpdate = [&](SmallSetVector<Value *, 8> &UnderlyingObjects,
|
||||
AA::ValueScope Scope) {
|
||||
bool UsedAssumedInformation = false;
|
||||
@@ -11918,6 +11971,10 @@ struct AAUnderlyingObjectsImpl
|
||||
for (unsigned I = 0; I < Values.size(); ++I) {
|
||||
auto &VAC = Values[I];
|
||||
auto *Obj = VAC.getValue();
|
||||
if (checkIfTerminals(A, Obj)) {
|
||||
Changed |= UnderlyingObjects.insert(Obj);
|
||||
continue;
|
||||
}
|
||||
Value *UO = getUnderlyingObject(Obj);
|
||||
if (UO && UO != VAC.getValue() && SeenObjects.insert(UO).second) {
|
||||
const auto *OtherAA = A.getAAFor<AAUnderlyingObjects>(
|
||||
@@ -11948,6 +12005,16 @@ struct AAUnderlyingObjectsImpl
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto *LI = dyn_cast<LoadInst>(Obj)) {
|
||||
LLVM_DEBUG({
|
||||
dbgs() << "[AAUnderlyingObjects] for CtxI ";
|
||||
getCtxI()->print(dbgs());
|
||||
dbgs() << " at position " << getIRPosition() << " has LoadInst ";
|
||||
LI->print(dbgs());
|
||||
dbgs() << '\n';
|
||||
});
|
||||
}
|
||||
|
||||
Changed |= UnderlyingObjects.insert(Obj);
|
||||
}
|
||||
|
||||
@@ -11958,6 +12025,8 @@ struct AAUnderlyingObjectsImpl
|
||||
Changed |= DoUpdate(IntraAssumedUnderlyingObjects, AA::Intraprocedural);
|
||||
Changed |= DoUpdate(InterAssumedUnderlyingObjects, AA::Interprocedural);
|
||||
|
||||
LLVM_DEBUG(dumpState(dbgs()));
|
||||
|
||||
return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
|
||||
}
|
||||
|
||||
@@ -11996,6 +12065,19 @@ private:
|
||||
return Changed;
|
||||
}
|
||||
|
||||
void dumpState(raw_ostream &O) {
|
||||
O << "Underlying objects:\nintra procedureal:\n";
|
||||
for (auto *Obj : IntraAssumedUnderlyingObjects) {
|
||||
Obj->print(O);
|
||||
O << '\n';
|
||||
}
|
||||
O << "inter procedureal:\n";
|
||||
for (auto *Obj : InterAssumedUnderlyingObjects) {
|
||||
Obj->print(O);
|
||||
O << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
/// All the underlying objects collected so far via intra procedural scope.
|
||||
SmallSetVector<Value *, 8> IntraAssumedUnderlyingObjects;
|
||||
/// All the underlying objects collected so far via inter procedural scope.
|
||||
|
||||
@@ -20,6 +20,7 @@ add_llvm_component_library(LLVMipo
|
||||
GlobalDCE.cpp
|
||||
GlobalOpt.cpp
|
||||
GlobalSplit.cpp
|
||||
HostRPC.cpp
|
||||
HotColdSplitting.cpp
|
||||
IPO.cpp
|
||||
IROutliner.cpp
|
||||
|
||||
890
llvm/lib/Transforms/IPO/HostRPC.cpp
Normal file
890
llvm/lib/Transforms/IPO/HostRPC.cpp
Normal file
@@ -0,0 +1,890 @@
|
||||
//===- Transform/IPO/HostRPC.h - Code of automatic host rpc -----*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/Transforms/IPO/HostRPC.h"
|
||||
|
||||
#include "llvm/ADT/EnumeratedArray.h"
|
||||
#include "llvm/CodeGen/CommandFlags.h"
|
||||
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
|
||||
#include "llvm/IR/BasicBlock.h"
|
||||
#include "llvm/IR/Constant.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InstrTypes.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IRReader/IRReader.h"
|
||||
#include "llvm/Support/Casting.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include "llvm/Support/SourceMgr.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/Transforms/IPO/Attributor.h"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#define DEBUG_TYPE "host-rpc"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
using ArgType = llvm::omp::OMPTgtHostRPCArgType;
|
||||
|
||||
static cl::opt<bool>
|
||||
UseDummyHostModule("host-rpc-use-dummy-host-module", cl::init(false),
|
||||
cl::Hidden,
|
||||
cl::desc("Use dummy host module if there no host module "
|
||||
"attached to the device module"));
|
||||
|
||||
namespace {
|
||||
|
||||
enum class HostRPCRuntimeFunction {
|
||||
#define __OMPRTL_HOST_RPC(_ENUM) OMPRTL_##_ENUM
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc),
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg),
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg),
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait),
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val),
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper),
|
||||
__OMPRTL_HOST_RPC(__last),
|
||||
#undef __OMPRTL_HOST_RPC
|
||||
};
|
||||
|
||||
#define __OMPRTL_HOST_RPC(_ENUM) \
|
||||
auto OMPRTL_##_ENUM = HostRPCRuntimeFunction::OMPRTL_##_ENUM;
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc)
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg)
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg)
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait)
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val)
|
||||
__OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper)
|
||||
#undef __OMPRTL_HOST_RPC
|
||||
|
||||
// TODO: Remove those functions implemented in device runtime.
|
||||
static constexpr const char *InternalPrefix[] = {
|
||||
"__kmp", "llvm.", "nvm.",
|
||||
"omp_", "vprintf", "malloc",
|
||||
"free", "__keep_alive", "__llvm_omp_vprintf"};
|
||||
|
||||
bool isInternalFunction(Function &F) {
|
||||
auto Name = F.getName();
|
||||
|
||||
for (auto *P : InternalPrefix)
|
||||
if (Name.starts_with(P))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string typeToString(Type *T) {
|
||||
if (T->is16bitFPTy())
|
||||
return "f16";
|
||||
if (T->isFloatTy())
|
||||
return "f32";
|
||||
if (T->isDoubleTy())
|
||||
return "f64";
|
||||
if (T->isPointerTy())
|
||||
return "ptr";
|
||||
if (T->isStructTy())
|
||||
return std::string(T->getStructName());
|
||||
if (T->isIntegerTy())
|
||||
return "i" + std::to_string(T->getIntegerBitWidth());
|
||||
|
||||
LLVM_DEBUG(dbgs() << "[HostRPC] unknown type " << *T
|
||||
<< " for typeToString.\n";);
|
||||
|
||||
llvm_unreachable("unknown type");
|
||||
}
|
||||
|
||||
class HostRPC {
|
||||
/// LLVM context instance
|
||||
LLVMContext &Context;
|
||||
|
||||
/// Device module.
|
||||
Module &M;
|
||||
/// Host module
|
||||
Module &HM;
|
||||
|
||||
/// Data layout of the device module.
|
||||
DataLayout DL;
|
||||
|
||||
IRBuilder<> Builder;
|
||||
|
||||
/// External functions we are operating on.
|
||||
SmallSetVector<Function *, 8> FunctionWorkList;
|
||||
|
||||
/// Attributor instance.
|
||||
Attributor &A;
|
||||
|
||||
// Types
|
||||
Type *Int8PtrTy;
|
||||
Type *VoidTy;
|
||||
Type *Int32Ty;
|
||||
Type *Int64Ty;
|
||||
StructType *ArgInfoTy;
|
||||
// Values
|
||||
Constant *NullPtr;
|
||||
Constant *NullInt64;
|
||||
|
||||
struct CallSiteInfo {
|
||||
CallInst *CI = nullptr;
|
||||
SmallVector<Type *> Params;
|
||||
};
|
||||
|
||||
struct HostRPCArgInfo {
|
||||
Value *BasePtr = nullptr;
|
||||
Constant *Type = nullptr;
|
||||
Value *Size = nullptr;
|
||||
};
|
||||
|
||||
///
|
||||
SmallVector<Function *> HostEntryTable;
|
||||
|
||||
EnumeratedArray<Function *, HostRPCRuntimeFunction,
|
||||
HostRPCRuntimeFunction::OMPRTL___last>
|
||||
RFIs;
|
||||
|
||||
SmallVector<std::pair<CallInst *, CallInst *>> CallInstMap;
|
||||
|
||||
Constant *getConstantInt64(uint64_t Val) {
|
||||
return ConstantInt::get(Int64Ty, Val);
|
||||
}
|
||||
|
||||
static std::string getWrapperFunctionName(Function *F, CallSiteInfo &CSI) {
|
||||
std::string Name = "__kmpc_host_rpc_wrapper_" + std::string(F->getName());
|
||||
if (!F->isVarArg())
|
||||
return Name;
|
||||
|
||||
for (unsigned I = F->getFunctionType()->getNumParams();
|
||||
I < CSI.Params.size(); ++I) {
|
||||
Name.push_back('_');
|
||||
Name.append(typeToString(CSI.Params[I]));
|
||||
}
|
||||
|
||||
return Name;
|
||||
}
|
||||
|
||||
void registerAAs();
|
||||
|
||||
Value *convertToInt64Ty(Value *V);
|
||||
|
||||
Value *convertFromInt64TyTo(Value *V, Type *TargetTy);
|
||||
|
||||
Constant *convertToInt64Ty(Constant *C);
|
||||
|
||||
Constant *convertFromInt64TyTo(Constant *C, Type *T);
|
||||
|
||||
// int device_wrapper(call_no, arg_info, ...) {
|
||||
// void *desc = __kmpc_host_rpc_get_desc(call_no, num_args, arg_info);
|
||||
// __kmpc_host_rpc_add_arg(desc, arg1, sizeof(arg1));
|
||||
// __kmpc_host_rpc_add_arg(desc, arg2, sizeof(arg2));
|
||||
// ...
|
||||
// int r = (int)__kmpc_host_rpc_send_and_wait(desc);
|
||||
// return r;
|
||||
// }
|
||||
Function *getDeviceWrapperFunction(StringRef WrapperName, Function *F,
|
||||
CallSiteInfo &CSI);
|
||||
|
||||
// void host_wrapper(desc) {
|
||||
// int arg1 = (int)__kmpc_host_rpc_get_arg(desc, 0);
|
||||
// float arg2 = (float)__kmpc_host_rpc_get_arg(desc, 1);
|
||||
// char *arg3 = (char *)__kmpc_host_rpc_get_arg(desc, 2);
|
||||
// ...
|
||||
// int r = actual_call(arg1, arg2, arg3, ...);
|
||||
// __kmpc_host_rpc_set_ret_val(ptr(desc, (int64_t)r);
|
||||
// }
|
||||
Function *getHostWrapperFunction(StringRef WrapperName, Function *F,
|
||||
CallSiteInfo &CSI);
|
||||
|
||||
bool rewriteWithHostRPC(Function *F);
|
||||
|
||||
void emitHostWrapperInvoker();
|
||||
|
||||
bool recollectInformation();
|
||||
|
||||
public:
|
||||
HostRPC(Module &DeviceModule, Module &HostModule, Attributor &A)
|
||||
: Context(DeviceModule.getContext()), M(DeviceModule), HM(HostModule),
|
||||
DL(M.getDataLayout()), Builder(Context), A(A) {
|
||||
assert(&M.getContext() == &HM.getContext() &&
|
||||
"device and host modules have different context");
|
||||
|
||||
Int8PtrTy = PointerType::getUnqual(Context);
|
||||
VoidTy = Type::getVoidTy(Context);
|
||||
Int32Ty = Type::getInt32Ty(Context);
|
||||
Int64Ty = Type::getInt64Ty(Context);
|
||||
NullPtr = ConstantInt::getNullValue(Int8PtrTy);
|
||||
NullInt64 = ConstantInt::getNullValue(Int64Ty);
|
||||
|
||||
#define __OMP_RTL(_ENUM, MOD, VARARG, RETTY, ...) \
|
||||
{ \
|
||||
SmallVector<Type *> Params{__VA_ARGS__}; \
|
||||
FunctionType *FT = FunctionType::get(RETTY, Params, VARARG); \
|
||||
Function *F = (MOD).getFunction(#_ENUM); \
|
||||
if (!F) \
|
||||
F = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage, \
|
||||
#_ENUM, (MOD)); \
|
||||
RFIs[OMPRTL_##_ENUM] = F; \
|
||||
}
|
||||
__OMP_RTL(__kmpc_host_rpc_get_desc, M, false, Int8PtrTy, Int32Ty, Int32Ty,
|
||||
Int8PtrTy)
|
||||
__OMP_RTL(__kmpc_host_rpc_add_arg, M, false, VoidTy, Int8PtrTy, Int64Ty,
|
||||
Int32Ty)
|
||||
__OMP_RTL(__kmpc_host_rpc_send_and_wait, M, false, Int64Ty, Int8PtrTy)
|
||||
__OMP_RTL(__kmpc_host_rpc_get_arg, HM, false, Int64Ty, Int8PtrTy, Int32Ty)
|
||||
__OMP_RTL(__kmpc_host_rpc_set_ret_val, HM, false, VoidTy, Int8PtrTy,
|
||||
Int64Ty)
|
||||
__OMP_RTL(__kmpc_host_rpc_invoke_host_wrapper, HM, false, VoidTy, Int32Ty,
|
||||
Int8PtrTy)
|
||||
#undef __OMP_RTL
|
||||
|
||||
ArgInfoTy = StructType::create({Int64Ty, Int64Ty, Int64Ty, Int8PtrTy},
|
||||
"struct.arg_info_t");
|
||||
}
|
||||
|
||||
bool run();
|
||||
};
|
||||
|
||||
Value *HostRPC::convertToInt64Ty(Value *V) {
|
||||
if (auto *C = dyn_cast<Constant>(V))
|
||||
return convertToInt64Ty(C);
|
||||
|
||||
Type *T = V->getType();
|
||||
|
||||
if (T == Int64Ty)
|
||||
return V;
|
||||
|
||||
if (T->isPointerTy())
|
||||
return Builder.CreatePtrToInt(V, Int64Ty);
|
||||
|
||||
if (T->isIntegerTy())
|
||||
return Builder.CreateIntCast(V, Int64Ty, /* isSigned */ true);
|
||||
|
||||
if (T->isFloatingPointTy()) {
|
||||
V = Builder.CreateBitCast(
|
||||
V, Type::getIntNTy(V->getContext(), T->getScalarSizeInBits()));
|
||||
return Builder.CreateIntCast(V, Int64Ty, /* isSigned */ true);
|
||||
}
|
||||
|
||||
llvm_unreachable("unknown cast to int64_t");
|
||||
}
|
||||
|
||||
Value *HostRPC::convertFromInt64TyTo(Value *V, Type *T) {
|
||||
if (auto *C = dyn_cast<Constant>(V))
|
||||
return convertFromInt64TyTo(C, T);
|
||||
|
||||
if (T == Int64Ty)
|
||||
return V;
|
||||
|
||||
if (T->isPointerTy())
|
||||
return Builder.CreateIntToPtr(V, T);
|
||||
|
||||
if (T->isIntegerTy())
|
||||
return Builder.CreateIntCast(V, T, /* isSigned */ true);
|
||||
|
||||
if (T->isFloatingPointTy()) {
|
||||
V = Builder.CreateIntCast(
|
||||
V, Type::getIntNTy(V->getContext(), T->getScalarSizeInBits()),
|
||||
/* isSigned */ true);
|
||||
return Builder.CreateBitCast(V, T);
|
||||
}
|
||||
|
||||
llvm_unreachable("unknown cast from int64_t");
|
||||
}
|
||||
|
||||
Constant *HostRPC::convertToInt64Ty(Constant *C) {
|
||||
Type *T = C->getType();
|
||||
|
||||
if (T == Int64Ty)
|
||||
return C;
|
||||
|
||||
if (T->isPointerTy())
|
||||
return ConstantExpr::getPtrToInt(C, Int64Ty);
|
||||
|
||||
if (T->isIntegerTy())
|
||||
llvm_unreachable("I don't know how to fixe this");
|
||||
//return ConstantExpr::getIntegerCast(C, Int64Ty, /* isSigned */ true);
|
||||
|
||||
if (T->isFloatingPointTy()) {
|
||||
// TODO: FIXEME getIntegerCast is hard to implement with new version of ConstExpr
|
||||
//C = ConstantExpr::getBitCast(
|
||||
// C, Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()));
|
||||
//return ConstantExpr::getIntegerCast(C, Int64Ty, /* isSigned */ true);
|
||||
llvm_unreachable("unsuported cast from float to int64_t");
|
||||
}
|
||||
|
||||
llvm_unreachable("unknown cast to int64_t");
|
||||
}
|
||||
|
||||
Constant *HostRPC::convertFromInt64TyTo(Constant *C, Type *T) {
|
||||
if (T == Int64Ty)
|
||||
return C;
|
||||
|
||||
if (T->isPointerTy())
|
||||
return ConstantExpr::getIntToPtr(C, T);
|
||||
|
||||
if (T->isIntegerTy())
|
||||
llvm_unreachable("I don't know how to fixe this");
|
||||
//return ConstantExpr::getIntegerCast(C, T, /* isSigned */ true);
|
||||
|
||||
if (T->isFloatingPointTy()) {
|
||||
// TODO: FIXEME getIntegerCast is hard to implement with new version of ConstExpr
|
||||
//C = ConstantExpr::getIntegerCast(
|
||||
// C, Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()),
|
||||
// /* isSigned */ true);
|
||||
//return ConstantExpr::getBitCast(C, T);
|
||||
llvm_unreachable("unsuported cast from int64_t to float");
|
||||
}
|
||||
|
||||
llvm_unreachable("unknown cast from int64_t");
|
||||
}
|
||||
|
||||
void HostRPC::registerAAs() {
|
||||
for (auto *F : FunctionWorkList)
|
||||
for (User *U : F->users()) {
|
||||
auto *CI = dyn_cast<CallInst>(U);
|
||||
if (!CI)
|
||||
continue;
|
||||
|
||||
for (unsigned I = 0; I < CI->arg_size(); ++I) {
|
||||
Value *Operand = CI->getArgOperand(I);
|
||||
if (!Operand->getType()->isPointerTy())
|
||||
continue;
|
||||
A.getOrCreateAAFor<AAUnderlyingObjects>(
|
||||
IRPosition::callsite_argument(*CI, I),
|
||||
/* QueryingAA */ nullptr, DepClassTy::NONE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool HostRPC::recollectInformation() {
|
||||
FunctionWorkList.clear();
|
||||
|
||||
for (Function &F : M) {
|
||||
// If the function is already defined, it definitely does not require RPC.
|
||||
if (!F.isDeclaration())
|
||||
continue;
|
||||
|
||||
// If it is an internal function, skip it as well.
|
||||
if (isInternalFunction(F))
|
||||
continue;
|
||||
|
||||
// If there is no use of the function, skip it.
|
||||
if (F.use_empty())
|
||||
continue;
|
||||
|
||||
FunctionWorkList.insert(&F);
|
||||
}
|
||||
|
||||
return !FunctionWorkList.empty();
|
||||
}
|
||||
|
||||
bool HostRPC::run() {
|
||||
bool Changed = false;
|
||||
|
||||
if (!recollectInformation())
|
||||
return Changed;
|
||||
|
||||
Changed = true;
|
||||
|
||||
// We add a couple of assumptions to those RPC functions such that AAs will
|
||||
// not error out because of unknown implementation of those functions.
|
||||
for (Function &F : M) {
|
||||
if (!F.isDeclaration())
|
||||
continue;
|
||||
|
||||
F.addFnAttr(Attribute::NoRecurse);
|
||||
|
||||
for (auto &Arg : F.args())
|
||||
if (Arg.getType()->isPointerTy())
|
||||
Arg.addAttr(Attribute::NoCapture);
|
||||
|
||||
if (!F.isVarArg())
|
||||
continue;
|
||||
|
||||
for (User *U : F.users()) {
|
||||
auto *CB = dyn_cast<CallBase>(U);
|
||||
if (!CB)
|
||||
continue;
|
||||
for (unsigned I = F.getFunctionType()->getNumParams(); I < CB->arg_size();
|
||||
++I) {
|
||||
Value *Arg = CB->getArgOperand(I);
|
||||
if (Arg->getType()->isPointerTy())
|
||||
CB->addParamAttr(I, Attribute::NoCapture);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LLVM_DEBUG(M.dump());
|
||||
|
||||
registerAAs();
|
||||
|
||||
ChangeStatus Status = A.run();
|
||||
if (!recollectInformation())
|
||||
return Status == ChangeStatus::CHANGED;
|
||||
|
||||
for (Function *F : FunctionWorkList)
|
||||
Changed |= rewriteWithHostRPC(F);
|
||||
|
||||
if (!Changed)
|
||||
return Changed;
|
||||
|
||||
for (auto Itr = CallInstMap.rbegin(); Itr != CallInstMap.rend(); ++Itr) {
|
||||
auto *CI = Itr->first;
|
||||
auto *NewCI = Itr->second;
|
||||
CI->replaceAllUsesWith(NewCI);
|
||||
CI->eraseFromParent();
|
||||
}
|
||||
|
||||
for (Function *F : FunctionWorkList)
|
||||
if (F->user_empty())
|
||||
F->eraseFromParent();
|
||||
|
||||
emitHostWrapperInvoker();
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool HostRPC::rewriteWithHostRPC(Function *F) {
|
||||
SmallVector<CallInst *> WorkList;
|
||||
|
||||
for (User *U : F->users()) {
|
||||
auto *CI = dyn_cast<CallInst>(U);
|
||||
if (!CI)
|
||||
continue;
|
||||
WorkList.push_back(CI);
|
||||
}
|
||||
|
||||
if (WorkList.empty())
|
||||
return false;
|
||||
|
||||
for (CallInst *CI : WorkList) {
|
||||
CallSiteInfo CSI;
|
||||
CSI.CI = CI;
|
||||
unsigned NumArgs = CI->arg_size();
|
||||
for (unsigned I = 0; I < NumArgs; ++I)
|
||||
CSI.Params.push_back(CI->getArgOperand(I)->getType());
|
||||
|
||||
std::string WrapperName = getWrapperFunctionName(F, CSI);
|
||||
Function *DeviceWrapperFn = getDeviceWrapperFunction(WrapperName, F, CSI);
|
||||
Function *HostWrapperFn = getHostWrapperFunction(WrapperName, F, CSI);
|
||||
|
||||
int32_t WrapperNumber = -1;
|
||||
for (unsigned I = 0; I < HostEntryTable.size(); ++I) {
|
||||
if (HostEntryTable[I] == HostWrapperFn) {
|
||||
WrapperNumber = I;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (WrapperNumber == -1) {
|
||||
WrapperNumber = HostEntryTable.size();
|
||||
HostEntryTable.push_back(HostWrapperFn);
|
||||
}
|
||||
|
||||
auto CheckIfIdentifierPtr = [this](const Value *V) {
|
||||
auto *CI = dyn_cast<CallInst>(V);
|
||||
if (!CI)
|
||||
return false;
|
||||
Function *Callee = CI->getCalledFunction();
|
||||
if (this->FunctionWorkList.count(Callee))
|
||||
return true;
|
||||
return Callee->getName().starts_with("__kmpc_host_rpc_wrapper_");
|
||||
};
|
||||
|
||||
auto CheckIfDynAlloc = [](Value *V) -> CallInst * {
|
||||
auto *CI = dyn_cast<CallInst>(V);
|
||||
if (!CI)
|
||||
return nullptr;
|
||||
Function *Callee = CI->getCalledFunction();
|
||||
auto Name = Callee->getName();
|
||||
if (Name == "malloc" || Name == "__kmpc_alloc_shared")
|
||||
return CI;
|
||||
return nullptr;
|
||||
};
|
||||
|
||||
auto CheckIfStdIO = [](Value *V) -> GlobalVariable * {
|
||||
auto *LI = dyn_cast<LoadInst>(V);
|
||||
if (!LI)
|
||||
return nullptr;
|
||||
auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
|
||||
if (!GV)
|
||||
return nullptr;
|
||||
auto Name = GV->getName();
|
||||
if (Name == "stdout" || Name == "stderr" || Name == "stdin")
|
||||
return GV;
|
||||
return nullptr;
|
||||
};
|
||||
|
||||
auto CheckIfGlobalVariable = [](Value *V) {
|
||||
if (auto *GV = dyn_cast<GlobalVariable>(V))
|
||||
return GV;
|
||||
if (auto *LI = dyn_cast<LoadInst>(V))
|
||||
if (auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()))
|
||||
return GV;
|
||||
return static_cast<GlobalVariable *>(nullptr);
|
||||
};
|
||||
|
||||
auto CheckIfNullPtr = [](Value *V) {
|
||||
if (!V->getType()->isPointerTy())
|
||||
return false;
|
||||
return V == ConstantInt::getNullValue(V->getType());
|
||||
};
|
||||
|
||||
auto HandleDirectUse = [&](Value *Ptr, HostRPCArgInfo &AI,
|
||||
bool IsPointer = false) {
|
||||
AI.BasePtr = Ptr;
|
||||
AI.Type = getConstantInt64(IsPointer ? ArgType::OMP_HOST_RPC_ARG_PTR
|
||||
: ArgType::OMP_HOST_RPC_ARG_SCALAR);
|
||||
AI.Size = NullInt64;
|
||||
};
|
||||
|
||||
SmallVector<SmallVector<HostRPCArgInfo>> ArgInfo;
|
||||
bool IsConstantArgInfo = true;
|
||||
|
||||
for (unsigned I = 0; I < CI->arg_size(); ++I) {
|
||||
ArgInfo.emplace_back();
|
||||
auto &AII = ArgInfo.back();
|
||||
|
||||
Value *Operand = CI->getArgOperand(I);
|
||||
|
||||
// Check if scalar type.
|
||||
if (!Operand->getType()->isPointerTy()) {
|
||||
AII.emplace_back();
|
||||
HandleDirectUse(Operand, AII.back());
|
||||
IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (CheckIfNullPtr(Operand))
|
||||
continue;
|
||||
|
||||
auto Pred = [&](Value &Obj) {
|
||||
if (CheckIfNullPtr(&Obj))
|
||||
return true;
|
||||
|
||||
bool IsConstantArgument = false;
|
||||
if (!F->isVarArg() &&
|
||||
F->hasParamAttribute(I, Attribute::AttrKind::ReadOnly))
|
||||
IsConstantArgument = true;
|
||||
|
||||
HostRPCArgInfo AI;
|
||||
|
||||
if (auto *IO = CheckIfStdIO(&Obj)) {
|
||||
HandleDirectUse(IO, AI, /* IsPointer */ true);
|
||||
} else if (CheckIfIdentifierPtr(&Obj)) {
|
||||
IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
|
||||
HandleDirectUse(Operand, AI, /* IsPointer */ true);
|
||||
} else if (auto *GV = CheckIfGlobalVariable(&Obj)) {
|
||||
AI.BasePtr = GV;
|
||||
AI.Size = getConstantInt64(DL.getTypeStoreSize(GV->getValueType()));
|
||||
AI.Type =
|
||||
getConstantInt64(GV->isConstant() || IsConstantArgument
|
||||
? ArgType::OMP_HOST_RPC_ARG_COPY_TO
|
||||
: ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM);
|
||||
} else if (CheckIfDynAlloc(&Obj)) {
|
||||
// We will handle this case at runtime so here we don't do anything.
|
||||
return true;
|
||||
} else if (isa<AllocaInst>(&Obj)) {
|
||||
llvm_unreachable("alloca instruction needs to be handled!");
|
||||
} else {
|
||||
LLVM_DEBUG({
|
||||
dbgs() << "[HostRPC] warning: call site " << *CI << ", operand "
|
||||
<< *Operand << ", underlying object " << Obj
|
||||
<< " cannot be handled.\n";
|
||||
});
|
||||
return true;
|
||||
}
|
||||
AII.push_back(std::move(AI));
|
||||
return true;
|
||||
};
|
||||
|
||||
auto &AAUO = *A.getOrCreateAAFor<AAUnderlyingObjects>(
|
||||
IRPosition::callsite_argument(*CI, I), nullptr, DepClassTy::NONE);
|
||||
if (!AAUO.forallUnderlyingObjects(Pred))
|
||||
llvm_unreachable("internal error");
|
||||
}
|
||||
|
||||
// Reset the insert point to the call site.
|
||||
Builder.SetInsertPoint(CI);
|
||||
|
||||
Value *ArgInfoVal = nullptr;
|
||||
if (!IsConstantArgInfo) {
|
||||
ArgInfoVal = Builder.CreateAlloca(Int8PtrTy, getConstantInt64(NumArgs),
|
||||
"arg_info");
|
||||
for (unsigned I = 0; I < NumArgs; ++I) {
|
||||
auto &AII = ArgInfo[I];
|
||||
Value *Next = NullPtr;
|
||||
for (auto &AI : AII) {
|
||||
Value *AIV = Builder.CreateAlloca(ArgInfoTy);
|
||||
Value *AIIArg =
|
||||
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(0)});
|
||||
Builder.Insert(AIIArg);
|
||||
Builder.CreateStore(convertToInt64Ty(AI.BasePtr), AIIArg);
|
||||
Value *AIIType =
|
||||
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(1)});
|
||||
Builder.Insert(AIIType);
|
||||
Builder.CreateStore(AI.Type, AIIType);
|
||||
Value *AIISize =
|
||||
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(2)});
|
||||
Builder.Insert(AIISize);
|
||||
Builder.CreateStore(AI.Size, AIISize);
|
||||
Value *AIINext =
|
||||
GetElementPtrInst::Create(Int8PtrTy, AIV, {getConstantInt64(3)});
|
||||
Builder.Insert(AIINext);
|
||||
Builder.CreateStore(Next, AIINext);
|
||||
Next = AIV;
|
||||
}
|
||||
Value *AIIV = GetElementPtrInst::Create(Int8PtrTy, ArgInfoVal,
|
||||
{getConstantInt64(I)});
|
||||
Builder.Insert(AIIV);
|
||||
Builder.CreateStore(Next, AIIV);
|
||||
}
|
||||
} else {
|
||||
SmallVector<Constant *> ArgInfoInitVar;
|
||||
for (auto &AII : ArgInfo) {
|
||||
Constant *Last = NullPtr;
|
||||
for (auto &AI : AII) {
|
||||
auto *Arg = cast<Constant>(AI.BasePtr);
|
||||
auto *CS =
|
||||
ConstantStruct::get(ArgInfoTy, {convertToInt64Ty(Arg), AI.Type,
|
||||
cast<Constant>(AI.Size), Last});
|
||||
auto *GV = new GlobalVariable(
|
||||
M, ArgInfoTy, /* isConstant */ true,
|
||||
GlobalValue::LinkageTypes::InternalLinkage, CS);
|
||||
Last = GV;
|
||||
}
|
||||
ArgInfoInitVar.push_back(Last);
|
||||
}
|
||||
Constant *ArgInfoInit = ConstantArray::get(
|
||||
ArrayType::get(Int8PtrTy, NumArgs), ArgInfoInitVar);
|
||||
ArgInfoVal = new GlobalVariable(
|
||||
M, ArrayType::get(Int8PtrTy, NumArgs), /* isConstant */ true,
|
||||
GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info");
|
||||
}
|
||||
|
||||
SmallVector<Value *> Args{ConstantInt::get(Int32Ty, WrapperNumber),
|
||||
ArgInfoVal};
|
||||
for (Value *Operand : CI->args())
|
||||
Args.push_back(Operand);
|
||||
|
||||
CallInst *NewCall = Builder.CreateCall(DeviceWrapperFn, Args);
|
||||
|
||||
CallInstMap.emplace_back(CI, NewCall);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
Function *HostRPC::getDeviceWrapperFunction(StringRef WrapperName, Function *F,
|
||||
CallSiteInfo &CSI) {
|
||||
Function *WrapperFn = M.getFunction(WrapperName);
|
||||
if (WrapperFn)
|
||||
return WrapperFn;
|
||||
|
||||
// return_type device_wrapper(int32_t call_no, void *arg_info, ...)
|
||||
SmallVector<Type *> Params{Int32Ty, Int8PtrTy};
|
||||
Params.append(CSI.Params);
|
||||
|
||||
Type *RetTy = F->getReturnType();
|
||||
|
||||
FunctionType *FT = FunctionType::get(RetTy, Params, /*isVarArg*/ false);
|
||||
WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::InternalLinkage,
|
||||
WrapperName, M);
|
||||
|
||||
// Emit the body of the device wrapper
|
||||
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
|
||||
Builder.SetInsertPoint(EntryBB);
|
||||
|
||||
// skip call_no and arg_info.
|
||||
constexpr const unsigned NumArgSkipped = 2;
|
||||
|
||||
Value *Desc = nullptr;
|
||||
{
|
||||
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_get_desc];
|
||||
Desc = Builder.CreateCall(
|
||||
Fn,
|
||||
{WrapperFn->getArg(0),
|
||||
ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
|
||||
WrapperFn->getArg(1)},
|
||||
"desc");
|
||||
}
|
||||
|
||||
{
|
||||
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_add_arg];
|
||||
for (unsigned I = NumArgSkipped; I < WrapperFn->arg_size(); ++I) {
|
||||
Value *V = convertToInt64Ty(WrapperFn->getArg(I));
|
||||
Builder.CreateCall(
|
||||
Fn, {Desc, V, ConstantInt::get(Int32Ty, I - NumArgSkipped)});
|
||||
}
|
||||
}
|
||||
|
||||
Value *RetVal =
|
||||
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_send_and_wait], {Desc});
|
||||
|
||||
if (RetTy->isVoidTy()) {
|
||||
Builder.CreateRetVoid();
|
||||
return WrapperFn;
|
||||
}
|
||||
|
||||
if (RetTy != RetVal->getType())
|
||||
RetVal = convertFromInt64TyTo(RetVal, RetTy);
|
||||
|
||||
Builder.CreateRet(RetVal);
|
||||
|
||||
return WrapperFn;
|
||||
}
|
||||
|
||||
Function *HostRPC::getHostWrapperFunction(StringRef WrapperName, Function *F,
|
||||
CallSiteInfo &CSI) {
|
||||
Function *WrapperFn = HM.getFunction(WrapperName);
|
||||
if (WrapperFn)
|
||||
return WrapperFn;
|
||||
|
||||
SmallVector<Type *> Params{Int8PtrTy};
|
||||
FunctionType *FT = FunctionType::get(VoidTy, Params, /* isVarArg */ false);
|
||||
WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,
|
||||
WrapperName, HM);
|
||||
|
||||
Value *Desc = WrapperFn->getArg(0);
|
||||
|
||||
// Emit the body of the host wrapper
|
||||
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
|
||||
Builder.SetInsertPoint(EntryBB);
|
||||
|
||||
SmallVector<Value *> Args;
|
||||
for (unsigned I = 0; I < CSI.CI->arg_size(); ++I) {
|
||||
Value *V = Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_arg],
|
||||
{Desc, ConstantInt::get(Int32Ty, I)});
|
||||
Args.push_back(convertFromInt64TyTo(V, CSI.Params[I]));
|
||||
}
|
||||
|
||||
// The host callee that will be called eventually by the host wrapper.
|
||||
Function *HostCallee = HM.getFunction(F->getName());
|
||||
if (!HostCallee)
|
||||
HostCallee = Function::Create(F->getFunctionType(), F->getLinkage(),
|
||||
F->getName(), HM);
|
||||
|
||||
Value *RetVal = Builder.CreateCall(HostCallee, Args);
|
||||
if (!RetVal->getType()->isVoidTy()) {
|
||||
RetVal = convertToInt64Ty(RetVal);
|
||||
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_set_ret_val],
|
||||
{Desc, RetVal});
|
||||
}
|
||||
Builder.CreateRetVoid();
|
||||
|
||||
return WrapperFn;
|
||||
}
|
||||
|
||||
void HostRPC::emitHostWrapperInvoker() {
|
||||
IRBuilder<> Builder(Context);
|
||||
unsigned NumEntries = HostEntryTable.size();
|
||||
Function *F = RFIs[OMPRTL___kmpc_host_rpc_invoke_host_wrapper];
|
||||
F->setDLLStorageClass(
|
||||
GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
|
||||
Value *CallNo = F->getArg(0);
|
||||
Value *Desc = F->getArg(1);
|
||||
|
||||
SmallVector<BasicBlock *> SwitchBBs;
|
||||
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", F);
|
||||
BasicBlock *ReturnBB = BasicBlock::Create(Context, "return", F);
|
||||
|
||||
// Emit code for the return bb.
|
||||
Builder.SetInsertPoint(ReturnBB);
|
||||
Builder.CreateRetVoid();
|
||||
|
||||
// Create BB for each host entry and emit function call.
|
||||
for (unsigned I = 0; I < NumEntries; ++I) {
|
||||
BasicBlock *BB = BasicBlock::Create(Context, "invoke.bb", F, ReturnBB);
|
||||
SwitchBBs.push_back(BB);
|
||||
Builder.SetInsertPoint(BB);
|
||||
Builder.CreateCall(HostEntryTable[I], {Desc});
|
||||
Builder.CreateBr(ReturnBB);
|
||||
}
|
||||
|
||||
// Emit code for the entry BB.
|
||||
Builder.SetInsertPoint(EntryBB);
|
||||
SwitchInst *Switch = Builder.CreateSwitch(CallNo, ReturnBB, NumEntries);
|
||||
for (unsigned I = 0; I < NumEntries; ++I)
|
||||
Switch->addCase(ConstantInt::get(cast<IntegerType>(Int32Ty), I),
|
||||
SwitchBBs[I]);
|
||||
}
|
||||
|
||||
Module *getHostModule(Module &M) {
|
||||
auto *MD = M.getNamedMetadata("llvm.hostrpc.hostmodule");
|
||||
if (!MD || MD->getNumOperands() == 0)
|
||||
return nullptr;
|
||||
auto *Node = MD->getOperand(0);
|
||||
assert(Node->getNumOperands() == 1 && "invliad named metadata");
|
||||
auto *CAM = dyn_cast<ConstantAsMetadata>(Node->getOperand(0));
|
||||
if (!CAM)
|
||||
return nullptr;
|
||||
auto *CI = cast<ConstantInt>(CAM->getValue());
|
||||
Module *Mod = reinterpret_cast<Module *>(CI->getZExtValue());
|
||||
M.eraseNamedMetadata(MD);
|
||||
return Mod;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
PreservedAnalyses HostRPCPass::run(Module &M, ModuleAnalysisManager &AM) {
|
||||
std::unique_ptr<Module> DummyHostModule;
|
||||
|
||||
Module *HostModule = nullptr;
|
||||
|
||||
if (UseDummyHostModule) {
|
||||
DummyHostModule =
|
||||
std::make_unique<Module>("dummy-host-rpc.bc", M.getContext());
|
||||
HostModule = DummyHostModule.get();
|
||||
} else {
|
||||
HostModule = getHostModule(M);
|
||||
}
|
||||
|
||||
if (!HostModule)
|
||||
return PreservedAnalyses::all();
|
||||
|
||||
bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
|
||||
LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
|
||||
|
||||
// The pass will not run if it is not invoked directly or not invoked at link
|
||||
// time.
|
||||
if (!UseDummyHostModule && !PostLink)
|
||||
return PreservedAnalyses::all();
|
||||
|
||||
FunctionAnalysisManager &FAM =
|
||||
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
|
||||
AnalysisGetter AG(FAM);
|
||||
|
||||
CallGraphUpdater CGUpdater;
|
||||
BumpPtrAllocator Allocator;
|
||||
|
||||
AttributorConfig AC(CGUpdater);
|
||||
AC.DefaultInitializeLiveInternals = false;
|
||||
AC.RewriteSignatures = false;
|
||||
AC.PassName = DEBUG_TYPE;
|
||||
AC.MaxFixpointIterations = 1024;
|
||||
|
||||
InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
|
||||
|
||||
SetVector<Function *> Functions;
|
||||
Attributor A(Functions, InfoCache, AC);
|
||||
|
||||
HostRPC RPC(M, *HostModule, A);
|
||||
bool Changed = RPC.run();
|
||||
|
||||
LLVM_DEBUG({
|
||||
if (Changed && UseDummyHostModule) {
|
||||
M.dump();
|
||||
HostModule->dump();
|
||||
}
|
||||
});
|
||||
|
||||
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
|
||||
}
|
||||
@@ -56,6 +56,7 @@
|
||||
#include "llvm/Transforms/IPO/Attributor.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
|
||||
#include "llvm/Transforms/Utils/ModuleUtils.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <optional>
|
||||
@@ -138,6 +139,15 @@ static cl::opt<bool>
|
||||
cl::desc("Enables more verbose remarks."), cl::Hidden,
|
||||
cl::init(false));
|
||||
|
||||
static cl::opt<bool>
|
||||
EnableParallel51Split("openmp-opt-enable-parallel-51-split",
|
||||
cl::desc("Enable the kernel split at parallel_51."),
|
||||
cl::Hidden, cl::init(false));
|
||||
static cl::opt<bool> EnableParallel51SplitMultiTeams(
|
||||
"openmp-opt-enable-parallel-51-split-multi-teams",
|
||||
cl::desc("Enable multi-teams support for the parallel 51 kernel."),
|
||||
cl::Hidden, cl::init(false));
|
||||
|
||||
static cl::opt<unsigned>
|
||||
SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
|
||||
cl::desc("Maximal number of attributor iterations."),
|
||||
@@ -917,6 +927,28 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
bool canUseMultiTeam(Function *OutlinedFn, OMPInformationCache &OMPInfoCache) {
|
||||
if (!EnableParallel51SplitMultiTeams)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void collectReachingKernels(Function *F,
|
||||
SmallVector<Kernel> &ReachingKernels) {
|
||||
if (omp::isOpenMPKernel(*F)) {
|
||||
if (F->hasFnAttribute("omp_parallel_51_kernel"))
|
||||
ReachingKernels.push_back(F);
|
||||
return;
|
||||
}
|
||||
for (User *U : F->users()) {
|
||||
auto *I = dyn_cast<Instruction>(U);
|
||||
if (!I)
|
||||
continue;
|
||||
collectReachingKernels(I->getFunction(), ReachingKernels);
|
||||
}
|
||||
}
|
||||
|
||||
struct OpenMPOpt {
|
||||
|
||||
using OptimizationRemarkGetter =
|
||||
@@ -945,6 +977,11 @@ struct OpenMPOpt {
|
||||
<< " functions\n");
|
||||
|
||||
if (IsModulePass) {
|
||||
if (EnableParallel51Split)
|
||||
Changed |= splitKernels();
|
||||
|
||||
OMPInfoCache.recollectUses();
|
||||
|
||||
Changed |= runAttributor(IsModulePass);
|
||||
|
||||
// Recollect uses, in case Attributor deleted any.
|
||||
@@ -1047,6 +1084,308 @@ struct OpenMPOpt {
|
||||
}
|
||||
|
||||
private:
|
||||
/// Create a new kernel for each function call to __kmpc_parallel_51.
|
||||
bool splitKernels() {
|
||||
bool Changed = false;
|
||||
|
||||
OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
|
||||
OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
|
||||
|
||||
if (!Parallel51RFI.Declaration)
|
||||
return Changed;
|
||||
|
||||
SmallVector<CallInst *> WorkItems;
|
||||
for (User *U : Parallel51RFI.Declaration->users()) {
|
||||
auto *CI = dyn_cast<CallInst>(U);
|
||||
if (!CI)
|
||||
continue;
|
||||
|
||||
Function *F = CI->getFunction();
|
||||
// If the parent function is already from kernel split, skip it.
|
||||
Attribute Attr = F->getFnAttribute("omp_parallel_51_kernel");
|
||||
if (Attr.isValid())
|
||||
continue;
|
||||
|
||||
WorkItems.push_back(CI);
|
||||
}
|
||||
|
||||
if (WorkItems.empty())
|
||||
return Changed;
|
||||
|
||||
Changed = true;
|
||||
|
||||
auto CreateNewKernel = [](LLVMContext &Ctx, Module &M,
|
||||
OpenMPIRBuilder &IRBuilder, Function *Parallel51,
|
||||
const CallInst *CI) {
|
||||
SmallVector<Type *, 4> ArgTypes;
|
||||
// int32_t gtid
|
||||
ArgTypes.push_back(Type::getInt32Ty(Ctx));
|
||||
// int32_t if_expr
|
||||
ArgTypes.push_back(Type::getInt32Ty(Ctx));
|
||||
// int32_t num_threads
|
||||
ArgTypes.push_back(Type::getInt32Ty(Ctx));
|
||||
// void **args
|
||||
ArgTypes.push_back(PointerType::getUnqual(Ctx));
|
||||
// int64_t nargs
|
||||
ArgTypes.push_back(Type::getInt64Ty(Ctx));
|
||||
|
||||
FunctionType *FT =
|
||||
FunctionType::get(Type::getVoidTy(Ctx), ArgTypes, false);
|
||||
std::string KernelName = "__omp_offloading_parallel_51_from_";
|
||||
KernelName += CI->getFunction()->getName();
|
||||
// Sanitize the kernel name
|
||||
{
|
||||
size_t DotPos = KernelName.find('.');
|
||||
while (DotPos != std::string::npos) {
|
||||
KernelName[DotPos] = '_';
|
||||
DotPos = KernelName.find('.');
|
||||
}
|
||||
}
|
||||
Function *K =
|
||||
Function::Create(FT, GlobalValue::WeakODRLinkage, KernelName, &M);
|
||||
K->setVisibility(GlobalValue::ProtectedVisibility);
|
||||
// exec mode global variable
|
||||
GlobalVariable *ModeGV = new GlobalVariable(
|
||||
M, Type::getInt8Ty(Ctx), /*isConstant=*/true,
|
||||
GlobalValue::WeakAnyLinkage,
|
||||
ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD),
|
||||
K->getName() + "_exec_mode");
|
||||
appendToCompilerUsed(M, {ModeGV});
|
||||
|
||||
// Attach "kernel" metadata to the new kernel.
|
||||
NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
|
||||
Metadata *MDVals[] = {ConstantAsMetadata::get(K),
|
||||
MDString::get(Ctx, "kernel"),
|
||||
ConstantAsMetadata::get(ConstantInt::get(
|
||||
llvm::Type::getInt32Ty(Ctx), 1))};
|
||||
MD->addOperand(MDNode::get(Ctx, MDVals));
|
||||
// Set kernel attributes.
|
||||
K->setAttributes(
|
||||
AttributeList::get(Ctx, AttributeList::FunctionIndex,
|
||||
CI->getFunction()->getAttributes().getFnAttrs()));
|
||||
K->removeFnAttr("omp_target_thread_limit");
|
||||
K->removeFnAttr("omp_target_num_teams");
|
||||
K->addFnAttr("omp_parallel_51_kernel");
|
||||
K->addFnAttr("kernel");
|
||||
K->getArg(3)->addAttr(Attribute::NoAlias);
|
||||
|
||||
auto *EntryBB = BasicBlock::Create(Ctx, "entry", K);
|
||||
OpenMPIRBuilder::LocationDescription TargetInitLoc(
|
||||
{EntryBB, EntryBB->end()});
|
||||
auto IP = IRBuilder.createTargetInit(TargetInitLoc, /* IsSPMD */ true);
|
||||
BasicBlock *UserCodeBB = IP.getBlock();
|
||||
|
||||
auto UseDirectIfPossible = [](Value *LHS, Value *RHS) -> Value * {
|
||||
if (isa<Constant>(LHS))
|
||||
return LHS;
|
||||
return RHS;
|
||||
};
|
||||
|
||||
Value *Ident = CI->getOperand(0);
|
||||
Value *TId = UseDirectIfPossible(CI->getOperand(1), K->getArg(0));
|
||||
Value *IfExpr = UseDirectIfPossible(CI->getOperand(2), K->getArg(1));
|
||||
Value *NumThreads = UseDirectIfPossible(CI->getOperand(3), K->getArg(2));
|
||||
Value *ProcBind = CI->getOperand(4);
|
||||
Value *Fn = CI->getOperand(5);
|
||||
Value *WrapperFn = CI->getOperand(6);
|
||||
Value *Args = K->getArg(3);
|
||||
Value *NArgs = UseDirectIfPossible(CI->getOperand(8), K->getArg(4));
|
||||
|
||||
(void)CallInst::Create(Parallel51,
|
||||
{Ident, TId, IfExpr, NumThreads, ProcBind, Fn,
|
||||
WrapperFn, Args, NArgs},
|
||||
"", UserCodeBB);
|
||||
|
||||
OpenMPIRBuilder::LocationDescription TargetDeInitLoc(
|
||||
{UserCodeBB, UserCodeBB->end()});
|
||||
IRBuilder.createTargetDeinit(TargetDeInitLoc, /* IsSPMD */ true);
|
||||
IRBuilder.Builder.CreateRetVoid();
|
||||
|
||||
return K;
|
||||
};
|
||||
|
||||
auto EnableMultiTeam = [](Function *OutlinedFn) {
|
||||
SmallVector<CallInst *> WorkItems;
|
||||
for (BasicBlock &BB : *OutlinedFn)
|
||||
for (Instruction &I : BB) {
|
||||
CallInst *C = dyn_cast<CallInst>(&I);
|
||||
if (!C)
|
||||
continue;
|
||||
Function *Callee = C->getCalledFunction();
|
||||
auto CalleeName = Callee->getName();
|
||||
if (CalleeName == "__kmpc_for_static_init_4" ||
|
||||
CalleeName == "__kmpc_for_static_init_4u" ||
|
||||
CalleeName == "__kmpc_for_static_init_8" ||
|
||||
CalleeName == "__kmpc_for_static_init_8u")
|
||||
WorkItems.push_back(C);
|
||||
}
|
||||
|
||||
for (CallInst *C : WorkItems) {
|
||||
constexpr const unsigned SchedTypeArgNum = 2;
|
||||
Value *SchedTypeVal = C->getOperand(SchedTypeArgNum);
|
||||
ConstantInt *SchedTypeCI = cast<ConstantInt>(SchedTypeVal);
|
||||
int32_t SchedType = SchedTypeCI->getSExtValue();
|
||||
Value *NewSchedType = SchedTypeVal;
|
||||
if (SchedType == /*kmp_sched_static_chunk*/ 33)
|
||||
NewSchedType = ConstantInt::get(SchedTypeCI->getType(), 100);
|
||||
else if (SchedType == /*kmp_sched_static_chunk*/ 34)
|
||||
NewSchedType = ConstantInt::get(SchedTypeCI->getType(), 101);
|
||||
C->setOperand(SchedTypeArgNum, NewSchedType);
|
||||
}
|
||||
};
|
||||
|
||||
auto &Ctx = M.getContext();
|
||||
|
||||
for (CallInst *CI : WorkItems) {
|
||||
Function *K = CreateNewKernel(Ctx, M, OMPInfoCache.OMPBuilder,
|
||||
Parallel51RFI.Declaration, CI);
|
||||
|
||||
constexpr const unsigned OutlinedFnArgNum = 5;
|
||||
Function *OutlinedFn =
|
||||
dyn_cast<Function>(CI->getOperand(OutlinedFnArgNum));
|
||||
assert(OutlinedFn && "arg fn is not a function");
|
||||
|
||||
if (canUseMultiTeam(OutlinedFn, OMPInfoCache))
|
||||
EnableMultiTeam(OutlinedFn);
|
||||
|
||||
auto &Ctx = M.getContext();
|
||||
IRBuilder<> Builder(CI);
|
||||
|
||||
FunctionCallee Callee =
|
||||
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
|
||||
M, OMPRTL___kmpc_launch_parallel_51_kernel);
|
||||
Constant *KernelName = ConstantDataArray::getString(Ctx, K->getName());
|
||||
GlobalVariable *KernelNameVar =
|
||||
new GlobalVariable(M, KernelName->getType(), /*isConstant=*/true,
|
||||
GlobalValue::WeakAnyLinkage, KernelName);
|
||||
Value *Args[6] = {KernelNameVar, CI->getArgOperand(1),
|
||||
CI->getArgOperand(2), CI->getArgOperand(3),
|
||||
CI->getArgOperand(7), CI->getArgOperand(8)};
|
||||
|
||||
CallInst *C = Builder.CreateCall(Callee, Args);
|
||||
OMPInfoCache.setCallingConvention(Callee, C);
|
||||
CI->eraseFromParent();
|
||||
}
|
||||
|
||||
// Check all the use of __kmpc_parallel_51 again. If it is just used in
|
||||
// parallel kernel, we can mark the main kernel as SPMD mode as well.
|
||||
bool OnlyUsedByParallelKernel = true;
|
||||
for (User *U : Parallel51RFI.Declaration->users()) {
|
||||
auto *CI = dyn_cast<CallInst>(U);
|
||||
if (!CI)
|
||||
continue;
|
||||
Function *F = CI->getFunction();
|
||||
Attribute Attr = F->getFnAttribute("omp_parallel_51_kernel");
|
||||
if (Attr.isValid())
|
||||
continue;
|
||||
OnlyUsedByParallelKernel = false;
|
||||
break;
|
||||
}
|
||||
|
||||
if (OnlyUsedByParallelKernel) {
|
||||
for (auto K : SCC) {
|
||||
if (!omp::isOpenMPKernel(*K))
|
||||
continue;
|
||||
Attribute Attr = K->getFnAttribute("omp_parallel_51_kernel");
|
||||
if (Attr.isValid())
|
||||
continue;
|
||||
GlobalVariable *ExecMode =
|
||||
M.getGlobalVariable((K->getName() + "_exec_mode").str());
|
||||
assert(ExecMode && "kernel without exec mode");
|
||||
assert(ExecMode->getInitializer() &&
|
||||
"ExecMode doesn't have initializer!");
|
||||
assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
|
||||
"ExecMode is not an integer!");
|
||||
const int8_t ExecModeVal =
|
||||
cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
|
||||
// Kernel is already in SPMD mode, skip.
|
||||
if (ExecModeVal & OMP_TGT_EXEC_MODE_SPMD)
|
||||
continue;
|
||||
auto *NewExecModeC =
|
||||
ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD);
|
||||
ExecMode->setInitializer(NewExecModeC);
|
||||
OMPInformationCache::RuntimeFunctionInfo &TargetInitRFI =
|
||||
OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
|
||||
OMPInformationCache::RuntimeFunctionInfo &TargetDeInitRFI =
|
||||
OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
|
||||
CallInst *InitCI = nullptr;
|
||||
CallInst *DeInitCI = nullptr;
|
||||
for (User *U : TargetInitRFI.Declaration->users()) {
|
||||
auto *CI = dyn_cast<CallInst>(U);
|
||||
if (!CI)
|
||||
continue;
|
||||
if (CI->getFunction() == K) {
|
||||
InitCI = CI;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (User *U : TargetDeInitRFI.Declaration->users()) {
|
||||
auto *CI = dyn_cast<CallInst>(U);
|
||||
if (!CI)
|
||||
continue;
|
||||
if (CI->getFunction() == K) {
|
||||
DeInitCI = CI;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(InitCI && DeInitCI && "kernel without init and deinit");
|
||||
InitCI->setArgOperand(1, NewExecModeC);
|
||||
InitCI->setArgOperand(2,
|
||||
ConstantInt::getNullValue(Type::getInt1Ty(Ctx)));
|
||||
DeInitCI->setArgOperand(1, NewExecModeC);
|
||||
}
|
||||
}
|
||||
|
||||
// Check the use of omp_get_thread_num and omp_get_num_threads.
|
||||
OMPInformationCache::RuntimeFunctionInfo &ThreadNumRFI =
|
||||
OMPInfoCache.RFIs[OMPRTL_omp_get_thread_num];
|
||||
OMPInformationCache::RuntimeFunctionInfo &NumThreadsRFI =
|
||||
OMPInfoCache.RFIs[OMPRTL_omp_get_num_threads];
|
||||
|
||||
auto CheckIfOnlyUsedByParallelKernel = [&](CallInst *CI) {
|
||||
SmallVector<Kernel> ReachingKernels;
|
||||
collectReachingKernels(CI->getFunction(),
|
||||
ReachingKernels);
|
||||
return ReachingKernels.size() < 2;
|
||||
};
|
||||
|
||||
SmallVector<CallInst *> WorkList;
|
||||
|
||||
for (User *U : ThreadNumRFI.Declaration->users()) {
|
||||
auto *CI = dyn_cast<CallInst>(U);
|
||||
if (!CI)
|
||||
continue;
|
||||
if (!CheckIfOnlyUsedByParallelKernel(CI))
|
||||
continue;
|
||||
WorkList.push_back(CI);
|
||||
}
|
||||
for (User *U : NumThreadsRFI.Declaration->users()) {
|
||||
auto *CI = dyn_cast<CallInst>(U);
|
||||
if (!CI)
|
||||
continue;
|
||||
if (!CheckIfOnlyUsedByParallelKernel(CI))
|
||||
continue;
|
||||
WorkList.push_back(CI);
|
||||
}
|
||||
|
||||
for (CallInst *CI : WorkList) {
|
||||
Function *Callee = nullptr;
|
||||
if (CI->getCalledFunction() == ThreadNumRFI.Declaration)
|
||||
Callee = OMPInfoCache.RFIs[OMPRTL_omp_get_bulk_thread_num].Declaration;
|
||||
if (CI->getCalledFunction() == NumThreadsRFI.Declaration)
|
||||
Callee = OMPInfoCache.RFIs[OMPRTL_omp_get_bulk_num_threads].Declaration;
|
||||
assert(Callee && "unknown callee");
|
||||
|
||||
auto &Builder = OMPInfoCache.OMPBuilder.Builder;
|
||||
Builder.SetInsertPoint(CI);
|
||||
Value *C = Builder.CreateCall(Callee);
|
||||
CI->replaceAllUsesWith(C);
|
||||
CI->eraseFromParent();
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
/// Merge parallel regions when it is safe.
|
||||
bool mergeParallelRegions() {
|
||||
const unsigned CallbackCalleeOperand = 2;
|
||||
|
||||
@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMTransformUtils
|
||||
CallGraphUpdater.cpp
|
||||
CanonicalizeAliases.cpp
|
||||
CanonicalizeFreezeInLoops.cpp
|
||||
CanonicalizeMainFunction.cpp
|
||||
CloneFunction.cpp
|
||||
CloneModule.cpp
|
||||
CodeExtractor.cpp
|
||||
|
||||
103
llvm/lib/Transforms/Utils/CanonicalizeMainFunction.cpp
Normal file
103
llvm/lib/Transforms/Utils/CanonicalizeMainFunction.cpp
Normal file
@@ -0,0 +1,103 @@
|
||||
//===-------------- CanonicalizeMainFunction.cpp ----------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Utility function to canonicalize main function.
|
||||
// The canonical main function is defined as: int main(int argc, char *argv[]);
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "canonicalize-main-function"
|
||||
|
||||
static cl::opt<std::string>
|
||||
MainFunctionName("canonical-main-function-name",
|
||||
cl::desc("New main function name"),
|
||||
cl::value_desc("main function name"));
|
||||
|
||||
bool rewriteMainFunction(Function &F) {
|
||||
if (F.arg_size() == 2 && F.getReturnType()->isIntegerTy(32))
|
||||
return false;
|
||||
|
||||
auto &Ctx = F.getContext();
|
||||
auto &DL = F.getParent()->getDataLayout();
|
||||
auto *Int32Ty = IntegerType::getInt32Ty(Ctx);
|
||||
auto *PtrTy = PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace());
|
||||
|
||||
FunctionType *NewFnTy =
|
||||
FunctionType::get(Int32Ty, {Int32Ty, PtrTy}, /* isVarArg */ false);
|
||||
Function *NewFn =
|
||||
Function::Create(NewFnTy, F.getLinkage(), F.getAddressSpace(), "");
|
||||
F.getParent()->getFunctionList().insert(F.getIterator(), NewFn);
|
||||
NewFn->takeName(&F);
|
||||
NewFn->copyAttributesFrom(&F);
|
||||
NewFn->setSubprogram(F.getSubprogram());
|
||||
F.setSubprogram(nullptr);
|
||||
NewFn->splice(NewFn->begin(), &F);
|
||||
|
||||
if (!F.getReturnType()->isIntegerTy(32)) {
|
||||
SmallVector<ReturnInst *> WorkList;
|
||||
for (BasicBlock &BB : *NewFn)
|
||||
for (Instruction &I : BB) {
|
||||
auto *RI = dyn_cast<ReturnInst>(&I);
|
||||
if (!RI)
|
||||
continue;
|
||||
assert(RI->getReturnValue() == nullptr &&
|
||||
"return value of a void main function is not nullptr");
|
||||
WorkList.push_back(RI);
|
||||
}
|
||||
for (auto *RI : WorkList) {
|
||||
(void)ReturnInst::Create(Ctx, ConstantInt::getNullValue(Int32Ty), RI);
|
||||
RI->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
if (F.arg_size() == NewFn->arg_size())
|
||||
for (unsigned I = 0; I < NewFn->arg_size(); ++I) {
|
||||
Argument *OldArg = F.getArg(I);
|
||||
Argument *NewArg = NewFn->getArg(I);
|
||||
NewArg->takeName(OldArg);
|
||||
OldArg->replaceAllUsesWith(NewArg);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
PreservedAnalyses CanonicalizeMainFunctionPass::run(Module &M,
|
||||
ModuleAnalysisManager &AM) {
|
||||
Function *MainFunc = nullptr;
|
||||
|
||||
for (Function &F : M)
|
||||
if (F.getName() == "main") {
|
||||
assert(MainFunc == nullptr && "more than one main function");
|
||||
MainFunc = &F;
|
||||
}
|
||||
|
||||
if (MainFunc == nullptr)
|
||||
return PreservedAnalyses::all();
|
||||
|
||||
bool Changed = false;
|
||||
|
||||
if (!MainFunctionName.empty() && MainFunc->getName() != MainFunctionName) {
|
||||
MainFunc->setName(MainFunctionName);
|
||||
Changed = true;
|
||||
}
|
||||
|
||||
if (rewriteMainFunction(*MainFunc)) {
|
||||
MainFunc->eraseFromParent();
|
||||
Changed = true;
|
||||
}
|
||||
|
||||
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
|
||||
}
|
||||
@@ -89,6 +89,7 @@ set(include_files
|
||||
${include_directory}/Interface.h
|
||||
${include_directory}/LibC.h
|
||||
${include_directory}/Mapping.h
|
||||
${include_directory}/Memory.h
|
||||
${include_directory}/State.h
|
||||
${include_directory}/Synchronization.h
|
||||
${include_directory}/Types.h
|
||||
@@ -99,6 +100,7 @@ set(src_files
|
||||
${source_directory}/Allocator.cpp
|
||||
${source_directory}/Configuration.cpp
|
||||
${source_directory}/Debug.cpp
|
||||
${source_directory}/HostRPC.cpp
|
||||
${source_directory}/Kernel.cpp
|
||||
${source_directory}/LibC.cpp
|
||||
${source_directory}/Mapping.cpp
|
||||
@@ -112,6 +114,14 @@ set(src_files
|
||||
${source_directory}/Workshare.cpp
|
||||
)
|
||||
|
||||
if (LIBOMPTARGET_DEVICE_BUILTIN_ALLOCATOR)
|
||||
list(APPEND src_files ${source_directory}/BuiltinAllocator.cpp)
|
||||
elseif (LIBOMPTARGET_GENERIC_ALLOCATOR)
|
||||
list(APPEND src_files ${source_directory}/GenericAllocator.cpp)
|
||||
else()
|
||||
list(APPEND src_files ${source_directory}/TeamAllocator.cpp)
|
||||
endif()
|
||||
|
||||
# We disable the slp vectorizer during the runtime optimization to avoid
|
||||
# vectorized accesses to the shared state. Generally, those are "good" but
|
||||
# the optimizer pipeline (esp. Attributor) does not fully support vectorized
|
||||
@@ -138,6 +148,14 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
|
||||
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
|
||||
)
|
||||
|
||||
if (LIBOMPTARGET_DEVICERTL_HOSTRPC_DEBUG)
|
||||
list(APPEND bc_flags "-DHOSTRPC_DEBUG")
|
||||
endif()
|
||||
|
||||
if (LIBOMPTARGET_DEVICERTL_HOSTRPC_PROFILING)
|
||||
list(APPEND bc_flags "-DHOSTRPC_PROFILING")
|
||||
endif()
|
||||
|
||||
# first create an object target
|
||||
add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
|
||||
function(compileDeviceRTLLibrary target_cpu target_name target_triple)
|
||||
|
||||
@@ -39,6 +39,12 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
|
||||
__builtin_trap(); \
|
||||
__builtin_unreachable();
|
||||
|
||||
#define assert(expr) \
|
||||
{ \
|
||||
if (!(expr)) \
|
||||
__assert_fail(#expr, __FILE__, __LINE__, __PRETTY_FUNCTION__); \
|
||||
}
|
||||
|
||||
///}
|
||||
|
||||
#define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);
|
||||
|
||||
@@ -360,6 +360,22 @@ int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
|
||||
int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
|
||||
int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
|
||||
///}
|
||||
|
||||
/// Host RPC
|
||||
///
|
||||
/// {
|
||||
void *__kmpc_host_rpc_get_desc(int32_t CallNo, int32_t NumArgs, void *ArgInfo);
|
||||
void __kmpc_host_rpc_add_arg(void *Desc, int64_t Arg, int32_t ArgNum);
|
||||
int64_t __kmpc_host_rpc_send_and_wait(void *Desc);
|
||||
/// }
|
||||
|
||||
/// Launch parallel kernel
|
||||
///
|
||||
///{
|
||||
void __kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
|
||||
int32_t if_expr, int32_t num_threads,
|
||||
void **args, int64_t nargs);
|
||||
///}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -14,12 +14,116 @@
|
||||
|
||||
#include "Types.h"
|
||||
|
||||
struct FILE;
|
||||
extern FILE *stdin;
|
||||
extern FILE *stdout;
|
||||
extern FILE *stderr;
|
||||
|
||||
#ifndef _ASM_GENERIC_ERRNO_BASE_H
|
||||
#define _ASM_GENERIC_ERRNO_BASE_H
|
||||
|
||||
#define EPERM 1 /* Operation not permitted */
|
||||
#define ENOENT 2 /* No such file or directory */
|
||||
#define ESRCH 3 /* No such process */
|
||||
#define EINTR 4 /* Interrupted system call */
|
||||
#define EIO 5 /* I/O error */
|
||||
#define ENXIO 6 /* No such device or address */
|
||||
#define E2BIG 7 /* Argument list too long */
|
||||
#define ENOEXEC 8 /* Exec format error */
|
||||
#define EBADF 9 /* Bad file number */
|
||||
#define ECHILD 10 /* No child processes */
|
||||
#define EAGAIN 11 /* Try again */
|
||||
#define ENOMEM 12 /* Out of memory */
|
||||
#define EACCES 13 /* Permission denied */
|
||||
#define EFAULT 14 /* Bad address */
|
||||
#define ENOTBLK 15 /* Block device required */
|
||||
#define EBUSY 16 /* Device or resource busy */
|
||||
#define EEXIST 17 /* File exists */
|
||||
#define EXDEV 18 /* Cross-device link */
|
||||
#define ENODEV 19 /* No such device */
|
||||
#define ENOTDIR 20 /* Not a directory */
|
||||
#define EISDIR 21 /* Is a directory */
|
||||
#define EINVAL 22 /* Invalid argument */
|
||||
#define ENFILE 23 /* File table overflow */
|
||||
#define EMFILE 24 /* Too many open files */
|
||||
#define ENOTTY 25 /* Not a typewriter */
|
||||
#define ETXTBSY 26 /* Text file busy */
|
||||
#define EFBIG 27 /* File too large */
|
||||
#define ENOSPC 28 /* No space left on device */
|
||||
#define ESPIPE 29 /* Illegal seek */
|
||||
#define EROFS 30 /* Read-only file system */
|
||||
#define EMLINK 31 /* Too many links */
|
||||
#define EPIPE 32 /* Broken pipe */
|
||||
#define EDOM 33 /* Math argument out of domain of func */
|
||||
#define ERANGE 34 /* Math result not representable */
|
||||
|
||||
#endif
|
||||
|
||||
#define errno (*__errno_location())
|
||||
|
||||
extern "C" {
|
||||
|
||||
int memcmp(const void *lhs, const void *rhs, size_t count);
|
||||
void memset(void *dst, int C, size_t count);
|
||||
void *memset(void *dst, int C, size_t count);
|
||||
|
||||
int printf(const char *format, ...);
|
||||
|
||||
long strtol(const char *str, char **str_end, int base);
|
||||
|
||||
int strcmp(const char *lhs, const char *rhs);
|
||||
|
||||
void *calloc(size_t num, size_t size);
|
||||
|
||||
int strcasecmp(const char *string1, const char *string2);
|
||||
|
||||
void exit(int exit_code);
|
||||
|
||||
size_t strlen(const char *str);
|
||||
|
||||
int atoi(const char *str);
|
||||
|
||||
char *strcpy(char *dest, const char *src);
|
||||
|
||||
int stat(const char *path, struct stat *buf);
|
||||
|
||||
int *__errno_location();
|
||||
|
||||
char *strcat(char *dest, const char *src);
|
||||
|
||||
void perror(const char *s);
|
||||
|
||||
int strncmp(const char *lhs, const char *rhs, size_t count);
|
||||
|
||||
char *strncpy(char *dest, const char *src, size_t count);
|
||||
|
||||
char *strchr(const char *str, int ch);
|
||||
|
||||
char *strtok(char *str, const char *delim);
|
||||
|
||||
const unsigned short **__ctype_b_loc(void);
|
||||
|
||||
void *realloc(void *ptr, size_t new_size);
|
||||
|
||||
void qsort(void *const pbase, size_t total_elems, size_t size,
|
||||
int (*comp)(const void *, const void *));
|
||||
|
||||
int gettimeofday(struct timeval *tv, struct timezone *tz);
|
||||
|
||||
char *__xpg_basename(const char *path);
|
||||
|
||||
void srand(unsigned seed);
|
||||
|
||||
int rand();
|
||||
|
||||
int abs(int n);
|
||||
|
||||
void *memcpy(void *dest, const void *src, size_t count);
|
||||
|
||||
double atof(const char *str);
|
||||
|
||||
double strtod(const char *str, char **ptr);
|
||||
|
||||
long strtol(const char *nptr, char **endptr, int base);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
39
openmp/libomptarget/DeviceRTL/include/Memory.h
Normal file
39
openmp/libomptarget/DeviceRTL/include/Memory.h
Normal file
@@ -0,0 +1,39 @@
|
||||
//===--- Memory.h - OpenMP device runtime memory allocator -------- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef OMPTARGET_MEMORY_H
|
||||
#define OMPTARGET_MEMORY_H
|
||||
|
||||
#include "Types.h"
|
||||
|
||||
extern "C" {
|
||||
__attribute__((leaf)) void *malloc(size_t Size);
|
||||
__attribute__((leaf)) void free(void *Ptr);
|
||||
}
|
||||
|
||||
namespace ompx {
|
||||
namespace memory {
|
||||
|
||||
struct MemoryAllocationInfo {
|
||||
void *BasePtr = nullptr;
|
||||
size_t Size = 0;
|
||||
|
||||
bool isValid() const { return BasePtr; }
|
||||
};
|
||||
|
||||
/// Get the memory allocation information if pointer \p P is in the range of one
|
||||
/// of the buffer allocated by \p malloc.
|
||||
MemoryAllocationInfo getMemoryAllocationInfo(void *P);
|
||||
} // namespace memory
|
||||
} // namespace ompx
|
||||
|
||||
#endif
|
||||
@@ -36,6 +36,9 @@ enum MemScopeTy {
|
||||
uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering,
|
||||
MemScopeTy MemScope = MemScopeTy::all);
|
||||
|
||||
///
|
||||
int32_t addSys(int32_t *Addr, int32_t V);
|
||||
|
||||
/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
|
||||
/// result is stored in \p *Addr;
|
||||
/// {
|
||||
@@ -135,6 +138,60 @@ void system(atomic::OrderingTy Ordering);
|
||||
|
||||
} // namespace fence
|
||||
|
||||
namespace atomic {
|
||||
|
||||
/// Atomically load \p Addr with \p Ordering semantics.
|
||||
uint32_t load(uint32_t *Addr, int Ordering);
|
||||
|
||||
/// Atomically load \p Addr with \p Ordering semantics.
|
||||
uint64_t load(uint64_t *Addr, int Ordering);
|
||||
|
||||
/// Atomically store \p V to \p Addr with \p Ordering semantics.
|
||||
void store(uint32_t *Addr, uint32_t V, int Ordering);
|
||||
|
||||
void store(uint64_t *Addr, uint64_t V, int Ordering);
|
||||
|
||||
/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
|
||||
uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);
|
||||
|
||||
/// Atomically add \p V to \p *Addr with \p Ordering semantics.
|
||||
uint32_t add(uint32_t *Addr, uint32_t V, int Ordering);
|
||||
|
||||
/// Atomically add \p V to \p *Addr with \p Ordering semantics.
|
||||
uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);
|
||||
|
||||
} // namespace atomic
|
||||
|
||||
namespace mutex {
|
||||
|
||||
class TicketLock {
|
||||
uint64_t NowServing = 0;
|
||||
uint64_t NextTicket = 0;
|
||||
|
||||
public:
|
||||
TicketLock() = default;
|
||||
|
||||
TicketLock(const TicketLock &) = delete;
|
||||
|
||||
TicketLock(TicketLock &&) = delete;
|
||||
|
||||
void lock();
|
||||
|
||||
void unlock();
|
||||
};
|
||||
|
||||
template <typename T> class LockGuard {
|
||||
T &Lock;
|
||||
|
||||
public:
|
||||
explicit LockGuard(T &L) : Lock(L) { Lock.lock(); }
|
||||
|
||||
~LockGuard() { Lock.unlock(); }
|
||||
};
|
||||
|
||||
} // namespace mutex
|
||||
|
||||
} // namespace ompx
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -36,6 +36,7 @@ using size_t = decltype(sizeof(char));
|
||||
// TODO: Properly implement this
|
||||
using intptr_t = int64_t;
|
||||
using uintptr_t = uint64_t;
|
||||
using ptrdiff_t = intptr_t;
|
||||
|
||||
static_assert(sizeof(int8_t) == 1, "type size mismatch");
|
||||
static_assert(sizeof(uint8_t) == 1, "type size mismatch");
|
||||
@@ -83,6 +84,11 @@ enum kmp_sched_t {
|
||||
kmp_sched_distr_static_nochunk = 92,
|
||||
kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
|
||||
|
||||
// The following two are for direct GPU compilation where we don't have outer
|
||||
// loop to distribute workload among teams but we still need to.
|
||||
kmp_sched_distr_parallel_static_noloop_chunk = 100,
|
||||
kmp_sched_distr_parallel_static_noloop_nochunk = 101,
|
||||
|
||||
kmp_sched_default = kmp_sched_static_nochunk,
|
||||
kmp_sched_unordered_first = kmp_sched_static_chunk,
|
||||
kmp_sched_unordered_last = kmp_sched_auto,
|
||||
|
||||
@@ -16,6 +16,9 @@
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
extern "C" double omp_get_wtime();
|
||||
extern "C" int printf(const char *, ...);
|
||||
|
||||
namespace ompx {
|
||||
namespace utils {
|
||||
|
||||
@@ -37,8 +40,8 @@ template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
|
||||
}
|
||||
|
||||
/// Advance \p Ptr by \p Bytes bytes.
|
||||
template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
|
||||
return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
|
||||
template <typename Ty1, typename Ty2> inline Ty1 advance(Ty1 Ptr, Ty2 Bytes) {
|
||||
return reinterpret_cast<Ty1>(reinterpret_cast<char *>(Ptr) + Bytes);
|
||||
}
|
||||
|
||||
/// Return the first bit set in \p V.
|
||||
@@ -82,12 +85,39 @@ template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
|
||||
return *((DstTy *)(&V));
|
||||
}
|
||||
|
||||
template <typename Ty = char>
|
||||
ptrdiff_t getPtrDiff(const void *End, const void *Begin) {
|
||||
return reinterpret_cast<const Ty *>(End) -
|
||||
reinterpret_cast<const Ty *>(Begin);
|
||||
}
|
||||
|
||||
inline bool isInRange(void *Ptr, void *BasePtr, int64_t Offset) {
|
||||
ptrdiff_t Diff = getPtrDiff(Ptr, BasePtr);
|
||||
return Diff >= 0 && Diff < Offset;
|
||||
}
|
||||
|
||||
inline intptr_t ptrtoint(void *Ptr) { return reinterpret_cast<intptr_t>(Ptr); }
|
||||
|
||||
template <typename T> T min(T a, T b) { return a < b ? a : b; }
|
||||
|
||||
/// A pointer variable that has by design an `undef` value. Use with care.
|
||||
[[clang::loader_uninitialized]] static void *const UndefPtr;
|
||||
|
||||
#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
|
||||
#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
|
||||
|
||||
class SimpleProfiler {
|
||||
const char *HeadLine = nullptr;
|
||||
double Start;
|
||||
|
||||
public:
|
||||
SimpleProfiler(const char *HL) : HeadLine(HL), Start(omp_get_wtime()) {}
|
||||
~SimpleProfiler() {
|
||||
double End = omp_get_wtime();
|
||||
printf("%s --> %lf s.\n", HeadLine, End - Start);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace utils
|
||||
} // namespace ompx
|
||||
|
||||
|
||||
40
openmp/libomptarget/DeviceRTL/src/BuiltinAllocator.cpp
Normal file
40
openmp/libomptarget/DeviceRTL/src/BuiltinAllocator.cpp
Normal file
@@ -0,0 +1,40 @@
|
||||
//===------- BuiltinAllocator.cpp - Generic GPU memory allocator -- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
#include "Debug.h"
|
||||
#include "Memory.h"
|
||||
#include "Utils.h"
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
namespace ompx {
|
||||
namespace memory {
|
||||
MemoryAllocationInfo getMemoryAllocationInfo(void *P) { return {}; }
|
||||
|
||||
void init() {}
|
||||
} // namespace memory
|
||||
} // namespace ompx
|
||||
|
||||
extern "C" {
|
||||
|
||||
void *realloc(void *ptr, size_t new_size) {
|
||||
void *NewPtr = malloc(new_size);
|
||||
if (!NewPtr)
|
||||
return nullptr;
|
||||
__builtin_memcpy(NewPtr, ptr, new_size);
|
||||
free(ptr);
|
||||
return NewPtr;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
17
openmp/libomptarget/DeviceRTL/src/CMakeLists.txt
Normal file
17
openmp/libomptarget/DeviceRTL/src/CMakeLists.txt
Normal file
@@ -0,0 +1,17 @@
|
||||
target_sources(omptarget.devicertl PRIVATE
|
||||
Configuration.cpp
|
||||
Debug.cpp
|
||||
HostRPC.cpp
|
||||
GenericAllocator.cpp
|
||||
Kernel.cpp
|
||||
LibC.cpp
|
||||
Mapping.cpp
|
||||
Misc.cpp
|
||||
Parallelism.cpp
|
||||
Reduction.cpp
|
||||
State.cpp
|
||||
Synchronization.cpp
|
||||
Tasking.cpp
|
||||
Utils.cpp
|
||||
WarpAllocator.cpp
|
||||
Workshare.cpp)
|
||||
310
openmp/libomptarget/DeviceRTL/src/GenericAllocator.cpp
Normal file
310
openmp/libomptarget/DeviceRTL/src/GenericAllocator.cpp
Normal file
@@ -0,0 +1,310 @@
|
||||
//===------- GenericAllocator.cpp - Generic GPU memory allocator -- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
#include "Debug.h"
|
||||
#include "Memory.h"
|
||||
#include "Synchronization.h"
|
||||
#include "Utils.h"
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
char *CONSTANT(omptarget_device_heap_buffer)
|
||||
__attribute__((used, retain, weak, visibility("protected")));
|
||||
|
||||
size_t CONSTANT(omptarget_device_heap_size)
|
||||
__attribute__((used, retain, weak, visibility("protected")));
|
||||
|
||||
namespace {
|
||||
size_t HeapCurPos = 0;
|
||||
mutex::TicketLock HeapLock;
|
||||
mutex::TicketLock AllocationListLock;
|
||||
mutex::TicketLock FreeListLock;
|
||||
constexpr const size_t Alignment = 16;
|
||||
|
||||
using intptr_t = int64_t;
|
||||
|
||||
struct SimpleLinkListNode {
|
||||
template <typename T> friend struct SimpleLinkList;
|
||||
|
||||
protected:
|
||||
SimpleLinkListNode *Prev = nullptr;
|
||||
SimpleLinkListNode *Next = nullptr;
|
||||
|
||||
bool operator>(const SimpleLinkListNode &RHS) const {
|
||||
const auto This = reinterpret_cast<int64_t>(this);
|
||||
const auto RHSThis = reinterpret_cast<int64_t>(&RHS);
|
||||
return This > RHSThis;
|
||||
}
|
||||
|
||||
SimpleLinkListNode() = default;
|
||||
SimpleLinkListNode(const SimpleLinkListNode &) = delete;
|
||||
SimpleLinkListNode(SimpleLinkListNode &&) = delete;
|
||||
};
|
||||
|
||||
struct AllocationMetadata final : SimpleLinkListNode {
|
||||
public:
|
||||
size_t getUserSize() const;
|
||||
void *getUserAddr() const {
|
||||
return const_cast<void *>(reinterpret_cast<const void *>(this + 1));
|
||||
}
|
||||
|
||||
size_t getSize() const { return Size; }
|
||||
void setSize(size_t V) { Size = V; }
|
||||
|
||||
bool isInRange(void *Ptr) {
|
||||
return utils::isInRange(Ptr, this + 1, getUserSize());
|
||||
}
|
||||
|
||||
static AllocationMetadata *getFromUserAddr(void *P) {
|
||||
return getFromAddr(P) - 1;
|
||||
}
|
||||
|
||||
static AllocationMetadata *getFromAddr(void *P) {
|
||||
return reinterpret_cast<AllocationMetadata *>(P);
|
||||
}
|
||||
|
||||
private:
|
||||
size_t Size = 0;
|
||||
int64_t Reserved = 0;
|
||||
};
|
||||
|
||||
constexpr const size_t AllocationMetadataSize = sizeof(AllocationMetadata);
|
||||
static_assert(AllocationMetadataSize == 32,
|
||||
"expect the metadata size to be 32");
|
||||
|
||||
size_t AllocationMetadata::getUserSize() const {
|
||||
return Size - AllocationMetadataSize;
|
||||
}
|
||||
|
||||
template <typename T> struct SimpleLinkList {
|
||||
struct iterator {
|
||||
friend SimpleLinkList;
|
||||
|
||||
T *operator->() { return reinterpret_cast<T *>(Node); }
|
||||
T &operator*() { return reinterpret_cast<T &>(*Node); }
|
||||
|
||||
iterator operator++() {
|
||||
Node = Node->Next;
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool operator==(const iterator &RHS) const { return Node == RHS.Node; }
|
||||
bool operator!=(const iterator &RHS) const { return !(*this == RHS); }
|
||||
|
||||
iterator next() const {
|
||||
iterator Itr;
|
||||
Itr.Node = Node->Next;
|
||||
return Itr;
|
||||
}
|
||||
|
||||
private:
|
||||
SimpleLinkListNode *Node = nullptr;
|
||||
};
|
||||
|
||||
iterator begin() {
|
||||
iterator Itr;
|
||||
Itr.Node = Head.Next;
|
||||
return Itr;
|
||||
}
|
||||
|
||||
iterator end() { return {}; }
|
||||
|
||||
void insert(SimpleLinkListNode *Node) { insertImpl(&Head, Node); }
|
||||
|
||||
void remove(iterator Itr) {
|
||||
SimpleLinkListNode *Node = Itr.Node;
|
||||
remove(Node);
|
||||
}
|
||||
|
||||
void remove(SimpleLinkListNode *Node) { removeImpl(Node); }
|
||||
|
||||
bool empty() const { return Head.Next == nullptr; }
|
||||
|
||||
private:
|
||||
static void insertImpl(SimpleLinkListNode *Current,
|
||||
SimpleLinkListNode *Node) {
|
||||
SimpleLinkListNode *OldNext = Current->Next;
|
||||
Node->Prev = Current;
|
||||
Node->Next = OldNext;
|
||||
if (OldNext)
|
||||
OldNext->Prev = Node;
|
||||
Current->Next = Node;
|
||||
}
|
||||
|
||||
static void removeImpl(SimpleLinkListNode *Node) {
|
||||
SimpleLinkListNode *Prev = Node->Prev;
|
||||
SimpleLinkListNode *Next = Node->Next;
|
||||
Prev->Next = Next;
|
||||
if (Next)
|
||||
Next->Prev = Prev;
|
||||
Node->Prev = nullptr;
|
||||
Node->Next = nullptr;
|
||||
}
|
||||
|
||||
/// Check if the node is in the list.
|
||||
bool checkExist(SimpleLinkListNode *Node) const {
|
||||
SimpleLinkListNode *P = Head.Next;
|
||||
while (P) {
|
||||
if (P == Node)
|
||||
return true;
|
||||
P = P->Next;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool checkSanity(SimpleLinkListNode *Current,
|
||||
SimpleLinkListNode *Next) {
|
||||
return Current->Next == Next && (!Next || Next->Prev == Current);
|
||||
}
|
||||
|
||||
static bool checkSanity(SimpleLinkListNode *Prev, SimpleLinkListNode *Current,
|
||||
SimpleLinkListNode *Next) {
|
||||
return Prev->Next == Current && Current->Next == Next &&
|
||||
Current->Prev == Prev && (!Next || Next->Prev == Current);
|
||||
}
|
||||
|
||||
static bool checkDangling(SimpleLinkListNode *Node) {
|
||||
return Node->Next == nullptr && Node->Prev == nullptr;
|
||||
}
|
||||
|
||||
SimpleLinkListNode Head;
|
||||
};
|
||||
|
||||
SimpleLinkList<AllocationMetadata> AllocationList;
|
||||
SimpleLinkList<AllocationMetadata> FreeList;
|
||||
|
||||
constexpr const int SplitRatio = 5;
|
||||
|
||||
#ifndef LIBOMPTARGET_MEMORY_PROFILING
|
||||
struct MemoryProfiler {
|
||||
MemoryProfiler(const char *S) {}
|
||||
~MemoryProfiler() {}
|
||||
};
|
||||
#else
|
||||
struct MemoryProfiler : utils::SimpleProfiler {
|
||||
MemoryProfiler(const char *S) : utils::SimpleProfiler(S) {}
|
||||
~MemoryProfiler() { utils::SimpleProfiler::~SimpleProfiler(); }
|
||||
};
|
||||
#endif
|
||||
} // namespace
|
||||
|
||||
namespace ompx {
|
||||
namespace memory {
|
||||
MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
|
||||
mutex::LockGuard ALG(AllocationListLock);
|
||||
for (auto Itr = AllocationList.begin(); Itr != AllocationList.end(); ++Itr) {
|
||||
AllocationMetadata *MD = &(*Itr);
|
||||
if (MD->isInRange(P))
|
||||
return {MD->getUserAddr(), MD->getUserSize()};
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
void init() {}
|
||||
} // namespace memory
|
||||
} // namespace ompx
|
||||
|
||||
extern "C" {
|
||||
|
||||
void *memset(void *dest, int ch, size_t count);
|
||||
|
||||
void *malloc(size_t Size) {
|
||||
MemoryProfiler Profiler(__FUNCTION__);
|
||||
|
||||
Size = utils::align_up(Size + AllocationMetadataSize, Alignment);
|
||||
|
||||
AllocationMetadata *MD = nullptr;
|
||||
|
||||
{
|
||||
mutex::LockGuard FLG(FreeListLock);
|
||||
|
||||
auto Itr = FreeList.begin();
|
||||
for (; Itr != FreeList.end(); ++Itr) {
|
||||
if (Itr->getSize() >= Size)
|
||||
break;
|
||||
}
|
||||
|
||||
bool Found = Itr != FreeList.end() && (Itr->getSize() / Size < SplitRatio);
|
||||
if (Found) {
|
||||
MD = &(*Itr);
|
||||
FreeList.remove(Itr);
|
||||
}
|
||||
}
|
||||
|
||||
if (MD) {
|
||||
mutex::LockGuard ALG(AllocationListLock);
|
||||
AllocationList.insert(MD);
|
||||
return MD->getUserAddr();
|
||||
}
|
||||
|
||||
{
|
||||
mutex::LockGuard LG(HeapLock);
|
||||
|
||||
if (Size + HeapCurPos < omptarget_device_heap_size) {
|
||||
void *R = omptarget_device_heap_buffer + HeapCurPos;
|
||||
(void)atomic::add(&HeapCurPos, Size, atomic::acq_rel);
|
||||
MD = AllocationMetadata::getFromAddr(R);
|
||||
}
|
||||
}
|
||||
|
||||
if (MD) {
|
||||
// We need to reset the head in case of any dirty data.
|
||||
memset(MD, 0, AllocationMetadataSize);
|
||||
MD->setSize(Size);
|
||||
mutex::LockGuard ALG(AllocationListLock);
|
||||
AllocationList.insert(MD);
|
||||
return MD->getUserAddr();
|
||||
}
|
||||
|
||||
printf("out of heap memory! size=%lu, cur=%lu.\n", Size, HeapCurPos);
|
||||
|
||||
printf("%s:%d\n", __FILE__, __LINE__);
|
||||
__builtin_trap();
|
||||
}
|
||||
|
||||
void free(void *P) {
|
||||
MemoryProfiler Profiler(__FUNCTION__);
|
||||
|
||||
if (!P)
|
||||
return;
|
||||
|
||||
auto *MD = AllocationMetadata::getFromUserAddr(P);
|
||||
|
||||
{
|
||||
mutex::LockGuard ALG(AllocationListLock);
|
||||
AllocationList.remove(MD);
|
||||
}
|
||||
|
||||
{
|
||||
mutex::LockGuard FLG(FreeListLock);
|
||||
FreeList.insert(MD);
|
||||
}
|
||||
}
|
||||
|
||||
void *realloc(void *ptr, size_t new_size) {
|
||||
MemoryProfiler Profiler(__FUNCTION__);
|
||||
|
||||
void *NewPtr = malloc(new_size);
|
||||
if (!NewPtr)
|
||||
return nullptr;
|
||||
auto *OldMD = AllocationMetadata::getFromUserAddr(ptr);
|
||||
assert(ptr == OldMD->getUserAddr());
|
||||
__builtin_memcpy(NewPtr, ptr, utils::min(OldMD->getUserSize(), new_size));
|
||||
free(ptr);
|
||||
return NewPtr;
|
||||
}
|
||||
|
||||
void __kmpc_target_init_allocator() {}
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
420
openmp/libomptarget/DeviceRTL/src/HostRPC.cpp
Normal file
420
openmp/libomptarget/DeviceRTL/src/HostRPC.cpp
Normal file
@@ -0,0 +1,420 @@
|
||||
//===------- HostRPC.cpp - Implementation of host RPC ------------- C++ ---===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
#include "HostRPC.h"
|
||||
|
||||
#include "Debug.h"
|
||||
#include "LibC.h"
|
||||
#include "Memory.h"
|
||||
#include "Synchronization.h"
|
||||
#include "Types.h"
|
||||
#include "Utils.h"
|
||||
|
||||
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
|
||||
|
||||
#ifdef HOSTRPC_DEBUG
|
||||
#define DEBUG_PREFIX "host-rpc-device"
|
||||
#define DP(FMT, ...) \
|
||||
{ printf("%s --> " FMT, DEBUG_PREFIX, __VA_ARGS__); }
|
||||
#else
|
||||
#define DP(FMT, ...)
|
||||
#endif
|
||||
|
||||
using namespace ompx;
|
||||
using namespace hostrpc;
|
||||
|
||||
using ArgType = llvm::omp::OMPTgtHostRPCArgType;
|
||||
|
||||
Descriptor *omptarget_hostrpc_descriptor
|
||||
__attribute__((used, retain, weak, visibility("protected")));
|
||||
int32_t *omptarget_hostrpc_futex
|
||||
__attribute__((used, retain, weak, visibility("protected")));
|
||||
char *omptarget_hostrpc_memory_buffer
|
||||
__attribute__((used, retain, weak, visibility("protected")));
|
||||
size_t omptarget_hostrpc_memory_buffer_size
|
||||
__attribute__((used, retain, weak, visibility("protected")));
|
||||
|
||||
#ifdef HOSTRPC_PROFILING
|
||||
int32_t HostRPCId;
|
||||
double GetDescStart;
|
||||
double GetDescEnd;
|
||||
double AddArgStart;
|
||||
double AddArgEnd;
|
||||
double IssueAndWaitStart;
|
||||
double IssueAndWaitEnd;
|
||||
double CopyBackStart;
|
||||
double CopyBackEnd;
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
size_t HostRPCMemoryBufferCurrentPosition = 0;
|
||||
constexpr const size_t Alignment = 16;
|
||||
|
||||
// FIXME: For now we only allow one thread requesting host RPC.
|
||||
mutex::TicketLock HostRPCLock;
|
||||
|
||||
void *HostRPCMemAlloc(size_t Size) {
|
||||
Size = utils::align_up(Size, Alignment);
|
||||
|
||||
if (Size + HostRPCMemoryBufferCurrentPosition <
|
||||
omptarget_hostrpc_memory_buffer_size) {
|
||||
void *R =
|
||||
omptarget_hostrpc_memory_buffer + HostRPCMemoryBufferCurrentPosition;
|
||||
atomic::add(&HostRPCMemoryBufferCurrentPosition, Size, atomic::acq_rel);
|
||||
return R;
|
||||
}
|
||||
|
||||
printf("%s:%d\n", __FILE__, __LINE__);
|
||||
__builtin_trap();
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// For now we just reset the buffer.
|
||||
void HostRPCMemReset() { HostRPCMemoryBufferCurrentPosition = 0; }
|
||||
|
||||
static_assert(sizeof(intptr_t) == sizeof(int64_t), "pointer size not match");
|
||||
|
||||
struct HostRPCArgInfo {
|
||||
void *BasePtr;
|
||||
int64_t Type;
|
||||
int64_t Size;
|
||||
HostRPCArgInfo *Next;
|
||||
};
|
||||
|
||||
struct HostRPCPointerMapEntry {
|
||||
void *BasePtr;
|
||||
void *MappedBasePtr;
|
||||
int64_t Size;
|
||||
int64_t Kind;
|
||||
};
|
||||
|
||||
void *getMappedPointer(Descriptor *D, void *BasePtr, int64_t Size,
|
||||
int64_t Offset, int64_t Kind) {
|
||||
assert(D->ArgMap && "ArgMap should not be nullptr");
|
||||
|
||||
HostRPCPointerMapEntry *MapTable =
|
||||
reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
|
||||
int I = 0;
|
||||
for (; I < D->NumArgs && MapTable[I].BasePtr; ++I)
|
||||
if (MapTable[I].BasePtr == BasePtr)
|
||||
return utils::advance(MapTable[I].MappedBasePtr, Offset);
|
||||
|
||||
MapTable[I].BasePtr = BasePtr;
|
||||
MapTable[I].MappedBasePtr = HostRPCMemAlloc(Size);
|
||||
MapTable[I].Size = Size;
|
||||
MapTable[I].Kind = Kind;
|
||||
|
||||
if (Kind & ArgType::OMP_HOST_RPC_ARG_COPY_TO) {
|
||||
__builtin_memcpy(MapTable[I].MappedBasePtr, BasePtr, Size);
|
||||
DP("getMappedPointer: copy %ld bytes memory from %p to %p.\n", Size,
|
||||
BasePtr, MapTable[I].MappedBasePtr);
|
||||
}
|
||||
|
||||
return utils::advance(MapTable[I].MappedBasePtr, Offset);
|
||||
}
|
||||
|
||||
void copybackIfNeeded(Descriptor *D) {
|
||||
if (!D->ArgMap)
|
||||
return;
|
||||
|
||||
auto *MapTable = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
|
||||
for (int I = 0; I < D->NumArgs && MapTable[I].BasePtr; ++I)
|
||||
if (MapTable[I].Kind & ArgType::OMP_HOST_RPC_ARG_COPY_FROM) {
|
||||
__builtin_memcpy(MapTable[I].BasePtr, MapTable[I].MappedBasePtr,
|
||||
MapTable[I].Size);
|
||||
DP("copybackIfNeeded: copy %ld bytes memory from %p to %p.\n",
|
||||
MapTable[I].Size, MapTable[I].MappedBasePtr, MapTable[I].BasePtr);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
extern "C" {
|
||||
__attribute__((noinline, used)) void *
|
||||
__kmpc_host_rpc_get_desc(int32_t CallId, int32_t NumArgs, void *ArgInfo) {
|
||||
assert(omptarget_hostrpc_descriptor && omptarget_hostrpc_futex &&
|
||||
"no host rpc pointer");
|
||||
|
||||
DP("device: stdin=%p, stdout=%p, stderr=%p\n", stdin, stdout, stderr);
|
||||
DP("get desc for request (id=%d), NumArgs=%d, ArgInfo=%p.\n", CallId, NumArgs,
|
||||
ArgInfo);
|
||||
#ifdef HOSTRPC_DEBUG
|
||||
{
|
||||
void **AIs = reinterpret_cast<void **>(ArgInfo);
|
||||
for (int I = 0; I < NumArgs; ++I)
|
||||
DP("ArgInfo[%d]=%p.\n", I, AIs[I]);
|
||||
}
|
||||
#endif
|
||||
|
||||
HostRPCLock.lock();
|
||||
|
||||
#ifdef HOSTRPC_PROFILING
|
||||
HostRPCId = CallId;
|
||||
GetDescStart = omp_get_wtime();
|
||||
#endif
|
||||
|
||||
// TODO: change it after we support a queue-like data structure.
|
||||
Descriptor *D = omptarget_hostrpc_descriptor;
|
||||
|
||||
D->Id = CallId;
|
||||
D->ArgInfo = reinterpret_cast<void **>(ArgInfo);
|
||||
D->NumArgs = NumArgs;
|
||||
D->Status = EXEC_STAT_CREATED;
|
||||
D->ReturnValue = 0;
|
||||
D->Args =
|
||||
reinterpret_cast<Argument *>(HostRPCMemAlloc(sizeof(Argument) * NumArgs));
|
||||
D->ArgMap = HostRPCMemAlloc(sizeof(HostRPCPointerMapEntry) * NumArgs);
|
||||
|
||||
assert(!NumArgs || (D->Args && D->ArgMap) && "out of host rpc memory!");
|
||||
|
||||
// Reset the map table.
|
||||
auto *ArgMap = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
|
||||
for (int I = 0; I < NumArgs; ++I)
|
||||
ArgMap[I].BasePtr = nullptr;
|
||||
|
||||
#ifdef HOSTRPC_PROFILING
|
||||
GetDescEnd = omp_get_wtime();
|
||||
AddArgStart = omp_get_wtime();
|
||||
#endif
|
||||
|
||||
return D;
|
||||
}
|
||||
|
||||
__attribute__((noinline, used)) void
|
||||
__kmpc_host_rpc_add_arg(void *Desc, int64_t ArgVal, int32_t ArgNum) {
|
||||
auto *D = reinterpret_cast<Descriptor *>(Desc);
|
||||
assert(ArgNum < D->NumArgs && "out-of-range arguments");
|
||||
Argument &ArgInDesc = D->Args[ArgNum];
|
||||
|
||||
DP("add arg (no=%d), arg=%lx to request (id=%d).\n", ArgNum, ArgVal, D->Id);
|
||||
|
||||
// This early branch can rule out nullptr and zero scalar value because it
|
||||
// doesn't matter whether it is a pointer or scalar value.
|
||||
if (ArgVal == 0) {
|
||||
ArgInDesc.Value = 0;
|
||||
ArgInDesc.ArgType = Type::ARG_LITERAL;
|
||||
|
||||
DP("arg (no=%d) is null, done.\n", ArgNum);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void *ArgPtr = reinterpret_cast<void *>(ArgVal);
|
||||
|
||||
if (ArgPtr == stdin || ArgPtr == stdout || ArgPtr == stderr) {
|
||||
ArgInDesc.Value = ArgVal;
|
||||
ArgInDesc.ArgType = Type::ARG_POINTER;
|
||||
|
||||
DP("arg (no=%d) is stdin/stdout/stderr, done.\n", ArgNum);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const auto *AI = reinterpret_cast<HostRPCArgInfo *>(D->ArgInfo[ArgNum]);
|
||||
|
||||
DP("try to find arg (no=%d) from args AI=%p\n", ArgNum, AI);
|
||||
|
||||
if (AI) {
|
||||
// Let's first check if Arg is a scalar.
|
||||
if (AI->Type == ArgType::OMP_HOST_RPC_ARG_SCALAR) {
|
||||
assert(AI->BasePtr == ArgPtr && "invalid scalar argument info");
|
||||
assert(AI->Next == nullptr && "invalid scalar argument info");
|
||||
|
||||
ArgInDesc.Value = ArgVal;
|
||||
ArgInDesc.ArgType = Type::ARG_LITERAL;
|
||||
|
||||
DP("arg (no=%d) is scalar, done.\n", ArgNum);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Then let's see if it is a literal pointer that we don't need copy.
|
||||
auto *P = AI;
|
||||
while (P) {
|
||||
if (P->Type == ArgType::OMP_HOST_RPC_ARG_PTR && P->BasePtr == ArgPtr) {
|
||||
ArgInDesc.Value = ArgVal;
|
||||
ArgInDesc.ArgType = Type::ARG_POINTER;
|
||||
|
||||
DP("arg (no=%d) is literal pointer, done.\n", ArgNum);
|
||||
|
||||
return;
|
||||
}
|
||||
P = P->Next;
|
||||
}
|
||||
|
||||
// Next we check if it is within the range of any buffer described in
|
||||
// argument info.
|
||||
P = AI;
|
||||
while (P) {
|
||||
if ((P->Type & ArgType::OMP_HOST_RPC_ARG_PTR) && P->Size) {
|
||||
if (utils::isInRange(ArgPtr, P->BasePtr, P->Size)) {
|
||||
auto Size = P->Size;
|
||||
auto Offset = utils::getPtrDiff(ArgPtr, P->BasePtr);
|
||||
|
||||
ArgInDesc.Value = utils::ptrtoint(
|
||||
getMappedPointer(D, P->BasePtr, Size, Offset, P->Type));
|
||||
ArgInDesc.ArgType = Type::ARG_POINTER;
|
||||
|
||||
DP("found a match for arg (no=%d). done.\n", ArgNum);
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
P = P->Next;
|
||||
}
|
||||
}
|
||||
|
||||
// Now we can't find a match from argument info, then we assume it is from
|
||||
// dynamic allocation.
|
||||
memory::MemoryAllocationInfo MAI = memory::getMemoryAllocationInfo(ArgPtr);
|
||||
if (MAI.isValid()) {
|
||||
auto Size = MAI.Size;
|
||||
auto Offset = utils::getPtrDiff(ArgPtr, MAI.BasePtr);
|
||||
|
||||
ArgInDesc.Value = utils::ptrtoint(
|
||||
getMappedPointer(D, MAI.BasePtr, Size, Offset,
|
||||
/* Kind */ ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM));
|
||||
ArgInDesc.ArgType = Type::ARG_POINTER;
|
||||
|
||||
DP("arg (no=%d) is from malloc. done.\n", ArgNum);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
printf("request (id=%d) arg (no=%d, val=%p) is unknown. send it to host "
|
||||
"directly.\n",
|
||||
D->Id, ArgNum, ArgPtr);
|
||||
|
||||
ArgInDesc.Value = ArgVal;
|
||||
ArgInDesc.ArgType = Type::ARG_POINTER;
|
||||
}
|
||||
|
||||
__attribute__((noinline, used)) int64_t
|
||||
__kmpc_host_rpc_send_and_wait(void *Desc) {
|
||||
auto *D = reinterpret_cast<Descriptor *>(Desc);
|
||||
int32_t Id = D->Id;
|
||||
|
||||
#ifdef HOSTRPC_PROFILING
|
||||
AddArgEnd = omp_get_wtime();
|
||||
IssueAndWaitStart = omp_get_wtime();
|
||||
#endif
|
||||
|
||||
atomic::add(omptarget_hostrpc_futex, 1U, atomic::acq_rel);
|
||||
|
||||
// A system fence is required to make sure futex on the host is also
|
||||
// updated if USM is supported.
|
||||
fence::system(atomic::seq_cst);
|
||||
|
||||
DP("sent request (id=%d) to host. waiting for finish.\n", Id);
|
||||
|
||||
unsigned NS = 8;
|
||||
|
||||
while (atomic::addSys(omptarget_hostrpc_futex, 0)) {
|
||||
asm volatile("nanosleep.u32 %0;" : : "r"(NS));
|
||||
// if (NS < 64)
|
||||
// NS *= 2;
|
||||
// fence::system(atomic::seq_cst);
|
||||
}
|
||||
|
||||
#ifdef HOSTRPC_PROFILING
|
||||
IssueAndWaitEnd = omp_get_wtime();
|
||||
#endif
|
||||
|
||||
DP("finish waiting for request (id=%d).\n", Id);
|
||||
|
||||
int64_t Ret = D->ReturnValue;
|
||||
|
||||
assert(!D->NumArgs || D->ArgMap && "arg map should not be nullptr");
|
||||
|
||||
#ifdef HOSTRPC_PROFILING
|
||||
CopyBackStart = omp_get_wtime();
|
||||
#endif
|
||||
|
||||
if (D->ArgMap) {
|
||||
DP("copy memory back for request (id=%d).\n", Id);
|
||||
copybackIfNeeded(D);
|
||||
DP("finish copy memory back for request (id=%d).\n", Id);
|
||||
}
|
||||
|
||||
#ifdef HOSTRPC_PROFILING
|
||||
CopyBackEnd = omp_get_wtime();
|
||||
#endif
|
||||
|
||||
HostRPCMemReset();
|
||||
|
||||
// We can unlock now as we already get all temporary part.
|
||||
// TODO: If we have a queue, we don't need this step.
|
||||
HostRPCLock.unlock();
|
||||
|
||||
DP("request (id=%d) is done with return code=%lx.\n", Id, Ret);
|
||||
|
||||
#ifdef HOSTRPC_PROFILING
|
||||
printf("[host-rpc-profiling-device] id=%d, init=%lf, add_arg=%lf, wait=%lf, "
|
||||
"copy=%lf.\n",
|
||||
HostRPCId, GetDescEnd - GetDescStart, AddArgEnd - AddArgStart,
|
||||
IssueAndWaitEnd - IssueAndWaitStart, CopyBackEnd - CopyBackStart);
|
||||
#endif
|
||||
|
||||
return Ret;
|
||||
}
|
||||
|
||||
__attribute__((noinline, used)) void
|
||||
__kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
|
||||
int32_t if_expr, int32_t num_threads,
|
||||
void **args, int64_t nargs) {
|
||||
constexpr const int64_t NumArgs = 6;
|
||||
HostRPCArgInfo ArgInfoArray[NumArgs];
|
||||
void *ArgInfo[NumArgs];
|
||||
for (unsigned I = 0; I < NumArgs; ++I) {
|
||||
ArgInfoArray[I].BasePtr = 0;
|
||||
ArgInfoArray[I].Type = ArgType::OMP_HOST_RPC_ARG_SCALAR;
|
||||
ArgInfoArray[I].Size = 0;
|
||||
ArgInfoArray[I].Next = nullptr;
|
||||
ArgInfo[I] = &ArgInfoArray[I];
|
||||
}
|
||||
auto *D = (Descriptor *)__kmpc_host_rpc_get_desc(0, NumArgs, (void *)ArgInfo);
|
||||
|
||||
// Set up arg info struct.
|
||||
ArgInfoArray[0].BasePtr = const_cast<char *>(name);
|
||||
ArgInfoArray[0].Type = ArgType::OMP_HOST_RPC_ARG_COPY_TO;
|
||||
ArgInfoArray[0].Size = strlen(name) + 1;
|
||||
ArgInfoArray[1].BasePtr =
|
||||
reinterpret_cast<void *>(static_cast<int64_t>(gtid));
|
||||
ArgInfoArray[2].BasePtr =
|
||||
reinterpret_cast<void *>(static_cast<int64_t>(if_expr));
|
||||
ArgInfoArray[3].BasePtr =
|
||||
reinterpret_cast<void *>(static_cast<int64_t>(num_threads));
|
||||
ArgInfoArray[5].BasePtr = reinterpret_cast<void *>(nargs);
|
||||
|
||||
// We need to treat args in a little bit different way because nargs might be
|
||||
// on the stack.
|
||||
ArgInfoArray[4].Size = sizeof(void *) * nargs;
|
||||
void *Args = nullptr;
|
||||
if (nargs) {
|
||||
Args = HostRPCMemAlloc(ArgInfoArray[4].Size);
|
||||
__builtin_memcpy(Args, args, ArgInfoArray[4].Size);
|
||||
}
|
||||
ArgInfoArray[4].BasePtr = Args;
|
||||
|
||||
D->Id = CALLID___kmpc_launch_parallel_51_kernel;
|
||||
|
||||
__kmpc_host_rpc_add_arg(D, reinterpret_cast<int64_t>(name), 0);
|
||||
__kmpc_host_rpc_add_arg(D, gtid, 1);
|
||||
__kmpc_host_rpc_add_arg(D, if_expr, 2);
|
||||
__kmpc_host_rpc_add_arg(D, num_threads, 3);
|
||||
__kmpc_host_rpc_add_arg(D, reinterpret_cast<int64_t>(Args), 4);
|
||||
__kmpc_host_rpc_add_arg(D, nargs, 5);
|
||||
|
||||
(void)__kmpc_host_rpc_send_and_wait(D);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "Debug.h"
|
||||
#include "Interface.h"
|
||||
#include "Mapping.h"
|
||||
#include "Memory.h"
|
||||
#include "State.h"
|
||||
#include "Synchronization.h"
|
||||
#include "Types.h"
|
||||
|
||||
@@ -7,6 +7,12 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "LibC.h"
|
||||
#include "Debug.h"
|
||||
#include "Memory.h"
|
||||
#include "Synchronization.h"
|
||||
#include "Utils.h"
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
@@ -32,8 +38,324 @@ int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
|
||||
return -1;
|
||||
}
|
||||
} // namespace impl
|
||||
|
||||
#pragma omp end declare variant
|
||||
|
||||
// Dummy variable for stdio.
|
||||
int StdInDummyVar;
|
||||
int StdOutDummyVar;
|
||||
int StdErrDummyVar;
|
||||
|
||||
struct FILE;
|
||||
__attribute__((used, retain, weak, visibility("protected"))) FILE *stdin =
|
||||
(FILE *)&StdInDummyVar;
|
||||
__attribute__((used, retain, weak, visibility("protected"))) FILE *stdout =
|
||||
(FILE *)&StdOutDummyVar;
|
||||
__attribute__((used, retain, weak, visibility("protected"))) FILE *stderr =
|
||||
(FILE *)&StdErrDummyVar;
|
||||
|
||||
typedef int (*__compar_fn_t)(const void *, const void *);
|
||||
typedef int (*__compar_d_fn_t)(const void *, const void *, void *);
|
||||
|
||||
namespace {
|
||||
const int32_t ToLowerMapTable[] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6,
|
||||
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
|
||||
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
|
||||
37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
|
||||
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 'a', 'b',
|
||||
'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
|
||||
'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 91, 92, 93, 94, 95, 96,
|
||||
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
||||
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 123, 124, 125, 126,
|
||||
127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
const int32_t *ToLowerPTable = ToLowerMapTable + 128;
|
||||
|
||||
#define X(x) (((x) / 256 | (x)*256) % 65536)
|
||||
|
||||
static const unsigned short BLocTable[] = {
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
|
||||
X(0x200), X(0x200), X(0x200), X(0x200), X(0x320), X(0x220), X(0x220),
|
||||
X(0x220), X(0x220), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
|
||||
X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
|
||||
X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x160),
|
||||
X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
|
||||
X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
|
||||
X(0x4c0), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8),
|
||||
X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x4c0), X(0x4c0), X(0x4c0),
|
||||
X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x8d5), X(0x8d5), X(0x8d5),
|
||||
X(0x8d5), X(0x8d5), X(0x8d5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
|
||||
X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
|
||||
X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
|
||||
X(0x8c5), X(0x8c5), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
|
||||
X(0x4c0), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6),
|
||||
X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6),
|
||||
X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6),
|
||||
X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x4c0),
|
||||
X(0x4c0), X(0x4c0), X(0x4c0), X(0x200), 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
|
||||
#undef X
|
||||
|
||||
static const unsigned short *BLocTablePTable = BLocTable + 128;
|
||||
|
||||
int ErrNo __attribute__((used, retain));
|
||||
|
||||
static unsigned long int RandomNext = 1;
|
||||
|
||||
#define SWAP(a, b, size) \
|
||||
do { \
|
||||
size_t __size = (size); \
|
||||
char *__a = (a), *__b = (b); \
|
||||
do { \
|
||||
char __tmp = *__a; \
|
||||
*__a++ = *__b; \
|
||||
*__b++ = __tmp; \
|
||||
} while (--__size > 0); \
|
||||
} while (0)
|
||||
|
||||
/* Discontinue quicksort algorithm when partition gets below this size.
|
||||
This particular magic number was chosen to work best on a Sun 4/260. */
|
||||
#define MAX_THRESH 4
|
||||
|
||||
/* Stack node declarations used to store unfulfilled partition obligations. */
|
||||
typedef struct {
|
||||
char *lo;
|
||||
char *hi;
|
||||
} stack_node;
|
||||
|
||||
/* The next 4 #defines implement a very fast in-line stack abstraction. */
|
||||
/* The stack needs log (total_elements) entries (we could even subtract
|
||||
log(MAX_THRESH)). Since total_elements has type size_t, we get as
|
||||
upper bound for log (total_elements):
|
||||
bits per byte (CHAR_BIT) * sizeof(size_t). */
|
||||
#define CHAR_BIT 8
|
||||
#define STACK_SIZE (CHAR_BIT * sizeof(size_t))
|
||||
#define PUSH(low, high) ((void)((top->lo = (low)), (top->hi = (high)), ++top))
|
||||
#define POP(low, high) ((void)(--top, (low = top->lo), (high = top->hi)))
|
||||
#define STACK_NOT_EMPTY (stack < top)
|
||||
|
||||
/* Order size using quicksort. This implementation incorporates
|
||||
four optimizations discussed in Sedgewick:
|
||||
1. Non-recursive, using an explicit stack of pointer that store the
|
||||
next array partition to sort. To save time, this maximum amount
|
||||
of space required to store an array of SIZE_MAX is allocated on the
|
||||
stack. Assuming a 32-bit (64 bit) integer for size_t, this needs
|
||||
only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes).
|
||||
Pretty cheap, actually.
|
||||
2. Chose the pivot element using a median-of-three decision tree.
|
||||
This reduces the probability of selecting a bad pivot value and
|
||||
eliminates certain extraneous comparisons.
|
||||
3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving
|
||||
insertion sort to order the MAX_THRESH items within each partition.
|
||||
This is a big win, since insertion sort is faster for small, mostly
|
||||
sorted array segments.
|
||||
4. The larger of the two sub-partitions is always pushed onto the
|
||||
stack first, with the algorithm then concentrating on the
|
||||
smaller partition. This *guarantees* no more than log (total_elems)
|
||||
stack size is needed (actually O(1) in this case)! */
|
||||
|
||||
void _quicksort(void *const pbase, size_t total_elems, size_t size,
|
||||
__compar_d_fn_t cmp, void *arg) {
|
||||
char *base_ptr = (char *)pbase;
|
||||
|
||||
const size_t max_thresh = MAX_THRESH * size;
|
||||
|
||||
if (total_elems == 0)
|
||||
/* Avoid lossage with unsigned arithmetic below. */
|
||||
return;
|
||||
|
||||
if (total_elems > MAX_THRESH) {
|
||||
char *lo = base_ptr;
|
||||
char *hi = &lo[size * (total_elems - 1)];
|
||||
stack_node stack[STACK_SIZE];
|
||||
stack_node *top = stack;
|
||||
|
||||
PUSH(nullptr, nullptr);
|
||||
|
||||
while (STACK_NOT_EMPTY) {
|
||||
char *left_ptr;
|
||||
char *right_ptr;
|
||||
|
||||
/* Select median value from among LO, MID, and HI. Rearrange
|
||||
LO and HI so the three values are sorted. This lowers the
|
||||
probability of picking a pathological pivot value and
|
||||
skips a comparison for both the LEFT_PTR and RIGHT_PTR in
|
||||
the while loops. */
|
||||
|
||||
char *mid = lo + size * ((hi - lo) / size >> 1);
|
||||
|
||||
if ((*cmp)((void *)mid, (void *)lo, arg) < 0)
|
||||
SWAP(mid, lo, size);
|
||||
if ((*cmp)((void *)hi, (void *)mid, arg) < 0)
|
||||
SWAP(mid, hi, size);
|
||||
else
|
||||
goto jump_over;
|
||||
if ((*cmp)((void *)mid, (void *)lo, arg) < 0)
|
||||
SWAP(mid, lo, size);
|
||||
jump_over:;
|
||||
|
||||
left_ptr = lo + size;
|
||||
right_ptr = hi - size;
|
||||
|
||||
/* Here's the famous ``collapse the walls'' section of quicksort.
|
||||
Gotta like those tight inner loops! They are the main reason
|
||||
that this algorithm runs much faster than others. */
|
||||
do {
|
||||
while ((*cmp)((void *)left_ptr, (void *)mid, arg) < 0)
|
||||
left_ptr += size;
|
||||
|
||||
while ((*cmp)((void *)mid, (void *)right_ptr, arg) < 0)
|
||||
right_ptr -= size;
|
||||
|
||||
if (left_ptr < right_ptr) {
|
||||
SWAP(left_ptr, right_ptr, size);
|
||||
if (mid == left_ptr)
|
||||
mid = right_ptr;
|
||||
else if (mid == right_ptr)
|
||||
mid = left_ptr;
|
||||
left_ptr += size;
|
||||
right_ptr -= size;
|
||||
} else if (left_ptr == right_ptr) {
|
||||
left_ptr += size;
|
||||
right_ptr -= size;
|
||||
break;
|
||||
}
|
||||
} while (left_ptr <= right_ptr);
|
||||
|
||||
/* Set up pointers for next iteration. First determine whether
|
||||
left and right partitions are below the threshold size. If so,
|
||||
ignore one or both. Otherwise, push the larger partition's
|
||||
bounds on the stack and continue sorting the smaller one. */
|
||||
|
||||
if ((size_t)(right_ptr - lo) <= max_thresh) {
|
||||
if ((size_t)(hi - left_ptr) <= max_thresh)
|
||||
/* Ignore both small partitions. */
|
||||
POP(lo, hi);
|
||||
else
|
||||
/* Ignore small left partition. */
|
||||
lo = left_ptr;
|
||||
} else if ((size_t)(hi - left_ptr) <= max_thresh)
|
||||
/* Ignore small right partition. */
|
||||
hi = right_ptr;
|
||||
else if ((right_ptr - lo) > (hi - left_ptr)) {
|
||||
/* Push larger left partition indices. */
|
||||
PUSH(lo, right_ptr);
|
||||
lo = left_ptr;
|
||||
} else {
|
||||
/* Push larger right partition indices. */
|
||||
PUSH(left_ptr, hi);
|
||||
hi = right_ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Once the BASE_PTR array is partially sorted by quicksort the rest
|
||||
is completely sorted using insertion sort, since this is efficient
|
||||
for partitions below MAX_THRESH size. BASE_PTR points to the beginning
|
||||
of the array to sort, and END_PTR points at the very last element in
|
||||
the array (*not* one beyond it!). */
|
||||
|
||||
#define min(x, y) ((x) < (y) ? (x) : (y))
|
||||
|
||||
{
|
||||
char *const end_ptr = &base_ptr[size * (total_elems - 1)];
|
||||
char *tmp_ptr = base_ptr;
|
||||
char *thresh = min(end_ptr, base_ptr + max_thresh);
|
||||
char *run_ptr;
|
||||
|
||||
/* Find smallest element in first threshold and place it at the
|
||||
array's beginning. This is the smallest array element,
|
||||
and the operation speeds up insertion sort's inner loop. */
|
||||
|
||||
for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
|
||||
if ((*cmp)((void *)run_ptr, (void *)tmp_ptr, arg) < 0)
|
||||
tmp_ptr = run_ptr;
|
||||
|
||||
if (tmp_ptr != base_ptr)
|
||||
SWAP(tmp_ptr, base_ptr, size);
|
||||
|
||||
/* Insertion sort, running from left-hand-side up to right-hand-side. */
|
||||
|
||||
run_ptr = base_ptr + size;
|
||||
while ((run_ptr += size) <= end_ptr) {
|
||||
tmp_ptr = run_ptr - size;
|
||||
while ((*cmp)((void *)run_ptr, (void *)tmp_ptr, arg) < 0)
|
||||
tmp_ptr -= size;
|
||||
|
||||
tmp_ptr += size;
|
||||
if (tmp_ptr != run_ptr) {
|
||||
char *trav;
|
||||
|
||||
trav = run_ptr + size;
|
||||
while (--trav >= run_ptr) {
|
||||
char c = *trav;
|
||||
char *hi, *lo;
|
||||
|
||||
for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
|
||||
*hi = *lo;
|
||||
*hi = c;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
extern "C" {
|
||||
|
||||
int memcmp(const void *lhs, const void *rhs, size_t count) {
|
||||
@@ -47,16 +369,419 @@ int memcmp(const void *lhs, const void *rhs, size_t count) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void memset(void *dst, int C, size_t count) {
|
||||
auto *dstc = reinterpret_cast<char *>(dst);
|
||||
for (size_t I = 0; I < count; ++I)
|
||||
dstc[I] = C;
|
||||
}
|
||||
|
||||
/// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf
|
||||
int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
|
||||
return impl::omp_vprintf(Format, Arguments, Size);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
#ifndef ULONG_MAX
|
||||
#define ULONG_MAX ((unsigned long)(~0L)) /* 0xFFFFFFFF */
|
||||
#endif
|
||||
|
||||
#ifndef LONG_MAX
|
||||
#define LONG_MAX ((long)(ULONG_MAX >> 1)) /* 0x7FFFFFFF */
|
||||
#endif
|
||||
|
||||
#ifndef LONG_MIN
|
||||
#define LONG_MIN ((long)(~LONG_MAX)) /* 0x80000000 */
|
||||
#endif
|
||||
|
||||
#ifndef ISDIGIT
|
||||
#define ISDIGIT(C) ((C) >= '0' && (C) <= '9')
|
||||
#endif
|
||||
|
||||
#ifndef ISSPACE
|
||||
#define ISSPACE(C) ((C) == ' ')
|
||||
#endif
|
||||
|
||||
#ifndef ISALPHA
|
||||
#define ISALPHA(C) (((C) >= 'a' && (C) <= 'z') || ((C) >= 'A' && (C) <= 'Z'))
|
||||
#endif
|
||||
|
||||
#ifndef ISUPPER
|
||||
#define ISUPPER(C) ((C) >= 'A' && (C) <= 'Z')
|
||||
#endif
|
||||
|
||||
long strtol(const char *nptr, char **endptr, int base) {
|
||||
const char *s = nptr;
|
||||
unsigned long acc;
|
||||
int c;
|
||||
unsigned long cutoff;
|
||||
int neg = 0, any, cutlim;
|
||||
|
||||
/*
|
||||
* Skip white space and pick up leading +/- sign if any.
|
||||
* If base is 0, allow 0x for hex and 0 for octal, else
|
||||
* assume decimal; if base is already 16, allow 0x.
|
||||
*/
|
||||
do {
|
||||
c = *s++;
|
||||
} while (ISSPACE(c));
|
||||
if (c == '-') {
|
||||
neg = 1;
|
||||
c = *s++;
|
||||
} else if (c == '+')
|
||||
c = *s++;
|
||||
if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X')) {
|
||||
c = s[1];
|
||||
s += 2;
|
||||
base = 16;
|
||||
}
|
||||
if (base == 0)
|
||||
base = c == '0' ? 8 : 10;
|
||||
|
||||
/*
|
||||
* Compute the cutoff value between legal numbers and illegal
|
||||
* numbers. That is the largest legal value, divided by the
|
||||
* base. An input number that is greater than this value, if
|
||||
* followed by a legal input character, is too big. One that
|
||||
* is equal to this value may be valid or not; the limit
|
||||
* between valid and invalid numbers is then based on the last
|
||||
* digit. For instance, if the range for longs is
|
||||
* [-2147483648..2147483647] and the input base is 10,
|
||||
* cutoff will be set to 214748364 and cutlim to either
|
||||
* 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated
|
||||
* a value > 214748364, or equal but the next digit is > 7 (or 8),
|
||||
* the number is too big, and we will return a range error.
|
||||
*
|
||||
* Set any if any `digits' consumed; make it negative to indicate
|
||||
* overflow.
|
||||
*/
|
||||
cutoff = neg ? -(unsigned long)LONG_MIN : LONG_MAX;
|
||||
cutlim = cutoff % (unsigned long)base;
|
||||
cutoff /= (unsigned long)base;
|
||||
for (acc = 0, any = 0;; c = *s++) {
|
||||
if (ISDIGIT(c))
|
||||
c -= '0';
|
||||
else if (ISALPHA(c))
|
||||
c -= ISUPPER(c) ? 'A' - 10 : 'a' - 10;
|
||||
else
|
||||
break;
|
||||
if (c >= base)
|
||||
break;
|
||||
if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
|
||||
any = -1;
|
||||
else {
|
||||
any = 1;
|
||||
acc *= base;
|
||||
acc += c;
|
||||
}
|
||||
}
|
||||
if (any < 0) {
|
||||
acc = neg ? LONG_MIN : LONG_MAX;
|
||||
errno = ERANGE;
|
||||
} else if (neg)
|
||||
acc = -acc;
|
||||
if (endptr != 0)
|
||||
*endptr = const_cast<char *>(any ? s - 1 : nptr);
|
||||
return (acc);
|
||||
}
|
||||
|
||||
int strcmp(const char *lhs, const char *rhs) {
|
||||
while (*lhs != '\0' && *rhs != '\0') {
|
||||
if (*lhs == *rhs) {
|
||||
++lhs;
|
||||
++rhs;
|
||||
}
|
||||
return *lhs - *rhs;
|
||||
}
|
||||
if (*lhs != '\0')
|
||||
return 1;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
void *calloc(size_t num, size_t size) {
|
||||
size_t bits = num * size;
|
||||
char *p = (char *)malloc(bits);
|
||||
if (!p)
|
||||
return p;
|
||||
char *q = (char *)p;
|
||||
while (q - p < bits) {
|
||||
*(int *)q = 0;
|
||||
q += sizeof(int);
|
||||
}
|
||||
while (q - p < bits) {
|
||||
*q = 0;
|
||||
q++;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
void exit(int exit_code) { asm volatile("exit;"); }
|
||||
|
||||
size_t strlen(const char *str) {
|
||||
size_t r = 0;
|
||||
while (*str == ' ')
|
||||
++str;
|
||||
|
||||
while (*str != '\0') {
|
||||
++r;
|
||||
++str;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
char *strcpy(char *dest, const char *src) {
|
||||
char *pd = dest;
|
||||
const char *ps = src;
|
||||
|
||||
while (*ps != '\0')
|
||||
*(pd++) = *(ps++);
|
||||
|
||||
*pd = '\0';
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
int *__errno_location() { return &ErrNo; }
|
||||
|
||||
char *strcat(char *dest, const char *src) {
|
||||
char *pd = dest;
|
||||
const char *ps = src;
|
||||
|
||||
while (*pd != '\0')
|
||||
++pd;
|
||||
|
||||
while (*ps != '\0')
|
||||
*(pd++) = *(ps++);
|
||||
|
||||
*pd = '\0';
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
void perror(const char *s) { printf("%s", s); }
|
||||
|
||||
int strncmp(const char *lhs, const char *rhs, size_t count) {
|
||||
size_t c = 0;
|
||||
while (*lhs != '\0' && *rhs != '\0' && c < count) {
|
||||
if (*lhs == *rhs) {
|
||||
++lhs;
|
||||
++rhs;
|
||||
++c;
|
||||
} else
|
||||
return *lhs - *rhs;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
char *strncpy(char *dest, const char *src, size_t count) {
|
||||
char *pd = dest;
|
||||
const char *ps = src;
|
||||
size_t c = 0;
|
||||
|
||||
while (*ps != '\0' && c < count) {
|
||||
*(pd++) = *(ps++);
|
||||
++c;
|
||||
}
|
||||
|
||||
if (c < count)
|
||||
*pd = '\0';
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
char *strchr(const char *s, int c) {
|
||||
do {
|
||||
if (*s == c)
|
||||
return const_cast<char *>(s);
|
||||
} while (*s++);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
char *strtok(char *str, const char *delim) {
|
||||
static char *s = nullptr;
|
||||
char *tok;
|
||||
if (str == nullptr) {
|
||||
if (s == nullptr)
|
||||
return nullptr;
|
||||
} else
|
||||
s = str;
|
||||
for (size_t i; (*s != '\0'); s++) {
|
||||
for (i = 0; (delim[i] != '\0') && (*s != delim[i]); i++)
|
||||
;
|
||||
if (delim[i] == '\0')
|
||||
break;
|
||||
}
|
||||
if (*s == '\0')
|
||||
return s = nullptr;
|
||||
tok = s++;
|
||||
for (size_t i; (*s != '\0'); s++) {
|
||||
for (i = 0; (delim[i] != '\0') && (*s != delim[i]); i++)
|
||||
;
|
||||
if (delim[i] != '\0')
|
||||
break;
|
||||
}
|
||||
if (*s != '\0') {
|
||||
*s = '\0';
|
||||
s++;
|
||||
}
|
||||
return tok;
|
||||
}
|
||||
|
||||
void srand(unsigned seed) { RandomNext = seed; }
|
||||
|
||||
int rand() {
|
||||
RandomNext = RandomNext * 1103515245 + 12345;
|
||||
return static_cast<unsigned int>((RandomNext / 65536)) % 2147483647;
|
||||
}
|
||||
|
||||
int abs(int n) { return n > 0 ? n : -n; }
|
||||
|
||||
void *memcpy(void *dest, const void *src, size_t count) {
|
||||
__builtin_memcpy(dest, src, count);
|
||||
return dest;
|
||||
}
|
||||
|
||||
unsigned long strtoul(const char *str, char **str_end, int base) {
|
||||
unsigned long res = 0;
|
||||
while (*str != '\0') {
|
||||
if (*str == ' ') {
|
||||
++str;
|
||||
continue;
|
||||
}
|
||||
if (*str >= '0' && *str <= '9') {
|
||||
res = res * 10 + *str - '0';
|
||||
++str;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (*str_end)
|
||||
*str_end = const_cast<char *>(str);
|
||||
return res;
|
||||
}
|
||||
|
||||
double atof(const char *s) { return strtod(s, nullptr); }
|
||||
|
||||
int atoi(const char *str) { return (int)strtol(str, nullptr, 10); }
|
||||
|
||||
double strtod(const char *str, char **ptr) {
|
||||
char *p;
|
||||
|
||||
if (ptr == (char **)0)
|
||||
return atof(str);
|
||||
|
||||
p = const_cast<char *>(str);
|
||||
|
||||
while (ISSPACE(*p))
|
||||
++p;
|
||||
|
||||
if (*p == '+' || *p == '-')
|
||||
++p;
|
||||
|
||||
/* INF or INFINITY. */
|
||||
if ((p[0] == 'i' || p[0] == 'I') && (p[1] == 'n' || p[1] == 'N') &&
|
||||
(p[2] == 'f' || p[2] == 'F')) {
|
||||
if ((p[3] == 'i' || p[3] == 'I') && (p[4] == 'n' || p[4] == 'N') &&
|
||||
(p[5] == 'i' || p[5] == 'I') && (p[6] == 't' || p[6] == 'T') &&
|
||||
(p[7] == 'y' || p[7] == 'Y')) {
|
||||
*ptr = p + 8;
|
||||
return atof(str);
|
||||
} else {
|
||||
*ptr = p + 3;
|
||||
return atof(str);
|
||||
}
|
||||
}
|
||||
|
||||
/* NAN or NAN(foo). */
|
||||
if ((p[0] == 'n' || p[0] == 'N') && (p[1] == 'a' || p[1] == 'A') &&
|
||||
(p[2] == 'n' || p[2] == 'N')) {
|
||||
p += 3;
|
||||
if (*p == '(') {
|
||||
++p;
|
||||
while (*p != '\0' && *p != ')')
|
||||
++p;
|
||||
if (*p == ')')
|
||||
++p;
|
||||
}
|
||||
*ptr = p;
|
||||
return atof(str);
|
||||
}
|
||||
|
||||
/* digits, with 0 or 1 periods in it. */
|
||||
if (ISDIGIT(*p) || *p == '.') {
|
||||
int got_dot = 0;
|
||||
while (ISDIGIT(*p) || (!got_dot && *p == '.')) {
|
||||
if (*p == '.')
|
||||
got_dot = 1;
|
||||
++p;
|
||||
}
|
||||
|
||||
/* Exponent. */
|
||||
if (*p == 'e' || *p == 'E') {
|
||||
int i;
|
||||
i = 1;
|
||||
if (p[i] == '+' || p[i] == '-')
|
||||
++i;
|
||||
if (ISDIGIT(p[i])) {
|
||||
while (ISDIGIT(p[i]))
|
||||
++i;
|
||||
*ptr = p + i;
|
||||
return atof(str);
|
||||
}
|
||||
}
|
||||
*ptr = p;
|
||||
return atof(str);
|
||||
}
|
||||
/* Didn't find any digits. Doesn't look like a number. */
|
||||
*ptr = const_cast<char *>(str);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
void *memset(void *dest, int ch, size_t count) {
|
||||
auto *P = reinterpret_cast<unsigned char *>(dest);
|
||||
for (size_t I = 0; I < count; ++I, ++P)
|
||||
*P = (unsigned char)ch;
|
||||
return dest;
|
||||
}
|
||||
|
||||
int tolower(int ch) {
|
||||
if (ch >= 'A' && ch <= 'Z')
|
||||
return ch + 32;
|
||||
return ch;
|
||||
}
|
||||
|
||||
char *strstr(const char *s1, const char *s2) {
|
||||
const size_t len = strlen(s2);
|
||||
while (*s1) {
|
||||
if (!memcmp(s1, s2, len))
|
||||
return const_cast<char *>(s1);
|
||||
++s1;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
char *__xpg_basename(const char *path) { return const_cast<char *>(path); }
|
||||
|
||||
const unsigned short **__ctype_b_loc(void) { return &BLocTablePTable; }
|
||||
|
||||
const int32_t **__ctype_tolower_loc(void) { return &ToLowerPTable; }
|
||||
|
||||
void qsort(void *b, size_t n, size_t s, __compar_fn_t cmp) {
|
||||
return _quicksort(b, n, s, (__compar_d_fn_t)cmp, nullptr);
|
||||
}
|
||||
|
||||
int strcasecmp(const char *s1, const char *s2) {
|
||||
const unsigned char *p1 = (const unsigned char *)s1;
|
||||
const unsigned char *p2 = (const unsigned char *)s2;
|
||||
int result;
|
||||
|
||||
if (p1 == p2)
|
||||
return 0;
|
||||
|
||||
while ((result = tolower(*p1) - tolower(*p2++)) == 0)
|
||||
if (*p1++ == '\0')
|
||||
break;
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
||||
@@ -282,9 +282,15 @@ uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
|
||||
|
||||
uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
|
||||
uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
|
||||
// If we are in SPMD mode, remove one warp.
|
||||
return BlockSize - (!IsSPMD * impl::getWarpSize());
|
||||
if (IsSPMD)
|
||||
return BlockSize;
|
||||
uint32_t WarpSize = impl::getWarpSize();
|
||||
bool IsFullLastWarp = !(BlockSize % WarpSize);
|
||||
if (OMP_LIKELY(IsFullLastWarp))
|
||||
return BlockSize - WarpSize;
|
||||
return BlockSize - BlockSize % WarpSize;
|
||||
}
|
||||
|
||||
uint32_t mapping::getMaxTeamThreads() {
|
||||
return mapping::getMaxTeamThreads(mapping::isSPMDMode());
|
||||
}
|
||||
|
||||
@@ -47,6 +47,9 @@ using namespace ompx;
|
||||
namespace {
|
||||
|
||||
uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
|
||||
if (NumThreadsClause != -1)
|
||||
return NumThreadsClause;
|
||||
|
||||
uint32_t NThreadsICV =
|
||||
NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
|
||||
uint32_t NumThreads = mapping::getMaxTeamThreads();
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "LibC.h"
|
||||
#include "Mapping.h"
|
||||
#include "State.h"
|
||||
#include "Memory.h"
|
||||
#include "Synchronization.h"
|
||||
#include "Types.h"
|
||||
#include "Utils.h"
|
||||
@@ -83,6 +84,7 @@ struct SharedMemorySmartStackTy {
|
||||
void *push(uint64_t Bytes);
|
||||
|
||||
/// Deallocate the last allocation made by the encountering thread and pointed
|
||||
//
|
||||
/// to by \p Ptr from the stack. Each thread can call this function.
|
||||
void pop(void *Ptr, uint32_t Bytes);
|
||||
|
||||
@@ -274,7 +276,8 @@ void state::enterDataEnvironment(IdentTy *Ident) {
|
||||
uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
|
||||
if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
|
||||
uint32_t Bytes =
|
||||
sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
|
||||
sizeof(ThreadStates[0]) *
|
||||
(mapping::getNumberOfThreadsInBlock() + (mapping::isSPMDMode() ? 0 : 1));
|
||||
void *ThreadStatesPtr =
|
||||
memory::allocGlobal(Bytes, "Thread state array allocation");
|
||||
memset(ThreadStatesPtr, 0, Bytes);
|
||||
@@ -371,6 +374,15 @@ int omp_get_thread_num(void) {
|
||||
return omp_get_ancestor_thread_num(omp_get_level());
|
||||
}
|
||||
|
||||
int omp_get_bulk_thread_num() {
|
||||
ASSERT(mapping::isSPMDMode(), "Not SPMD");
|
||||
int BId = mapping::getBlockIdInKernel();
|
||||
int BSize = mapping::getMaxTeamThreads();
|
||||
int TId = mapping::getThreadIdInBlock();
|
||||
int Id = BId * BSize + TId;
|
||||
return returnValIfLevelIsActive(omp_get_level(), Id, 0);
|
||||
}
|
||||
|
||||
int omp_get_team_size(int Level) {
|
||||
return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
|
||||
}
|
||||
@@ -379,6 +391,11 @@ int omp_get_num_threads(void) {
|
||||
return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
|
||||
}
|
||||
|
||||
int omp_get_bulk_num_threads(void) {
|
||||
ASSERT(mapping::isSPMDMode(), "Not SPMD");
|
||||
return mapping::getNumberOfThreadsInKernel();
|
||||
}
|
||||
|
||||
int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
|
||||
|
||||
int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
|
||||
|
||||
@@ -122,6 +122,8 @@ uint32_t atomicExchange(uint32_t *Address, uint32_t Val,
|
||||
}
|
||||
///}
|
||||
|
||||
int32_t atomicAddSys(int32_t *Address, uint32_t Val);
|
||||
|
||||
// Forward declarations defined to be defined for AMDGCN and NVPTX.
|
||||
uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
|
||||
atomic::MemScopeTy MemScope);
|
||||
@@ -336,6 +338,10 @@ uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
|
||||
return __nvvm_atom_inc_gen_ui(Address, Val);
|
||||
}
|
||||
|
||||
int32_t atomicAddSys(int32_t *Address, int32_t Val) {
|
||||
return __nvvm_atom_sys_add_gen_i(Address, Val);
|
||||
}
|
||||
|
||||
void namedBarrierInit() {}
|
||||
|
||||
void namedBarrier() {
|
||||
@@ -509,6 +515,22 @@ void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
|
||||
|
||||
void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }
|
||||
|
||||
int32_t atomic::addSys(int32_t *Addr, int32_t Val) {
|
||||
return impl::atomicAddSys(Addr, Val);
|
||||
}
|
||||
|
||||
void mutex::TicketLock::lock() {
|
||||
uint64_t MyTicket = atomic::add(&NextTicket, 1, atomic::seq_cst);
|
||||
while (atomic::load(&NowServing, atomic::aquire) != MyTicket)
|
||||
;
|
||||
fence::kernel(atomic::aquire);
|
||||
}
|
||||
|
||||
void mutex::TicketLock::unlock() {
|
||||
fence::kernel(atomic::release);
|
||||
atomic::add(&NowServing, 1, atomic::seq_cst);
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
|
||||
|
||||
|
||||
304
openmp/libomptarget/DeviceRTL/src/TeamAllocator.cpp
Normal file
304
openmp/libomptarget/DeviceRTL/src/TeamAllocator.cpp
Normal file
@@ -0,0 +1,304 @@
|
||||
//===------- WarpAllocator.cpp - Warp memory allocator ------- C++ -*-========//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
#include "Debug.h"
|
||||
#include "Mapping.h"
|
||||
#include "Memory.h"
|
||||
#include "State.h"
|
||||
#include "Synchronization.h"
|
||||
#include "Types.h"
|
||||
#include "Utils.h"
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
char *CONSTANT(omptarget_device_heap_buffer)
|
||||
__attribute__((used, retain, weak, visibility("protected")));
|
||||
|
||||
size_t CONSTANT(omptarget_device_heap_size)
|
||||
__attribute__((used, retain, weak, visibility("protected")));
|
||||
|
||||
namespace {
|
||||
constexpr const size_t Alignment = 16;
|
||||
constexpr const size_t FirstThreadRatio = 40;
|
||||
constexpr const size_t SplitThreadhold = Alignment * 4;
|
||||
|
||||
template <typename T> T abs(T V) { return V > 0 ? V : -V; }
|
||||
|
||||
template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator;
|
||||
|
||||
class WarpAllocatorEntry {
|
||||
template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> friend struct WarpAllocator;
|
||||
|
||||
/// If Size is less than 0, the entry is allocated (in use).
|
||||
int64_t Size = 0;
|
||||
/// PrevSize is also supposed to be greater than or equal to 0. When it is 0,
|
||||
/// it is the first entry of the buffer.
|
||||
int64_t PrevSize = 0;
|
||||
|
||||
public:
|
||||
bool isFirst() const { return !PrevSize; }
|
||||
|
||||
size_t getSize() const { return abs(Size); }
|
||||
void setSize(size_t V) { Size = V; }
|
||||
|
||||
void setPrevSize(WarpAllocatorEntry *Prev) {
|
||||
PrevSize = Prev ? Prev->getSize() : 0;
|
||||
}
|
||||
|
||||
size_t getUserSize() const { return getSize() - sizeof(WarpAllocatorEntry); }
|
||||
|
||||
// Note: isUsed can not be !isUnused or other way around because when Size is
|
||||
// 0, it is uninitialized.
|
||||
bool isUsed() const { return Size < 0; }
|
||||
bool isUnused() const { return Size > 0; }
|
||||
|
||||
void setUsed() {
|
||||
assert(isUnused() && "the entry is in use");
|
||||
Size *= -1;
|
||||
}
|
||||
void setUnused() {
|
||||
assert(isUsed() && "the entry is not in use");
|
||||
Size *= -1;
|
||||
}
|
||||
|
||||
char *getUserPtr() { return reinterpret_cast<char *>(this + 1); }
|
||||
char *getEndPtr() { return reinterpret_cast<char *>(getNext()); }
|
||||
|
||||
WarpAllocatorEntry *getPrev() { return utils::advance(this, -PrevSize); }
|
||||
WarpAllocatorEntry *getNext() { return utils::advance(this, getSize()); }
|
||||
|
||||
static WarpAllocatorEntry *fromUserPtr(void *Ptr) { return fromPtr(Ptr) - 1; }
|
||||
|
||||
static WarpAllocatorEntry *fromPtr(void *Ptr) {
|
||||
return reinterpret_cast<WarpAllocatorEntry *>(Ptr);
|
||||
}
|
||||
};
|
||||
|
||||
static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");
|
||||
|
||||
template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
|
||||
void init() {
|
||||
if (mapping::isSPMDMode() &&
|
||||
(mapping::getThreadIdInBlock() || mapping::getBlockIdInKernel()))
|
||||
return;
|
||||
|
||||
size_t HeapSize = omptarget_device_heap_size;
|
||||
|
||||
FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
|
||||
FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
|
||||
size_t OtherThreadHeapSize =
|
||||
(HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
|
||||
OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);
|
||||
|
||||
size_t TeamHeapSize = FirstThreadHeapSize / TEAM_SIZE;
|
||||
TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
|
||||
FirstTeamSize = TeamHeapSize;
|
||||
|
||||
char *LastLimit = omptarget_device_heap_buffer;
|
||||
for (int I = 0; I < WARP_SIZE; ++I) {
|
||||
for (int J = 0; J < TEAM_SIZE; ++J) {
|
||||
Entries[I][J] = nullptr;
|
||||
Limits[I][J] = LastLimit + TeamHeapSize * (J + 1);
|
||||
}
|
||||
LastLimit += I ? OtherThreadHeapSize : FirstThreadHeapSize;
|
||||
Limits[I][TEAM_SIZE - 1] = LastLimit;
|
||||
TeamHeapSize = OtherThreadHeapSize / TEAM_SIZE;
|
||||
TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
|
||||
}
|
||||
}
|
||||
|
||||
void *allocate(size_t Size) {
|
||||
int32_t TeamSlot = getTeamSlot();
|
||||
int32_t TIdInWarp = mapping::getThreadIdInWarp();
|
||||
|
||||
Size = utils::align_up(Size + sizeof(WarpAllocatorEntry), Alignment);
|
||||
|
||||
// Error our early if the requested size is larger than the entire block.
|
||||
if (Size > getBlockSize(TIdInWarp, TeamSlot))
|
||||
return nullptr;
|
||||
|
||||
WarpAllocatorEntry *E = nullptr;
|
||||
{
|
||||
mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
|
||||
|
||||
auto *LastEntry = Entries[TIdInWarp][TeamSlot];
|
||||
auto *NewWatermark = (LastEntry ? LastEntry->getEndPtr()
|
||||
: getBlockBegin(TIdInWarp, TeamSlot)) +
|
||||
Size;
|
||||
if (NewWatermark >= Limits[TIdInWarp][TeamSlot]) {
|
||||
E = findMemorySlow(Size, TIdInWarp, TeamSlot);
|
||||
} else {
|
||||
E = LastEntry ? LastEntry->getNext()
|
||||
: WarpAllocatorEntry::fromPtr(
|
||||
getBlockBegin(TIdInWarp, TeamSlot));
|
||||
E->setSize(Size);
|
||||
E->setPrevSize(LastEntry);
|
||||
Entries[TIdInWarp][TeamSlot] = E;
|
||||
}
|
||||
|
||||
if (!E)
|
||||
return nullptr;
|
||||
|
||||
E->setUsed();
|
||||
}
|
||||
|
||||
return E->getUserPtr();
|
||||
}
|
||||
|
||||
void deallocate(void *Ptr) {
|
||||
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(Ptr);
|
||||
|
||||
auto TeamSlot = getTeamSlot();
|
||||
auto TIdInWarp = mapping::getThreadIdInWarp();
|
||||
|
||||
mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
|
||||
E->setUnused();
|
||||
// Is last entry?
|
||||
if (E == Entries[TIdInWarp][TeamSlot]) {
|
||||
do {
|
||||
E = E->getPrev();
|
||||
} while (!E->isFirst() && !E->isUsed());
|
||||
Entries[TIdInWarp][TeamSlot] = E;
|
||||
}
|
||||
}
|
||||
|
||||
memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
|
||||
if (!utils::isInRange(P, omptarget_device_heap_buffer,
|
||||
omptarget_device_heap_size))
|
||||
return {};
|
||||
|
||||
auto TeamSlot = getTeamSlot();
|
||||
auto TIdInWarp = mapping::getThreadIdInWarp();
|
||||
for (int I = TIdInWarp; I < TIdInWarp + WARP_SIZE; ++I) {
|
||||
int TId = I % WARP_SIZE;
|
||||
for (int J = TeamSlot; J < TeamSlot + TEAM_SIZE; ++J) {
|
||||
int SId = J % TEAM_SIZE;
|
||||
if (P < getBlockBegin(TId, SId) || P >= getBlockEnd(TId, SId))
|
||||
continue;
|
||||
|
||||
mutex::LockGuard LG(Locks[I][SId]);
|
||||
WarpAllocatorEntry *E = Entries[I][SId];
|
||||
if (!E)
|
||||
return {};
|
||||
if (E->getEndPtr() <= P)
|
||||
return {};
|
||||
do {
|
||||
if (E->getUserPtr() <= P && P < E->getEndPtr()) {
|
||||
if (!E->isUsed())
|
||||
return {};
|
||||
return {E->getUserPtr(), E->getUserSize()};
|
||||
}
|
||||
E = E->getPrev();
|
||||
} while (!E->isFirst());
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
private:
|
||||
char *getBlockBegin(int32_t TIdInWarp, int32_t TeamSlot) const {
|
||||
if (TeamSlot)
|
||||
return Limits[TIdInWarp][TeamSlot - 1];
|
||||
if (TIdInWarp)
|
||||
return Limits[TIdInWarp - 1][TEAM_SIZE - 1];
|
||||
return omptarget_device_heap_buffer;
|
||||
}
|
||||
char *getBlockEnd(int32_t TIdInWarp, int32_t TeamSlot) const {
|
||||
return Limits[TIdInWarp][TeamSlot];
|
||||
}
|
||||
|
||||
size_t getBlockSize(int32_t TIdInWarp, int32_t TeamSlot) const {
|
||||
return getBlockEnd(TIdInWarp, TeamSlot) -
|
||||
getBlockBegin(TIdInWarp, TeamSlot);
|
||||
}
|
||||
|
||||
static int32_t getTeamSlot() { return mapping::getBlockIdInKernel() % TEAM_SIZE; }
|
||||
|
||||
WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp,
|
||||
int32_t TeamSlot) {
|
||||
char *Ptr = getBlockBegin(TIdInWarp, TeamSlot);
|
||||
char *Limit = getBlockEnd(TIdInWarp, TeamSlot);
|
||||
|
||||
WarpAllocatorEntry *E = WarpAllocatorEntry::fromPtr(Ptr);
|
||||
do {
|
||||
if (!E->isUsed() && E->getSize() >= Size)
|
||||
break;
|
||||
E = E->getNext();
|
||||
if (reinterpret_cast<char *>(E) + Size > Limit)
|
||||
return nullptr;
|
||||
} while (1);
|
||||
|
||||
size_t OldSize = E->getSize();
|
||||
if (OldSize - Size >= SplitThreadhold) {
|
||||
auto *OldNext = E->getNext();
|
||||
E->setSize(Size);
|
||||
auto *LeftOverE = E->getNext();
|
||||
LeftOverE->setPrevSize(E);
|
||||
LeftOverE->setSize(OldSize - Size);
|
||||
OldNext->setPrevSize(LeftOverE);
|
||||
}
|
||||
|
||||
return E;
|
||||
}
|
||||
|
||||
WarpAllocatorEntry *Entries[WARP_SIZE][TEAM_SIZE];
|
||||
char *Limits[WARP_SIZE][TEAM_SIZE];
|
||||
mutex::TicketLock Locks[WARP_SIZE][TEAM_SIZE];
|
||||
size_t FirstThreadHeapSize;
|
||||
size_t FirstTeamSize;
|
||||
};
|
||||
|
||||
WarpAllocator<32, 16> Allocator;
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace ompx {
|
||||
namespace memory {
|
||||
MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
|
||||
return Allocator.getMemoryAllocationInfo(P);
|
||||
}
|
||||
} // namespace memory
|
||||
} // namespace ompx
|
||||
|
||||
extern "C" {
|
||||
|
||||
void *malloc(size_t Size) {
|
||||
if (!Size)
|
||||
return nullptr;
|
||||
void *P = Allocator.allocate(Size);
|
||||
assert(P && "allocator out of memory");
|
||||
assert(reinterpret_cast<intptr_t>(P) % Alignment == 0 &&
|
||||
"misaligned address");
|
||||
return P;
|
||||
}
|
||||
|
||||
void free(void *P) {
|
||||
if (!P)
|
||||
return;
|
||||
Allocator.deallocate(P);
|
||||
}
|
||||
|
||||
void *realloc(void *ptr, size_t new_size) {
|
||||
void *NewPtr = malloc(new_size);
|
||||
if (!NewPtr)
|
||||
return nullptr;
|
||||
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(ptr);
|
||||
__builtin_memcpy(NewPtr, ptr, utils::min(E->getUserSize(), new_size));
|
||||
free(ptr);
|
||||
return NewPtr;
|
||||
}
|
||||
|
||||
void __kmpc_target_init_allocator() { Allocator.init(); }
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
275
openmp/libomptarget/DeviceRTL/src/WarpAllocator.cpp
Normal file
275
openmp/libomptarget/DeviceRTL/src/WarpAllocator.cpp
Normal file
@@ -0,0 +1,275 @@
|
||||
//===------- WarpAllocator.cpp - Warp memory allocator ------- C++ -*-========//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
#include "Debug.h"
|
||||
#include "Mapping.h"
|
||||
#include "Memory.h"
|
||||
#include "State.h"
|
||||
#include "Synchronization.h"
|
||||
#include "Types.h"
|
||||
#include "Utils.h"
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
char *CONSTANT(omptarget_device_heap_buffer)
|
||||
__attribute__((used, retain, weak, visibility("protected")));
|
||||
|
||||
size_t CONSTANT(omptarget_device_heap_size)
|
||||
__attribute__((used, retain, weak, visibility("protected")));
|
||||
|
||||
namespace {
|
||||
constexpr const size_t Alignment = 16;
|
||||
constexpr const size_t FirstThreadRatio = 40;
|
||||
constexpr const size_t SplitThreadhold = Alignment * 16;
|
||||
|
||||
template <typename T> T abs(T V) { return V > 0 ? V : -V; }
|
||||
|
||||
template <int32_t WARP_SIZE> struct WarpAllocator;
|
||||
|
||||
class WarpAllocatorEntry {
|
||||
template <int32_t WARP_SIZE> friend struct WarpAllocator;
|
||||
|
||||
/// If Size is less than 0, the entry is allocated (in use).
|
||||
int64_t Size = 0;
|
||||
/// PrevSize is also supposed to be greater than or equal to 0. When it is 0,
|
||||
/// it is the first entry of the buffer.
|
||||
int64_t PrevSize = 0;
|
||||
|
||||
public:
|
||||
bool isFirst() const { return !PrevSize; }
|
||||
|
||||
size_t getSize() const { return abs(Size); }
|
||||
void setSize(size_t V) { Size = V; }
|
||||
|
||||
void setPrevSize(WarpAllocatorEntry *Prev) {
|
||||
PrevSize = Prev ? Prev->getSize() : 0;
|
||||
}
|
||||
|
||||
size_t getUserSize() const { return getSize() - sizeof(WarpAllocatorEntry); }
|
||||
|
||||
// Note: isUsed can not be !isUnused or other way around because when Size is
|
||||
// 0, it is uninitialized.
|
||||
bool isUsed() const { return Size < 0; }
|
||||
bool isUnused() const { return Size > 0; }
|
||||
|
||||
void setUsed() {
|
||||
assert(isUnused() && "the entry is in use");
|
||||
Size *= -1;
|
||||
}
|
||||
void setUnused() {
|
||||
assert(isUsed() && "the entry is not in use");
|
||||
Size *= -1;
|
||||
}
|
||||
|
||||
char *getUserPtr() { return reinterpret_cast<char *>(this + 1); }
|
||||
char *getEndPtr() { return reinterpret_cast<char *>(getNext()); }
|
||||
|
||||
WarpAllocatorEntry *getPrev() { return utils::advance(this, -PrevSize); }
|
||||
WarpAllocatorEntry *getNext() { return utils::advance(this, getSize()); }
|
||||
|
||||
static WarpAllocatorEntry *fromUserPtr(void *Ptr) { return fromPtr(Ptr) - 1; }
|
||||
|
||||
static WarpAllocatorEntry *fromPtr(void *Ptr) {
|
||||
return reinterpret_cast<WarpAllocatorEntry *>(Ptr);
|
||||
}
|
||||
};
|
||||
|
||||
static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");
|
||||
|
||||
template <int32_t WARP_SIZE> struct WarpAllocator {
|
||||
void init() {
|
||||
if (mapping::isSPMDMode() &&
|
||||
(mapping::getThreadIdInBlock() || mapping::getBlockId()))
|
||||
return;
|
||||
|
||||
size_t HeapSize = omptarget_device_heap_size;
|
||||
size_t FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
|
||||
FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
|
||||
size_t OtherThreadHeapSize =
|
||||
(HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
|
||||
OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);
|
||||
|
||||
for (int I = 0; I < WARP_SIZE; ++I) {
|
||||
Entries[I] = nullptr;
|
||||
size_t PrivateOffset = OtherThreadHeapSize * I + FirstThreadHeapSize;
|
||||
Limits[I] = omptarget_device_heap_buffer + PrivateOffset;
|
||||
}
|
||||
}
|
||||
|
||||
void *allocate(size_t Size) {
|
||||
int32_t TIdInWarp = mapping::getThreadIdInWarp();
|
||||
Size = utils::align_up(Size + sizeof(WarpAllocatorEntry), Alignment);
|
||||
|
||||
// Error our early if the requested size is larger than the entire block.
|
||||
if (Size > getBlockSize(TIdInWarp))
|
||||
return nullptr;
|
||||
|
||||
WarpAllocatorEntry *E = nullptr;
|
||||
{
|
||||
mutex::LockGuard LG(Locks[TIdInWarp]);
|
||||
|
||||
auto *LastEntry = Entries[TIdInWarp];
|
||||
auto *NewWatermark =
|
||||
(LastEntry ? LastEntry->getEndPtr() : getBlockBegin(TIdInWarp)) +
|
||||
Size;
|
||||
if (NewWatermark >= Limits[TIdInWarp]) {
|
||||
E = findMemorySlow(Size, TIdInWarp);
|
||||
} else {
|
||||
E = LastEntry ? LastEntry->getNext()
|
||||
: WarpAllocatorEntry::fromPtr(getBlockBegin(TIdInWarp));
|
||||
E->setSize(Size);
|
||||
E->setPrevSize(LastEntry);
|
||||
Entries[TIdInWarp] = E;
|
||||
}
|
||||
}
|
||||
|
||||
if (!E)
|
||||
return nullptr;
|
||||
|
||||
assert(E->isUnused() && "entry is not set to use properly");
|
||||
|
||||
E->setUsed();
|
||||
|
||||
return E->getUserPtr();
|
||||
}
|
||||
|
||||
void deallocate(void *Ptr) {
|
||||
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(Ptr);
|
||||
|
||||
auto TIdInWarp = mapping::getThreadIdInWarp();
|
||||
|
||||
mutex::LockGuard LG(Locks[TIdInWarp]);
|
||||
E->setUnused();
|
||||
// Is last entry?
|
||||
if (E == Entries[TIdInWarp]) {
|
||||
do {
|
||||
E = E->getPrev();
|
||||
} while (!E->isFirst() && !E->isUsed());
|
||||
Entries[TIdInWarp] = E;
|
||||
}
|
||||
}
|
||||
|
||||
memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
|
||||
if (!utils::isInRange(P, omptarget_device_heap_buffer,
|
||||
omptarget_device_heap_size))
|
||||
return {};
|
||||
|
||||
auto TIdInWarp = mapping::getThreadIdInWarp();
|
||||
for (int I = TIdInWarp; I < TIdInWarp + mapping::getWarpSize(); ++I) {
|
||||
int TId = I % mapping::getWarpSize();
|
||||
if (P < getBlockBegin(TId) || P >= getBlockEnd(TId))
|
||||
continue;
|
||||
|
||||
mutex::LockGuard LG(Locks[I]);
|
||||
WarpAllocatorEntry *E = Entries[I];
|
||||
if (!E)
|
||||
return {};
|
||||
if (E->getEndPtr() <= P)
|
||||
return {};
|
||||
do {
|
||||
if (E->getUserPtr() <= P && P < E->getEndPtr()) {
|
||||
if (!E->isUsed())
|
||||
return {};
|
||||
return {E->getUserPtr(), E->getUserSize()};
|
||||
}
|
||||
E = E->getPrev();
|
||||
} while (!E->isFirst());
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
private:
|
||||
char *getBlockBegin(int32_t TIdInWarp) const {
|
||||
return TIdInWarp ? Limits[TIdInWarp - 1] : omptarget_device_heap_buffer;
|
||||
}
|
||||
char *getBlockEnd(int32_t TIdInWarp) const { return Limits[TIdInWarp]; }
|
||||
|
||||
size_t getBlockSize(int32_t TIdInWarp) const {
|
||||
return getBlockEnd(TIdInWarp) - getBlockBegin(TIdInWarp);
|
||||
}
|
||||
|
||||
WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp) {
|
||||
char *Ptr = getBlockBegin(TIdInWarp);
|
||||
char *Limit = getBlockEnd(TIdInWarp);
|
||||
|
||||
WarpAllocatorEntry *E = WarpAllocatorEntry::fromPtr(Ptr);
|
||||
do {
|
||||
if (!E->isUsed() && E->getSize() >= Size)
|
||||
break;
|
||||
E = E->getNext();
|
||||
if (reinterpret_cast<char *>(E) + Size > Limit)
|
||||
return nullptr;
|
||||
} while (1);
|
||||
|
||||
size_t OldSize = E->getSize();
|
||||
if (OldSize - Size >= SplitThreadhold) {
|
||||
auto *OldNext = E->getNext();
|
||||
E->setSize(Size);
|
||||
auto *LeftOverE = E->getNext();
|
||||
LeftOverE->setPrevSize(E);
|
||||
LeftOverE->setSize(OldSize - Size);
|
||||
OldNext->setPrevSize(LeftOverE);
|
||||
}
|
||||
|
||||
return E;
|
||||
}
|
||||
|
||||
WarpAllocatorEntry *Entries[WARP_SIZE];
|
||||
char *Limits[WARP_SIZE];
|
||||
mutex::TicketLock Locks[WARP_SIZE];
|
||||
};
|
||||
|
||||
WarpAllocator<32> Allocator;
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace ompx {
|
||||
namespace memory {
|
||||
MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
|
||||
return Allocator.getMemoryAllocationInfo(P);
|
||||
}
|
||||
} // namespace memory
|
||||
} // namespace ompx
|
||||
|
||||
extern "C" {
|
||||
|
||||
void *malloc(size_t Size) {
|
||||
if (!Size)
|
||||
return nullptr;
|
||||
void *P = Allocator.allocate(Size);
|
||||
assert(P && "allocator out of memory");
|
||||
assert(reinterpret_cast<intptr_t>(P) % Alignment == 0 &&
|
||||
"misaligned address");
|
||||
return P;
|
||||
}
|
||||
|
||||
void free(void *P) {
|
||||
if (!P)
|
||||
return;
|
||||
Allocator.deallocate(P);
|
||||
}
|
||||
|
||||
void *realloc(void *ptr, size_t new_size) {
|
||||
void *NewPtr = malloc(new_size);
|
||||
if (!NewPtr)
|
||||
return nullptr;
|
||||
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(ptr);
|
||||
__builtin_memcpy(NewPtr, ptr, utils::min(E->getUserSize(), new_size));
|
||||
free(ptr);
|
||||
return NewPtr;
|
||||
}
|
||||
|
||||
void __kmpc_target_init_allocator() { Allocator.init(); }
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
@@ -111,6 +111,38 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
|
||||
stride = loopSize; // make sure we only do 1 chunk per warp
|
||||
}
|
||||
|
||||
static void ForStaticNoLoopNoTrunk(int &last, T &lb, T &ub, ST &stride,
|
||||
ST &chunk, T entityId,
|
||||
T numberOfEntities) {
|
||||
T loopSize = ub - lb + 1;
|
||||
chunk = loopSize / numberOfEntities;
|
||||
|
||||
// We have more entities than iterations.
|
||||
if (chunk == 0) {
|
||||
chunk = 1;
|
||||
lb = lb + entityId * chunk;
|
||||
T inputUb = ub;
|
||||
ub = lb + chunk - 1;
|
||||
last = lb <= inputUb && inputUb <= ub;
|
||||
stride = loopSize; // make sure we only do 1 chunk per warp
|
||||
return;
|
||||
}
|
||||
|
||||
T leftOver = loopSize - chunk * numberOfEntities;
|
||||
|
||||
if (entityId < leftOver) {
|
||||
chunk++;
|
||||
lb = lb + entityId * chunk;
|
||||
} else {
|
||||
lb = lb + entityId * chunk + leftOver;
|
||||
}
|
||||
|
||||
T inputUb = ub;
|
||||
ub = lb + chunk - 1; // Clang uses i <= ub
|
||||
last = lb <= inputUb && inputUb <= ub;
|
||||
stride = loopSize; // make sure we only do 1 chunk per warp
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Support for Static Init
|
||||
|
||||
@@ -184,6 +216,19 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
|
||||
omp_get_num_teams() * numberOfActiveOMPThreads);
|
||||
break;
|
||||
}
|
||||
case kmp_sched_distr_parallel_static_noloop_chunk: {
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk,
|
||||
numberOfActiveOMPThreads * omp_get_team_num() + gtid,
|
||||
omp_get_num_teams() * numberOfActiveOMPThreads);
|
||||
break;
|
||||
}
|
||||
case kmp_sched_distr_parallel_static_noloop_nochunk: {
|
||||
ForStaticNoLoopNoTrunk(lastiter, lb, ub, stride, chunk,
|
||||
numberOfActiveOMPThreads * omp_get_team_num() +
|
||||
gtid,
|
||||
omp_get_num_teams() * numberOfActiveOMPThreads);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
// ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
|
||||
|
||||
@@ -11,8 +11,62 @@ _ZN4ompx*
|
||||
|
||||
IsSPMDMode
|
||||
|
||||
malloc
|
||||
free
|
||||
memcmp
|
||||
printf
|
||||
__assert_fail
|
||||
__ctype_b_loc
|
||||
__ctype_tolower_loc
|
||||
__xpg_basename
|
||||
abs
|
||||
atof
|
||||
atoi
|
||||
calloc
|
||||
clock
|
||||
clock_gettime
|
||||
exit
|
||||
fclose
|
||||
feof
|
||||
fflush
|
||||
fgets
|
||||
fopen
|
||||
fprintf
|
||||
fputs
|
||||
fread
|
||||
free
|
||||
fscanf
|
||||
fseek
|
||||
ftell
|
||||
fwrite
|
||||
getc
|
||||
gettimeofday
|
||||
gmtime
|
||||
malloc
|
||||
memcmp
|
||||
memcpy
|
||||
memset
|
||||
pclose
|
||||
perror
|
||||
popen
|
||||
printf
|
||||
qsort
|
||||
rand
|
||||
realloc
|
||||
rewind
|
||||
sprintf
|
||||
srand
|
||||
sscanf
|
||||
stat
|
||||
strcat
|
||||
strchr
|
||||
strcmp
|
||||
strcpy
|
||||
strftime
|
||||
strlen
|
||||
strncmp
|
||||
strncpy
|
||||
strstr
|
||||
strtod
|
||||
strtok
|
||||
strtol
|
||||
strtoul
|
||||
time
|
||||
tolower
|
||||
strcasecmp
|
||||
|
||||
12
openmp/libomptarget/DirectGPUCompilation/clang-gpu
Executable file
12
openmp/libomptarget/DirectGPUCompilation/clang-gpu
Executable file
@@ -0,0 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
from driver.wrapper import run
|
||||
|
||||
|
||||
def main():
|
||||
run(is_cpp=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
12
openmp/libomptarget/DirectGPUCompilation/clang-gpu++
Executable file
12
openmp/libomptarget/DirectGPUCompilation/clang-gpu++
Executable file
@@ -0,0 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
from driver.wrapper import run
|
||||
|
||||
|
||||
def main():
|
||||
run(is_cpp=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
40
openmp/libomptarget/DirectGPUCompilation/driver/Main.c
Normal file
40
openmp/libomptarget/DirectGPUCompilation/driver/Main.c
Normal file
@@ -0,0 +1,40 @@
|
||||
//===------- Main.c - Direct compilation program start point ------ C -----===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <string.h>
|
||||
extern int __user_main(int, char *[]);
|
||||
extern void __kmpc_target_init_allocator(void);
|
||||
|
||||
#ifdef SINGLE_THREAD_EXECUTION
|
||||
#define THREAD_LIMIT 1
|
||||
#else
|
||||
#define THREAD_LIMIT 1024
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
#pragma omp target enter data map(to: argv[:argc])
|
||||
|
||||
for (int I = 0; I < argc; ++I) {
|
||||
#pragma omp target enter data map(to: argv[I][:strlen(argv[I])])
|
||||
}
|
||||
|
||||
int Ret = 0;
|
||||
#pragma omp target enter data map(to: Ret)
|
||||
|
||||
#pragma omp target teams num_teams(1) thread_limit(1)
|
||||
{ __kmpc_target_init_allocator(); }
|
||||
|
||||
#pragma omp target teams num_teams(1) thread_limit(THREAD_LIMIT)
|
||||
{
|
||||
Ret = __user_main(argc, argv);
|
||||
}
|
||||
|
||||
#pragma omp target exit data map(from: Ret)
|
||||
|
||||
return Ret;
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
//===------- UserWrapper.h - User code wrapper --------------- C++ --------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
|
||||
#define OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
#endif
|
||||
0
openmp/libomptarget/DirectGPUCompilation/driver/__init__.py
Executable file
0
openmp/libomptarget/DirectGPUCompilation/driver/__init__.py
Executable file
141
openmp/libomptarget/DirectGPUCompilation/driver/wrapper.py
Executable file
141
openmp/libomptarget/DirectGPUCompilation/driver/wrapper.py
Executable file
@@ -0,0 +1,141 @@
|
||||
import argparse
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
|
||||
cwd = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
source_file_suffix = [".c", ".cpp", ".cu", ".hip", ".cc", ".cxx"]
|
||||
|
||||
|
||||
def print_version():
|
||||
cp = subprocess.run(["clang", "--version"])
|
||||
if cp.returncode != 0:
|
||||
sys.exit(cp.returncode)
|
||||
|
||||
|
||||
def compile_loader(loader_name, targets, verbose, dry_run):
|
||||
cmd = [
|
||||
"clang",
|
||||
"-c",
|
||||
"-fopenmp",
|
||||
"-foffload-lto",
|
||||
"-fopenmp-offload-mandatory",
|
||||
"-o",
|
||||
loader_name,
|
||||
]
|
||||
if targets:
|
||||
for arch in targets:
|
||||
cmd.append("--offload-arch={}".format(arch))
|
||||
else:
|
||||
cmd.append("--offload-arch=native")
|
||||
cmd.append(os.path.join(cwd, "Main.c"))
|
||||
if verbose:
|
||||
print(" ".join(cmd), file=sys.stderr)
|
||||
if dry_run:
|
||||
print(" ".join(cmd), file=sys.stderr)
|
||||
return
|
||||
cp = subprocess.run(cmd)
|
||||
if cp.returncode != 0:
|
||||
sys.exit(cp.returncode)
|
||||
|
||||
|
||||
def invoke_clang(is_cpp, args, targets, verbose, dry_run):
|
||||
cmd = [
|
||||
"clang++" if is_cpp else "clang",
|
||||
"-fopenmp",
|
||||
"-foffload-lto",
|
||||
"-fopenmp-offload-mandatory",
|
||||
"-fopenmp-globalize-to-global-space",
|
||||
"-include",
|
||||
os.path.join(cwd, "UserWrapper.h"),
|
||||
"--save-temps",
|
||||
"-rdynamic",
|
||||
"-mllvm",
|
||||
"-enable-host-rpc",
|
||||
"-mllvm",
|
||||
"-openmp-opt-disable-state-machine-rewrite",
|
||||
"-mllvm",
|
||||
"-enable-canonicalize-main-function",
|
||||
"-mllvm",
|
||||
"-canonical-main-function-name=__user_main",
|
||||
]
|
||||
if targets:
|
||||
for arch in targets:
|
||||
cmd.append("--offload-arch={}".format(arch))
|
||||
else:
|
||||
cmd.append("--offload-arch=native")
|
||||
cmd += args
|
||||
if verbose:
|
||||
print(" ".join(cmd))
|
||||
if dry_run:
|
||||
print(" ".join(cmd))
|
||||
return
|
||||
cp = subprocess.run(cmd)
|
||||
if cp.returncode != 0:
|
||||
sys.exit(cp.returncode)
|
||||
|
||||
|
||||
def run(is_cpp=False):
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="clang-gpu", description="clang LLVM GPU compiler"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
action="store_true",
|
||||
help="Only run preprocess, compile, and assemble steps",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", action="store_true", help="Show commands to run and use verbose output"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version", action="store_true", help="Print version information"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-###",
|
||||
action="store_true",
|
||||
help="Print (but do not run) the commands to run for this compilation",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--offload-arch",
|
||||
type=str,
|
||||
help=(
|
||||
"Specify an offloading device architecture for CUDA, HIP, or OpenMP. (e.g."
|
||||
" sm_35). If 'native' is used the compiler will detect locally installed"
|
||||
" architectures. For HIP offloading, the device architecture can be"
|
||||
" followed by target ID features delimited by a colon (e.g."
|
||||
" gfx908:xnack+:sramecc-). May be specified more than once."
|
||||
),
|
||||
nargs="*",
|
||||
)
|
||||
|
||||
args, fwd_args = parser.parse_known_args()
|
||||
|
||||
if args.version:
|
||||
print_version()
|
||||
return
|
||||
|
||||
if args.c:
|
||||
fwd_args.append("-c")
|
||||
if args.v:
|
||||
fwd_args.append("-v")
|
||||
|
||||
dry_run = vars(args)["###"]
|
||||
loader_name = None
|
||||
temp_files = []
|
||||
|
||||
if not args.c:
|
||||
tf = tempfile.NamedTemporaryFile()
|
||||
loader_name = "{}.o".format(tf.name)
|
||||
compile_loader(loader_name, args.offload_arch, args.v, dry_run)
|
||||
fwd_args.append(loader_name)
|
||||
temp_files.append(loader_name)
|
||||
|
||||
invoke_clang(is_cpp, fwd_args, args.offload_arch, args.v, dry_run)
|
||||
|
||||
for f in temp_files:
|
||||
if os.path.isfile(f):
|
||||
os.unlink(f)
|
||||
67
openmp/libomptarget/include/HostRPC.h
Normal file
67
openmp/libomptarget/include/HostRPC.h
Normal file
@@ -0,0 +1,67 @@
|
||||
//===------- HostRPC.h - Host RPC ---------------------------- C++ --------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef OPENMP_LIBOMPTARGET_INCLUDE_HOSTRPC_H
|
||||
#define OPENMP_LIBOMPTARGET_INCLUDE_HOSTRPC_H
|
||||
|
||||
#ifdef OMPTARGET_DEVICE_RUNTIME
|
||||
#include "Types.h"
|
||||
#else
|
||||
#include <cstdint>
|
||||
#endif
|
||||
|
||||
namespace hostrpc {
|
||||
|
||||
/// RPC call identifier. Note: negative value only. Non-negative values are for
|
||||
/// compiler generated functions.
|
||||
enum CallId {
|
||||
CALLID___kmpc_launch_parallel_51_kernel = -1,
|
||||
CALLID_invalid = -2147483648,
|
||||
};
|
||||
|
||||
/// Execution status.
|
||||
enum ExecutionStatus {
|
||||
EXEC_STAT_CREATED = 0,
|
||||
EXEC_STAT_DONE = 1,
|
||||
};
|
||||
|
||||
enum Type {
|
||||
ARG_LITERAL = 0,
|
||||
ARG_POINTER = 1,
|
||||
};
|
||||
|
||||
struct Argument {
|
||||
intptr_t Value;
|
||||
int64_t ArgType;
|
||||
};
|
||||
|
||||
struct Descriptor {
|
||||
// The following member will be used by both host and device.
|
||||
int32_t Id;
|
||||
struct Argument *Args;
|
||||
int64_t NumArgs;
|
||||
volatile int64_t Status;
|
||||
volatile int64_t ReturnValue;
|
||||
|
||||
// The following members will only be used by device.
|
||||
void **ArgInfo;
|
||||
void *ArgMap;
|
||||
};
|
||||
|
||||
/// A wrapper of HostRPCDescriptor that will only be used between plugins and
|
||||
/// libomptarget. It contains the three stdio global variables.
|
||||
struct DescriptorWrapper {
|
||||
Descriptor D;
|
||||
void *StdIn = nullptr;
|
||||
void *StdOut = nullptr;
|
||||
void *StdErr = nullptr;
|
||||
};
|
||||
|
||||
} // namespace hostrpc
|
||||
|
||||
#endif
|
||||
@@ -424,6 +424,10 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
|
||||
void *VAddr, bool IsRecord, bool SaveOutput,
|
||||
uint64_t &ReqPtrArgOffset);
|
||||
|
||||
// Host RPC support
|
||||
int64_t __kmpc_host_rpc_get_arg(void *Wrapper, int32_t ArgNo);
|
||||
void __kmpc_host_rpc_set_ret_val(void *Wrapper, int64_t RetVal);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -69,7 +69,8 @@ VERS1.0 {
|
||||
__tgt_interop_init;
|
||||
__tgt_interop_use;
|
||||
__tgt_interop_destroy;
|
||||
ompt_libomptarget_connect;
|
||||
__kmpc_host_rpc_get_arg;
|
||||
__kmpc_host_rpc_set_ret_val;
|
||||
local:
|
||||
*;
|
||||
};
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
#include "PluginManager.h"
|
||||
#include "private.h"
|
||||
|
||||
#include "HostRPC.h"
|
||||
|
||||
#include "Shared/EnvironmentVar.h"
|
||||
#include "Shared/Profile.h"
|
||||
|
||||
@@ -504,3 +506,33 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
|
||||
delete AsyncInfo;
|
||||
*AsyncHandle = nullptr;
|
||||
}
|
||||
|
||||
// Host RPC support functions.
|
||||
EXTERN int64_t __kmpc_host_rpc_get_arg(void *Wrapper, int32_t ArgNum) {
|
||||
auto *W = reinterpret_cast<hostrpc::DescriptorWrapper *>(Wrapper);
|
||||
auto &SD = W->D;
|
||||
|
||||
assert(ArgNum < SD.NumArgs && "out-of-range argument");
|
||||
|
||||
int64_t ArgVal = SD.Args[ArgNum].Value;
|
||||
void *ArgPtr = reinterpret_cast<void *>(ArgVal);
|
||||
|
||||
DP("[host-rpc] get argno=%d arg=%lx...\n", ArgNum, ArgVal);
|
||||
|
||||
if (W->StdIn && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
|
||||
ArgPtr == W->StdIn)
|
||||
return (int64_t)stdin;
|
||||
if (W->StdOut && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
|
||||
ArgPtr == W->StdOut)
|
||||
return (int64_t)stdout;
|
||||
if (W->StdErr && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
|
||||
ArgPtr == W->StdErr)
|
||||
return (int64_t)stderr;
|
||||
|
||||
return ArgVal;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_host_rpc_set_ret_val(void *Wrapper, int64_t RetVal) {
|
||||
auto &SD = reinterpret_cast<hostrpc::DescriptorWrapper *>(Wrapper)->D;
|
||||
SD.ReturnValue = RetVal;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user