Compare commits
4 Commits
mangling
...
llvm-test-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b19051093e | ||
|
|
de7beadb39 | ||
|
|
25d5970544 | ||
|
|
20f98716b0 |
65
auto-host-rpc/device.ll
Normal file
65
auto-host-rpc/device.ll
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
; ModuleID = 'test-openmp-nvptx64-nvidia-cuda-sm_75.bc'
|
||||||
|
source_filename = "/home/shiltian/Documents/vscode/llvm-project/auto-host-rpc/test.c"
|
||||||
|
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||||
|
target triple = "nvptx64-nvidia-cuda"
|
||||||
|
|
||||||
|
%struct.ddd = type { i32, i32, float }
|
||||||
|
|
||||||
|
@__omp_rtl_debug_kind = weak_odr hidden local_unnamed_addr constant i32 0
|
||||||
|
@__omp_rtl_assume_teams_oversubscription = weak_odr hidden local_unnamed_addr constant i32 0
|
||||||
|
@__omp_rtl_assume_threads_oversubscription = weak_odr hidden local_unnamed_addr constant i32 0
|
||||||
|
@__omp_rtl_assume_no_thread_state = weak_odr hidden local_unnamed_addr constant i32 0
|
||||||
|
@__omp_rtl_assume_no_nested_parallelism = weak_odr hidden local_unnamed_addr constant i32 0
|
||||||
|
@.str = private unnamed_addr constant [9 x i8] c"main.cpp\00", align 1
|
||||||
|
@.str1 = private unnamed_addr constant [2 x i8] c"r\00", align 1
|
||||||
|
@.str2 = private unnamed_addr constant [3 x i8] c"%d\00", align 1
|
||||||
|
@.str3 = private unnamed_addr constant [7 x i8] c"%f%d%s\00", align 1
|
||||||
|
@.str4 = private unnamed_addr constant [6 x i8] c"hello\00", align 1
|
||||||
|
|
||||||
|
; Function Attrs: convergent nounwind
|
||||||
|
define hidden void @foo() local_unnamed_addr #0 {
|
||||||
|
entry:
|
||||||
|
%d = tail call align 16 dereferenceable_or_null(12) ptr @__kmpc_alloc_shared(i64 12) #4
|
||||||
|
%call = tail call noalias ptr @fopen(ptr noundef nonnull @.str, ptr noundef nonnull @.str1) #5
|
||||||
|
%call1 = tail call i32 (ptr, ptr, ...) @fprintf(ptr noundef %call, ptr noundef nonnull @.str2, i32 noundef 6) #5
|
||||||
|
%call2 = tail call i32 (ptr, ptr, ...) @fprintf(ptr noundef %call, ptr noundef nonnull @.str3, double noundef 6.000000e+00, i32 noundef 1, ptr noundef nonnull @.str4) #5
|
||||||
|
%a = getelementptr inbounds %struct.ddd, ptr %d, i64 0, i32 1
|
||||||
|
%call3 = tail call i32 (ptr, ptr, ...) @fscanf(ptr noundef %call, ptr noundef nonnull @.str2, ptr noundef nonnull %a) #5
|
||||||
|
tail call void @__kmpc_free_shared(ptr %d, i64 12)
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; Function Attrs: nofree nosync nounwind allocsize(0)
|
||||||
|
declare ptr @__kmpc_alloc_shared(i64) local_unnamed_addr #1
|
||||||
|
|
||||||
|
; Function Attrs: convergent
|
||||||
|
declare noalias ptr @fopen(ptr noundef, ptr noundef) local_unnamed_addr #2
|
||||||
|
|
||||||
|
; Function Attrs: convergent
|
||||||
|
declare i32 @fprintf(ptr noundef, ptr noundef, ...) local_unnamed_addr #2
|
||||||
|
|
||||||
|
; Function Attrs: convergent
|
||||||
|
declare i32 @fscanf(ptr noundef, ptr noundef, ...) local_unnamed_addr #2
|
||||||
|
|
||||||
|
; Function Attrs: nosync nounwind
|
||||||
|
declare void @__kmpc_free_shared(ptr allocptr nocapture, i64) local_unnamed_addr #3
|
||||||
|
|
||||||
|
attributes #0 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx77,+sm_75" }
|
||||||
|
attributes #1 = { nofree nosync nounwind allocsize(0) }
|
||||||
|
attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx77,+sm_75" }
|
||||||
|
attributes #3 = { nosync nounwind }
|
||||||
|
attributes #4 = { nounwind }
|
||||||
|
attributes #5 = { convergent nounwind }
|
||||||
|
|
||||||
|
!llvm.module.flags = !{!0, !1, !2, !3, !4, !5}
|
||||||
|
!llvm.ident = !{!6, !7}
|
||||||
|
!nvvm.annotations = !{}
|
||||||
|
|
||||||
|
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 7]}
|
||||||
|
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||||
|
!2 = !{i32 7, !"openmp", i32 50}
|
||||||
|
!3 = !{i32 7, !"openmp-device", i32 50}
|
||||||
|
!4 = !{i32 8, !"PIC Level", i32 2}
|
||||||
|
!5 = !{i32 7, !"frame-pointer", i32 2}
|
||||||
|
!6 = !{!"clang version 16.0.0"}
|
||||||
|
!7 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
||||||
560
auto-host-rpc/main.cpp
Normal file
560
auto-host-rpc/main.cpp
Normal file
@@ -0,0 +1,560 @@
|
|||||||
|
#include "llvm/ADT/EnumeratedArray.h"
|
||||||
|
#include "llvm/ADT/Triple.h"
|
||||||
|
#include "llvm/Analysis/ValueTracking.h"
|
||||||
|
#include "llvm/CodeGen/CommandFlags.h"
|
||||||
|
#include "llvm/IR/BasicBlock.h"
|
||||||
|
#include "llvm/IR/IRBuilder.h"
|
||||||
|
#include "llvm/IR/Instructions.h"
|
||||||
|
#include "llvm/IR/LLVMContext.h"
|
||||||
|
#include "llvm/IR/Module.h"
|
||||||
|
#include "llvm/IRReader/IRReader.h"
|
||||||
|
#include "llvm/Support/SourceMgr.h"
|
||||||
|
#include "llvm/Support/TargetSelect.h"
|
||||||
|
#include "llvm/Target/TargetOptions.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
using namespace llvm;
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
static LLVMContext Context;
|
||||||
|
static codegen::RegisterCodeGenFlags RCGF;
|
||||||
|
static constexpr const char *InternalPrefix[] = {"__kmp", "llvm.", "nvm.",
|
||||||
|
"omp_"};
|
||||||
|
|
||||||
|
std::string typeToString(Type *T) {
|
||||||
|
if (T->is16bitFPTy())
|
||||||
|
return "f16";
|
||||||
|
if (T->isFloatTy())
|
||||||
|
return "f32";
|
||||||
|
if (T->isDoubleTy())
|
||||||
|
return "f64";
|
||||||
|
if (T->isPointerTy())
|
||||||
|
return "ptr";
|
||||||
|
if (T->isStructTy())
|
||||||
|
return std::string(T->getStructName());
|
||||||
|
if (T->isIntegerTy())
|
||||||
|
return "i" + std::to_string(T->getIntegerBitWidth());
|
||||||
|
|
||||||
|
llvm_unreachable("unknown type");
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
namespace llvm {
|
||||||
|
|
||||||
|
enum class HostRPCRuntimeFunction {
|
||||||
|
#define __OMPRTL_HOST_RPC(_ENUM) OMPRTL_##_ENUM
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc),
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg),
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg),
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait),
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_ret_val),
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val),
|
||||||
|
__OMPRTL_HOST_RPC(__last),
|
||||||
|
#undef __OMPRTL_HOST_RPC
|
||||||
|
};
|
||||||
|
|
||||||
|
#define __OMPRTL_HOST_RPC(_ENUM) \
|
||||||
|
auto OMPRTL_##_ENUM = HostRPCRuntimeFunction::OMPRTL_##_ENUM;
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc)
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg)
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg)
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait)
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_ret_val)
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val)
|
||||||
|
#undef __OMPRTL_HOST_RPC
|
||||||
|
|
||||||
|
enum OMPHostRPCArgType {
|
||||||
|
// No need to copy.
|
||||||
|
OMP_HOST_RPC_ARG_SCALAR = 0,
|
||||||
|
OMP_HOST_RPC_ARG_PTR = 1,
|
||||||
|
// Copy to host.
|
||||||
|
OMP_HOST_RPC_ARG_PTR_COPY_TO = 2,
|
||||||
|
// Copy to device
|
||||||
|
OMP_HOST_RPC_ARG_PTR_COPY_FROM = 3,
|
||||||
|
// TODO: Do we have a tofrom pointer?
|
||||||
|
OMP_HOST_RPC_ARG_PTR_COPY_TOFROM = 4,
|
||||||
|
};
|
||||||
|
|
||||||
|
// struct HostRPCArgInfo {
|
||||||
|
// // OMPHostRPCArgType
|
||||||
|
// int64_t Type;
|
||||||
|
// int64_t Size;
|
||||||
|
// };
|
||||||
|
|
||||||
|
class AutoHostRPC {
|
||||||
|
LLVMContext &Context;
|
||||||
|
// Device module
|
||||||
|
Module &DM;
|
||||||
|
// Host module
|
||||||
|
Module &HM;
|
||||||
|
// Types
|
||||||
|
Type *Int8PtrTy;
|
||||||
|
Type *VoidTy;
|
||||||
|
Type *Int32Ty;
|
||||||
|
Type *Int64Ty;
|
||||||
|
StructType *ArgInfoTy;
|
||||||
|
|
||||||
|
struct CallSiteInfo {
|
||||||
|
CallInst *CI = nullptr;
|
||||||
|
SmallVector<Type *> Params;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct HostRPCArgInfo {
|
||||||
|
// OMPHostRPCArgType
|
||||||
|
Constant *Type;
|
||||||
|
Value *Size;
|
||||||
|
};
|
||||||
|
|
||||||
|
//
|
||||||
|
SmallVector<Function *> HostEntryTable;
|
||||||
|
|
||||||
|
EnumeratedArray<Function *, HostRPCRuntimeFunction,
|
||||||
|
HostRPCRuntimeFunction::OMPRTL___last>
|
||||||
|
RFIs;
|
||||||
|
|
||||||
|
static std::string getWrapperFunctionName(Function *F, CallSiteInfo &CSI) {
|
||||||
|
std::string Name = "__kmpc_host_rpc_wrapper_" + std::string(F->getName());
|
||||||
|
if (!F->isVarArg())
|
||||||
|
return Name;
|
||||||
|
|
||||||
|
for (unsigned I = F->getFunctionType()->getNumParams();
|
||||||
|
I < CSI.Params.size(); ++I) {
|
||||||
|
Name.push_back('_');
|
||||||
|
Name.append(typeToString(CSI.Params[I]));
|
||||||
|
}
|
||||||
|
|
||||||
|
return Name;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool isInternalFunction(Function &F) {
|
||||||
|
auto Name = F.getName();
|
||||||
|
|
||||||
|
for (auto P : InternalPrefix)
|
||||||
|
if (Name.startswith(P))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *convertToInt64Ty(IRBuilder<> &Builder, Value *V);
|
||||||
|
|
||||||
|
Value *convertFromInt64TyTo(IRBuilder<> &Builder, Value *V, Type *TargetTy);
|
||||||
|
|
||||||
|
// int device_wrapper(call_no, arg_info, ...) {
|
||||||
|
// void *desc = __kmpc_host_rpc_get_desc(call_no, num_args, arg_info);
|
||||||
|
// __kmpc_host_rpc_add_arg(desc, arg1, sizeof(arg1));
|
||||||
|
// __kmpc_host_rpc_add_arg(desc, arg2, sizeof(arg2));
|
||||||
|
// ...
|
||||||
|
// __kmpc_host_rpc_send_and_wait(desc);
|
||||||
|
// int r = (int)__kmpc_host_rpc_get_ret_val(desc);
|
||||||
|
// return r;
|
||||||
|
// }
|
||||||
|
Function *getDeviceWrapperFunction(StringRef WrapperName, Function *F,
|
||||||
|
CallSiteInfo &CSI);
|
||||||
|
|
||||||
|
// void host_wrapper(desc) {
|
||||||
|
// int arg1 = (int)__kmpc_host_rpc_get_arg(desc, 0);
|
||||||
|
// float arg2 = (float)__kmpc_host_rpc_get_arg(desc, 1);
|
||||||
|
// char *arg3 = (char *)__kmpc_host_rpc_get_arg(desc, 2);
|
||||||
|
// ...
|
||||||
|
// int r = actual_call(arg1, arg2, arg3, ...);
|
||||||
|
// __kmpc_host_rpc_set_ret_val(ptr(desc, (int64_t)r);
|
||||||
|
// }
|
||||||
|
Function *getHostWrapperFunction(StringRef WrapperName, Function *F,
|
||||||
|
CallSiteInfo &CSI);
|
||||||
|
|
||||||
|
bool rewriteWithHostRPC(Function *F);
|
||||||
|
|
||||||
|
public:
|
||||||
|
AutoHostRPC(Module &DeviceModule, Module &HostModule)
|
||||||
|
: Context(DeviceModule.getContext()), DM(DeviceModule), HM(HostModule) {
|
||||||
|
assert(&DeviceModule.getContext() == &HostModule.getContext() &&
|
||||||
|
"device and host modules have different context");
|
||||||
|
|
||||||
|
#define __OMP_TYPE(TYPE) TYPE = Type::get##TYPE(Context)
|
||||||
|
__OMP_TYPE(Int8PtrTy);
|
||||||
|
__OMP_TYPE(VoidTy);
|
||||||
|
__OMP_TYPE(Int32Ty);
|
||||||
|
__OMP_TYPE(Int64Ty);
|
||||||
|
#undef __OMP_TYPE
|
||||||
|
|
||||||
|
#define __OMP_RTL(_ENUM, MOD, VARARG, RETTY, ...) \
|
||||||
|
{ \
|
||||||
|
SmallVector<Type *> Params{__VA_ARGS__}; \
|
||||||
|
FunctionType *FT = FunctionType::get(RETTY, Params, VARARG); \
|
||||||
|
RFIs[OMPRTL_##_ENUM] = Function::Create( \
|
||||||
|
FT, GlobalValue::LinkageTypes::InternalLinkage, #_ENUM, MOD); \
|
||||||
|
}
|
||||||
|
__OMP_RTL(__kmpc_host_rpc_get_desc, DM, false, Int8PtrTy, Int32Ty, Int32Ty,
|
||||||
|
Int8PtrTy)
|
||||||
|
__OMP_RTL(__kmpc_host_rpc_add_arg, DM, false, VoidTy, Int8PtrTy, Int64Ty,
|
||||||
|
Int64Ty)
|
||||||
|
__OMP_RTL(__kmpc_host_rpc_send_and_wait, DM, false, VoidTy, Int8PtrTy)
|
||||||
|
__OMP_RTL(__kmpc_host_rpc_get_ret_val, DM, false, Int64Ty, Int8PtrTy)
|
||||||
|
__OMP_RTL(__kmpc_host_rpc_get_arg, HM, false, Int64Ty, Int8PtrTy, Int32Ty)
|
||||||
|
__OMP_RTL(__kmpc_host_rpc_set_ret_val, HM, false, VoidTy, Int8PtrTy,
|
||||||
|
Int64Ty)
|
||||||
|
#undef __OMP_RTL
|
||||||
|
|
||||||
|
ArgInfoTy = StructType::create({Int64Ty, Int64Ty}, "struct.arg_info_t");
|
||||||
|
}
|
||||||
|
|
||||||
|
bool run();
|
||||||
|
};
|
||||||
|
|
||||||
|
Value *AutoHostRPC::convertToInt64Ty(IRBuilder<> &Builder, Value *V) {
|
||||||
|
Type *T = V->getType();
|
||||||
|
|
||||||
|
if (T == Int64Ty)
|
||||||
|
return V;
|
||||||
|
|
||||||
|
if (T->isPointerTy())
|
||||||
|
return Builder.CreatePtrToInt(V, Int64Ty);
|
||||||
|
|
||||||
|
if (T->isIntegerTy())
|
||||||
|
return Builder.CreateIntCast(V, Int64Ty, false);
|
||||||
|
|
||||||
|
if (T->isFloatingPointTy()) {
|
||||||
|
if (T->isFloatTy())
|
||||||
|
V = Builder.CreateFPToSI(V, Int32Ty);
|
||||||
|
return Builder.CreateFPToSI(V, Int64Ty);
|
||||||
|
}
|
||||||
|
|
||||||
|
llvm_unreachable("unknown cast to int64_t");
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *AutoHostRPC::convertFromInt64TyTo(IRBuilder<> &Builder, Value *V,
|
||||||
|
Type *T) {
|
||||||
|
if (T == Int64Ty)
|
||||||
|
return V;
|
||||||
|
|
||||||
|
if (T->isPointerTy())
|
||||||
|
return Builder.CreateIntToPtr(V, T);
|
||||||
|
|
||||||
|
if (T->isIntegerTy())
|
||||||
|
return Builder.CreateIntCast(V, T, /* isSigned */ true);
|
||||||
|
|
||||||
|
if (T->isFloatingPointTy()) {
|
||||||
|
if (T->isFloatTy())
|
||||||
|
V = Builder.CreateIntCast(V, Int32Ty, /* isSigned */ true);
|
||||||
|
V = Builder.CreateSIToFP(V, T);
|
||||||
|
return V;
|
||||||
|
}
|
||||||
|
|
||||||
|
llvm_unreachable("unknown cast from int64_t");
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AutoHostRPC::run() {
|
||||||
|
bool Changed = false;
|
||||||
|
|
||||||
|
SmallVector<Function *> WorkList;
|
||||||
|
|
||||||
|
for (Function &F : DM) {
|
||||||
|
// If the function is already defined, it definitely does not require RPC.
|
||||||
|
if (!F.isDeclaration())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// If it is an internal function, skip it as well.
|
||||||
|
if (isInternalFunction(F))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// If there is no use of the function, skip it.
|
||||||
|
if (F.use_empty())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
WorkList.push_back(&F);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (WorkList.empty())
|
||||||
|
return Changed;
|
||||||
|
|
||||||
|
for (Function *F : WorkList)
|
||||||
|
Changed |= rewriteWithHostRPC(F);
|
||||||
|
|
||||||
|
return Changed;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AutoHostRPC::rewriteWithHostRPC(Function *F) {
|
||||||
|
bool Changed = false;
|
||||||
|
|
||||||
|
SmallVector<CallInst *> WorkList;
|
||||||
|
|
||||||
|
for (User *U : F->users()) {
|
||||||
|
auto *CI = dyn_cast<CallInst>(U);
|
||||||
|
if (!CI)
|
||||||
|
continue;
|
||||||
|
WorkList.push_back(CI);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (WorkList.empty())
|
||||||
|
return Changed;
|
||||||
|
|
||||||
|
for (CallInst *CI : WorkList) {
|
||||||
|
CallSiteInfo CSI;
|
||||||
|
CSI.CI = CI;
|
||||||
|
unsigned NumArgs = CI->arg_size();
|
||||||
|
for (unsigned I = 0; I < NumArgs; ++I)
|
||||||
|
CSI.Params.push_back(CI->getArgOperand(I)->getType());
|
||||||
|
|
||||||
|
std::string WrapperName = getWrapperFunctionName(F, CSI);
|
||||||
|
Function *DeviceWrapperFn = getDeviceWrapperFunction(WrapperName, F, CSI);
|
||||||
|
Function *HostWrapperFn = getHostWrapperFunction(WrapperName, F, CSI);
|
||||||
|
|
||||||
|
int32_t WrapperNumber = -1;
|
||||||
|
for (unsigned I = 0; I < HostEntryTable.size(); ++I) {
|
||||||
|
if (HostEntryTable[I] == HostWrapperFn) {
|
||||||
|
WrapperNumber = I;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (WrapperNumber == -1) {
|
||||||
|
WrapperNumber = HostEntryTable.size();
|
||||||
|
HostEntryTable.push_back(HostWrapperFn);
|
||||||
|
}
|
||||||
|
|
||||||
|
DataLayout DL = DM.getDataLayout();
|
||||||
|
IRBuilder<> Builder(CI);
|
||||||
|
|
||||||
|
auto CheckIfIdentifierPtr = [](const Value *V) {
|
||||||
|
auto *CI = dyn_cast<CallInst>(V);
|
||||||
|
if (!CI)
|
||||||
|
return false;
|
||||||
|
Function *Callee = CI->getCalledFunction();
|
||||||
|
return Callee->getName().startswith("__kmpc_host_rpc_wrapper_");
|
||||||
|
};
|
||||||
|
|
||||||
|
auto CheckIfAlloca = [](const Value *V) {
|
||||||
|
auto *CI = dyn_cast<CallInst>(V);
|
||||||
|
if (!CI)
|
||||||
|
return false;
|
||||||
|
Function *Callee = CI->getCalledFunction();
|
||||||
|
return Callee->getName() == "__kmpc_alloc_shared" ||
|
||||||
|
Callee->getName() == "malloc";
|
||||||
|
};
|
||||||
|
|
||||||
|
SmallVector<HostRPCArgInfo> ArgInfos;
|
||||||
|
bool IsConstantArgInfo = true;
|
||||||
|
|
||||||
|
for (Value *Op : CI->args()) {
|
||||||
|
if (!Op->getType()->isPointerTy()) {
|
||||||
|
HostRPCArgInfo AI{
|
||||||
|
ConstantInt::get(Int64Ty,
|
||||||
|
OMPHostRPCArgType::OMP_HOST_RPC_ARG_SCALAR),
|
||||||
|
ConstantInt::getNullValue(Int64Ty)};
|
||||||
|
ArgInfos.push_back(std::move(AI));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *SizeVal = nullptr;
|
||||||
|
OMPHostRPCArgType ArgType = OMP_HOST_RPC_ARG_PTR_COPY_TOFROM;
|
||||||
|
|
||||||
|
SmallVector<const Value *> Objects;
|
||||||
|
getUnderlyingObjects(Op, Objects);
|
||||||
|
|
||||||
|
// TODO: Handle phi node
|
||||||
|
if (Objects.size() != 1)
|
||||||
|
llvm_unreachable("we can't handle phi node yet");
|
||||||
|
|
||||||
|
auto *Obj = Objects.front();
|
||||||
|
if (CheckIfIdentifierPtr(Obj)) {
|
||||||
|
ArgType = OMP_HOST_RPC_ARG_SCALAR;
|
||||||
|
SizeVal = ConstantInt::getNullValue(Int64Ty);
|
||||||
|
} else if (CheckIfAlloca(Obj)) {
|
||||||
|
auto *CI = dyn_cast<CallInst>(Obj);
|
||||||
|
SizeVal = CI->getOperand(0);
|
||||||
|
if (!isa<Constant>(SizeVal))
|
||||||
|
IsConstantArgInfo = false;
|
||||||
|
} else {
|
||||||
|
if (auto *GV = dyn_cast<GlobalVariable>(Obj)) {
|
||||||
|
SizeVal = ConstantInt::get(Int64Ty,
|
||||||
|
DL.getTypeStoreSize(GV->getValueType()));
|
||||||
|
if (GV->isConstant())
|
||||||
|
ArgType = OMP_HOST_RPC_ARG_PTR_COPY_TO;
|
||||||
|
if (GV->isConstant() && GV->hasInitializer()) {
|
||||||
|
// TODO: If the global variable is contant, we can do some
|
||||||
|
// optimization.
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// TODO: fix that when it occurs
|
||||||
|
llvm_unreachable("cannot handle unknown type");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HostRPCArgInfo AI{ConstantInt::get(Int64Ty, ArgType), SizeVal};
|
||||||
|
ArgInfos.push_back(std::move(AI));
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *ArgInfo = nullptr;
|
||||||
|
|
||||||
|
if (!IsConstantArgInfo) {
|
||||||
|
ArgInfo = Builder.CreateAlloca(
|
||||||
|
ArgInfoTy, ConstantInt::get(Int64Ty, NumArgs), "arg_info");
|
||||||
|
for (unsigned I = 0; I < NumArgs; ++I) {
|
||||||
|
Value *AII = GetElementPtrInst::Create(
|
||||||
|
ArrayType::get(ArgInfoTy, NumArgs), ArgInfo,
|
||||||
|
{ConstantInt::getNullValue(Int64Ty), ConstantInt::get(Int64Ty, I)});
|
||||||
|
Value *AIIType = GetElementPtrInst::Create(
|
||||||
|
ArgInfoTy, AII, {ConstantInt::get(Int64Ty, 0)});
|
||||||
|
Value *AIISize = GetElementPtrInst::Create(
|
||||||
|
ArgInfoTy, AII, {ConstantInt::get(Int64Ty, 1)});
|
||||||
|
Builder.Insert(AII);
|
||||||
|
Builder.Insert(AIIType);
|
||||||
|
Builder.Insert(AIISize);
|
||||||
|
Builder.CreateStore(ArgInfos[I].Type, AIIType);
|
||||||
|
Builder.CreateStore(ArgInfos[I].Size, AIISize);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
SmallVector<Constant *> ArgInfoInitVar;
|
||||||
|
for (auto &AI : ArgInfos) {
|
||||||
|
auto *CS =
|
||||||
|
ConstantStruct::get(ArgInfoTy, {AI.Type, cast<Constant>(AI.Size)});
|
||||||
|
ArgInfoInitVar.push_back(CS);
|
||||||
|
}
|
||||||
|
Constant *ArgInfoInit = ConstantArray::get(
|
||||||
|
ArrayType::get(ArgInfoTy, NumArgs), ArgInfoInitVar);
|
||||||
|
ArgInfo = new GlobalVariable(
|
||||||
|
DM, ArrayType::get(ArgInfoTy, NumArgs), /* isConstant */ true,
|
||||||
|
GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info");
|
||||||
|
}
|
||||||
|
|
||||||
|
SmallVector<Value *> Args{ConstantInt::get(Int32Ty, WrapperNumber),
|
||||||
|
ArgInfo};
|
||||||
|
for (Value *Op : CI->args())
|
||||||
|
Args.push_back(Op);
|
||||||
|
|
||||||
|
CallInst *NewCall = Builder.CreateCall(DeviceWrapperFn, Args);
|
||||||
|
|
||||||
|
CI->replaceAllUsesWith(NewCall);
|
||||||
|
CI->eraseFromParent();
|
||||||
|
}
|
||||||
|
|
||||||
|
F->eraseFromParent();
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
Function *AutoHostRPC::getDeviceWrapperFunction(StringRef WrapperName,
|
||||||
|
Function *F,
|
||||||
|
CallSiteInfo &CSI) {
|
||||||
|
Function *WrapperFn = DM.getFunction(WrapperName);
|
||||||
|
if (WrapperFn)
|
||||||
|
return WrapperFn;
|
||||||
|
|
||||||
|
// return_type device_wrapper(int32_t call_no, void *arg_info, ...)
|
||||||
|
SmallVector<Type *> Params{Int32Ty, Int8PtrTy};
|
||||||
|
Params.append(CSI.Params);
|
||||||
|
|
||||||
|
Type *RetTy = F->getReturnType();
|
||||||
|
|
||||||
|
FunctionType *FT = FunctionType::get(RetTy, Params, /*isVarArg*/ false);
|
||||||
|
WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::WeakODRLinkage,
|
||||||
|
WrapperName, DM);
|
||||||
|
|
||||||
|
// Emit the body of the device wrapper
|
||||||
|
IRBuilder<> Builder(Context);
|
||||||
|
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
|
||||||
|
Builder.SetInsertPoint(EntryBB);
|
||||||
|
|
||||||
|
// skip call_no and arg_info.
|
||||||
|
constexpr const unsigned NumArgSkipped = 2;
|
||||||
|
|
||||||
|
Value *Desc = nullptr;
|
||||||
|
{
|
||||||
|
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_get_desc];
|
||||||
|
Desc = Builder.CreateCall(
|
||||||
|
Fn,
|
||||||
|
{WrapperFn->getArg(0),
|
||||||
|
ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
|
||||||
|
WrapperFn->getArg(1)},
|
||||||
|
"desc");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_add_arg];
|
||||||
|
for (unsigned I = NumArgSkipped; I < WrapperFn->arg_size(); ++I) {
|
||||||
|
Value *V = convertToInt64Ty(Builder, WrapperFn->getArg(I));
|
||||||
|
Builder.CreateCall(Fn, {Desc, V, ConstantInt::getNullValue(Int64Ty)});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_send_and_wait], {Desc});
|
||||||
|
|
||||||
|
if (RetTy->isVoidTy()) {
|
||||||
|
Builder.CreateRetVoid();
|
||||||
|
return WrapperFn;
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *RetVal =
|
||||||
|
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_ret_val], {Desc});
|
||||||
|
if (RetTy != RetVal->getType())
|
||||||
|
RetVal = convertFromInt64TyTo(Builder, RetVal, RetTy);
|
||||||
|
|
||||||
|
Builder.CreateRet(RetVal);
|
||||||
|
|
||||||
|
return WrapperFn;
|
||||||
|
}
|
||||||
|
|
||||||
|
Function *AutoHostRPC::getHostWrapperFunction(StringRef WrapperName,
|
||||||
|
Function *F, CallSiteInfo &CSI) {
|
||||||
|
Function *WrapperFn = HM.getFunction(WrapperName);
|
||||||
|
if (WrapperFn)
|
||||||
|
return WrapperFn;
|
||||||
|
|
||||||
|
SmallVector<Type *> Params{Int8PtrTy};
|
||||||
|
FunctionType *FT = FunctionType::get(VoidTy, Params, /* isVarArg */ false);
|
||||||
|
WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,
|
||||||
|
WrapperName, HM);
|
||||||
|
|
||||||
|
Value *Desc = WrapperFn->getArg(0);
|
||||||
|
|
||||||
|
// Emit the body of the host wrapper
|
||||||
|
IRBuilder<> Builder(Context);
|
||||||
|
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
|
||||||
|
Builder.SetInsertPoint(EntryBB);
|
||||||
|
|
||||||
|
SmallVector<Value *> Args;
|
||||||
|
for (unsigned I = 0; I < CSI.CI->arg_size(); ++I) {
|
||||||
|
Value *V = Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_arg],
|
||||||
|
{Desc, ConstantInt::get(Int32Ty, I)});
|
||||||
|
Args.push_back(convertFromInt64TyTo(Builder, V, CSI.Params[I]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// The host callee that will be called eventually by the host wrapper.
|
||||||
|
Function *HostCallee = HM.getFunction(F->getName());
|
||||||
|
if (!HostCallee)
|
||||||
|
HostCallee = Function::Create(F->getFunctionType(), F->getLinkage(),
|
||||||
|
F->getName(), HM);
|
||||||
|
|
||||||
|
Value *RetVal = Builder.CreateCall(HostCallee, Args);
|
||||||
|
RetVal = convertToInt64Ty(Builder, RetVal);
|
||||||
|
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_set_ret_val], {Desc, RetVal});
|
||||||
|
Builder.CreateRetVoid();
|
||||||
|
|
||||||
|
return WrapperFn;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace llvm
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
InitializeAllTargets();
|
||||||
|
InitializeAllTargetMCs();
|
||||||
|
InitializeAllAsmPrinters();
|
||||||
|
InitializeAllAsmParsers();
|
||||||
|
|
||||||
|
SMDiagnostic Err;
|
||||||
|
std::unique_ptr<Module> DM = parseIRFile("device.ll", Err, Context);
|
||||||
|
if (!DM)
|
||||||
|
return 1;
|
||||||
|
Module HM("host-rpc.bc", Context);
|
||||||
|
// get the right target triple
|
||||||
|
HM.setTargetTriple(Triple::normalize("x86-64"));
|
||||||
|
|
||||||
|
AutoHostRPC RPC(*DM, HM);
|
||||||
|
(void)RPC.run();
|
||||||
|
|
||||||
|
DM->dump();
|
||||||
|
// HM.dump();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
19
auto-host-rpc/test.c
Normal file
19
auto-host-rpc/test.c
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#pragma omp begin declare target device_type(nohost)
|
||||||
|
|
||||||
|
struct ddd {
|
||||||
|
int num;
|
||||||
|
int a;
|
||||||
|
float b;
|
||||||
|
};
|
||||||
|
|
||||||
|
void foo() {
|
||||||
|
FILE *fp = fopen("main.cpp", "r");
|
||||||
|
struct ddd d;
|
||||||
|
fprintf(fp, "%d", 6);
|
||||||
|
fprintf(fp, "%f%d%s", 6.0f, 1, "hello");
|
||||||
|
fscanf(fp, "%d", &d.a);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma omp end declare target
|
||||||
@@ -258,6 +258,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
|
|||||||
LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
|
LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
|
||||||
LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
|
LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
|
||||||
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
|
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
|
||||||
|
LANGOPT(OpenMPGlobalizeToGlobalSpace , 1, 0, "Globalize to global space for the globalized variables")
|
||||||
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
|
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
|
||||||
LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
|
LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
|
||||||
LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
|
LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
|
||||||
|
|||||||
@@ -3497,6 +3497,10 @@ def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested
|
|||||||
HelpText<"Assert no nested parallel regions in the GPU">,
|
HelpText<"Assert no nested parallel regions in the GPU">,
|
||||||
MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;
|
MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;
|
||||||
|
|
||||||
|
def fopenmp_globalize_to_global_space : Flag<["-"], "fopenmp-globalize-to-global-space">,
|
||||||
|
HelpText<"Globalize to global space for the globalized variables">,
|
||||||
|
MarshallingInfoFlag<LangOpts<"OpenMPGlobalizeToGlobalSpace">>;
|
||||||
|
|
||||||
} // let Group = f_Group
|
} // let Group = f_Group
|
||||||
} // let Visibility = [ClangOption, CC1Option, FC1Option]
|
} // let Visibility = [ClangOption, CC1Option, FC1Option]
|
||||||
} // let Flags = [NoArgumentUnused, HelpHidden]
|
} // let Flags = [NoArgumentUnused, HelpHidden]
|
||||||
|
|||||||
@@ -2530,48 +2530,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
|
|||||||
(IPD->getParameterKind() == ImplicitParamKind::ThreadPrivateVar);
|
(IPD->getParameterKind() == ImplicitParamKind::ThreadPrivateVar);
|
||||||
}
|
}
|
||||||
|
|
||||||
Address DeclPtr = Address::invalid();
|
auto PushCleanupIfNeeded = [this, Ty, &D](Address DeclPtr) {
|
||||||
Address AllocaPtr = Address::invalid();
|
|
||||||
bool DoStore = false;
|
|
||||||
bool IsScalar = hasScalarEvaluationKind(Ty);
|
|
||||||
bool UseIndirectDebugAddress = false;
|
|
||||||
|
|
||||||
// If we already have a pointer to the argument, reuse the input pointer.
|
|
||||||
if (Arg.isIndirect()) {
|
|
||||||
DeclPtr = Arg.getIndirectAddress();
|
|
||||||
DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty));
|
|
||||||
// Indirect argument is in alloca address space, which may be different
|
|
||||||
// from the default address space.
|
|
||||||
auto AllocaAS = CGM.getASTAllocaAddressSpace();
|
|
||||||
auto *V = DeclPtr.getPointer();
|
|
||||||
AllocaPtr = DeclPtr;
|
|
||||||
|
|
||||||
// For truly ABI indirect arguments -- those that are not `byval` -- store
|
|
||||||
// the address of the argument on the stack to preserve debug information.
|
|
||||||
ABIArgInfo ArgInfo = CurFnInfo->arguments()[ArgNo - 1].info;
|
|
||||||
if (ArgInfo.isIndirect())
|
|
||||||
UseIndirectDebugAddress = !ArgInfo.getIndirectByVal();
|
|
||||||
if (UseIndirectDebugAddress) {
|
|
||||||
auto PtrTy = getContext().getPointerType(Ty);
|
|
||||||
AllocaPtr = CreateMemTemp(PtrTy, getContext().getTypeAlignInChars(PtrTy),
|
|
||||||
D.getName() + ".indirect_addr");
|
|
||||||
EmitStoreOfScalar(V, AllocaPtr, /* Volatile */ false, PtrTy);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
|
|
||||||
auto DestLangAS =
|
|
||||||
getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
|
|
||||||
if (SrcLangAS != DestLangAS) {
|
|
||||||
assert(getContext().getTargetAddressSpace(SrcLangAS) ==
|
|
||||||
CGM.getDataLayout().getAllocaAddrSpace());
|
|
||||||
auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
|
|
||||||
auto *T = llvm::PointerType::get(getLLVMContext(), DestAS);
|
|
||||||
DeclPtr =
|
|
||||||
DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
|
|
||||||
*this, V, SrcLangAS, DestLangAS, T, true),
|
|
||||||
DeclPtr.isKnownNonNull());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Push a destructor cleanup for this parameter if the ABI requires it.
|
// Push a destructor cleanup for this parameter if the ABI requires it.
|
||||||
// Don't push a cleanup in a thunk for a method that will also emit a
|
// Don't push a cleanup in a thunk for a method that will also emit a
|
||||||
// cleanup.
|
// cleanup.
|
||||||
@@ -2587,20 +2546,56 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
|
|||||||
EHStack.stable_begin();
|
EHStack.stable_begin();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
};
|
||||||
// Check if the parameter address is controlled by OpenMP runtime.
|
|
||||||
|
Address DeclPtr = Address::invalid();
|
||||||
|
Address AllocaPtr = Address::invalid();
|
||||||
Address OpenMPLocalAddr =
|
Address OpenMPLocalAddr =
|
||||||
getLangOpts().OpenMP
|
getLangOpts().OpenMP
|
||||||
? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
|
? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
|
||||||
: Address::invalid();
|
: Address::invalid();
|
||||||
if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
|
bool DoStore = false;
|
||||||
|
bool IsScalar = hasScalarEvaluationKind(Ty);
|
||||||
|
bool UseIndirectDebugAddress = false;
|
||||||
|
if (OpenMPLocalAddr.isValid()) {
|
||||||
DeclPtr = OpenMPLocalAddr;
|
DeclPtr = OpenMPLocalAddr;
|
||||||
AllocaPtr = DeclPtr;
|
AllocaPtr = DeclPtr;
|
||||||
|
LValue Dst = MakeAddrLValue(DeclPtr, Ty);
|
||||||
|
if (Arg.isIndirect()) {
|
||||||
|
LValue Src = MakeAddrLValue(Arg.getIndirectAddress(), Ty);
|
||||||
|
callCStructCopyConstructor(Dst, Src);
|
||||||
|
PushCleanupIfNeeded(Arg.getIndirectAddress());
|
||||||
} else {
|
} else {
|
||||||
// Otherwise, create a temporary to hold the value.
|
EmitStoreOfScalar(Arg.getDirectValue(), Dst, /* isInitialization */ true);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// If we already have a pointer to the argument, reuse the input pointer.
|
||||||
|
if (Arg.isIndirect()) {
|
||||||
|
DeclPtr = Arg.getIndirectAddress();
|
||||||
|
DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty));
|
||||||
|
// Indirect argument is in alloca address space, which may be different
|
||||||
|
// from the default address space.
|
||||||
|
auto AllocaAS = CGM.getASTAllocaAddressSpace();
|
||||||
|
auto *V = DeclPtr.getPointer();
|
||||||
|
AllocaPtr = DeclPtr;
|
||||||
|
auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
|
||||||
|
auto DestLangAS =
|
||||||
|
getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
|
||||||
|
if (SrcLangAS != DestLangAS) {
|
||||||
|
assert(getContext().getTargetAddressSpace(SrcLangAS) ==
|
||||||
|
CGM.getDataLayout().getAllocaAddrSpace());
|
||||||
|
auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
|
||||||
|
auto *T = llvm::PointerType::get(getLLVMContext(), DestAS);
|
||||||
|
DeclPtr =
|
||||||
|
DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
|
||||||
|
*this, V, SrcLangAS, DestLangAS, T, true),
|
||||||
|
DeclPtr.isKnownNonNull());
|
||||||
|
}
|
||||||
|
PushCleanupIfNeeded(DeclPtr);
|
||||||
|
} else {
|
||||||
|
// Create a temporary to hold the value.
|
||||||
DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
|
DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
|
||||||
D.getName() + ".addr", &AllocaPtr);
|
D.getName() + ".addr", &AllocaPtr);
|
||||||
}
|
|
||||||
DoStore = true;
|
DoStore = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2638,8 +2633,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
|
|||||||
EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
|
EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
|
||||||
EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
|
EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
|
||||||
DoStore = false;
|
DoStore = false;
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
// Don't use objc_retainBlock for block pointers, because we
|
// Don't use objc_retainBlock for block pointers, because we
|
||||||
// don't want to Block_copy something just because we got it
|
// don't want to Block_copy something just because we got it
|
||||||
// as a parameter.
|
// as a parameter.
|
||||||
@@ -2648,10 +2642,11 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
|
|||||||
} else {
|
} else {
|
||||||
// Push the cleanup for a consumed parameter.
|
// Push the cleanup for a consumed parameter.
|
||||||
if (isConsumed) {
|
if (isConsumed) {
|
||||||
ARCPreciseLifetime_t precise = (D.hasAttr<ObjCPreciseLifetimeAttr>()
|
ARCPreciseLifetime_t precise =
|
||||||
? ARCPreciseLifetime : ARCImpreciseLifetime);
|
(D.hasAttr<ObjCPreciseLifetimeAttr>() ? ARCPreciseLifetime
|
||||||
EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(), ArgVal,
|
: ARCImpreciseLifetime);
|
||||||
precise);
|
EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(),
|
||||||
|
ArgVal, precise);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lt == Qualifiers::OCL_Weak) {
|
if (lt == Qualifiers::OCL_Weak) {
|
||||||
@@ -2668,6 +2663,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
|
|||||||
// Store the initial value into the alloca.
|
// Store the initial value into the alloca.
|
||||||
if (DoStore)
|
if (DoStore)
|
||||||
EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
|
EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
|
||||||
|
}
|
||||||
|
|
||||||
setAddrOfLocalVar(&D, DeclPtr);
|
setAddrOfLocalVar(&D, DeclPtr);
|
||||||
|
|
||||||
|
|||||||
@@ -1083,9 +1083,11 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
|
|||||||
|
|
||||||
// Allocate space for the variable to be globalized
|
// Allocate space for the variable to be globalized
|
||||||
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
|
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
|
||||||
llvm::CallBase *VoidPtr =
|
llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
|
||||||
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
|
OMPBuilder.getOrCreateRuntimeFunction(
|
||||||
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
|
CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
|
||||||
|
? OMPRTL_malloc
|
||||||
|
: OMPRTL___kmpc_alloc_shared),
|
||||||
AllocArgs, VD->getName());
|
AllocArgs, VD->getName());
|
||||||
// FIXME: We should use the variables actual alignment as an argument.
|
// FIXME: We should use the variables actual alignment as an argument.
|
||||||
VoidPtr->addRetAttr(llvm::Attribute::get(
|
VoidPtr->addRetAttr(llvm::Attribute::get(
|
||||||
@@ -1116,12 +1118,14 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
|
|||||||
I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
|
I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
|
||||||
LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
|
LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
|
||||||
CGM.getContext().getDeclAlign(VD),
|
CGM.getContext().getDeclAlign(VD),
|
||||||
|
|
||||||
AlignmentSource::Decl);
|
AlignmentSource::Decl);
|
||||||
I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress(CGF));
|
I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress(CGF));
|
||||||
}
|
}
|
||||||
I->getSecond().MappedParams->apply(CGF);
|
I->getSecond().MappedParams->apply(CGF);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
|
bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
|
||||||
const VarDecl *VD) const {
|
const VarDecl *VD) const {
|
||||||
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
|
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
|
||||||
@@ -1149,9 +1153,11 @@ CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
|
|||||||
|
|
||||||
// Allocate space for this VLA object to be globalized.
|
// Allocate space for this VLA object to be globalized.
|
||||||
llvm::Value *AllocArgs[] = {Size};
|
llvm::Value *AllocArgs[] = {Size};
|
||||||
llvm::CallBase *VoidPtr =
|
llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
|
||||||
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
|
OMPBuilder.getOrCreateRuntimeFunction(
|
||||||
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
|
CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
|
||||||
|
? OMPRTL_malloc
|
||||||
|
: OMPRTL___kmpc_alloc_shared),
|
||||||
AllocArgs, VD->getName());
|
AllocArgs, VD->getName());
|
||||||
VoidPtr->addRetAttr(llvm::Attribute::get(
|
VoidPtr->addRetAttr(llvm::Attribute::get(
|
||||||
CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
|
CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
|
||||||
@@ -1178,6 +1184,11 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
|
|||||||
// globalized in the prolog (i.e. emitGenericVarsProlog).
|
// globalized in the prolog (i.e. emitGenericVarsProlog).
|
||||||
for (const auto &AddrSizePair :
|
for (const auto &AddrSizePair :
|
||||||
llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
|
llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
|
||||||
|
if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
|
||||||
|
CGF.EmitRuntimeCall(
|
||||||
|
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
|
||||||
|
{AddrSizePair.first});
|
||||||
|
else
|
||||||
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
|
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
|
||||||
CGM.getModule(), OMPRTL___kmpc_free_shared),
|
CGM.getModule(), OMPRTL___kmpc_free_shared),
|
||||||
{AddrSizePair.first, AddrSizePair.second});
|
{AddrSizePair.first, AddrSizePair.second});
|
||||||
@@ -1187,11 +1198,15 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
|
|||||||
const auto *VD = cast<VarDecl>(Rec.first);
|
const auto *VD = cast<VarDecl>(Rec.first);
|
||||||
I->getSecond().MappedParams->restore(CGF);
|
I->getSecond().MappedParams->restore(CGF);
|
||||||
|
|
||||||
llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
|
if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
|
||||||
CGF.getTypeSize(VD->getType())};
|
CGF.EmitRuntimeCall(
|
||||||
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
|
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
|
||||||
CGM.getModule(), OMPRTL___kmpc_free_shared),
|
{Rec.second.GlobalizedVal});
|
||||||
FreeArgs);
|
else
|
||||||
|
CGF.EmitRuntimeCall(
|
||||||
|
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
|
||||||
|
OMPRTL___kmpc_free_shared),
|
||||||
|
{Rec.second.GlobalizedVal, CGF.getTypeSize(VD->getType())});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6502,6 +6502,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
|
|||||||
CmdArgs.push_back("-fopenmp-offload-mandatory");
|
CmdArgs.push_back("-fopenmp-offload-mandatory");
|
||||||
if (Args.hasArg(options::OPT_fopenmp_force_usm))
|
if (Args.hasArg(options::OPT_fopenmp_force_usm))
|
||||||
CmdArgs.push_back("-fopenmp-force-usm");
|
CmdArgs.push_back("-fopenmp-force-usm");
|
||||||
|
if (Args.hasArg(options::OPT_fopenmp_globalize_to_global_space))
|
||||||
|
CmdArgs.push_back("-fopenmp-globalize-to-global-space");
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
// By default, if Clang doesn't know how to generate useful OpenMP code
|
// By default, if Clang doesn't know how to generate useful OpenMP code
|
||||||
|
|||||||
33
clang/test/OpenMP/globalization_byval_struct.c
Normal file
33
clang/test/OpenMP/globalization_byval_struct.c
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||||
|
// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
|
||||||
|
// expected-no-diagnostics
|
||||||
|
|
||||||
|
extern int printf(const char *, ...);
|
||||||
|
|
||||||
|
struct S {
|
||||||
|
int a;
|
||||||
|
float b;
|
||||||
|
};
|
||||||
|
|
||||||
|
// CHECK: define{{.*}}void @test(%struct.S* noundef byval(%struct.S) align {{[0-9]+}} [[arg:%[0-9a-zA-Z]+]])
|
||||||
|
// CHECK: [[g:%[0-9a-zA-Z]+]] = call align {{[0-9]+}} i8* @__kmpc_alloc_shared
|
||||||
|
// CHECK: bitcast i8* [[g]] to %struct.S*
|
||||||
|
// CHECK: bitcast %struct.S* [[arg]] to i8**
|
||||||
|
// CHECK: call void [[cc:@__copy_constructor[_0-9a-zA-Z]+]]
|
||||||
|
// CHECK: void [[cc]]
|
||||||
|
void test(struct S s) {
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < s.a; ++i) {
|
||||||
|
printf("%i : %i : %f\n", i, s.a, s.b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void foo() {
|
||||||
|
#pragma omp target teams num_teams(1)
|
||||||
|
{
|
||||||
|
struct S s;
|
||||||
|
s.a = 7;
|
||||||
|
s.b = 11;
|
||||||
|
test(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -137,6 +137,13 @@ static constexpr OptTable::Info InfoTable[] = {
|
|||||||
#undef OPTION
|
#undef OPTION
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Host RPC module that will be shared to the corresponding pass.
|
||||||
|
Module *HostModule = nullptr;
|
||||||
|
/// We only need to generate the host RPC module once.
|
||||||
|
bool IsHostModuleGenerated = false;
|
||||||
|
/// Host RPC object file.
|
||||||
|
StringRef HostRPCObjFile;
|
||||||
|
|
||||||
class WrapperOptTable : public opt::GenericOptTable {
|
class WrapperOptTable : public opt::GenericOptTable {
|
||||||
public:
|
public:
|
||||||
WrapperOptTable() : opt::GenericOptTable(InfoTable) {}
|
WrapperOptTable() : opt::GenericOptTable(InfoTable) {}
|
||||||
@@ -613,10 +620,12 @@ std::vector<std::string> getTargetFeatures(ArrayRef<OffloadFile> InputFiles) {
|
|||||||
return UnifiedFeatures;
|
return UnifiedFeatures;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename ModuleHook = function_ref<bool(size_t, const Module &)>>
|
template <typename PreHookTy = function_ref<bool(size_t, const Module &)>,
|
||||||
|
typename PostHookTy = function_ref<bool(size_t, const Module &)>>
|
||||||
std::unique_ptr<lto::LTO> createLTO(
|
std::unique_ptr<lto::LTO> createLTO(
|
||||||
const ArgList &Args, const std::vector<std::string> &Features,
|
const ArgList &Args, const std::vector<std::string> &Features,
|
||||||
ModuleHook Hook = [](size_t, const Module &) { return true; }) {
|
PreHookTy PreHook = [](size_t, const Module &) { return true; },
|
||||||
|
PostHookTy PostHook = [](size_t, const Module &) { return true; }) {
|
||||||
const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
|
const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
|
||||||
// We need to remove AMD's target-id from the processor if present.
|
// We need to remove AMD's target-id from the processor if present.
|
||||||
StringRef Arch = Args.getLastArgValue(OPT_arch_EQ).split(":").first;
|
StringRef Arch = Args.getLastArgValue(OPT_arch_EQ).split(":").first;
|
||||||
@@ -671,7 +680,9 @@ std::unique_ptr<lto::LTO> createLTO(
|
|||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
Conf.PostOptModuleHook = Hook;
|
|
||||||
|
Conf.PreOptModuleHook = PreHook;
|
||||||
|
Conf.PostOptModuleHook = PostHook;
|
||||||
Conf.CGFileType = (Triple.isNVPTX() || SaveTemps)
|
Conf.CGFileType = (Triple.isNVPTX() || SaveTemps)
|
||||||
? CodeGenFileType::AssemblyFile
|
? CodeGenFileType::AssemblyFile
|
||||||
: CodeGenFileType::ObjectFile;
|
: CodeGenFileType::ObjectFile;
|
||||||
@@ -690,6 +701,58 @@ bool isValidCIdentifier(StringRef S) {
|
|||||||
[](char C) { return C == '_' || isAlnum(C); });
|
[](char C) { return C == '_' || isAlnum(C); });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool writeHostModule(std::string &FileName) {
|
||||||
|
if (!HostModule)
|
||||||
|
return false;
|
||||||
|
if (HostModule->getFunctionList().empty())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
auto HostTriple = HostModule->getTargetTriple();
|
||||||
|
FileName =
|
||||||
|
sys::path::filename(ExecutableName).str() + "-host-rpc-" + HostTriple;
|
||||||
|
auto TempFileOrErr = createOutputFile(FileName, "bc");
|
||||||
|
if (!TempFileOrErr)
|
||||||
|
reportError(TempFileOrErr.takeError());
|
||||||
|
int FD = -1;
|
||||||
|
if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
|
||||||
|
reportError(errorCodeToError(EC));
|
||||||
|
|
||||||
|
auto Out = std::make_unique<llvm::raw_fd_ostream>(FD, true);
|
||||||
|
WriteBitcodeToFile(*HostModule, *Out);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<lto::LTO> createHostRPCLTO(StringRef HostTriple) {
|
||||||
|
const llvm::Triple Triple(HostTriple);
|
||||||
|
lto::Config Conf;
|
||||||
|
lto::ThinBackend Backend;
|
||||||
|
Backend =
|
||||||
|
lto::createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
|
||||||
|
|
||||||
|
// TODO: host arch?
|
||||||
|
// Conf.CPU = Arch.str();
|
||||||
|
Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple);
|
||||||
|
|
||||||
|
// TODO: host features?
|
||||||
|
// Conf.MAttrs = Features;
|
||||||
|
Conf.CGOptLevel = *CodeGenOpt::getLevel(3);
|
||||||
|
Conf.OptLevel = 3;
|
||||||
|
Conf.UseDefaultPipeline = true;
|
||||||
|
Conf.DefaultTriple = Triple.getTriple();
|
||||||
|
|
||||||
|
LTOError = false;
|
||||||
|
Conf.DiagHandler = diagnosticHandler;
|
||||||
|
|
||||||
|
Conf.PTO.LoopVectorization = Conf.OptLevel > 1;
|
||||||
|
Conf.PTO.SLPVectorization = Conf.OptLevel > 1;
|
||||||
|
Conf.CGFileType = CodeGenFileType::ObjectFile;
|
||||||
|
|
||||||
|
Conf.HasWholeProgramVisibility = false;
|
||||||
|
|
||||||
|
return std::make_unique<lto::LTO>(std::move(Conf), Backend);
|
||||||
|
}
|
||||||
|
|
||||||
Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
|
Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
|
||||||
SmallVectorImpl<StringRef> &OutputFiles,
|
SmallVectorImpl<StringRef> &OutputFiles,
|
||||||
const ArgList &Args) {
|
const ArgList &Args) {
|
||||||
@@ -775,14 +838,52 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
|
|||||||
BitcodeOutput.push_back(*TempFileOrErr);
|
BitcodeOutput.push_back(*TempFileOrErr);
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
|
auto AddHostModuleAddr = [&](size_t, const Module &M) {
|
||||||
|
if (!HostModule)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
Module &CM = const_cast<Module &>(M);
|
||||||
|
auto *MD = CM.getOrInsertNamedMetadata("llvm.hostrpc.hostmodule");
|
||||||
|
MD->clearOperands();
|
||||||
|
MD->addOperand(MDTuple::get(
|
||||||
|
CM.getContext(), {ConstantAsMetadata::get(ConstantInt::get(
|
||||||
|
Type::getInt64Ty(CM.getContext()),
|
||||||
|
reinterpret_cast<uintptr_t>(HostModule)))}));
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
// We assume visibility of the whole program if every input file was bitcode.
|
// We assume visibility of the whole program if every input file was bitcode.
|
||||||
auto Features = getTargetFeatures(BitcodeInputFiles);
|
auto Features = getTargetFeatures(BitcodeInputFiles);
|
||||||
auto LTOBackend = Args.hasArg(OPT_embed_bitcode) ||
|
auto LTOBackend = Args.hasArg(OPT_embed_bitcode) ||
|
||||||
Args.hasArg(OPT_builtin_bitcode_EQ) ||
|
Args.hasArg(OPT_builtin_bitcode_EQ) ||
|
||||||
Args.hasArg(OPT_clang_backend)
|
Args.hasArg(OPT_clang_backend)
|
||||||
? createLTO(Args, Features, OutputBitcode)
|
? createLTO(Args, Features, AddHostModuleAddr, OutputBitcode)
|
||||||
: createLTO(Args, Features);
|
: createLTO(Args, Features, AddHostModuleAddr);
|
||||||
|
|
||||||
|
LLVMContext &Ctx = LTOBackend->getContext();
|
||||||
|
StringRef HostTriple =
|
||||||
|
Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple());
|
||||||
|
std::unique_ptr<Module> HostModulePtr;
|
||||||
|
if (!IsHostModuleGenerated) {
|
||||||
|
HostModulePtr = std::make_unique<Module>(
|
||||||
|
sys::path::filename(ExecutableName).str() + "-host-rpc.bc", Ctx);
|
||||||
|
HostModule = HostModulePtr.get();
|
||||||
|
HostModulePtr->setTargetTriple(HostTriple);
|
||||||
|
|
||||||
|
std::string Msg;
|
||||||
|
const Target *T =
|
||||||
|
TargetRegistry::lookupTarget(HostModule->getTargetTriple(), Msg);
|
||||||
|
if (!T)
|
||||||
|
return createStringError(inconvertibleErrorCode(), Msg);
|
||||||
|
auto Options =
|
||||||
|
codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple(HostTriple));
|
||||||
|
StringRef CPU = "";
|
||||||
|
StringRef Features = "";
|
||||||
|
std::unique_ptr<TargetMachine> TM(
|
||||||
|
T->createTargetMachine(HostTriple, CPU, Features, Options, Reloc::PIC_,
|
||||||
|
HostModule->getCodeModel()));
|
||||||
|
HostModule->setDataLayout(TM->createDataLayout());
|
||||||
|
}
|
||||||
|
|
||||||
// We need to resolve the symbols so the LTO backend knows which symbols need
|
// We need to resolve the symbols so the LTO backend knows which symbols need
|
||||||
// to be kept or can be internalized. This is a simplified symbol resolution
|
// to be kept or can be internalized. This is a simplified symbol resolution
|
||||||
@@ -806,6 +907,7 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
|
|||||||
SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
|
SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
|
||||||
size_t Idx = 0;
|
size_t Idx = 0;
|
||||||
for (auto &Sym : Symbols) {
|
for (auto &Sym : Symbols) {
|
||||||
|
|
||||||
lto::SymbolResolution &Res = Resolutions[Idx++];
|
lto::SymbolResolution &Res = Resolutions[Idx++];
|
||||||
|
|
||||||
// We will use this as the prevailing symbol definition in LTO unless
|
// We will use this as the prevailing symbol definition in LTO unless
|
||||||
@@ -876,6 +978,57 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
|
|||||||
if (Error Err = LTOBackend->run(AddStream))
|
if (Error Err = LTOBackend->run(AddStream))
|
||||||
return Err;
|
return Err;
|
||||||
|
|
||||||
|
std::string HostModuleTempFile;
|
||||||
|
bool ValidHostModule = writeHostModule(HostModuleTempFile);
|
||||||
|
// Reset the HostModule pointer.
|
||||||
|
HostModulePtr.reset();
|
||||||
|
HostModule = nullptr;
|
||||||
|
// TODO: this is really redundant code.
|
||||||
|
if (ValidHostModule) {
|
||||||
|
auto HostLTO = createHostRPCLTO(HostTriple);
|
||||||
|
|
||||||
|
std::string HostBitCodeFile = HostModuleTempFile + ".bc";
|
||||||
|
auto BufferOrError = MemoryBuffer::getFile(HostBitCodeFile);
|
||||||
|
if (!BufferOrError)
|
||||||
|
reportError(createFileError(HostBitCodeFile, BufferOrError.getError()));
|
||||||
|
Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr =
|
||||||
|
llvm::lto::InputFile::create(*BufferOrError.get());
|
||||||
|
if (!BitcodeFileOrErr)
|
||||||
|
return BitcodeFileOrErr.takeError();
|
||||||
|
|
||||||
|
const auto Symbols = (*BitcodeFileOrErr)->symbols();
|
||||||
|
SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
|
||||||
|
size_t Idx = 0;
|
||||||
|
for (auto &Sym : Symbols) {
|
||||||
|
(void)Sym;
|
||||||
|
lto::SymbolResolution &Res = Resolutions[Idx++];
|
||||||
|
Res.ExportDynamic = true;
|
||||||
|
Res.VisibleToRegularObj = true;
|
||||||
|
Res.LinkerRedefined = false;
|
||||||
|
Res.Prevailing = true;
|
||||||
|
}
|
||||||
|
if (Error Err = HostLTO->add(std::move(*BitcodeFileOrErr), Resolutions))
|
||||||
|
return Err;
|
||||||
|
|
||||||
|
auto RPCAddStream =
|
||||||
|
[&](size_t Task,
|
||||||
|
const Twine &ModuleName) -> std::unique_ptr<CachedFileStream> {
|
||||||
|
int FD = -1;
|
||||||
|
auto TempFileOrErr = createOutputFile(
|
||||||
|
sys::path::filename(ExecutableName) + "-host-rpc-" + HostTriple, "o");
|
||||||
|
if (!TempFileOrErr)
|
||||||
|
reportError(TempFileOrErr.takeError());
|
||||||
|
HostRPCObjFile = *TempFileOrErr;
|
||||||
|
if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
|
||||||
|
reportError(errorCodeToError(EC));
|
||||||
|
return std::make_unique<CachedFileStream>(
|
||||||
|
std::make_unique<llvm::raw_fd_ostream>(FD, true));
|
||||||
|
};
|
||||||
|
|
||||||
|
if (Error Err = HostLTO->run(RPCAddStream))
|
||||||
|
return Err;
|
||||||
|
}
|
||||||
|
|
||||||
if (LTOError)
|
if (LTOError)
|
||||||
return createStringError(inconvertibleErrorCode(),
|
return createStringError(inconvertibleErrorCode(),
|
||||||
"Errors encountered inside the LTO pipeline.");
|
"Errors encountered inside the LTO pipeline.");
|
||||||
@@ -1244,6 +1397,9 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
|
|||||||
WrappedOutput.push_back(*OutputOrErr);
|
WrappedOutput.push_back(*OutputOrErr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!HostRPCObjFile.empty())
|
||||||
|
WrappedOutput.push_back(HostRPCObjFile);
|
||||||
|
|
||||||
return WrappedOutput;
|
return WrappedOutput;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -181,6 +181,10 @@ set(TARGET_LIBC_ENTRYPOINTS
|
|||||||
|
|
||||||
# gpu/rpc.h entrypoints
|
# gpu/rpc.h entrypoints
|
||||||
libc.src.gpu.rpc_host_call
|
libc.src.gpu.rpc_host_call
|
||||||
|
libc.src.gpu.rpc_open_port
|
||||||
|
libc.src.gpu.rpc_send_n
|
||||||
|
libc.src.gpu.rpc_recv_n
|
||||||
|
libc.src.gpu.rpc_close_port
|
||||||
)
|
)
|
||||||
|
|
||||||
set(TARGET_LIBM_ENTRYPOINTS
|
set(TARGET_LIBM_ENTRYPOINTS
|
||||||
|
|||||||
@@ -604,6 +604,7 @@ if(LIBC_TARGET_OS_IS_GPU)
|
|||||||
DEPENDS
|
DEPENDS
|
||||||
.llvm_libc_common_h
|
.llvm_libc_common_h
|
||||||
.llvm-libc-types.rpc_opcodes_t
|
.llvm-libc-types.rpc_opcodes_t
|
||||||
|
.llvm-libc-types.rpc_port_t
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,10 @@
|
|||||||
|
|
||||||
#include <__llvm-libc-common.h>
|
#include <__llvm-libc-common.h>
|
||||||
|
|
||||||
|
#include <llvm-libc-types/size_t.h>
|
||||||
|
|
||||||
#include <llvm-libc-types/rpc_opcodes_t.h>
|
#include <llvm-libc-types/rpc_opcodes_t.h>
|
||||||
|
#include <llvm-libc-types/rpc_port_t.h>
|
||||||
|
|
||||||
%%public_api()
|
%%public_api()
|
||||||
|
|
||||||
|
|||||||
@@ -93,6 +93,7 @@ add_header(socklen_t HDR socklen_t.h)
|
|||||||
add_header(struct_sockaddr_un HDR struct_sockaddr_un.h)
|
add_header(struct_sockaddr_un HDR struct_sockaddr_un.h)
|
||||||
add_header(struct_sockaddr HDR struct_sockaddr.h)
|
add_header(struct_sockaddr HDR struct_sockaddr.h)
|
||||||
add_header(rpc_opcodes_t HDR rpc_opcodes_t.h)
|
add_header(rpc_opcodes_t HDR rpc_opcodes_t.h)
|
||||||
|
add_header(rpc_port_t HDR rpc_port_t.h)
|
||||||
add_header(ACTION HDR ACTION.h)
|
add_header(ACTION HDR ACTION.h)
|
||||||
add_header(ENTRY HDR ENTRY.h)
|
add_header(ENTRY HDR ENTRY.h)
|
||||||
add_header(struct_hsearch_data HDR struct_hsearch_data.h)
|
add_header(struct_hsearch_data HDR struct_hsearch_data.h)
|
||||||
|
|||||||
16
libc/include/llvm-libc-types/rpc_port_t.h
Normal file
16
libc/include/llvm-libc-types/rpc_port_t.h
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
//===-- Definition of type rpc_port_t -------------------------------------===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef __LLVM_LIBC_TYPES_RPC_PORT_T_H__
|
||||||
|
#define __LLVM_LIBC_TYPES_RPC_PORT_T_H__
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
__UINT8_TYPE__ reserved[32];
|
||||||
|
} rpc_port_t;
|
||||||
|
|
||||||
|
#endif // __LLVM_LIBC_TYPES_RPC_PORT_T_H__
|
||||||
@@ -15,6 +15,7 @@ typedef enum : unsigned short {
|
|||||||
RPC_TEST_NOOP = 1 << 15,
|
RPC_TEST_NOOP = 1 << 15,
|
||||||
RPC_TEST_INCREMENT,
|
RPC_TEST_INCREMENT,
|
||||||
RPC_TEST_INTERFACE,
|
RPC_TEST_INTERFACE,
|
||||||
|
RPC_TEST_EXTERNAL,
|
||||||
RPC_TEST_STREAM,
|
RPC_TEST_STREAM,
|
||||||
} rpc_test_opcode_t;
|
} rpc_test_opcode_t;
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,11 @@
|
|||||||
|
def RPCPortT : NamedType<"rpc_port_t">;
|
||||||
|
def RPCPortPtrT : PtrType<RPCPortT>;
|
||||||
|
|
||||||
def GPUExtensions : StandardSpec<"GPUExtensions"> {
|
def GPUExtensions : StandardSpec<"GPUExtensions"> {
|
||||||
HeaderSpec RPC = HeaderSpec<
|
HeaderSpec RPC = HeaderSpec<
|
||||||
"gpu/rpc.h",
|
"gpu/rpc.h",
|
||||||
[], // Macros
|
[], // Macros
|
||||||
[], // Types
|
[RPCPortT, SizeTType], // Types
|
||||||
[], // Enumerations
|
[], // Enumerations
|
||||||
[
|
[
|
||||||
FunctionSpec<
|
FunctionSpec<
|
||||||
@@ -10,6 +13,26 @@ def GPUExtensions : StandardSpec<"GPUExtensions"> {
|
|||||||
RetValSpec<VoidType>,
|
RetValSpec<VoidType>,
|
||||||
[ArgSpec<VoidPtr>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
|
[ArgSpec<VoidPtr>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
|
||||||
>,
|
>,
|
||||||
|
FunctionSpec<
|
||||||
|
"rpc_open_port",
|
||||||
|
RetValSpec<RPCPortT>,
|
||||||
|
[ArgSpec<UnsignedIntType>]
|
||||||
|
>,
|
||||||
|
FunctionSpec<
|
||||||
|
"rpc_send_n",
|
||||||
|
RetValSpec<VoidType>,
|
||||||
|
[ArgSpec<RPCPortPtrT>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
|
||||||
|
>,
|
||||||
|
FunctionSpec<
|
||||||
|
"rpc_recv_n",
|
||||||
|
RetValSpec<VoidType>,
|
||||||
|
[ArgSpec<RPCPortPtrT>, ArgSpec<VoidPtr>, ArgSpec<SizeTPtr>]
|
||||||
|
>,
|
||||||
|
FunctionSpec<
|
||||||
|
"rpc_close_port",
|
||||||
|
RetValSpec<VoidType>,
|
||||||
|
[ArgSpec<RPCPortPtrT>]
|
||||||
|
>,
|
||||||
]
|
]
|
||||||
>;
|
>;
|
||||||
let Headers = [
|
let Headers = [
|
||||||
|
|||||||
@@ -25,12 +25,14 @@ namespace LIBC_NAMESPACE::cpp {
|
|||||||
#define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
|
#define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// This implementation of bit_cast requires trivially-constructible To, to avoid
|
|
||||||
// UB in the implementation.
|
|
||||||
template <typename To, typename From>
|
template <typename To, typename From>
|
||||||
LIBC_INLINE constexpr cpp::enable_if_t<
|
LIBC_INLINE constexpr cpp::enable_if_t<
|
||||||
(sizeof(To) == sizeof(From)) &&
|
(sizeof(To) == sizeof(From)) &&
|
||||||
|
// Implementation of bit_cast that cannot use the compiler builtin must be
|
||||||
|
// trivially-constructible To, to avoid UB in the implementation.
|
||||||
|
#if !LIBC_HAS_BUILTIN(__builtin_bit_cast)
|
||||||
cpp::is_trivially_constructible<To>::value &&
|
cpp::is_trivially_constructible<To>::value &&
|
||||||
|
#endif
|
||||||
cpp::is_trivially_copyable<To>::value &&
|
cpp::is_trivially_copyable<To>::value &&
|
||||||
cpp::is_trivially_copyable<From>::value,
|
cpp::is_trivially_copyable<From>::value,
|
||||||
To>
|
To>
|
||||||
|
|||||||
@@ -323,6 +323,7 @@ public:
|
|||||||
LIBC_INLINE void send_n(const void *src, uint64_t size);
|
LIBC_INLINE void send_n(const void *src, uint64_t size);
|
||||||
template <typename A>
|
template <typename A>
|
||||||
LIBC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc);
|
LIBC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc);
|
||||||
|
LIBC_INLINE void recv_n(void *dst, uint64_t *size);
|
||||||
|
|
||||||
LIBC_INLINE uint16_t get_opcode() const {
|
LIBC_INLINE uint16_t get_opcode() const {
|
||||||
return process.header[index].opcode;
|
return process.header[index].opcode;
|
||||||
@@ -359,7 +360,9 @@ struct Client {
|
|||||||
: process(port_count, buffer) {}
|
: process(port_count, buffer) {}
|
||||||
|
|
||||||
using Port = rpc::Port<false>;
|
using Port = rpc::Port<false>;
|
||||||
template <uint16_t opcode> LIBC_INLINE Port open();
|
|
||||||
|
template <uint16_t opcode> LIBC_INLINE Port open() { return open(opcode); }
|
||||||
|
LIBC_INLINE Port open(uint16_t opcode);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Process<false> process;
|
Process<false> process;
|
||||||
@@ -484,6 +487,14 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Helper routine to simplify the interface when recieving from the GPU using
|
||||||
|
/// thread private pointers to the underly value and the destination pointer
|
||||||
|
/// contains enough data to recieve the values.
|
||||||
|
template <bool T> LIBC_INLINE void Port<T>::recv_n(void *dst, uint64_t *size) {
|
||||||
|
void **dst_ptr = &dst;
|
||||||
|
recv_n(dst_ptr, size, [=](uint64_t) { return dst; });
|
||||||
|
}
|
||||||
|
|
||||||
/// Receives an arbitrarily sized data buffer across the shared channel in
|
/// Receives an arbitrarily sized data buffer across the shared channel in
|
||||||
/// multiples of the packet length. The \p alloc function is called with the
|
/// multiples of the packet length. The \p alloc function is called with the
|
||||||
/// size of the data so that we can initialize the size of the \p dst buffer.
|
/// size of the data so that we can initialize the size of the \p dst buffer.
|
||||||
@@ -522,9 +533,9 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
|
|||||||
/// is, there are send operations pending that haven't been serviced on this
|
/// is, there are send operations pending that haven't been serviced on this
|
||||||
/// port. Each port instance uses an associated \p opcode to tell the server
|
/// port. Each port instance uses an associated \p opcode to tell the server
|
||||||
/// what to do. The Client interface provides the appropriate lane size to the
|
/// what to do. The Client interface provides the appropriate lane size to the
|
||||||
/// port using the platform's returned value.
|
/// port using the platform's returned value. It is required that \p opcode is
|
||||||
template <uint16_t opcode>
|
/// uniform between all the lanes for this to work.
|
||||||
[[clang::convergent]] LIBC_INLINE Client::Port Client::open() {
|
[[clang::convergent]] LIBC_INLINE Client::Port Client::open(uint16_t opcode) {
|
||||||
// Repeatedly perform a naive linear scan for a port that can be opened to
|
// Repeatedly perform a naive linear scan for a port that can be opened to
|
||||||
// send data.
|
// send data.
|
||||||
for (uint32_t index = gpu::get_cluster_id();; ++index) {
|
for (uint32_t index = gpu::get_cluster_id();; ++index) {
|
||||||
|
|||||||
@@ -995,24 +995,13 @@ LIBC_INLINE_VAR constexpr bool is_big_int_v = is_big_int<T>::value;
|
|||||||
|
|
||||||
namespace cpp {
|
namespace cpp {
|
||||||
|
|
||||||
// Specialization of cpp::bit_cast ('bit.h') from T to BigInt.
|
|
||||||
template <typename To, typename From>
|
|
||||||
LIBC_INLINE constexpr cpp::enable_if_t<
|
|
||||||
(sizeof(To) == sizeof(From)) && cpp::is_trivially_copyable<To>::value &&
|
|
||||||
cpp::is_trivially_copyable<From>::value && is_big_int<To>::value,
|
|
||||||
To>
|
|
||||||
bit_cast(const From &from) {
|
|
||||||
To out;
|
|
||||||
using Storage = decltype(out.val);
|
|
||||||
out.val = cpp::bit_cast<Storage>(from);
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Specialization of cpp::bit_cast ('bit.h') from BigInt to T.
|
// Specialization of cpp::bit_cast ('bit.h') from BigInt to T.
|
||||||
template <typename To, size_t Bits>
|
template <typename To, size_t Bits>
|
||||||
LIBC_INLINE constexpr cpp::enable_if_t<
|
LIBC_INLINE constexpr cpp::enable_if_t<
|
||||||
sizeof(To) == sizeof(UInt<Bits>) &&
|
sizeof(To) == sizeof(UInt<Bits>) &&
|
||||||
|
#if !LIBC_HAS_BUILTIN(__builtin_bit_cast)
|
||||||
cpp::is_trivially_constructible<To>::value &&
|
cpp::is_trivially_constructible<To>::value &&
|
||||||
|
#endif
|
||||||
cpp::is_trivially_copyable<To>::value &&
|
cpp::is_trivially_copyable<To>::value &&
|
||||||
cpp::is_trivially_copyable<UInt<Bits>>::value,
|
cpp::is_trivially_copyable<UInt<Bits>>::value,
|
||||||
To>
|
To>
|
||||||
|
|||||||
@@ -8,3 +8,58 @@ add_entrypoint_object(
|
|||||||
libc.src.__support.RPC.rpc_client
|
libc.src.__support.RPC.rpc_client
|
||||||
libc.src.__support.GPU.utils
|
libc.src.__support.GPU.utils
|
||||||
)
|
)
|
||||||
|
|
||||||
|
add_entrypoint_object(
|
||||||
|
rpc_open_port
|
||||||
|
SRCS
|
||||||
|
rpc_open_port.cpp
|
||||||
|
HDRS
|
||||||
|
rpc_open_port.h
|
||||||
|
DEPENDS
|
||||||
|
libc.src.__support.RPC.rpc_client
|
||||||
|
libc.src.__support.GPU.utils
|
||||||
|
)
|
||||||
|
|
||||||
|
add_entrypoint_object(
|
||||||
|
rpc_close_port
|
||||||
|
SRCS
|
||||||
|
rpc_close_port.cpp
|
||||||
|
HDRS
|
||||||
|
rpc_close_port.h
|
||||||
|
DEPENDS
|
||||||
|
libc.src.__support.RPC.rpc_client
|
||||||
|
libc.src.__support.GPU.utils
|
||||||
|
)
|
||||||
|
|
||||||
|
add_entrypoint_object(
|
||||||
|
rpc_get_buffer
|
||||||
|
SRCS
|
||||||
|
rpc_get_buffer.cpp
|
||||||
|
HDRS
|
||||||
|
rpc_get_buffer.h
|
||||||
|
DEPENDS
|
||||||
|
libc.src.__support.RPC.rpc_client
|
||||||
|
libc.src.__support.GPU.utils
|
||||||
|
)
|
||||||
|
|
||||||
|
add_entrypoint_object(
|
||||||
|
rpc_send_n
|
||||||
|
SRCS
|
||||||
|
rpc_send_n.cpp
|
||||||
|
HDRS
|
||||||
|
rpc_send_n.h
|
||||||
|
DEPENDS
|
||||||
|
libc.src.__support.RPC.rpc_client
|
||||||
|
libc.src.__support.GPU.utils
|
||||||
|
)
|
||||||
|
|
||||||
|
add_entrypoint_object(
|
||||||
|
rpc_recv_n
|
||||||
|
SRCS
|
||||||
|
rpc_recv_n.cpp
|
||||||
|
HDRS
|
||||||
|
rpc_recv_n.h
|
||||||
|
DEPENDS
|
||||||
|
libc.src.__support.RPC.rpc_client
|
||||||
|
libc.src.__support.GPU.utils
|
||||||
|
)
|
||||||
|
|||||||
25
libc/src/gpu/rpc_close_port.cpp
Normal file
25
libc/src/gpu/rpc_close_port.cpp
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
//===---------- GPU implementation of the external RPC port interface -----===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "src/gpu/rpc_close_port.h"
|
||||||
|
|
||||||
|
#include "src/__support/GPU/utils.h"
|
||||||
|
#include "src/__support/RPC/rpc_client.h"
|
||||||
|
#include "src/__support/common.h"
|
||||||
|
|
||||||
|
namespace LIBC_NAMESPACE {
|
||||||
|
|
||||||
|
static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
|
||||||
|
|
||||||
|
LLVM_LIBC_FUNCTION(void, rpc_close_port, (rpc_port_t * handle)) {
|
||||||
|
rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
|
||||||
|
port.close();
|
||||||
|
*handle = cpp::bit_cast<rpc_port_t>(port);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace LIBC_NAMESPACE
|
||||||
20
libc/src/gpu/rpc_close_port.h
Normal file
20
libc/src/gpu/rpc_close_port.h
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H
|
||||||
|
#define LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H
|
||||||
|
|
||||||
|
#include <gpu/rpc.h>
|
||||||
|
|
||||||
|
namespace LIBC_NAMESPACE {
|
||||||
|
|
||||||
|
void rpc_close_port(rpc_port_t *handle);
|
||||||
|
|
||||||
|
} // namespace LIBC_NAMESPACE
|
||||||
|
|
||||||
|
#endif // LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H
|
||||||
26
libc/src/gpu/rpc_open_port.cpp
Normal file
26
libc/src/gpu/rpc_open_port.cpp
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
//===---------- GPU implementation of the external RPC port interface -----===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "src/gpu/rpc_open_port.h"
|
||||||
|
|
||||||
|
#include "src/__support/CPP/bit.h"
|
||||||
|
#include "src/__support/GPU/utils.h"
|
||||||
|
#include "src/__support/RPC/rpc_client.h"
|
||||||
|
#include "src/__support/common.h"
|
||||||
|
|
||||||
|
namespace LIBC_NAMESPACE {
|
||||||
|
|
||||||
|
static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
|
||||||
|
|
||||||
|
LLVM_LIBC_FUNCTION(rpc_port_t, rpc_open_port, (unsigned opcode)) {
|
||||||
|
uint32_t uniform = gpu::broadcast_value(gpu::get_lane_mask(), opcode);
|
||||||
|
rpc::Client::Port port = rpc::client.open(static_cast<uint16_t>(uniform));
|
||||||
|
return cpp::bit_cast<rpc_port_t>(port);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace LIBC_NAMESPACE
|
||||||
20
libc/src/gpu/rpc_open_port.h
Normal file
20
libc/src/gpu/rpc_open_port.h
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H
|
||||||
|
#define LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H
|
||||||
|
|
||||||
|
#include <gpu/rpc.h>
|
||||||
|
|
||||||
|
namespace LIBC_NAMESPACE {
|
||||||
|
|
||||||
|
rpc_port_t rpc_open_port(unsigned opcode);
|
||||||
|
|
||||||
|
} // namespace LIBC_NAMESPACE
|
||||||
|
|
||||||
|
#endif // LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H
|
||||||
26
libc/src/gpu/rpc_recv_n.cpp
Normal file
26
libc/src/gpu/rpc_recv_n.cpp
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
//===---------- GPU implementation of the external RPC port interface -----===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "src/gpu/rpc_recv_n.h"
|
||||||
|
|
||||||
|
#include "src/__support/GPU/utils.h"
|
||||||
|
#include "src/__support/RPC/rpc_client.h"
|
||||||
|
#include "src/__support/common.h"
|
||||||
|
|
||||||
|
namespace LIBC_NAMESPACE {
|
||||||
|
|
||||||
|
static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
|
||||||
|
|
||||||
|
LLVM_LIBC_FUNCTION(void, rpc_recv_n,
|
||||||
|
(rpc_port_t * handle, void *dst, size_t *size)) {
|
||||||
|
rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
|
||||||
|
port.recv_n(dst, reinterpret_cast<uint64_t *>(size));
|
||||||
|
*handle = cpp::bit_cast<rpc_port_t>(port);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace LIBC_NAMESPACE
|
||||||
20
libc/src/gpu/rpc_recv_n.h
Normal file
20
libc/src/gpu/rpc_recv_n.h
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef LLVM_LIBC_SRC_GPU_RPC_RECV_H
|
||||||
|
#define LLVM_LIBC_SRC_GPU_RPC_RECV_H
|
||||||
|
|
||||||
|
#include <gpu/rpc.h>
|
||||||
|
|
||||||
|
namespace LIBC_NAMESPACE {
|
||||||
|
|
||||||
|
void rpc_recv_n(rpc_port_t *handle, void *dst, size_t *size);
|
||||||
|
|
||||||
|
} // namespace LIBC_NAMESPACE
|
||||||
|
|
||||||
|
#endif // LLVM_LIBC_SRC_GPU_RPC_RECV_H
|
||||||
26
libc/src/gpu/rpc_send_n.cpp
Normal file
26
libc/src/gpu/rpc_send_n.cpp
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
//===---------- GPU implementation of the external RPC port interface -----===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "src/gpu/rpc_send_n.h"
|
||||||
|
|
||||||
|
#include "src/__support/GPU/utils.h"
|
||||||
|
#include "src/__support/RPC/rpc_client.h"
|
||||||
|
#include "src/__support/common.h"
|
||||||
|
|
||||||
|
namespace LIBC_NAMESPACE {
|
||||||
|
|
||||||
|
static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
|
||||||
|
|
||||||
|
LLVM_LIBC_FUNCTION(void, rpc_send_n,
|
||||||
|
(rpc_port_t * handle, const void *src, size_t size)) {
|
||||||
|
rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
|
||||||
|
port.send_n(src, size);
|
||||||
|
*handle = cpp::bit_cast<rpc_port_t>(port);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace LIBC_NAMESPACE
|
||||||
20
libc/src/gpu/rpc_send_n.h
Normal file
20
libc/src/gpu/rpc_send_n.h
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef LLVM_LIBC_SRC_GPU_RPC_SEND_H
|
||||||
|
#define LLVM_LIBC_SRC_GPU_RPC_SEND_H
|
||||||
|
|
||||||
|
#include <gpu/rpc.h>
|
||||||
|
|
||||||
|
namespace LIBC_NAMESPACE {
|
||||||
|
|
||||||
|
void rpc_send_n(rpc_port_t *handle, const void *src, size_t size);
|
||||||
|
|
||||||
|
} // namespace LIBC_NAMESPACE
|
||||||
|
|
||||||
|
#endif // LLVM_LIBC_SRC_GPU_RPC_SEND_H
|
||||||
@@ -44,6 +44,21 @@ add_integration_test(
|
|||||||
rpc_interface_test.cpp
|
rpc_interface_test.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
|
add_integration_test(
|
||||||
|
startup_rpc_external_interface_test
|
||||||
|
SUITE libc-startup-tests
|
||||||
|
SRCS
|
||||||
|
rpc_external_interface_test.cpp
|
||||||
|
DEPENDS
|
||||||
|
libc.src.gpu.rpc_open_port
|
||||||
|
libc.src.gpu.rpc_send_n
|
||||||
|
libc.src.gpu.rpc_recv_n
|
||||||
|
libc.src.gpu.rpc_close_port
|
||||||
|
LOADER_ARGS
|
||||||
|
--threads 32
|
||||||
|
--blocks 8
|
||||||
|
)
|
||||||
|
|
||||||
add_integration_test(
|
add_integration_test(
|
||||||
startup_rpc_stream_test
|
startup_rpc_stream_test
|
||||||
SUITE libc-startup-tests
|
SUITE libc-startup-tests
|
||||||
|
|||||||
@@ -0,0 +1,43 @@
|
|||||||
|
//===-- Loader test to check the external RPC interface with the loader ---===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include <gpu/rpc.h>
|
||||||
|
|
||||||
|
#include "src/gpu/rpc_close_port.h"
|
||||||
|
#include "src/gpu/rpc_open_port.h"
|
||||||
|
#include "src/gpu/rpc_recv_n.h"
|
||||||
|
#include "src/gpu/rpc_send_n.h"
|
||||||
|
|
||||||
|
#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
|
||||||
|
#include "src/__support/GPU/utils.h"
|
||||||
|
#include "src/__support/RPC/rpc_client.h"
|
||||||
|
#include "test/IntegrationTest/test.h"
|
||||||
|
|
||||||
|
using namespace LIBC_NAMESPACE;
|
||||||
|
|
||||||
|
static void test_interface() {
|
||||||
|
uint32_t num_additions =
|
||||||
|
10 + 10 * gpu::get_thread_id() + 10 * gpu::get_block_id();
|
||||||
|
uint64_t cnt = 0;
|
||||||
|
for (uint32_t i = 0; i < num_additions; ++i) {
|
||||||
|
size_t size = sizeof(uint64_t);
|
||||||
|
rpc_port_t port = LIBC_NAMESPACE::rpc_open_port(RPC_TEST_EXTERNAL);
|
||||||
|
LIBC_NAMESPACE::rpc_send_n(&port, &cnt, size);
|
||||||
|
LIBC_NAMESPACE::rpc_recv_n(&port, &cnt, &size);
|
||||||
|
LIBC_NAMESPACE::rpc_close_port(&port);
|
||||||
|
ASSERT_TRUE(size == sizeof(uint64_t));
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT_TRUE(cnt == num_additions && "Invalid number of increments");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_MAIN(int argc, char **argv, char **envp) {
|
||||||
|
test_interface();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
@@ -11,8 +11,8 @@
|
|||||||
|
|
||||||
#include "utils/gpu/server/llvmlibc_rpc_server.h"
|
#include "utils/gpu/server/llvmlibc_rpc_server.h"
|
||||||
|
|
||||||
#include "llvm-libc-types/rpc_opcodes_t.h"
|
|
||||||
#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
|
#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
|
||||||
|
#include "llvm-libc-types/rpc_opcodes_t.h"
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
@@ -222,6 +222,28 @@ inline void register_rpc_callbacks(uint32_t device_id) {
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
nullptr);
|
nullptr);
|
||||||
|
|
||||||
|
// Register the stream test handler.
|
||||||
|
rpc_register_callback(
|
||||||
|
device_id, static_cast<rpc_opcode_t>(RPC_TEST_EXTERNAL),
|
||||||
|
[](rpc_port_t port, void *data) {
|
||||||
|
uint64_t sizes[lane_size] = {0};
|
||||||
|
void *dst[lane_size] = {nullptr};
|
||||||
|
rpc_recv_n(
|
||||||
|
port, dst, sizes,
|
||||||
|
[](uint64_t size, void *) -> void * { return new char[size]; },
|
||||||
|
nullptr);
|
||||||
|
for (uint64_t i = 0; i < lane_size; ++i) {
|
||||||
|
if (dst[i])
|
||||||
|
*reinterpret_cast<uint64_t *>(dst[i]) += 1;
|
||||||
|
}
|
||||||
|
rpc_send_n(port, dst, sizes);
|
||||||
|
for (uint64_t i = 0; i < lane_size; ++i) {
|
||||||
|
if (dst[i])
|
||||||
|
delete[] reinterpret_cast<uint8_t *>(dst[i]);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -25,6 +25,19 @@ enum OMPTgtExecModeFlags : unsigned char {
|
|||||||
OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
|
OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum OMPTgtHostRPCArgType {
|
||||||
|
// No need to copy.
|
||||||
|
OMP_HOST_RPC_ARG_SCALAR = 0,
|
||||||
|
OMP_HOST_RPC_ARG_PTR = 1,
|
||||||
|
// Copy to device.
|
||||||
|
OMP_HOST_RPC_ARG_COPY_TO = OMP_HOST_RPC_ARG_PTR | (1 << 1),
|
||||||
|
// Copy to device.
|
||||||
|
OMP_HOST_RPC_ARG_COPY_FROM = OMP_HOST_RPC_ARG_PTR | (1 << 2),
|
||||||
|
// Copy to and from device.
|
||||||
|
OMP_HOST_RPC_ARG_COPY_TOFROM =
|
||||||
|
OMP_HOST_RPC_ARG_COPY_TO | OMP_HOST_RPC_ARG_COPY_FROM,
|
||||||
|
};
|
||||||
|
|
||||||
} // end namespace omp
|
} // end namespace omp
|
||||||
} // end namespace llvm
|
} // end namespace llvm
|
||||||
|
|
||||||
|
|||||||
@@ -227,7 +227,9 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
|
|||||||
__OMP_RTL(__kmpc_get_warp_size, false, Int32, )
|
__OMP_RTL(__kmpc_get_warp_size, false, Int32, )
|
||||||
|
|
||||||
__OMP_RTL(omp_get_thread_num, false, Int32, )
|
__OMP_RTL(omp_get_thread_num, false, Int32, )
|
||||||
|
__OMP_RTL(omp_get_bulk_thread_num, false, Int32, )
|
||||||
__OMP_RTL(omp_get_num_threads, false, Int32, )
|
__OMP_RTL(omp_get_num_threads, false, Int32, )
|
||||||
|
__OMP_RTL(omp_get_bulk_num_threads, false, Int32, )
|
||||||
__OMP_RTL(omp_get_max_threads, false, Int32, )
|
__OMP_RTL(omp_get_max_threads, false, Int32, )
|
||||||
__OMP_RTL(omp_in_parallel, false, Int32, )
|
__OMP_RTL(omp_in_parallel, false, Int32, )
|
||||||
__OMP_RTL(omp_get_dynamic, false, Int32, )
|
__OMP_RTL(omp_get_dynamic, false, Int32, )
|
||||||
@@ -490,6 +492,8 @@ __OMP_RTL(__kmpc_reduction_get_fixed_buffer, false, VoidPtr, )
|
|||||||
|
|
||||||
__OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
|
__OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
|
||||||
|
|
||||||
|
__OMP_RTL(malloc, false, VoidPtr, SizeTy)
|
||||||
|
__OMP_RTL(free, false, Void, VoidPtr)
|
||||||
__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
|
__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
|
||||||
__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)
|
__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)
|
||||||
__OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
|
__OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
|
||||||
@@ -503,6 +507,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
|
|||||||
__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
|
__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
|
||||||
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
|
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
|
||||||
|
|
||||||
|
__OMP_RTL(__kmpc_launch_parallel_51_kernel, false, Void, Int8Ptr, Int32, Int32,
|
||||||
|
Int32, VoidPtrPtr, Int64)
|
||||||
|
|
||||||
__OMP_RTL(__last, false, Void, )
|
__OMP_RTL(__last, false, Void, )
|
||||||
|
|
||||||
#undef __OMP_RTL
|
#undef __OMP_RTL
|
||||||
@@ -710,6 +717,8 @@ __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs())
|
|||||||
|
|
||||||
__OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs())
|
__OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs())
|
||||||
__OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs())
|
__OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs())
|
||||||
|
__OMP_RTL_ATTRS(omp_get_bulk_thread_num, GetterAttrs, SExt, ParamAttrs())
|
||||||
|
__OMP_RTL_ATTRS(omp_get_bulk_num_threads, GetterAttrs, SExt, ParamAttrs())
|
||||||
__OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs())
|
__OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs())
|
||||||
__OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs())
|
__OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs())
|
||||||
__OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, SExt, ParamAttrs())
|
__OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, SExt, ParamAttrs())
|
||||||
|
|||||||
@@ -60,6 +60,9 @@ struct Config {
|
|||||||
bool VerifyEach = false;
|
bool VerifyEach = false;
|
||||||
bool DisableVerify = false;
|
bool DisableVerify = false;
|
||||||
|
|
||||||
|
/// Use the standard optimization pipeline.
|
||||||
|
bool UseDefaultPipeline = false;
|
||||||
|
|
||||||
/// Flag to indicate that the optimizer should not assume builtins are present
|
/// Flag to indicate that the optimizer should not assume builtins are present
|
||||||
/// on the target.
|
/// on the target.
|
||||||
bool Freestanding = false;
|
bool Freestanding = false;
|
||||||
|
|||||||
@@ -303,6 +303,9 @@ public:
|
|||||||
/// by LTO but might not be visible from bitcode symbol table.
|
/// by LTO but might not be visible from bitcode symbol table.
|
||||||
static ArrayRef<const char*> getRuntimeLibcallSymbols();
|
static ArrayRef<const char*> getRuntimeLibcallSymbols();
|
||||||
|
|
||||||
|
/// Returns the context.
|
||||||
|
LLVMContext &getContext() { return RegularLTO.Ctx; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Config Conf;
|
Config Conf;
|
||||||
|
|
||||||
|
|||||||
31
llvm/include/llvm/Transforms/IPO/HostRPC.h
Normal file
31
llvm/include/llvm/Transforms/IPO/HostRPC.h
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
//===- Transform/IPO/HostRPC.h - Code of automatic host rpc -----*- C++ -*-===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef LLVM_TRANSFORMS_IPO_HOSTRPC_H
|
||||||
|
#define LLVM_TRANSFORMS_IPO_HOSTRPC_H
|
||||||
|
|
||||||
|
#include "llvm/Analysis/CGSCCPassManager.h"
|
||||||
|
#include "llvm/IR/PassManager.h"
|
||||||
|
|
||||||
|
namespace llvm {
|
||||||
|
class HostRPCPass : public PassInfoMixin<HostRPCPass> {
|
||||||
|
public:
|
||||||
|
HostRPCPass() : LTOPhase(ThinOrFullLTOPhase::None) {}
|
||||||
|
HostRPCPass(ThinOrFullLTOPhase LTOPhase) : LTOPhase(LTOPhase) {}
|
||||||
|
|
||||||
|
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
|
||||||
|
|
||||||
|
private:
|
||||||
|
const ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None;
|
||||||
|
};
|
||||||
|
} // namespace llvm
|
||||||
|
|
||||||
|
#endif // LLVM_TRANSFORMS_IPO_HOSTRPC_H
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
//===--------------- CanonicalizeMainFunction.h -----------------*- C++ -*-===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
// Utility function to canonicalize main function.
|
||||||
|
// The canonical main function is defined as: int main(int argc, char *argv[]);
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
|
||||||
|
#define LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
|
||||||
|
|
||||||
|
#include "llvm/IR/PassManager.h"
|
||||||
|
|
||||||
|
namespace llvm {
|
||||||
|
|
||||||
|
/// A pass that canonicalizes main function in a module.
|
||||||
|
class CanonicalizeMainFunctionPass
|
||||||
|
: public PassInfoMixin<CanonicalizeMainFunctionPass> {
|
||||||
|
public:
|
||||||
|
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
|
||||||
|
};
|
||||||
|
|
||||||
|
} // end namespace llvm
|
||||||
|
|
||||||
|
#endif // LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
|
||||||
@@ -330,6 +330,8 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
|
|||||||
report_fatal_error(Twine("unable to parse pass pipeline description '") +
|
report_fatal_error(Twine("unable to parse pass pipeline description '") +
|
||||||
Conf.OptPipeline + "': " + toString(std::move(Err)));
|
Conf.OptPipeline + "': " + toString(std::move(Err)));
|
||||||
}
|
}
|
||||||
|
} else if (Conf.UseDefaultPipeline) {
|
||||||
|
MPM.addPass(PB.buildPerModuleDefaultPipeline(OL));
|
||||||
} else if (IsThinLTO) {
|
} else if (IsThinLTO) {
|
||||||
MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary));
|
MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary));
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -141,6 +141,7 @@
|
|||||||
#include "llvm/Transforms/IPO/GlobalDCE.h"
|
#include "llvm/Transforms/IPO/GlobalDCE.h"
|
||||||
#include "llvm/Transforms/IPO/GlobalOpt.h"
|
#include "llvm/Transforms/IPO/GlobalOpt.h"
|
||||||
#include "llvm/Transforms/IPO/GlobalSplit.h"
|
#include "llvm/Transforms/IPO/GlobalSplit.h"
|
||||||
|
#include "llvm/Transforms/IPO/HostRPC.h"
|
||||||
#include "llvm/Transforms/IPO/HotColdSplitting.h"
|
#include "llvm/Transforms/IPO/HotColdSplitting.h"
|
||||||
#include "llvm/Transforms/IPO/IROutliner.h"
|
#include "llvm/Transforms/IPO/IROutliner.h"
|
||||||
#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
|
#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
|
||||||
@@ -264,6 +265,7 @@
|
|||||||
#include "llvm/Transforms/Utils/BreakCriticalEdges.h"
|
#include "llvm/Transforms/Utils/BreakCriticalEdges.h"
|
||||||
#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
|
#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
|
||||||
#include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
|
#include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
|
||||||
|
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
|
||||||
#include "llvm/Transforms/Utils/CountVisits.h"
|
#include "llvm/Transforms/Utils/CountVisits.h"
|
||||||
#include "llvm/Transforms/Utils/DXILUpgrade.h"
|
#include "llvm/Transforms/Utils/DXILUpgrade.h"
|
||||||
#include "llvm/Transforms/Utils/Debugify.h"
|
#include "llvm/Transforms/Utils/Debugify.h"
|
||||||
|
|||||||
@@ -53,6 +53,7 @@
|
|||||||
#include "llvm/Transforms/IPO/GlobalDCE.h"
|
#include "llvm/Transforms/IPO/GlobalDCE.h"
|
||||||
#include "llvm/Transforms/IPO/GlobalOpt.h"
|
#include "llvm/Transforms/IPO/GlobalOpt.h"
|
||||||
#include "llvm/Transforms/IPO/GlobalSplit.h"
|
#include "llvm/Transforms/IPO/GlobalSplit.h"
|
||||||
|
#include "llvm/Transforms/IPO/HostRPC.h"
|
||||||
#include "llvm/Transforms/IPO/HotColdSplitting.h"
|
#include "llvm/Transforms/IPO/HotColdSplitting.h"
|
||||||
#include "llvm/Transforms/IPO/IROutliner.h"
|
#include "llvm/Transforms/IPO/IROutliner.h"
|
||||||
#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
|
#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
|
||||||
@@ -126,6 +127,7 @@
|
|||||||
#include "llvm/Transforms/Utils/AddDiscriminators.h"
|
#include "llvm/Transforms/Utils/AddDiscriminators.h"
|
||||||
#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
|
#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
|
||||||
#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
|
#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
|
||||||
|
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
|
||||||
#include "llvm/Transforms/Utils/CountVisits.h"
|
#include "llvm/Transforms/Utils/CountVisits.h"
|
||||||
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
|
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
|
||||||
#include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
|
#include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
|
||||||
@@ -182,8 +184,8 @@ static cl::opt<bool> EnablePostPGOLoopRotation(
|
|||||||
"enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden,
|
"enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden,
|
||||||
cl::desc("Run the loop rotation transformation after PGO instrumentation"));
|
cl::desc("Run the loop rotation transformation after PGO instrumentation"));
|
||||||
|
|
||||||
static cl::opt<bool> EnableGlobalAnalyses(
|
static cl::opt<bool>
|
||||||
"enable-global-analyses", cl::init(true), cl::Hidden,
|
EnableGlobalAnalyses("enable-global-analyses", cl::init(true), cl::Hidden,
|
||||||
cl::desc("Enable inter-procedural analyses"));
|
cl::desc("Enable inter-procedural analyses"));
|
||||||
|
|
||||||
static cl::opt<bool>
|
static cl::opt<bool>
|
||||||
@@ -306,6 +308,13 @@ cl::opt<bool> EnableMemProfContextDisambiguation(
|
|||||||
extern cl::opt<bool> EnableInferAlignmentPass;
|
extern cl::opt<bool> EnableInferAlignmentPass;
|
||||||
} // namespace llvm
|
} // namespace llvm
|
||||||
|
|
||||||
|
static cl::opt<bool> EnableHostRPC("enable-host-rpc", cl::init(false),
|
||||||
|
cl::Hidden, cl::desc("Enable HostRPC pass"));
|
||||||
|
|
||||||
|
static cl::opt<bool> EnableCanonicalizeMainFunction(
|
||||||
|
"enable-canonicalize-main-function", cl::init(false), cl::Hidden,
|
||||||
|
cl::desc("Enable CanonicalizeMainFunction pass"));
|
||||||
|
|
||||||
PipelineTuningOptions::PipelineTuningOptions() {
|
PipelineTuningOptions::PipelineTuningOptions() {
|
||||||
LoopInterleaving = true;
|
LoopInterleaving = true;
|
||||||
LoopVectorization = true;
|
LoopVectorization = true;
|
||||||
@@ -1091,6 +1100,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
|
|||||||
PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
|
PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (EnableCanonicalizeMainFunction)
|
||||||
|
MPM.addPass(CanonicalizeMainFunctionPass());
|
||||||
|
|
||||||
// Try to perform OpenMP specific optimizations on the module. This is a
|
// Try to perform OpenMP specific optimizations on the module. This is a
|
||||||
// (quick!) no-op if there are no OpenMP runtime calls present in the module.
|
// (quick!) no-op if there are no OpenMP runtime calls present in the module.
|
||||||
MPM.addPass(OpenMPOptPass());
|
MPM.addPass(OpenMPOptPass());
|
||||||
@@ -1110,8 +1122,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
|
|||||||
// and prior to optimizing globals.
|
// and prior to optimizing globals.
|
||||||
// FIXME: This position in the pipeline hasn't been carefully considered in
|
// FIXME: This position in the pipeline hasn't been carefully considered in
|
||||||
// years, it should be re-analyzed.
|
// years, it should be re-analyzed.
|
||||||
MPM.addPass(IPSCCPPass(
|
MPM.addPass(IPSCCPPass(IPSCCPOptions(/*AllowFuncSpec=*/
|
||||||
IPSCCPOptions(/*AllowFuncSpec=*/
|
|
||||||
Level != OptimizationLevel::Os &&
|
Level != OptimizationLevel::Os &&
|
||||||
Level != OptimizationLevel::Oz &&
|
Level != OptimizationLevel::Oz &&
|
||||||
!isLTOPreLink(Phase))));
|
!isLTOPreLink(Phase))));
|
||||||
@@ -1588,7 +1599,7 @@ PassBuilder::buildFatLTODefaultPipeline(OptimizationLevel Level, bool ThinLTO,
|
|||||||
ModulePassManager
|
ModulePassManager
|
||||||
PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
|
PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
|
||||||
if (Level == OptimizationLevel::O0)
|
if (Level == OptimizationLevel::O0)
|
||||||
return buildO0DefaultPipeline(Level, /*LTOPreLink*/true);
|
return buildO0DefaultPipeline(Level, /*LTOPreLink*/ true);
|
||||||
|
|
||||||
ModulePassManager MPM;
|
ModulePassManager MPM;
|
||||||
|
|
||||||
@@ -1711,6 +1722,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
|
|||||||
// in the current module.
|
// in the current module.
|
||||||
MPM.addPass(CrossDSOCFIPass());
|
MPM.addPass(CrossDSOCFIPass());
|
||||||
|
|
||||||
|
if (EnableCanonicalizeMainFunction)
|
||||||
|
MPM.addPass(CanonicalizeMainFunctionPass());
|
||||||
|
|
||||||
if (Level == OptimizationLevel::O0) {
|
if (Level == OptimizationLevel::O0) {
|
||||||
// The WPD and LowerTypeTest passes need to run at -O0 to lower type
|
// The WPD and LowerTypeTest passes need to run at -O0 to lower type
|
||||||
// metadata and intrinsics.
|
// metadata and intrinsics.
|
||||||
@@ -1741,6 +1755,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
|
|||||||
// Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
|
// Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
|
||||||
MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink));
|
MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink));
|
||||||
|
|
||||||
|
if (EnableHostRPC)
|
||||||
|
MPM.addPass(HostRPCPass(ThinOrFullLTOPhase::FullLTOPostLink));
|
||||||
|
|
||||||
// Remove unused virtual tables to improve the quality of code generated by
|
// Remove unused virtual tables to improve the quality of code generated by
|
||||||
// whole-program devirtualization and bitset lowering.
|
// whole-program devirtualization and bitset lowering.
|
||||||
MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true));
|
MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true));
|
||||||
@@ -2115,6 +2132,9 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,
|
|||||||
|
|
||||||
MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));
|
MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));
|
||||||
|
|
||||||
|
if (EnableCanonicalizeMainFunction)
|
||||||
|
MPM.addPass(CanonicalizeMainFunctionPass());
|
||||||
|
|
||||||
return MPM;
|
return MPM;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -47,6 +47,7 @@ MODULE_PASS("attributor", AttributorPass())
|
|||||||
MODULE_PASS("attributor-light", AttributorLightPass())
|
MODULE_PASS("attributor-light", AttributorLightPass())
|
||||||
MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
|
MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
|
||||||
MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass())
|
MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass())
|
||||||
|
MODULE_PASS("canonicalize-main-function", CanonicalizeMainFunctionPass())
|
||||||
MODULE_PASS("check-debugify", NewPMCheckDebugifyPass())
|
MODULE_PASS("check-debugify", NewPMCheckDebugifyPass())
|
||||||
MODULE_PASS("constmerge", ConstantMergePass())
|
MODULE_PASS("constmerge", ConstantMergePass())
|
||||||
MODULE_PASS("coro-cleanup", CoroCleanupPass())
|
MODULE_PASS("coro-cleanup", CoroCleanupPass())
|
||||||
@@ -141,6 +142,7 @@ MODULE_PASS("tsan-module", ModuleThreadSanitizerPass())
|
|||||||
MODULE_PASS("verify", VerifierPass())
|
MODULE_PASS("verify", VerifierPass())
|
||||||
MODULE_PASS("view-callgraph", CallGraphViewerPass())
|
MODULE_PASS("view-callgraph", CallGraphViewerPass())
|
||||||
MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
|
MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
|
||||||
|
MODULE_PASS("host-rpc", HostRPCPass())
|
||||||
#undef MODULE_PASS
|
#undef MODULE_PASS
|
||||||
|
|
||||||
#ifndef MODULE_PASS_WITH_PARAMS
|
#ifndef MODULE_PASS_WITH_PARAMS
|
||||||
|
|||||||
@@ -397,13 +397,26 @@ static bool getPotentialCopiesOfMemoryValue(
|
|||||||
dbgs() << "Underlying object is a valid nullptr, giving up.\n";);
|
dbgs() << "Underlying object is a valid nullptr, giving up.\n";);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// TODO: Use assumed noalias return.
|
if (!isa<AllocaInst>(&Obj) && !isa<GlobalVariable>(&Obj)) {
|
||||||
if (!isa<AllocaInst>(&Obj) && !isa<GlobalVariable>(&Obj) &&
|
auto *CB = dyn_cast<CallBase>(&Obj);
|
||||||
!(IsLoad ? isAllocationFn(&Obj, TLI) : isNoAliasCall(&Obj))) {
|
bool R = false;
|
||||||
|
if (CB) {
|
||||||
|
auto &AAN =
|
||||||
|
*A.getOrCreateAAFor<AANoAlias>(IRPosition::callsite_returned(*CB),
|
||||||
|
&QueryingAA, DepClassTy::OPTIONAL);
|
||||||
|
if (AAN.isValidState())
|
||||||
|
R = AAN.isAssumedNoAlias();
|
||||||
|
else
|
||||||
|
R = isNoAliasCall(&Obj);
|
||||||
|
if (!R)
|
||||||
|
R = isAllocationFn(&Obj, TLI);
|
||||||
|
}
|
||||||
|
if (!R) {
|
||||||
LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << Obj
|
LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << Obj
|
||||||
<< "\n";);
|
<< "\n";);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
if (auto *GV = dyn_cast<GlobalVariable>(&Obj))
|
if (auto *GV = dyn_cast<GlobalVariable>(&Obj))
|
||||||
if (!GV->hasLocalLinkage() &&
|
if (!GV->hasLocalLinkage() &&
|
||||||
!(GV->isConstant() && GV->hasInitializer())) {
|
!(GV->isConstant() && GV->hasInitializer())) {
|
||||||
@@ -2226,7 +2239,8 @@ void Attributor::runTillFixpoint() {
|
|||||||
|
|
||||||
LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
|
LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
|
||||||
<< IterationCounter << "/" << MaxIterations
|
<< IterationCounter << "/" << MaxIterations
|
||||||
<< " iterations\n");
|
<< " iterations. #Changed AAs: " << ChangedAAs.size()
|
||||||
|
<< "\n");
|
||||||
|
|
||||||
// Reset abstract arguments not settled in a sound fixpoint by now. This
|
// Reset abstract arguments not settled in a sound fixpoint by now. This
|
||||||
// happens when we stopped the fixpoint iteration early. Note that only the
|
// happens when we stopped the fixpoint iteration early. Note that only the
|
||||||
@@ -2241,13 +2255,21 @@ void Attributor::runTillFixpoint() {
|
|||||||
|
|
||||||
AbstractState &State = ChangedAA->getState();
|
AbstractState &State = ChangedAA->getState();
|
||||||
if (!State.isAtFixpoint()) {
|
if (!State.isAtFixpoint()) {
|
||||||
|
LLVM_DEBUG(
|
||||||
|
dbgs() << "\n[Attributor] Invalidate AA " << *ChangedAA
|
||||||
|
<< " as it is not at fix point after max iterations.\n");
|
||||||
|
|
||||||
State.indicatePessimisticFixpoint();
|
State.indicatePessimisticFixpoint();
|
||||||
|
|
||||||
NumAttributesTimedOut++;
|
NumAttributesTimedOut++;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto &DepIt : ChangedAA->Deps)
|
for (auto &DepIt : ChangedAA->Deps) {
|
||||||
|
LLVM_DEBUG(dbgs() << "\n[Attributor] Invalidate dependent AA "
|
||||||
|
<< *(DepIt.getPointer()) << " of AA " << *ChangedAA
|
||||||
|
<< "\n");
|
||||||
ChangedAAs.push_back(cast<AbstractAttribute>(DepIt.getPointer()));
|
ChangedAAs.push_back(cast<AbstractAttribute>(DepIt.getPointer()));
|
||||||
|
}
|
||||||
ChangedAA->Deps.clear();
|
ChangedAA->Deps.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1669,6 +1669,8 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
|
|||||||
// might change while we iterate through a loop. For now, we give up if
|
// might change while we iterate through a loop. For now, we give up if
|
||||||
// the PHI is not invariant.
|
// the PHI is not invariant.
|
||||||
if (isa<PHINode>(Usr)) {
|
if (isa<PHINode>(Usr)) {
|
||||||
|
if (!Usr->getType()->isPointerTy())
|
||||||
|
return false;
|
||||||
// Note the order here, the Usr access might change the map, CurPtr is
|
// Note the order here, the Usr access might change the map, CurPtr is
|
||||||
// already in it though.
|
// already in it though.
|
||||||
bool IsFirstPHIUser = !OffsetInfoMap.count(Usr);
|
bool IsFirstPHIUser = !OffsetInfoMap.count(Usr);
|
||||||
@@ -1899,6 +1901,31 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (auto *II = dyn_cast<ICmpInst>(Usr)) {
|
||||||
|
auto CheckIfUsedAsPred = [&](const Use &U, bool &Follow) {
|
||||||
|
const auto *UU = U.getUser();
|
||||||
|
if (isa<SelectInst>(UU))
|
||||||
|
return true;
|
||||||
|
if (isa<BranchInst>(UU))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
LLVM_DEBUG(dbgs() << "[AAPointerInfo] ICmpInst user not handled " << *UU
|
||||||
|
<< "\n");
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
if (!A.checkForAllUses(CheckIfUsedAsPred, *this, *II,
|
||||||
|
/* CheckBBLivenessOnly */ true,
|
||||||
|
DepClassTy::OPTIONAL,
|
||||||
|
/* IgnoreDroppableUses */ true)) {
|
||||||
|
LLVM_DEBUG(
|
||||||
|
dbgs() << "[AAPointerInfo] Check for all uses failed for ICmpInst "
|
||||||
|
<< *II << "\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
LLVM_DEBUG(dbgs() << "[AAPointerInfo] User not handled " << *Usr << "\n");
|
LLVM_DEBUG(dbgs() << "[AAPointerInfo] User not handled " << *Usr << "\n");
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
@@ -2040,6 +2067,7 @@ struct AAPointerInfoCallSiteReturned final : AAPointerInfoFloating {
|
|||||||
};
|
};
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
|
||||||
/// -----------------------NoUnwind Function Attribute--------------------------
|
/// -----------------------NoUnwind Function Attribute--------------------------
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
@@ -11899,10 +11927,35 @@ struct AAUnderlyingObjectsImpl
|
|||||||
/// See AbstractAttribute::trackStatistics()
|
/// See AbstractAttribute::trackStatistics()
|
||||||
void trackStatistics() const override {}
|
void trackStatistics() const override {}
|
||||||
|
|
||||||
|
// TODO: This is a temporary solution for terminals.
|
||||||
|
bool checkIfTerminals(Attributor &A, Value *V) {
|
||||||
|
auto *CI = dyn_cast<CallInst>(V);
|
||||||
|
if (!CI)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Function *Scope = CI->getFunction();
|
||||||
|
const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*Scope);
|
||||||
|
LibFunc TLIFn;
|
||||||
|
if (TLI && TLI->getLibFunc(*CI, TLIFn)) {
|
||||||
|
if (TLIFn == LibFunc::LibFunc_malloc ||
|
||||||
|
TLIFn == LibFunc::LibFunc___kmpc_alloc_shared)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/// See AbstractAttribute::updateImpl(...).
|
/// See AbstractAttribute::updateImpl(...).
|
||||||
ChangeStatus updateImpl(Attributor &A) override {
|
ChangeStatus updateImpl(Attributor &A) override {
|
||||||
auto &Ptr = getAssociatedValue();
|
auto &Ptr = getAssociatedValue();
|
||||||
|
|
||||||
|
if (checkIfTerminals(A, &Ptr)) {
|
||||||
|
bool Changed = false;
|
||||||
|
Changed |= InterAssumedUnderlyingObjects.insert(&Ptr);
|
||||||
|
Changed |= IntraAssumedUnderlyingObjects.insert(&Ptr);
|
||||||
|
return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
|
||||||
|
}
|
||||||
|
|
||||||
auto DoUpdate = [&](SmallSetVector<Value *, 8> &UnderlyingObjects,
|
auto DoUpdate = [&](SmallSetVector<Value *, 8> &UnderlyingObjects,
|
||||||
AA::ValueScope Scope) {
|
AA::ValueScope Scope) {
|
||||||
bool UsedAssumedInformation = false;
|
bool UsedAssumedInformation = false;
|
||||||
@@ -11918,6 +11971,10 @@ struct AAUnderlyingObjectsImpl
|
|||||||
for (unsigned I = 0; I < Values.size(); ++I) {
|
for (unsigned I = 0; I < Values.size(); ++I) {
|
||||||
auto &VAC = Values[I];
|
auto &VAC = Values[I];
|
||||||
auto *Obj = VAC.getValue();
|
auto *Obj = VAC.getValue();
|
||||||
|
if (checkIfTerminals(A, Obj)) {
|
||||||
|
Changed |= UnderlyingObjects.insert(Obj);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
Value *UO = getUnderlyingObject(Obj);
|
Value *UO = getUnderlyingObject(Obj);
|
||||||
if (UO && UO != VAC.getValue() && SeenObjects.insert(UO).second) {
|
if (UO && UO != VAC.getValue() && SeenObjects.insert(UO).second) {
|
||||||
const auto *OtherAA = A.getAAFor<AAUnderlyingObjects>(
|
const auto *OtherAA = A.getAAFor<AAUnderlyingObjects>(
|
||||||
@@ -11948,6 +12005,16 @@ struct AAUnderlyingObjectsImpl
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (auto *LI = dyn_cast<LoadInst>(Obj)) {
|
||||||
|
LLVM_DEBUG({
|
||||||
|
dbgs() << "[AAUnderlyingObjects] for CtxI ";
|
||||||
|
getCtxI()->print(dbgs());
|
||||||
|
dbgs() << " at position " << getIRPosition() << " has LoadInst ";
|
||||||
|
LI->print(dbgs());
|
||||||
|
dbgs() << '\n';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
Changed |= UnderlyingObjects.insert(Obj);
|
Changed |= UnderlyingObjects.insert(Obj);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -11958,6 +12025,8 @@ struct AAUnderlyingObjectsImpl
|
|||||||
Changed |= DoUpdate(IntraAssumedUnderlyingObjects, AA::Intraprocedural);
|
Changed |= DoUpdate(IntraAssumedUnderlyingObjects, AA::Intraprocedural);
|
||||||
Changed |= DoUpdate(InterAssumedUnderlyingObjects, AA::Interprocedural);
|
Changed |= DoUpdate(InterAssumedUnderlyingObjects, AA::Interprocedural);
|
||||||
|
|
||||||
|
LLVM_DEBUG(dumpState(dbgs()));
|
||||||
|
|
||||||
return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
|
return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -11996,6 +12065,19 @@ private:
|
|||||||
return Changed;
|
return Changed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void dumpState(raw_ostream &O) {
|
||||||
|
O << "Underlying objects:\nintra procedureal:\n";
|
||||||
|
for (auto *Obj : IntraAssumedUnderlyingObjects) {
|
||||||
|
Obj->print(O);
|
||||||
|
O << '\n';
|
||||||
|
}
|
||||||
|
O << "inter procedureal:\n";
|
||||||
|
for (auto *Obj : InterAssumedUnderlyingObjects) {
|
||||||
|
Obj->print(O);
|
||||||
|
O << '\n';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// All the underlying objects collected so far via intra procedural scope.
|
/// All the underlying objects collected so far via intra procedural scope.
|
||||||
SmallSetVector<Value *, 8> IntraAssumedUnderlyingObjects;
|
SmallSetVector<Value *, 8> IntraAssumedUnderlyingObjects;
|
||||||
/// All the underlying objects collected so far via inter procedural scope.
|
/// All the underlying objects collected so far via inter procedural scope.
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ add_llvm_component_library(LLVMipo
|
|||||||
GlobalDCE.cpp
|
GlobalDCE.cpp
|
||||||
GlobalOpt.cpp
|
GlobalOpt.cpp
|
||||||
GlobalSplit.cpp
|
GlobalSplit.cpp
|
||||||
|
HostRPC.cpp
|
||||||
HotColdSplitting.cpp
|
HotColdSplitting.cpp
|
||||||
IPO.cpp
|
IPO.cpp
|
||||||
IROutliner.cpp
|
IROutliner.cpp
|
||||||
|
|||||||
890
llvm/lib/Transforms/IPO/HostRPC.cpp
Normal file
890
llvm/lib/Transforms/IPO/HostRPC.cpp
Normal file
@@ -0,0 +1,890 @@
|
|||||||
|
//===- Transform/IPO/HostRPC.h - Code of automatic host rpc -----*- C++ -*-===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "llvm/Transforms/IPO/HostRPC.h"
|
||||||
|
|
||||||
|
#include "llvm/ADT/EnumeratedArray.h"
|
||||||
|
#include "llvm/CodeGen/CommandFlags.h"
|
||||||
|
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
|
||||||
|
#include "llvm/IR/BasicBlock.h"
|
||||||
|
#include "llvm/IR/Constant.h"
|
||||||
|
#include "llvm/IR/Constants.h"
|
||||||
|
#include "llvm/IR/IRBuilder.h"
|
||||||
|
#include "llvm/IR/InstrTypes.h"
|
||||||
|
#include "llvm/IR/Instructions.h"
|
||||||
|
#include "llvm/IR/LLVMContext.h"
|
||||||
|
#include "llvm/IR/Module.h"
|
||||||
|
#include "llvm/IRReader/IRReader.h"
|
||||||
|
#include "llvm/Support/Casting.h"
|
||||||
|
#include "llvm/Support/Debug.h"
|
||||||
|
#include "llvm/Support/ErrorHandling.h"
|
||||||
|
#include "llvm/Support/SourceMgr.h"
|
||||||
|
#include "llvm/Support/TargetSelect.h"
|
||||||
|
#include "llvm/Target/TargetOptions.h"
|
||||||
|
#include "llvm/Transforms/IPO/Attributor.h"
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
#define DEBUG_TYPE "host-rpc"
|
||||||
|
|
||||||
|
using namespace llvm;
|
||||||
|
|
||||||
|
using ArgType = llvm::omp::OMPTgtHostRPCArgType;
|
||||||
|
|
||||||
|
static cl::opt<bool>
|
||||||
|
UseDummyHostModule("host-rpc-use-dummy-host-module", cl::init(false),
|
||||||
|
cl::Hidden,
|
||||||
|
cl::desc("Use dummy host module if there no host module "
|
||||||
|
"attached to the device module"));
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
enum class HostRPCRuntimeFunction {
|
||||||
|
#define __OMPRTL_HOST_RPC(_ENUM) OMPRTL_##_ENUM
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc),
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg),
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg),
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait),
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val),
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper),
|
||||||
|
__OMPRTL_HOST_RPC(__last),
|
||||||
|
#undef __OMPRTL_HOST_RPC
|
||||||
|
};
|
||||||
|
|
||||||
|
#define __OMPRTL_HOST_RPC(_ENUM) \
|
||||||
|
auto OMPRTL_##_ENUM = HostRPCRuntimeFunction::OMPRTL_##_ENUM;
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc)
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg)
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg)
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait)
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val)
|
||||||
|
__OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper)
|
||||||
|
#undef __OMPRTL_HOST_RPC
|
||||||
|
|
||||||
|
// TODO: Remove those functions implemented in device runtime.
|
||||||
|
static constexpr const char *InternalPrefix[] = {
|
||||||
|
"__kmp", "llvm.", "nvm.",
|
||||||
|
"omp_", "vprintf", "malloc",
|
||||||
|
"free", "__keep_alive", "__llvm_omp_vprintf"};
|
||||||
|
|
||||||
|
bool isInternalFunction(Function &F) {
|
||||||
|
auto Name = F.getName();
|
||||||
|
|
||||||
|
for (auto *P : InternalPrefix)
|
||||||
|
if (Name.starts_with(P))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string typeToString(Type *T) {
|
||||||
|
if (T->is16bitFPTy())
|
||||||
|
return "f16";
|
||||||
|
if (T->isFloatTy())
|
||||||
|
return "f32";
|
||||||
|
if (T->isDoubleTy())
|
||||||
|
return "f64";
|
||||||
|
if (T->isPointerTy())
|
||||||
|
return "ptr";
|
||||||
|
if (T->isStructTy())
|
||||||
|
return std::string(T->getStructName());
|
||||||
|
if (T->isIntegerTy())
|
||||||
|
return "i" + std::to_string(T->getIntegerBitWidth());
|
||||||
|
|
||||||
|
LLVM_DEBUG(dbgs() << "[HostRPC] unknown type " << *T
|
||||||
|
<< " for typeToString.\n";);
|
||||||
|
|
||||||
|
llvm_unreachable("unknown type");
|
||||||
|
}
|
||||||
|
|
||||||
|
class HostRPC {
|
||||||
|
/// LLVM context instance
|
||||||
|
LLVMContext &Context;
|
||||||
|
|
||||||
|
/// Device module.
|
||||||
|
Module &M;
|
||||||
|
/// Host module
|
||||||
|
Module &HM;
|
||||||
|
|
||||||
|
/// Data layout of the device module.
|
||||||
|
DataLayout DL;
|
||||||
|
|
||||||
|
IRBuilder<> Builder;
|
||||||
|
|
||||||
|
/// External functions we are operating on.
|
||||||
|
SmallSetVector<Function *, 8> FunctionWorkList;
|
||||||
|
|
||||||
|
/// Attributor instance.
|
||||||
|
Attributor &A;
|
||||||
|
|
||||||
|
// Types
|
||||||
|
Type *Int8PtrTy;
|
||||||
|
Type *VoidTy;
|
||||||
|
Type *Int32Ty;
|
||||||
|
Type *Int64Ty;
|
||||||
|
StructType *ArgInfoTy;
|
||||||
|
// Values
|
||||||
|
Constant *NullPtr;
|
||||||
|
Constant *NullInt64;
|
||||||
|
|
||||||
|
struct CallSiteInfo {
|
||||||
|
CallInst *CI = nullptr;
|
||||||
|
SmallVector<Type *> Params;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct HostRPCArgInfo {
|
||||||
|
Value *BasePtr = nullptr;
|
||||||
|
Constant *Type = nullptr;
|
||||||
|
Value *Size = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
///
|
||||||
|
SmallVector<Function *> HostEntryTable;
|
||||||
|
|
||||||
|
EnumeratedArray<Function *, HostRPCRuntimeFunction,
|
||||||
|
HostRPCRuntimeFunction::OMPRTL___last>
|
||||||
|
RFIs;
|
||||||
|
|
||||||
|
SmallVector<std::pair<CallInst *, CallInst *>> CallInstMap;
|
||||||
|
|
||||||
|
Constant *getConstantInt64(uint64_t Val) {
|
||||||
|
return ConstantInt::get(Int64Ty, Val);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string getWrapperFunctionName(Function *F, CallSiteInfo &CSI) {
|
||||||
|
std::string Name = "__kmpc_host_rpc_wrapper_" + std::string(F->getName());
|
||||||
|
if (!F->isVarArg())
|
||||||
|
return Name;
|
||||||
|
|
||||||
|
for (unsigned I = F->getFunctionType()->getNumParams();
|
||||||
|
I < CSI.Params.size(); ++I) {
|
||||||
|
Name.push_back('_');
|
||||||
|
Name.append(typeToString(CSI.Params[I]));
|
||||||
|
}
|
||||||
|
|
||||||
|
return Name;
|
||||||
|
}
|
||||||
|
|
||||||
|
void registerAAs();
|
||||||
|
|
||||||
|
Value *convertToInt64Ty(Value *V);
|
||||||
|
|
||||||
|
Value *convertFromInt64TyTo(Value *V, Type *TargetTy);
|
||||||
|
|
||||||
|
Constant *convertToInt64Ty(Constant *C);
|
||||||
|
|
||||||
|
Constant *convertFromInt64TyTo(Constant *C, Type *T);
|
||||||
|
|
||||||
|
// int device_wrapper(call_no, arg_info, ...) {
|
||||||
|
// void *desc = __kmpc_host_rpc_get_desc(call_no, num_args, arg_info);
|
||||||
|
// __kmpc_host_rpc_add_arg(desc, arg1, sizeof(arg1));
|
||||||
|
// __kmpc_host_rpc_add_arg(desc, arg2, sizeof(arg2));
|
||||||
|
// ...
|
||||||
|
// int r = (int)__kmpc_host_rpc_send_and_wait(desc);
|
||||||
|
// return r;
|
||||||
|
// }
|
||||||
|
Function *getDeviceWrapperFunction(StringRef WrapperName, Function *F,
|
||||||
|
CallSiteInfo &CSI);
|
||||||
|
|
||||||
|
// void host_wrapper(desc) {
|
||||||
|
// int arg1 = (int)__kmpc_host_rpc_get_arg(desc, 0);
|
||||||
|
// float arg2 = (float)__kmpc_host_rpc_get_arg(desc, 1);
|
||||||
|
// char *arg3 = (char *)__kmpc_host_rpc_get_arg(desc, 2);
|
||||||
|
// ...
|
||||||
|
// int r = actual_call(arg1, arg2, arg3, ...);
|
||||||
|
// __kmpc_host_rpc_set_ret_val(ptr(desc, (int64_t)r);
|
||||||
|
// }
|
||||||
|
Function *getHostWrapperFunction(StringRef WrapperName, Function *F,
|
||||||
|
CallSiteInfo &CSI);
|
||||||
|
|
||||||
|
bool rewriteWithHostRPC(Function *F);
|
||||||
|
|
||||||
|
void emitHostWrapperInvoker();
|
||||||
|
|
||||||
|
bool recollectInformation();
|
||||||
|
|
||||||
|
public:
|
||||||
|
HostRPC(Module &DeviceModule, Module &HostModule, Attributor &A)
|
||||||
|
: Context(DeviceModule.getContext()), M(DeviceModule), HM(HostModule),
|
||||||
|
DL(M.getDataLayout()), Builder(Context), A(A) {
|
||||||
|
assert(&M.getContext() == &HM.getContext() &&
|
||||||
|
"device and host modules have different context");
|
||||||
|
|
||||||
|
Int8PtrTy = PointerType::getUnqual(Context);
|
||||||
|
VoidTy = Type::getVoidTy(Context);
|
||||||
|
Int32Ty = Type::getInt32Ty(Context);
|
||||||
|
Int64Ty = Type::getInt64Ty(Context);
|
||||||
|
NullPtr = ConstantInt::getNullValue(Int8PtrTy);
|
||||||
|
NullInt64 = ConstantInt::getNullValue(Int64Ty);
|
||||||
|
|
||||||
|
#define __OMP_RTL(_ENUM, MOD, VARARG, RETTY, ...) \
|
||||||
|
{ \
|
||||||
|
SmallVector<Type *> Params{__VA_ARGS__}; \
|
||||||
|
FunctionType *FT = FunctionType::get(RETTY, Params, VARARG); \
|
||||||
|
Function *F = (MOD).getFunction(#_ENUM); \
|
||||||
|
if (!F) \
|
||||||
|
F = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage, \
|
||||||
|
#_ENUM, (MOD)); \
|
||||||
|
RFIs[OMPRTL_##_ENUM] = F; \
|
||||||
|
}
|
||||||
|
__OMP_RTL(__kmpc_host_rpc_get_desc, M, false, Int8PtrTy, Int32Ty, Int32Ty,
|
||||||
|
Int8PtrTy)
|
||||||
|
__OMP_RTL(__kmpc_host_rpc_add_arg, M, false, VoidTy, Int8PtrTy, Int64Ty,
|
||||||
|
Int32Ty)
|
||||||
|
__OMP_RTL(__kmpc_host_rpc_send_and_wait, M, false, Int64Ty, Int8PtrTy)
|
||||||
|
__OMP_RTL(__kmpc_host_rpc_get_arg, HM, false, Int64Ty, Int8PtrTy, Int32Ty)
|
||||||
|
__OMP_RTL(__kmpc_host_rpc_set_ret_val, HM, false, VoidTy, Int8PtrTy,
|
||||||
|
Int64Ty)
|
||||||
|
__OMP_RTL(__kmpc_host_rpc_invoke_host_wrapper, HM, false, VoidTy, Int32Ty,
|
||||||
|
Int8PtrTy)
|
||||||
|
#undef __OMP_RTL
|
||||||
|
|
||||||
|
ArgInfoTy = StructType::create({Int64Ty, Int64Ty, Int64Ty, Int8PtrTy},
|
||||||
|
"struct.arg_info_t");
|
||||||
|
}
|
||||||
|
|
||||||
|
bool run();
|
||||||
|
};
|
||||||
|
|
||||||
|
Value *HostRPC::convertToInt64Ty(Value *V) {
|
||||||
|
if (auto *C = dyn_cast<Constant>(V))
|
||||||
|
return convertToInt64Ty(C);
|
||||||
|
|
||||||
|
Type *T = V->getType();
|
||||||
|
|
||||||
|
if (T == Int64Ty)
|
||||||
|
return V;
|
||||||
|
|
||||||
|
if (T->isPointerTy())
|
||||||
|
return Builder.CreatePtrToInt(V, Int64Ty);
|
||||||
|
|
||||||
|
if (T->isIntegerTy())
|
||||||
|
return Builder.CreateIntCast(V, Int64Ty, /* isSigned */ true);
|
||||||
|
|
||||||
|
if (T->isFloatingPointTy()) {
|
||||||
|
V = Builder.CreateBitCast(
|
||||||
|
V, Type::getIntNTy(V->getContext(), T->getScalarSizeInBits()));
|
||||||
|
return Builder.CreateIntCast(V, Int64Ty, /* isSigned */ true);
|
||||||
|
}
|
||||||
|
|
||||||
|
llvm_unreachable("unknown cast to int64_t");
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *HostRPC::convertFromInt64TyTo(Value *V, Type *T) {
|
||||||
|
if (auto *C = dyn_cast<Constant>(V))
|
||||||
|
return convertFromInt64TyTo(C, T);
|
||||||
|
|
||||||
|
if (T == Int64Ty)
|
||||||
|
return V;
|
||||||
|
|
||||||
|
if (T->isPointerTy())
|
||||||
|
return Builder.CreateIntToPtr(V, T);
|
||||||
|
|
||||||
|
if (T->isIntegerTy())
|
||||||
|
return Builder.CreateIntCast(V, T, /* isSigned */ true);
|
||||||
|
|
||||||
|
if (T->isFloatingPointTy()) {
|
||||||
|
V = Builder.CreateIntCast(
|
||||||
|
V, Type::getIntNTy(V->getContext(), T->getScalarSizeInBits()),
|
||||||
|
/* isSigned */ true);
|
||||||
|
return Builder.CreateBitCast(V, T);
|
||||||
|
}
|
||||||
|
|
||||||
|
llvm_unreachable("unknown cast from int64_t");
|
||||||
|
}
|
||||||
|
|
||||||
|
Constant *HostRPC::convertToInt64Ty(Constant *C) {
|
||||||
|
Type *T = C->getType();
|
||||||
|
|
||||||
|
if (T == Int64Ty)
|
||||||
|
return C;
|
||||||
|
|
||||||
|
if (T->isPointerTy())
|
||||||
|
return ConstantExpr::getPtrToInt(C, Int64Ty);
|
||||||
|
|
||||||
|
if (T->isIntegerTy())
|
||||||
|
llvm_unreachable("I don't know how to fixe this");
|
||||||
|
//return ConstantExpr::getIntegerCast(C, Int64Ty, /* isSigned */ true);
|
||||||
|
|
||||||
|
if (T->isFloatingPointTy()) {
|
||||||
|
// TODO: FIXEME getIntegerCast is hard to implement with new version of ConstExpr
|
||||||
|
//C = ConstantExpr::getBitCast(
|
||||||
|
// C, Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()));
|
||||||
|
//return ConstantExpr::getIntegerCast(C, Int64Ty, /* isSigned */ true);
|
||||||
|
llvm_unreachable("unsuported cast from float to int64_t");
|
||||||
|
}
|
||||||
|
|
||||||
|
llvm_unreachable("unknown cast to int64_t");
|
||||||
|
}
|
||||||
|
|
||||||
|
Constant *HostRPC::convertFromInt64TyTo(Constant *C, Type *T) {
|
||||||
|
if (T == Int64Ty)
|
||||||
|
return C;
|
||||||
|
|
||||||
|
if (T->isPointerTy())
|
||||||
|
return ConstantExpr::getIntToPtr(C, T);
|
||||||
|
|
||||||
|
if (T->isIntegerTy())
|
||||||
|
llvm_unreachable("I don't know how to fixe this");
|
||||||
|
//return ConstantExpr::getIntegerCast(C, T, /* isSigned */ true);
|
||||||
|
|
||||||
|
if (T->isFloatingPointTy()) {
|
||||||
|
// TODO: FIXEME getIntegerCast is hard to implement with new version of ConstExpr
|
||||||
|
//C = ConstantExpr::getIntegerCast(
|
||||||
|
// C, Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()),
|
||||||
|
// /* isSigned */ true);
|
||||||
|
//return ConstantExpr::getBitCast(C, T);
|
||||||
|
llvm_unreachable("unsuported cast from int64_t to float");
|
||||||
|
}
|
||||||
|
|
||||||
|
llvm_unreachable("unknown cast from int64_t");
|
||||||
|
}
|
||||||
|
|
||||||
|
void HostRPC::registerAAs() {
|
||||||
|
for (auto *F : FunctionWorkList)
|
||||||
|
for (User *U : F->users()) {
|
||||||
|
auto *CI = dyn_cast<CallInst>(U);
|
||||||
|
if (!CI)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
for (unsigned I = 0; I < CI->arg_size(); ++I) {
|
||||||
|
Value *Operand = CI->getArgOperand(I);
|
||||||
|
if (!Operand->getType()->isPointerTy())
|
||||||
|
continue;
|
||||||
|
A.getOrCreateAAFor<AAUnderlyingObjects>(
|
||||||
|
IRPosition::callsite_argument(*CI, I),
|
||||||
|
/* QueryingAA */ nullptr, DepClassTy::NONE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HostRPC::recollectInformation() {
|
||||||
|
FunctionWorkList.clear();
|
||||||
|
|
||||||
|
for (Function &F : M) {
|
||||||
|
// If the function is already defined, it definitely does not require RPC.
|
||||||
|
if (!F.isDeclaration())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// If it is an internal function, skip it as well.
|
||||||
|
if (isInternalFunction(F))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// If there is no use of the function, skip it.
|
||||||
|
if (F.use_empty())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
FunctionWorkList.insert(&F);
|
||||||
|
}
|
||||||
|
|
||||||
|
return !FunctionWorkList.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HostRPC::run() {
|
||||||
|
bool Changed = false;
|
||||||
|
|
||||||
|
if (!recollectInformation())
|
||||||
|
return Changed;
|
||||||
|
|
||||||
|
Changed = true;
|
||||||
|
|
||||||
|
// We add a couple of assumptions to those RPC functions such that AAs will
|
||||||
|
// not error out because of unknown implementation of those functions.
|
||||||
|
for (Function &F : M) {
|
||||||
|
if (!F.isDeclaration())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
F.addFnAttr(Attribute::NoRecurse);
|
||||||
|
|
||||||
|
for (auto &Arg : F.args())
|
||||||
|
if (Arg.getType()->isPointerTy())
|
||||||
|
Arg.addAttr(Attribute::NoCapture);
|
||||||
|
|
||||||
|
if (!F.isVarArg())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
for (User *U : F.users()) {
|
||||||
|
auto *CB = dyn_cast<CallBase>(U);
|
||||||
|
if (!CB)
|
||||||
|
continue;
|
||||||
|
for (unsigned I = F.getFunctionType()->getNumParams(); I < CB->arg_size();
|
||||||
|
++I) {
|
||||||
|
Value *Arg = CB->getArgOperand(I);
|
||||||
|
if (Arg->getType()->isPointerTy())
|
||||||
|
CB->addParamAttr(I, Attribute::NoCapture);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LLVM_DEBUG(M.dump());
|
||||||
|
|
||||||
|
registerAAs();
|
||||||
|
|
||||||
|
ChangeStatus Status = A.run();
|
||||||
|
if (!recollectInformation())
|
||||||
|
return Status == ChangeStatus::CHANGED;
|
||||||
|
|
||||||
|
for (Function *F : FunctionWorkList)
|
||||||
|
Changed |= rewriteWithHostRPC(F);
|
||||||
|
|
||||||
|
if (!Changed)
|
||||||
|
return Changed;
|
||||||
|
|
||||||
|
for (auto Itr = CallInstMap.rbegin(); Itr != CallInstMap.rend(); ++Itr) {
|
||||||
|
auto *CI = Itr->first;
|
||||||
|
auto *NewCI = Itr->second;
|
||||||
|
CI->replaceAllUsesWith(NewCI);
|
||||||
|
CI->eraseFromParent();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (Function *F : FunctionWorkList)
|
||||||
|
if (F->user_empty())
|
||||||
|
F->eraseFromParent();
|
||||||
|
|
||||||
|
emitHostWrapperInvoker();
|
||||||
|
|
||||||
|
return Changed;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HostRPC::rewriteWithHostRPC(Function *F) {
|
||||||
|
SmallVector<CallInst *> WorkList;
|
||||||
|
|
||||||
|
for (User *U : F->users()) {
|
||||||
|
auto *CI = dyn_cast<CallInst>(U);
|
||||||
|
if (!CI)
|
||||||
|
continue;
|
||||||
|
WorkList.push_back(CI);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (WorkList.empty())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for (CallInst *CI : WorkList) {
|
||||||
|
CallSiteInfo CSI;
|
||||||
|
CSI.CI = CI;
|
||||||
|
unsigned NumArgs = CI->arg_size();
|
||||||
|
for (unsigned I = 0; I < NumArgs; ++I)
|
||||||
|
CSI.Params.push_back(CI->getArgOperand(I)->getType());
|
||||||
|
|
||||||
|
std::string WrapperName = getWrapperFunctionName(F, CSI);
|
||||||
|
Function *DeviceWrapperFn = getDeviceWrapperFunction(WrapperName, F, CSI);
|
||||||
|
Function *HostWrapperFn = getHostWrapperFunction(WrapperName, F, CSI);
|
||||||
|
|
||||||
|
int32_t WrapperNumber = -1;
|
||||||
|
for (unsigned I = 0; I < HostEntryTable.size(); ++I) {
|
||||||
|
if (HostEntryTable[I] == HostWrapperFn) {
|
||||||
|
WrapperNumber = I;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (WrapperNumber == -1) {
|
||||||
|
WrapperNumber = HostEntryTable.size();
|
||||||
|
HostEntryTable.push_back(HostWrapperFn);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto CheckIfIdentifierPtr = [this](const Value *V) {
|
||||||
|
auto *CI = dyn_cast<CallInst>(V);
|
||||||
|
if (!CI)
|
||||||
|
return false;
|
||||||
|
Function *Callee = CI->getCalledFunction();
|
||||||
|
if (this->FunctionWorkList.count(Callee))
|
||||||
|
return true;
|
||||||
|
return Callee->getName().starts_with("__kmpc_host_rpc_wrapper_");
|
||||||
|
};
|
||||||
|
|
||||||
|
auto CheckIfDynAlloc = [](Value *V) -> CallInst * {
|
||||||
|
auto *CI = dyn_cast<CallInst>(V);
|
||||||
|
if (!CI)
|
||||||
|
return nullptr;
|
||||||
|
Function *Callee = CI->getCalledFunction();
|
||||||
|
auto Name = Callee->getName();
|
||||||
|
if (Name == "malloc" || Name == "__kmpc_alloc_shared")
|
||||||
|
return CI;
|
||||||
|
return nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto CheckIfStdIO = [](Value *V) -> GlobalVariable * {
|
||||||
|
auto *LI = dyn_cast<LoadInst>(V);
|
||||||
|
if (!LI)
|
||||||
|
return nullptr;
|
||||||
|
auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
|
||||||
|
if (!GV)
|
||||||
|
return nullptr;
|
||||||
|
auto Name = GV->getName();
|
||||||
|
if (Name == "stdout" || Name == "stderr" || Name == "stdin")
|
||||||
|
return GV;
|
||||||
|
return nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto CheckIfGlobalVariable = [](Value *V) {
|
||||||
|
if (auto *GV = dyn_cast<GlobalVariable>(V))
|
||||||
|
return GV;
|
||||||
|
if (auto *LI = dyn_cast<LoadInst>(V))
|
||||||
|
if (auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()))
|
||||||
|
return GV;
|
||||||
|
return static_cast<GlobalVariable *>(nullptr);
|
||||||
|
};
|
||||||
|
|
||||||
|
auto CheckIfNullPtr = [](Value *V) {
|
||||||
|
if (!V->getType()->isPointerTy())
|
||||||
|
return false;
|
||||||
|
return V == ConstantInt::getNullValue(V->getType());
|
||||||
|
};
|
||||||
|
|
||||||
|
auto HandleDirectUse = [&](Value *Ptr, HostRPCArgInfo &AI,
|
||||||
|
bool IsPointer = false) {
|
||||||
|
AI.BasePtr = Ptr;
|
||||||
|
AI.Type = getConstantInt64(IsPointer ? ArgType::OMP_HOST_RPC_ARG_PTR
|
||||||
|
: ArgType::OMP_HOST_RPC_ARG_SCALAR);
|
||||||
|
AI.Size = NullInt64;
|
||||||
|
};
|
||||||
|
|
||||||
|
SmallVector<SmallVector<HostRPCArgInfo>> ArgInfo;
|
||||||
|
bool IsConstantArgInfo = true;
|
||||||
|
|
||||||
|
for (unsigned I = 0; I < CI->arg_size(); ++I) {
|
||||||
|
ArgInfo.emplace_back();
|
||||||
|
auto &AII = ArgInfo.back();
|
||||||
|
|
||||||
|
Value *Operand = CI->getArgOperand(I);
|
||||||
|
|
||||||
|
// Check if scalar type.
|
||||||
|
if (!Operand->getType()->isPointerTy()) {
|
||||||
|
AII.emplace_back();
|
||||||
|
HandleDirectUse(Operand, AII.back());
|
||||||
|
IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (CheckIfNullPtr(Operand))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
auto Pred = [&](Value &Obj) {
|
||||||
|
if (CheckIfNullPtr(&Obj))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
bool IsConstantArgument = false;
|
||||||
|
if (!F->isVarArg() &&
|
||||||
|
F->hasParamAttribute(I, Attribute::AttrKind::ReadOnly))
|
||||||
|
IsConstantArgument = true;
|
||||||
|
|
||||||
|
HostRPCArgInfo AI;
|
||||||
|
|
||||||
|
if (auto *IO = CheckIfStdIO(&Obj)) {
|
||||||
|
HandleDirectUse(IO, AI, /* IsPointer */ true);
|
||||||
|
} else if (CheckIfIdentifierPtr(&Obj)) {
|
||||||
|
IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
|
||||||
|
HandleDirectUse(Operand, AI, /* IsPointer */ true);
|
||||||
|
} else if (auto *GV = CheckIfGlobalVariable(&Obj)) {
|
||||||
|
AI.BasePtr = GV;
|
||||||
|
AI.Size = getConstantInt64(DL.getTypeStoreSize(GV->getValueType()));
|
||||||
|
AI.Type =
|
||||||
|
getConstantInt64(GV->isConstant() || IsConstantArgument
|
||||||
|
? ArgType::OMP_HOST_RPC_ARG_COPY_TO
|
||||||
|
: ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM);
|
||||||
|
} else if (CheckIfDynAlloc(&Obj)) {
|
||||||
|
// We will handle this case at runtime so here we don't do anything.
|
||||||
|
return true;
|
||||||
|
} else if (isa<AllocaInst>(&Obj)) {
|
||||||
|
llvm_unreachable("alloca instruction needs to be handled!");
|
||||||
|
} else {
|
||||||
|
LLVM_DEBUG({
|
||||||
|
dbgs() << "[HostRPC] warning: call site " << *CI << ", operand "
|
||||||
|
<< *Operand << ", underlying object " << Obj
|
||||||
|
<< " cannot be handled.\n";
|
||||||
|
});
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
AII.push_back(std::move(AI));
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto &AAUO = *A.getOrCreateAAFor<AAUnderlyingObjects>(
|
||||||
|
IRPosition::callsite_argument(*CI, I), nullptr, DepClassTy::NONE);
|
||||||
|
if (!AAUO.forallUnderlyingObjects(Pred))
|
||||||
|
llvm_unreachable("internal error");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset the insert point to the call site.
|
||||||
|
Builder.SetInsertPoint(CI);
|
||||||
|
|
||||||
|
Value *ArgInfoVal = nullptr;
|
||||||
|
if (!IsConstantArgInfo) {
|
||||||
|
ArgInfoVal = Builder.CreateAlloca(Int8PtrTy, getConstantInt64(NumArgs),
|
||||||
|
"arg_info");
|
||||||
|
for (unsigned I = 0; I < NumArgs; ++I) {
|
||||||
|
auto &AII = ArgInfo[I];
|
||||||
|
Value *Next = NullPtr;
|
||||||
|
for (auto &AI : AII) {
|
||||||
|
Value *AIV = Builder.CreateAlloca(ArgInfoTy);
|
||||||
|
Value *AIIArg =
|
||||||
|
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(0)});
|
||||||
|
Builder.Insert(AIIArg);
|
||||||
|
Builder.CreateStore(convertToInt64Ty(AI.BasePtr), AIIArg);
|
||||||
|
Value *AIIType =
|
||||||
|
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(1)});
|
||||||
|
Builder.Insert(AIIType);
|
||||||
|
Builder.CreateStore(AI.Type, AIIType);
|
||||||
|
Value *AIISize =
|
||||||
|
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(2)});
|
||||||
|
Builder.Insert(AIISize);
|
||||||
|
Builder.CreateStore(AI.Size, AIISize);
|
||||||
|
Value *AIINext =
|
||||||
|
GetElementPtrInst::Create(Int8PtrTy, AIV, {getConstantInt64(3)});
|
||||||
|
Builder.Insert(AIINext);
|
||||||
|
Builder.CreateStore(Next, AIINext);
|
||||||
|
Next = AIV;
|
||||||
|
}
|
||||||
|
Value *AIIV = GetElementPtrInst::Create(Int8PtrTy, ArgInfoVal,
|
||||||
|
{getConstantInt64(I)});
|
||||||
|
Builder.Insert(AIIV);
|
||||||
|
Builder.CreateStore(Next, AIIV);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
SmallVector<Constant *> ArgInfoInitVar;
|
||||||
|
for (auto &AII : ArgInfo) {
|
||||||
|
Constant *Last = NullPtr;
|
||||||
|
for (auto &AI : AII) {
|
||||||
|
auto *Arg = cast<Constant>(AI.BasePtr);
|
||||||
|
auto *CS =
|
||||||
|
ConstantStruct::get(ArgInfoTy, {convertToInt64Ty(Arg), AI.Type,
|
||||||
|
cast<Constant>(AI.Size), Last});
|
||||||
|
auto *GV = new GlobalVariable(
|
||||||
|
M, ArgInfoTy, /* isConstant */ true,
|
||||||
|
GlobalValue::LinkageTypes::InternalLinkage, CS);
|
||||||
|
Last = GV;
|
||||||
|
}
|
||||||
|
ArgInfoInitVar.push_back(Last);
|
||||||
|
}
|
||||||
|
Constant *ArgInfoInit = ConstantArray::get(
|
||||||
|
ArrayType::get(Int8PtrTy, NumArgs), ArgInfoInitVar);
|
||||||
|
ArgInfoVal = new GlobalVariable(
|
||||||
|
M, ArrayType::get(Int8PtrTy, NumArgs), /* isConstant */ true,
|
||||||
|
GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info");
|
||||||
|
}
|
||||||
|
|
||||||
|
SmallVector<Value *> Args{ConstantInt::get(Int32Ty, WrapperNumber),
|
||||||
|
ArgInfoVal};
|
||||||
|
for (Value *Operand : CI->args())
|
||||||
|
Args.push_back(Operand);
|
||||||
|
|
||||||
|
CallInst *NewCall = Builder.CreateCall(DeviceWrapperFn, Args);
|
||||||
|
|
||||||
|
CallInstMap.emplace_back(CI, NewCall);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
Function *HostRPC::getDeviceWrapperFunction(StringRef WrapperName, Function *F,
|
||||||
|
CallSiteInfo &CSI) {
|
||||||
|
Function *WrapperFn = M.getFunction(WrapperName);
|
||||||
|
if (WrapperFn)
|
||||||
|
return WrapperFn;
|
||||||
|
|
||||||
|
// return_type device_wrapper(int32_t call_no, void *arg_info, ...)
|
||||||
|
SmallVector<Type *> Params{Int32Ty, Int8PtrTy};
|
||||||
|
Params.append(CSI.Params);
|
||||||
|
|
||||||
|
Type *RetTy = F->getReturnType();
|
||||||
|
|
||||||
|
FunctionType *FT = FunctionType::get(RetTy, Params, /*isVarArg*/ false);
|
||||||
|
WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::InternalLinkage,
|
||||||
|
WrapperName, M);
|
||||||
|
|
||||||
|
// Emit the body of the device wrapper
|
||||||
|
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
|
||||||
|
Builder.SetInsertPoint(EntryBB);
|
||||||
|
|
||||||
|
// skip call_no and arg_info.
|
||||||
|
constexpr const unsigned NumArgSkipped = 2;
|
||||||
|
|
||||||
|
Value *Desc = nullptr;
|
||||||
|
{
|
||||||
|
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_get_desc];
|
||||||
|
Desc = Builder.CreateCall(
|
||||||
|
Fn,
|
||||||
|
{WrapperFn->getArg(0),
|
||||||
|
ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
|
||||||
|
WrapperFn->getArg(1)},
|
||||||
|
"desc");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_add_arg];
|
||||||
|
for (unsigned I = NumArgSkipped; I < WrapperFn->arg_size(); ++I) {
|
||||||
|
Value *V = convertToInt64Ty(WrapperFn->getArg(I));
|
||||||
|
Builder.CreateCall(
|
||||||
|
Fn, {Desc, V, ConstantInt::get(Int32Ty, I - NumArgSkipped)});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *RetVal =
|
||||||
|
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_send_and_wait], {Desc});
|
||||||
|
|
||||||
|
if (RetTy->isVoidTy()) {
|
||||||
|
Builder.CreateRetVoid();
|
||||||
|
return WrapperFn;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (RetTy != RetVal->getType())
|
||||||
|
RetVal = convertFromInt64TyTo(RetVal, RetTy);
|
||||||
|
|
||||||
|
Builder.CreateRet(RetVal);
|
||||||
|
|
||||||
|
return WrapperFn;
|
||||||
|
}
|
||||||
|
|
||||||
|
Function *HostRPC::getHostWrapperFunction(StringRef WrapperName, Function *F,
|
||||||
|
CallSiteInfo &CSI) {
|
||||||
|
Function *WrapperFn = HM.getFunction(WrapperName);
|
||||||
|
if (WrapperFn)
|
||||||
|
return WrapperFn;
|
||||||
|
|
||||||
|
SmallVector<Type *> Params{Int8PtrTy};
|
||||||
|
FunctionType *FT = FunctionType::get(VoidTy, Params, /* isVarArg */ false);
|
||||||
|
WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,
|
||||||
|
WrapperName, HM);
|
||||||
|
|
||||||
|
Value *Desc = WrapperFn->getArg(0);
|
||||||
|
|
||||||
|
// Emit the body of the host wrapper
|
||||||
|
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
|
||||||
|
Builder.SetInsertPoint(EntryBB);
|
||||||
|
|
||||||
|
SmallVector<Value *> Args;
|
||||||
|
for (unsigned I = 0; I < CSI.CI->arg_size(); ++I) {
|
||||||
|
Value *V = Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_arg],
|
||||||
|
{Desc, ConstantInt::get(Int32Ty, I)});
|
||||||
|
Args.push_back(convertFromInt64TyTo(V, CSI.Params[I]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// The host callee that will be called eventually by the host wrapper.
|
||||||
|
Function *HostCallee = HM.getFunction(F->getName());
|
||||||
|
if (!HostCallee)
|
||||||
|
HostCallee = Function::Create(F->getFunctionType(), F->getLinkage(),
|
||||||
|
F->getName(), HM);
|
||||||
|
|
||||||
|
Value *RetVal = Builder.CreateCall(HostCallee, Args);
|
||||||
|
if (!RetVal->getType()->isVoidTy()) {
|
||||||
|
RetVal = convertToInt64Ty(RetVal);
|
||||||
|
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_set_ret_val],
|
||||||
|
{Desc, RetVal});
|
||||||
|
}
|
||||||
|
Builder.CreateRetVoid();
|
||||||
|
|
||||||
|
return WrapperFn;
|
||||||
|
}
|
||||||
|
|
||||||
|
void HostRPC::emitHostWrapperInvoker() {
|
||||||
|
IRBuilder<> Builder(Context);
|
||||||
|
unsigned NumEntries = HostEntryTable.size();
|
||||||
|
Function *F = RFIs[OMPRTL___kmpc_host_rpc_invoke_host_wrapper];
|
||||||
|
F->setDLLStorageClass(
|
||||||
|
GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
|
||||||
|
Value *CallNo = F->getArg(0);
|
||||||
|
Value *Desc = F->getArg(1);
|
||||||
|
|
||||||
|
SmallVector<BasicBlock *> SwitchBBs;
|
||||||
|
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", F);
|
||||||
|
BasicBlock *ReturnBB = BasicBlock::Create(Context, "return", F);
|
||||||
|
|
||||||
|
// Emit code for the return bb.
|
||||||
|
Builder.SetInsertPoint(ReturnBB);
|
||||||
|
Builder.CreateRetVoid();
|
||||||
|
|
||||||
|
// Create BB for each host entry and emit function call.
|
||||||
|
for (unsigned I = 0; I < NumEntries; ++I) {
|
||||||
|
BasicBlock *BB = BasicBlock::Create(Context, "invoke.bb", F, ReturnBB);
|
||||||
|
SwitchBBs.push_back(BB);
|
||||||
|
Builder.SetInsertPoint(BB);
|
||||||
|
Builder.CreateCall(HostEntryTable[I], {Desc});
|
||||||
|
Builder.CreateBr(ReturnBB);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Emit code for the entry BB.
|
||||||
|
Builder.SetInsertPoint(EntryBB);
|
||||||
|
SwitchInst *Switch = Builder.CreateSwitch(CallNo, ReturnBB, NumEntries);
|
||||||
|
for (unsigned I = 0; I < NumEntries; ++I)
|
||||||
|
Switch->addCase(ConstantInt::get(cast<IntegerType>(Int32Ty), I),
|
||||||
|
SwitchBBs[I]);
|
||||||
|
}
|
||||||
|
|
||||||
|
Module *getHostModule(Module &M) {
|
||||||
|
auto *MD = M.getNamedMetadata("llvm.hostrpc.hostmodule");
|
||||||
|
if (!MD || MD->getNumOperands() == 0)
|
||||||
|
return nullptr;
|
||||||
|
auto *Node = MD->getOperand(0);
|
||||||
|
assert(Node->getNumOperands() == 1 && "invliad named metadata");
|
||||||
|
auto *CAM = dyn_cast<ConstantAsMetadata>(Node->getOperand(0));
|
||||||
|
if (!CAM)
|
||||||
|
return nullptr;
|
||||||
|
auto *CI = cast<ConstantInt>(CAM->getValue());
|
||||||
|
Module *Mod = reinterpret_cast<Module *>(CI->getZExtValue());
|
||||||
|
M.eraseNamedMetadata(MD);
|
||||||
|
return Mod;
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
PreservedAnalyses HostRPCPass::run(Module &M, ModuleAnalysisManager &AM) {
|
||||||
|
std::unique_ptr<Module> DummyHostModule;
|
||||||
|
|
||||||
|
Module *HostModule = nullptr;
|
||||||
|
|
||||||
|
if (UseDummyHostModule) {
|
||||||
|
DummyHostModule =
|
||||||
|
std::make_unique<Module>("dummy-host-rpc.bc", M.getContext());
|
||||||
|
HostModule = DummyHostModule.get();
|
||||||
|
} else {
|
||||||
|
HostModule = getHostModule(M);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!HostModule)
|
||||||
|
return PreservedAnalyses::all();
|
||||||
|
|
||||||
|
bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
|
||||||
|
LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
|
||||||
|
|
||||||
|
// The pass will not run if it is not invoked directly or not invoked at link
|
||||||
|
// time.
|
||||||
|
if (!UseDummyHostModule && !PostLink)
|
||||||
|
return PreservedAnalyses::all();
|
||||||
|
|
||||||
|
FunctionAnalysisManager &FAM =
|
||||||
|
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
|
||||||
|
AnalysisGetter AG(FAM);
|
||||||
|
|
||||||
|
CallGraphUpdater CGUpdater;
|
||||||
|
BumpPtrAllocator Allocator;
|
||||||
|
|
||||||
|
AttributorConfig AC(CGUpdater);
|
||||||
|
AC.DefaultInitializeLiveInternals = false;
|
||||||
|
AC.RewriteSignatures = false;
|
||||||
|
AC.PassName = DEBUG_TYPE;
|
||||||
|
AC.MaxFixpointIterations = 1024;
|
||||||
|
|
||||||
|
InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
|
||||||
|
|
||||||
|
SetVector<Function *> Functions;
|
||||||
|
Attributor A(Functions, InfoCache, AC);
|
||||||
|
|
||||||
|
HostRPC RPC(M, *HostModule, A);
|
||||||
|
bool Changed = RPC.run();
|
||||||
|
|
||||||
|
LLVM_DEBUG({
|
||||||
|
if (Changed && UseDummyHostModule) {
|
||||||
|
M.dump();
|
||||||
|
HostModule->dump();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
|
||||||
|
}
|
||||||
@@ -56,6 +56,7 @@
|
|||||||
#include "llvm/Transforms/IPO/Attributor.h"
|
#include "llvm/Transforms/IPO/Attributor.h"
|
||||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||||
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
|
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
|
||||||
|
#include "llvm/Transforms/Utils/ModuleUtils.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <optional>
|
#include <optional>
|
||||||
@@ -138,6 +139,15 @@ static cl::opt<bool>
|
|||||||
cl::desc("Enables more verbose remarks."), cl::Hidden,
|
cl::desc("Enables more verbose remarks."), cl::Hidden,
|
||||||
cl::init(false));
|
cl::init(false));
|
||||||
|
|
||||||
|
static cl::opt<bool>
|
||||||
|
EnableParallel51Split("openmp-opt-enable-parallel-51-split",
|
||||||
|
cl::desc("Enable the kernel split at parallel_51."),
|
||||||
|
cl::Hidden, cl::init(false));
|
||||||
|
static cl::opt<bool> EnableParallel51SplitMultiTeams(
|
||||||
|
"openmp-opt-enable-parallel-51-split-multi-teams",
|
||||||
|
cl::desc("Enable multi-teams support for the parallel 51 kernel."),
|
||||||
|
cl::Hidden, cl::init(false));
|
||||||
|
|
||||||
static cl::opt<unsigned>
|
static cl::opt<unsigned>
|
||||||
SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
|
SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
|
||||||
cl::desc("Maximal number of attributor iterations."),
|
cl::desc("Maximal number of attributor iterations."),
|
||||||
@@ -917,6 +927,28 @@ private:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
bool canUseMultiTeam(Function *OutlinedFn, OMPInformationCache &OMPInfoCache) {
|
||||||
|
if (!EnableParallel51SplitMultiTeams)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void collectReachingKernels(Function *F,
|
||||||
|
SmallVector<Kernel> &ReachingKernels) {
|
||||||
|
if (omp::isOpenMPKernel(*F)) {
|
||||||
|
if (F->hasFnAttribute("omp_parallel_51_kernel"))
|
||||||
|
ReachingKernels.push_back(F);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (User *U : F->users()) {
|
||||||
|
auto *I = dyn_cast<Instruction>(U);
|
||||||
|
if (!I)
|
||||||
|
continue;
|
||||||
|
collectReachingKernels(I->getFunction(), ReachingKernels);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct OpenMPOpt {
|
struct OpenMPOpt {
|
||||||
|
|
||||||
using OptimizationRemarkGetter =
|
using OptimizationRemarkGetter =
|
||||||
@@ -945,6 +977,11 @@ struct OpenMPOpt {
|
|||||||
<< " functions\n");
|
<< " functions\n");
|
||||||
|
|
||||||
if (IsModulePass) {
|
if (IsModulePass) {
|
||||||
|
if (EnableParallel51Split)
|
||||||
|
Changed |= splitKernels();
|
||||||
|
|
||||||
|
OMPInfoCache.recollectUses();
|
||||||
|
|
||||||
Changed |= runAttributor(IsModulePass);
|
Changed |= runAttributor(IsModulePass);
|
||||||
|
|
||||||
// Recollect uses, in case Attributor deleted any.
|
// Recollect uses, in case Attributor deleted any.
|
||||||
@@ -1047,6 +1084,308 @@ struct OpenMPOpt {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
/// Create a new kernel for each function call to __kmpc_parallel_51.
|
||||||
|
bool splitKernels() {
|
||||||
|
bool Changed = false;
|
||||||
|
|
||||||
|
OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
|
||||||
|
OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
|
||||||
|
|
||||||
|
if (!Parallel51RFI.Declaration)
|
||||||
|
return Changed;
|
||||||
|
|
||||||
|
SmallVector<CallInst *> WorkItems;
|
||||||
|
for (User *U : Parallel51RFI.Declaration->users()) {
|
||||||
|
auto *CI = dyn_cast<CallInst>(U);
|
||||||
|
if (!CI)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
Function *F = CI->getFunction();
|
||||||
|
// If the parent function is already from kernel split, skip it.
|
||||||
|
Attribute Attr = F->getFnAttribute("omp_parallel_51_kernel");
|
||||||
|
if (Attr.isValid())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
WorkItems.push_back(CI);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (WorkItems.empty())
|
||||||
|
return Changed;
|
||||||
|
|
||||||
|
Changed = true;
|
||||||
|
|
||||||
|
auto CreateNewKernel = [](LLVMContext &Ctx, Module &M,
|
||||||
|
OpenMPIRBuilder &IRBuilder, Function *Parallel51,
|
||||||
|
const CallInst *CI) {
|
||||||
|
SmallVector<Type *, 4> ArgTypes;
|
||||||
|
// int32_t gtid
|
||||||
|
ArgTypes.push_back(Type::getInt32Ty(Ctx));
|
||||||
|
// int32_t if_expr
|
||||||
|
ArgTypes.push_back(Type::getInt32Ty(Ctx));
|
||||||
|
// int32_t num_threads
|
||||||
|
ArgTypes.push_back(Type::getInt32Ty(Ctx));
|
||||||
|
// void **args
|
||||||
|
ArgTypes.push_back(PointerType::getUnqual(Ctx));
|
||||||
|
// int64_t nargs
|
||||||
|
ArgTypes.push_back(Type::getInt64Ty(Ctx));
|
||||||
|
|
||||||
|
FunctionType *FT =
|
||||||
|
FunctionType::get(Type::getVoidTy(Ctx), ArgTypes, false);
|
||||||
|
std::string KernelName = "__omp_offloading_parallel_51_from_";
|
||||||
|
KernelName += CI->getFunction()->getName();
|
||||||
|
// Sanitize the kernel name
|
||||||
|
{
|
||||||
|
size_t DotPos = KernelName.find('.');
|
||||||
|
while (DotPos != std::string::npos) {
|
||||||
|
KernelName[DotPos] = '_';
|
||||||
|
DotPos = KernelName.find('.');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Function *K =
|
||||||
|
Function::Create(FT, GlobalValue::WeakODRLinkage, KernelName, &M);
|
||||||
|
K->setVisibility(GlobalValue::ProtectedVisibility);
|
||||||
|
// exec mode global variable
|
||||||
|
GlobalVariable *ModeGV = new GlobalVariable(
|
||||||
|
M, Type::getInt8Ty(Ctx), /*isConstant=*/true,
|
||||||
|
GlobalValue::WeakAnyLinkage,
|
||||||
|
ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD),
|
||||||
|
K->getName() + "_exec_mode");
|
||||||
|
appendToCompilerUsed(M, {ModeGV});
|
||||||
|
|
||||||
|
// Attach "kernel" metadata to the new kernel.
|
||||||
|
NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
|
||||||
|
Metadata *MDVals[] = {ConstantAsMetadata::get(K),
|
||||||
|
MDString::get(Ctx, "kernel"),
|
||||||
|
ConstantAsMetadata::get(ConstantInt::get(
|
||||||
|
llvm::Type::getInt32Ty(Ctx), 1))};
|
||||||
|
MD->addOperand(MDNode::get(Ctx, MDVals));
|
||||||
|
// Set kernel attributes.
|
||||||
|
K->setAttributes(
|
||||||
|
AttributeList::get(Ctx, AttributeList::FunctionIndex,
|
||||||
|
CI->getFunction()->getAttributes().getFnAttrs()));
|
||||||
|
K->removeFnAttr("omp_target_thread_limit");
|
||||||
|
K->removeFnAttr("omp_target_num_teams");
|
||||||
|
K->addFnAttr("omp_parallel_51_kernel");
|
||||||
|
K->addFnAttr("kernel");
|
||||||
|
K->getArg(3)->addAttr(Attribute::NoAlias);
|
||||||
|
|
||||||
|
auto *EntryBB = BasicBlock::Create(Ctx, "entry", K);
|
||||||
|
OpenMPIRBuilder::LocationDescription TargetInitLoc(
|
||||||
|
{EntryBB, EntryBB->end()});
|
||||||
|
auto IP = IRBuilder.createTargetInit(TargetInitLoc, /* IsSPMD */ true);
|
||||||
|
BasicBlock *UserCodeBB = IP.getBlock();
|
||||||
|
|
||||||
|
auto UseDirectIfPossible = [](Value *LHS, Value *RHS) -> Value * {
|
||||||
|
if (isa<Constant>(LHS))
|
||||||
|
return LHS;
|
||||||
|
return RHS;
|
||||||
|
};
|
||||||
|
|
||||||
|
Value *Ident = CI->getOperand(0);
|
||||||
|
Value *TId = UseDirectIfPossible(CI->getOperand(1), K->getArg(0));
|
||||||
|
Value *IfExpr = UseDirectIfPossible(CI->getOperand(2), K->getArg(1));
|
||||||
|
Value *NumThreads = UseDirectIfPossible(CI->getOperand(3), K->getArg(2));
|
||||||
|
Value *ProcBind = CI->getOperand(4);
|
||||||
|
Value *Fn = CI->getOperand(5);
|
||||||
|
Value *WrapperFn = CI->getOperand(6);
|
||||||
|
Value *Args = K->getArg(3);
|
||||||
|
Value *NArgs = UseDirectIfPossible(CI->getOperand(8), K->getArg(4));
|
||||||
|
|
||||||
|
(void)CallInst::Create(Parallel51,
|
||||||
|
{Ident, TId, IfExpr, NumThreads, ProcBind, Fn,
|
||||||
|
WrapperFn, Args, NArgs},
|
||||||
|
"", UserCodeBB);
|
||||||
|
|
||||||
|
OpenMPIRBuilder::LocationDescription TargetDeInitLoc(
|
||||||
|
{UserCodeBB, UserCodeBB->end()});
|
||||||
|
IRBuilder.createTargetDeinit(TargetDeInitLoc, /* IsSPMD */ true);
|
||||||
|
IRBuilder.Builder.CreateRetVoid();
|
||||||
|
|
||||||
|
return K;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto EnableMultiTeam = [](Function *OutlinedFn) {
|
||||||
|
SmallVector<CallInst *> WorkItems;
|
||||||
|
for (BasicBlock &BB : *OutlinedFn)
|
||||||
|
for (Instruction &I : BB) {
|
||||||
|
CallInst *C = dyn_cast<CallInst>(&I);
|
||||||
|
if (!C)
|
||||||
|
continue;
|
||||||
|
Function *Callee = C->getCalledFunction();
|
||||||
|
auto CalleeName = Callee->getName();
|
||||||
|
if (CalleeName == "__kmpc_for_static_init_4" ||
|
||||||
|
CalleeName == "__kmpc_for_static_init_4u" ||
|
||||||
|
CalleeName == "__kmpc_for_static_init_8" ||
|
||||||
|
CalleeName == "__kmpc_for_static_init_8u")
|
||||||
|
WorkItems.push_back(C);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (CallInst *C : WorkItems) {
|
||||||
|
constexpr const unsigned SchedTypeArgNum = 2;
|
||||||
|
Value *SchedTypeVal = C->getOperand(SchedTypeArgNum);
|
||||||
|
ConstantInt *SchedTypeCI = cast<ConstantInt>(SchedTypeVal);
|
||||||
|
int32_t SchedType = SchedTypeCI->getSExtValue();
|
||||||
|
Value *NewSchedType = SchedTypeVal;
|
||||||
|
if (SchedType == /*kmp_sched_static_chunk*/ 33)
|
||||||
|
NewSchedType = ConstantInt::get(SchedTypeCI->getType(), 100);
|
||||||
|
else if (SchedType == /*kmp_sched_static_chunk*/ 34)
|
||||||
|
NewSchedType = ConstantInt::get(SchedTypeCI->getType(), 101);
|
||||||
|
C->setOperand(SchedTypeArgNum, NewSchedType);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
auto &Ctx = M.getContext();
|
||||||
|
|
||||||
|
for (CallInst *CI : WorkItems) {
|
||||||
|
Function *K = CreateNewKernel(Ctx, M, OMPInfoCache.OMPBuilder,
|
||||||
|
Parallel51RFI.Declaration, CI);
|
||||||
|
|
||||||
|
constexpr const unsigned OutlinedFnArgNum = 5;
|
||||||
|
Function *OutlinedFn =
|
||||||
|
dyn_cast<Function>(CI->getOperand(OutlinedFnArgNum));
|
||||||
|
assert(OutlinedFn && "arg fn is not a function");
|
||||||
|
|
||||||
|
if (canUseMultiTeam(OutlinedFn, OMPInfoCache))
|
||||||
|
EnableMultiTeam(OutlinedFn);
|
||||||
|
|
||||||
|
auto &Ctx = M.getContext();
|
||||||
|
IRBuilder<> Builder(CI);
|
||||||
|
|
||||||
|
FunctionCallee Callee =
|
||||||
|
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
|
||||||
|
M, OMPRTL___kmpc_launch_parallel_51_kernel);
|
||||||
|
Constant *KernelName = ConstantDataArray::getString(Ctx, K->getName());
|
||||||
|
GlobalVariable *KernelNameVar =
|
||||||
|
new GlobalVariable(M, KernelName->getType(), /*isConstant=*/true,
|
||||||
|
GlobalValue::WeakAnyLinkage, KernelName);
|
||||||
|
Value *Args[6] = {KernelNameVar, CI->getArgOperand(1),
|
||||||
|
CI->getArgOperand(2), CI->getArgOperand(3),
|
||||||
|
CI->getArgOperand(7), CI->getArgOperand(8)};
|
||||||
|
|
||||||
|
CallInst *C = Builder.CreateCall(Callee, Args);
|
||||||
|
OMPInfoCache.setCallingConvention(Callee, C);
|
||||||
|
CI->eraseFromParent();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check all the use of __kmpc_parallel_51 again. If it is just used in
|
||||||
|
// parallel kernel, we can mark the main kernel as SPMD mode as well.
|
||||||
|
bool OnlyUsedByParallelKernel = true;
|
||||||
|
for (User *U : Parallel51RFI.Declaration->users()) {
|
||||||
|
auto *CI = dyn_cast<CallInst>(U);
|
||||||
|
if (!CI)
|
||||||
|
continue;
|
||||||
|
Function *F = CI->getFunction();
|
||||||
|
Attribute Attr = F->getFnAttribute("omp_parallel_51_kernel");
|
||||||
|
if (Attr.isValid())
|
||||||
|
continue;
|
||||||
|
OnlyUsedByParallelKernel = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (OnlyUsedByParallelKernel) {
|
||||||
|
for (auto K : SCC) {
|
||||||
|
if (!omp::isOpenMPKernel(*K))
|
||||||
|
continue;
|
||||||
|
Attribute Attr = K->getFnAttribute("omp_parallel_51_kernel");
|
||||||
|
if (Attr.isValid())
|
||||||
|
continue;
|
||||||
|
GlobalVariable *ExecMode =
|
||||||
|
M.getGlobalVariable((K->getName() + "_exec_mode").str());
|
||||||
|
assert(ExecMode && "kernel without exec mode");
|
||||||
|
assert(ExecMode->getInitializer() &&
|
||||||
|
"ExecMode doesn't have initializer!");
|
||||||
|
assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
|
||||||
|
"ExecMode is not an integer!");
|
||||||
|
const int8_t ExecModeVal =
|
||||||
|
cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
|
||||||
|
// Kernel is already in SPMD mode, skip.
|
||||||
|
if (ExecModeVal & OMP_TGT_EXEC_MODE_SPMD)
|
||||||
|
continue;
|
||||||
|
auto *NewExecModeC =
|
||||||
|
ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD);
|
||||||
|
ExecMode->setInitializer(NewExecModeC);
|
||||||
|
OMPInformationCache::RuntimeFunctionInfo &TargetInitRFI =
|
||||||
|
OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
|
||||||
|
OMPInformationCache::RuntimeFunctionInfo &TargetDeInitRFI =
|
||||||
|
OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
|
||||||
|
CallInst *InitCI = nullptr;
|
||||||
|
CallInst *DeInitCI = nullptr;
|
||||||
|
for (User *U : TargetInitRFI.Declaration->users()) {
|
||||||
|
auto *CI = dyn_cast<CallInst>(U);
|
||||||
|
if (!CI)
|
||||||
|
continue;
|
||||||
|
if (CI->getFunction() == K) {
|
||||||
|
InitCI = CI;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (User *U : TargetDeInitRFI.Declaration->users()) {
|
||||||
|
auto *CI = dyn_cast<CallInst>(U);
|
||||||
|
if (!CI)
|
||||||
|
continue;
|
||||||
|
if (CI->getFunction() == K) {
|
||||||
|
DeInitCI = CI;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(InitCI && DeInitCI && "kernel without init and deinit");
|
||||||
|
InitCI->setArgOperand(1, NewExecModeC);
|
||||||
|
InitCI->setArgOperand(2,
|
||||||
|
ConstantInt::getNullValue(Type::getInt1Ty(Ctx)));
|
||||||
|
DeInitCI->setArgOperand(1, NewExecModeC);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check the use of omp_get_thread_num and omp_get_num_threads.
|
||||||
|
OMPInformationCache::RuntimeFunctionInfo &ThreadNumRFI =
|
||||||
|
OMPInfoCache.RFIs[OMPRTL_omp_get_thread_num];
|
||||||
|
OMPInformationCache::RuntimeFunctionInfo &NumThreadsRFI =
|
||||||
|
OMPInfoCache.RFIs[OMPRTL_omp_get_num_threads];
|
||||||
|
|
||||||
|
auto CheckIfOnlyUsedByParallelKernel = [&](CallInst *CI) {
|
||||||
|
SmallVector<Kernel> ReachingKernels;
|
||||||
|
collectReachingKernels(CI->getFunction(),
|
||||||
|
ReachingKernels);
|
||||||
|
return ReachingKernels.size() < 2;
|
||||||
|
};
|
||||||
|
|
||||||
|
SmallVector<CallInst *> WorkList;
|
||||||
|
|
||||||
|
for (User *U : ThreadNumRFI.Declaration->users()) {
|
||||||
|
auto *CI = dyn_cast<CallInst>(U);
|
||||||
|
if (!CI)
|
||||||
|
continue;
|
||||||
|
if (!CheckIfOnlyUsedByParallelKernel(CI))
|
||||||
|
continue;
|
||||||
|
WorkList.push_back(CI);
|
||||||
|
}
|
||||||
|
for (User *U : NumThreadsRFI.Declaration->users()) {
|
||||||
|
auto *CI = dyn_cast<CallInst>(U);
|
||||||
|
if (!CI)
|
||||||
|
continue;
|
||||||
|
if (!CheckIfOnlyUsedByParallelKernel(CI))
|
||||||
|
continue;
|
||||||
|
WorkList.push_back(CI);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (CallInst *CI : WorkList) {
|
||||||
|
Function *Callee = nullptr;
|
||||||
|
if (CI->getCalledFunction() == ThreadNumRFI.Declaration)
|
||||||
|
Callee = OMPInfoCache.RFIs[OMPRTL_omp_get_bulk_thread_num].Declaration;
|
||||||
|
if (CI->getCalledFunction() == NumThreadsRFI.Declaration)
|
||||||
|
Callee = OMPInfoCache.RFIs[OMPRTL_omp_get_bulk_num_threads].Declaration;
|
||||||
|
assert(Callee && "unknown callee");
|
||||||
|
|
||||||
|
auto &Builder = OMPInfoCache.OMPBuilder.Builder;
|
||||||
|
Builder.SetInsertPoint(CI);
|
||||||
|
Value *C = Builder.CreateCall(Callee);
|
||||||
|
CI->replaceAllUsesWith(C);
|
||||||
|
CI->eraseFromParent();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Changed;
|
||||||
|
}
|
||||||
|
|
||||||
/// Merge parallel regions when it is safe.
|
/// Merge parallel regions when it is safe.
|
||||||
bool mergeParallelRegions() {
|
bool mergeParallelRegions() {
|
||||||
const unsigned CallbackCalleeOperand = 2;
|
const unsigned CallbackCalleeOperand = 2;
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMTransformUtils
|
|||||||
CallGraphUpdater.cpp
|
CallGraphUpdater.cpp
|
||||||
CanonicalizeAliases.cpp
|
CanonicalizeAliases.cpp
|
||||||
CanonicalizeFreezeInLoops.cpp
|
CanonicalizeFreezeInLoops.cpp
|
||||||
|
CanonicalizeMainFunction.cpp
|
||||||
CloneFunction.cpp
|
CloneFunction.cpp
|
||||||
CloneModule.cpp
|
CloneModule.cpp
|
||||||
CodeExtractor.cpp
|
CodeExtractor.cpp
|
||||||
|
|||||||
103
llvm/lib/Transforms/Utils/CanonicalizeMainFunction.cpp
Normal file
103
llvm/lib/Transforms/Utils/CanonicalizeMainFunction.cpp
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
//===-------------- CanonicalizeMainFunction.cpp ----------------*- C++ -*-===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
// Utility function to canonicalize main function.
|
||||||
|
// The canonical main function is defined as: int main(int argc, char *argv[]);
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
|
||||||
|
#include "llvm/ADT/SmallVector.h"
|
||||||
|
#include "llvm/IR/Constants.h"
|
||||||
|
#include "llvm/IR/Instructions.h"
|
||||||
|
#include "llvm/Support/CommandLine.h"
|
||||||
|
|
||||||
|
using namespace llvm;
|
||||||
|
|
||||||
|
#define DEBUG_TYPE "canonicalize-main-function"
|
||||||
|
|
||||||
|
static cl::opt<std::string>
|
||||||
|
MainFunctionName("canonical-main-function-name",
|
||||||
|
cl::desc("New main function name"),
|
||||||
|
cl::value_desc("main function name"));
|
||||||
|
|
||||||
|
bool rewriteMainFunction(Function &F) {
|
||||||
|
if (F.arg_size() == 2 && F.getReturnType()->isIntegerTy(32))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
auto &Ctx = F.getContext();
|
||||||
|
auto &DL = F.getParent()->getDataLayout();
|
||||||
|
auto *Int32Ty = IntegerType::getInt32Ty(Ctx);
|
||||||
|
auto *PtrTy = PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace());
|
||||||
|
|
||||||
|
FunctionType *NewFnTy =
|
||||||
|
FunctionType::get(Int32Ty, {Int32Ty, PtrTy}, /* isVarArg */ false);
|
||||||
|
Function *NewFn =
|
||||||
|
Function::Create(NewFnTy, F.getLinkage(), F.getAddressSpace(), "");
|
||||||
|
F.getParent()->getFunctionList().insert(F.getIterator(), NewFn);
|
||||||
|
NewFn->takeName(&F);
|
||||||
|
NewFn->copyAttributesFrom(&F);
|
||||||
|
NewFn->setSubprogram(F.getSubprogram());
|
||||||
|
F.setSubprogram(nullptr);
|
||||||
|
NewFn->splice(NewFn->begin(), &F);
|
||||||
|
|
||||||
|
if (!F.getReturnType()->isIntegerTy(32)) {
|
||||||
|
SmallVector<ReturnInst *> WorkList;
|
||||||
|
for (BasicBlock &BB : *NewFn)
|
||||||
|
for (Instruction &I : BB) {
|
||||||
|
auto *RI = dyn_cast<ReturnInst>(&I);
|
||||||
|
if (!RI)
|
||||||
|
continue;
|
||||||
|
assert(RI->getReturnValue() == nullptr &&
|
||||||
|
"return value of a void main function is not nullptr");
|
||||||
|
WorkList.push_back(RI);
|
||||||
|
}
|
||||||
|
for (auto *RI : WorkList) {
|
||||||
|
(void)ReturnInst::Create(Ctx, ConstantInt::getNullValue(Int32Ty), RI);
|
||||||
|
RI->eraseFromParent();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (F.arg_size() == NewFn->arg_size())
|
||||||
|
for (unsigned I = 0; I < NewFn->arg_size(); ++I) {
|
||||||
|
Argument *OldArg = F.getArg(I);
|
||||||
|
Argument *NewArg = NewFn->getArg(I);
|
||||||
|
NewArg->takeName(OldArg);
|
||||||
|
OldArg->replaceAllUsesWith(NewArg);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
PreservedAnalyses CanonicalizeMainFunctionPass::run(Module &M,
|
||||||
|
ModuleAnalysisManager &AM) {
|
||||||
|
Function *MainFunc = nullptr;
|
||||||
|
|
||||||
|
for (Function &F : M)
|
||||||
|
if (F.getName() == "main") {
|
||||||
|
assert(MainFunc == nullptr && "more than one main function");
|
||||||
|
MainFunc = &F;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MainFunc == nullptr)
|
||||||
|
return PreservedAnalyses::all();
|
||||||
|
|
||||||
|
bool Changed = false;
|
||||||
|
|
||||||
|
if (!MainFunctionName.empty() && MainFunc->getName() != MainFunctionName) {
|
||||||
|
MainFunc->setName(MainFunctionName);
|
||||||
|
Changed = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rewriteMainFunction(*MainFunc)) {
|
||||||
|
MainFunc->eraseFromParent();
|
||||||
|
Changed = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
|
||||||
|
}
|
||||||
@@ -89,6 +89,7 @@ set(include_files
|
|||||||
${include_directory}/Interface.h
|
${include_directory}/Interface.h
|
||||||
${include_directory}/LibC.h
|
${include_directory}/LibC.h
|
||||||
${include_directory}/Mapping.h
|
${include_directory}/Mapping.h
|
||||||
|
${include_directory}/Memory.h
|
||||||
${include_directory}/State.h
|
${include_directory}/State.h
|
||||||
${include_directory}/Synchronization.h
|
${include_directory}/Synchronization.h
|
||||||
${include_directory}/Types.h
|
${include_directory}/Types.h
|
||||||
@@ -99,6 +100,7 @@ set(src_files
|
|||||||
${source_directory}/Allocator.cpp
|
${source_directory}/Allocator.cpp
|
||||||
${source_directory}/Configuration.cpp
|
${source_directory}/Configuration.cpp
|
||||||
${source_directory}/Debug.cpp
|
${source_directory}/Debug.cpp
|
||||||
|
${source_directory}/HostRPC.cpp
|
||||||
${source_directory}/Kernel.cpp
|
${source_directory}/Kernel.cpp
|
||||||
${source_directory}/LibC.cpp
|
${source_directory}/LibC.cpp
|
||||||
${source_directory}/Mapping.cpp
|
${source_directory}/Mapping.cpp
|
||||||
@@ -112,6 +114,14 @@ set(src_files
|
|||||||
${source_directory}/Workshare.cpp
|
${source_directory}/Workshare.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if (LIBOMPTARGET_DEVICE_BUILTIN_ALLOCATOR)
|
||||||
|
list(APPEND src_files ${source_directory}/BuiltinAllocator.cpp)
|
||||||
|
elseif (LIBOMPTARGET_GENERIC_ALLOCATOR)
|
||||||
|
list(APPEND src_files ${source_directory}/GenericAllocator.cpp)
|
||||||
|
else()
|
||||||
|
list(APPEND src_files ${source_directory}/TeamAllocator.cpp)
|
||||||
|
endif()
|
||||||
|
|
||||||
# We disable the slp vectorizer during the runtime optimization to avoid
|
# We disable the slp vectorizer during the runtime optimization to avoid
|
||||||
# vectorized accesses to the shared state. Generally, those are "good" but
|
# vectorized accesses to the shared state. Generally, those are "good" but
|
||||||
# the optimizer pipeline (esp. Attributor) does not fully support vectorized
|
# the optimizer pipeline (esp. Attributor) does not fully support vectorized
|
||||||
@@ -138,6 +148,14 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
|
|||||||
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
|
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if (LIBOMPTARGET_DEVICERTL_HOSTRPC_DEBUG)
|
||||||
|
list(APPEND bc_flags "-DHOSTRPC_DEBUG")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LIBOMPTARGET_DEVICERTL_HOSTRPC_PROFILING)
|
||||||
|
list(APPEND bc_flags "-DHOSTRPC_PROFILING")
|
||||||
|
endif()
|
||||||
|
|
||||||
# first create an object target
|
# first create an object target
|
||||||
add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
|
add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
|
||||||
function(compileDeviceRTLLibrary target_cpu target_name target_triple)
|
function(compileDeviceRTLLibrary target_cpu target_name target_triple)
|
||||||
|
|||||||
@@ -39,6 +39,12 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
|
|||||||
__builtin_trap(); \
|
__builtin_trap(); \
|
||||||
__builtin_unreachable();
|
__builtin_unreachable();
|
||||||
|
|
||||||
|
#define assert(expr) \
|
||||||
|
{ \
|
||||||
|
if (!(expr)) \
|
||||||
|
__assert_fail(#expr, __FILE__, __LINE__, __PRETTY_FUNCTION__); \
|
||||||
|
}
|
||||||
|
|
||||||
///}
|
///}
|
||||||
|
|
||||||
#define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);
|
#define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);
|
||||||
|
|||||||
@@ -360,6 +360,22 @@ int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
|
|||||||
int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
|
int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
|
||||||
int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
|
int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
|
||||||
///}
|
///}
|
||||||
|
|
||||||
|
/// Host RPC
|
||||||
|
///
|
||||||
|
/// {
|
||||||
|
void *__kmpc_host_rpc_get_desc(int32_t CallNo, int32_t NumArgs, void *ArgInfo);
|
||||||
|
void __kmpc_host_rpc_add_arg(void *Desc, int64_t Arg, int32_t ArgNum);
|
||||||
|
int64_t __kmpc_host_rpc_send_and_wait(void *Desc);
|
||||||
|
/// }
|
||||||
|
|
||||||
|
/// Launch parallel kernel
|
||||||
|
///
|
||||||
|
///{
|
||||||
|
void __kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
|
||||||
|
int32_t if_expr, int32_t num_threads,
|
||||||
|
void **args, int64_t nargs);
|
||||||
|
///}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -14,12 +14,116 @@
|
|||||||
|
|
||||||
#include "Types.h"
|
#include "Types.h"
|
||||||
|
|
||||||
|
struct FILE;
|
||||||
|
extern FILE *stdin;
|
||||||
|
extern FILE *stdout;
|
||||||
|
extern FILE *stderr;
|
||||||
|
|
||||||
|
#ifndef _ASM_GENERIC_ERRNO_BASE_H
|
||||||
|
#define _ASM_GENERIC_ERRNO_BASE_H
|
||||||
|
|
||||||
|
#define EPERM 1 /* Operation not permitted */
|
||||||
|
#define ENOENT 2 /* No such file or directory */
|
||||||
|
#define ESRCH 3 /* No such process */
|
||||||
|
#define EINTR 4 /* Interrupted system call */
|
||||||
|
#define EIO 5 /* I/O error */
|
||||||
|
#define ENXIO 6 /* No such device or address */
|
||||||
|
#define E2BIG 7 /* Argument list too long */
|
||||||
|
#define ENOEXEC 8 /* Exec format error */
|
||||||
|
#define EBADF 9 /* Bad file number */
|
||||||
|
#define ECHILD 10 /* No child processes */
|
||||||
|
#define EAGAIN 11 /* Try again */
|
||||||
|
#define ENOMEM 12 /* Out of memory */
|
||||||
|
#define EACCES 13 /* Permission denied */
|
||||||
|
#define EFAULT 14 /* Bad address */
|
||||||
|
#define ENOTBLK 15 /* Block device required */
|
||||||
|
#define EBUSY 16 /* Device or resource busy */
|
||||||
|
#define EEXIST 17 /* File exists */
|
||||||
|
#define EXDEV 18 /* Cross-device link */
|
||||||
|
#define ENODEV 19 /* No such device */
|
||||||
|
#define ENOTDIR 20 /* Not a directory */
|
||||||
|
#define EISDIR 21 /* Is a directory */
|
||||||
|
#define EINVAL 22 /* Invalid argument */
|
||||||
|
#define ENFILE 23 /* File table overflow */
|
||||||
|
#define EMFILE 24 /* Too many open files */
|
||||||
|
#define ENOTTY 25 /* Not a typewriter */
|
||||||
|
#define ETXTBSY 26 /* Text file busy */
|
||||||
|
#define EFBIG 27 /* File too large */
|
||||||
|
#define ENOSPC 28 /* No space left on device */
|
||||||
|
#define ESPIPE 29 /* Illegal seek */
|
||||||
|
#define EROFS 30 /* Read-only file system */
|
||||||
|
#define EMLINK 31 /* Too many links */
|
||||||
|
#define EPIPE 32 /* Broken pipe */
|
||||||
|
#define EDOM 33 /* Math argument out of domain of func */
|
||||||
|
#define ERANGE 34 /* Math result not representable */
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define errno (*__errno_location())
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
||||||
int memcmp(const void *lhs, const void *rhs, size_t count);
|
int memcmp(const void *lhs, const void *rhs, size_t count);
|
||||||
void memset(void *dst, int C, size_t count);
|
void *memset(void *dst, int C, size_t count);
|
||||||
|
|
||||||
int printf(const char *format, ...);
|
int printf(const char *format, ...);
|
||||||
|
|
||||||
|
long strtol(const char *str, char **str_end, int base);
|
||||||
|
|
||||||
|
int strcmp(const char *lhs, const char *rhs);
|
||||||
|
|
||||||
|
void *calloc(size_t num, size_t size);
|
||||||
|
|
||||||
|
int strcasecmp(const char *string1, const char *string2);
|
||||||
|
|
||||||
|
void exit(int exit_code);
|
||||||
|
|
||||||
|
size_t strlen(const char *str);
|
||||||
|
|
||||||
|
int atoi(const char *str);
|
||||||
|
|
||||||
|
char *strcpy(char *dest, const char *src);
|
||||||
|
|
||||||
|
int stat(const char *path, struct stat *buf);
|
||||||
|
|
||||||
|
int *__errno_location();
|
||||||
|
|
||||||
|
char *strcat(char *dest, const char *src);
|
||||||
|
|
||||||
|
void perror(const char *s);
|
||||||
|
|
||||||
|
int strncmp(const char *lhs, const char *rhs, size_t count);
|
||||||
|
|
||||||
|
char *strncpy(char *dest, const char *src, size_t count);
|
||||||
|
|
||||||
|
char *strchr(const char *str, int ch);
|
||||||
|
|
||||||
|
char *strtok(char *str, const char *delim);
|
||||||
|
|
||||||
|
const unsigned short **__ctype_b_loc(void);
|
||||||
|
|
||||||
|
void *realloc(void *ptr, size_t new_size);
|
||||||
|
|
||||||
|
void qsort(void *const pbase, size_t total_elems, size_t size,
|
||||||
|
int (*comp)(const void *, const void *));
|
||||||
|
|
||||||
|
int gettimeofday(struct timeval *tv, struct timezone *tz);
|
||||||
|
|
||||||
|
char *__xpg_basename(const char *path);
|
||||||
|
|
||||||
|
void srand(unsigned seed);
|
||||||
|
|
||||||
|
int rand();
|
||||||
|
|
||||||
|
int abs(int n);
|
||||||
|
|
||||||
|
void *memcpy(void *dest, const void *src, size_t count);
|
||||||
|
|
||||||
|
double atof(const char *str);
|
||||||
|
|
||||||
|
double strtod(const char *str, char **ptr);
|
||||||
|
|
||||||
|
long strtol(const char *nptr, char **endptr, int base);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
39
openmp/libomptarget/DeviceRTL/include/Memory.h
Normal file
39
openmp/libomptarget/DeviceRTL/include/Memory.h
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
//===--- Memory.h - OpenMP device runtime memory allocator -------- C++ -*-===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
//
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef OMPTARGET_MEMORY_H
|
||||||
|
#define OMPTARGET_MEMORY_H
|
||||||
|
|
||||||
|
#include "Types.h"
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
__attribute__((leaf)) void *malloc(size_t Size);
|
||||||
|
__attribute__((leaf)) void free(void *Ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace ompx {
|
||||||
|
namespace memory {
|
||||||
|
|
||||||
|
struct MemoryAllocationInfo {
|
||||||
|
void *BasePtr = nullptr;
|
||||||
|
size_t Size = 0;
|
||||||
|
|
||||||
|
bool isValid() const { return BasePtr; }
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Get the memory allocation information if pointer \p P is in the range of one
|
||||||
|
/// of the buffer allocated by \p malloc.
|
||||||
|
MemoryAllocationInfo getMemoryAllocationInfo(void *P);
|
||||||
|
} // namespace memory
|
||||||
|
} // namespace ompx
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -36,6 +36,9 @@ enum MemScopeTy {
|
|||||||
uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering,
|
uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering,
|
||||||
MemScopeTy MemScope = MemScopeTy::all);
|
MemScopeTy MemScope = MemScopeTy::all);
|
||||||
|
|
||||||
|
///
|
||||||
|
int32_t addSys(int32_t *Addr, int32_t V);
|
||||||
|
|
||||||
/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
|
/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
|
||||||
/// result is stored in \p *Addr;
|
/// result is stored in \p *Addr;
|
||||||
/// {
|
/// {
|
||||||
@@ -135,6 +138,60 @@ void system(atomic::OrderingTy Ordering);
|
|||||||
|
|
||||||
} // namespace fence
|
} // namespace fence
|
||||||
|
|
||||||
|
namespace atomic {
|
||||||
|
|
||||||
|
/// Atomically load \p Addr with \p Ordering semantics.
|
||||||
|
uint32_t load(uint32_t *Addr, int Ordering);
|
||||||
|
|
||||||
|
/// Atomically load \p Addr with \p Ordering semantics.
|
||||||
|
uint64_t load(uint64_t *Addr, int Ordering);
|
||||||
|
|
||||||
|
/// Atomically store \p V to \p Addr with \p Ordering semantics.
|
||||||
|
void store(uint32_t *Addr, uint32_t V, int Ordering);
|
||||||
|
|
||||||
|
void store(uint64_t *Addr, uint64_t V, int Ordering);
|
||||||
|
|
||||||
|
/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
|
||||||
|
uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);
|
||||||
|
|
||||||
|
/// Atomically add \p V to \p *Addr with \p Ordering semantics.
|
||||||
|
uint32_t add(uint32_t *Addr, uint32_t V, int Ordering);
|
||||||
|
|
||||||
|
/// Atomically add \p V to \p *Addr with \p Ordering semantics.
|
||||||
|
uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);
|
||||||
|
|
||||||
|
} // namespace atomic
|
||||||
|
|
||||||
|
namespace mutex {
|
||||||
|
|
||||||
|
class TicketLock {
|
||||||
|
uint64_t NowServing = 0;
|
||||||
|
uint64_t NextTicket = 0;
|
||||||
|
|
||||||
|
public:
|
||||||
|
TicketLock() = default;
|
||||||
|
|
||||||
|
TicketLock(const TicketLock &) = delete;
|
||||||
|
|
||||||
|
TicketLock(TicketLock &&) = delete;
|
||||||
|
|
||||||
|
void lock();
|
||||||
|
|
||||||
|
void unlock();
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> class LockGuard {
|
||||||
|
T &Lock;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit LockGuard(T &L) : Lock(L) { Lock.lock(); }
|
||||||
|
|
||||||
|
~LockGuard() { Lock.unlock(); }
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace mutex
|
||||||
|
|
||||||
} // namespace ompx
|
} // namespace ompx
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ using size_t = decltype(sizeof(char));
|
|||||||
// TODO: Properly implement this
|
// TODO: Properly implement this
|
||||||
using intptr_t = int64_t;
|
using intptr_t = int64_t;
|
||||||
using uintptr_t = uint64_t;
|
using uintptr_t = uint64_t;
|
||||||
|
using ptrdiff_t = intptr_t;
|
||||||
|
|
||||||
static_assert(sizeof(int8_t) == 1, "type size mismatch");
|
static_assert(sizeof(int8_t) == 1, "type size mismatch");
|
||||||
static_assert(sizeof(uint8_t) == 1, "type size mismatch");
|
static_assert(sizeof(uint8_t) == 1, "type size mismatch");
|
||||||
@@ -83,6 +84,11 @@ enum kmp_sched_t {
|
|||||||
kmp_sched_distr_static_nochunk = 92,
|
kmp_sched_distr_static_nochunk = 92,
|
||||||
kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
|
kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
|
||||||
|
|
||||||
|
// The following two are for direct GPU compilation where we don't have outer
|
||||||
|
// loop to distribute workload among teams but we still need to.
|
||||||
|
kmp_sched_distr_parallel_static_noloop_chunk = 100,
|
||||||
|
kmp_sched_distr_parallel_static_noloop_nochunk = 101,
|
||||||
|
|
||||||
kmp_sched_default = kmp_sched_static_nochunk,
|
kmp_sched_default = kmp_sched_static_nochunk,
|
||||||
kmp_sched_unordered_first = kmp_sched_static_chunk,
|
kmp_sched_unordered_first = kmp_sched_static_chunk,
|
||||||
kmp_sched_unordered_last = kmp_sched_auto,
|
kmp_sched_unordered_last = kmp_sched_auto,
|
||||||
|
|||||||
@@ -16,6 +16,9 @@
|
|||||||
|
|
||||||
#pragma omp begin declare target device_type(nohost)
|
#pragma omp begin declare target device_type(nohost)
|
||||||
|
|
||||||
|
extern "C" double omp_get_wtime();
|
||||||
|
extern "C" int printf(const char *, ...);
|
||||||
|
|
||||||
namespace ompx {
|
namespace ompx {
|
||||||
namespace utils {
|
namespace utils {
|
||||||
|
|
||||||
@@ -37,8 +40,8 @@ template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Advance \p Ptr by \p Bytes bytes.
|
/// Advance \p Ptr by \p Bytes bytes.
|
||||||
template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
|
template <typename Ty1, typename Ty2> inline Ty1 advance(Ty1 Ptr, Ty2 Bytes) {
|
||||||
return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
|
return reinterpret_cast<Ty1>(reinterpret_cast<char *>(Ptr) + Bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the first bit set in \p V.
|
/// Return the first bit set in \p V.
|
||||||
@@ -82,12 +85,39 @@ template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
|
|||||||
return *((DstTy *)(&V));
|
return *((DstTy *)(&V));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename Ty = char>
|
||||||
|
ptrdiff_t getPtrDiff(const void *End, const void *Begin) {
|
||||||
|
return reinterpret_cast<const Ty *>(End) -
|
||||||
|
reinterpret_cast<const Ty *>(Begin);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool isInRange(void *Ptr, void *BasePtr, int64_t Offset) {
|
||||||
|
ptrdiff_t Diff = getPtrDiff(Ptr, BasePtr);
|
||||||
|
return Diff >= 0 && Diff < Offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline intptr_t ptrtoint(void *Ptr) { return reinterpret_cast<intptr_t>(Ptr); }
|
||||||
|
|
||||||
|
template <typename T> T min(T a, T b) { return a < b ? a : b; }
|
||||||
|
|
||||||
/// A pointer variable that has by design an `undef` value. Use with care.
|
/// A pointer variable that has by design an `undef` value. Use with care.
|
||||||
[[clang::loader_uninitialized]] static void *const UndefPtr;
|
[[clang::loader_uninitialized]] static void *const UndefPtr;
|
||||||
|
|
||||||
#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
|
#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
|
||||||
#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
|
#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
|
||||||
|
|
||||||
|
class SimpleProfiler {
|
||||||
|
const char *HeadLine = nullptr;
|
||||||
|
double Start;
|
||||||
|
|
||||||
|
public:
|
||||||
|
SimpleProfiler(const char *HL) : HeadLine(HL), Start(omp_get_wtime()) {}
|
||||||
|
~SimpleProfiler() {
|
||||||
|
double End = omp_get_wtime();
|
||||||
|
printf("%s --> %lf s.\n", HeadLine, End - Start);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace utils
|
} // namespace utils
|
||||||
} // namespace ompx
|
} // namespace ompx
|
||||||
|
|
||||||
|
|||||||
40
openmp/libomptarget/DeviceRTL/src/BuiltinAllocator.cpp
Normal file
40
openmp/libomptarget/DeviceRTL/src/BuiltinAllocator.cpp
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
//===------- BuiltinAllocator.cpp - Generic GPU memory allocator -- C++ -*-===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#pragma omp begin declare target device_type(nohost)
|
||||||
|
|
||||||
|
#include "Debug.h"
|
||||||
|
#include "Memory.h"
|
||||||
|
#include "Utils.h"
|
||||||
|
|
||||||
|
using namespace ompx;
|
||||||
|
|
||||||
|
namespace ompx {
|
||||||
|
namespace memory {
|
||||||
|
MemoryAllocationInfo getMemoryAllocationInfo(void *P) { return {}; }
|
||||||
|
|
||||||
|
void init() {}
|
||||||
|
} // namespace memory
|
||||||
|
} // namespace ompx
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
|
||||||
|
void *realloc(void *ptr, size_t new_size) {
|
||||||
|
void *NewPtr = malloc(new_size);
|
||||||
|
if (!NewPtr)
|
||||||
|
return nullptr;
|
||||||
|
__builtin_memcpy(NewPtr, ptr, new_size);
|
||||||
|
free(ptr);
|
||||||
|
return NewPtr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma omp end declare target
|
||||||
17
openmp/libomptarget/DeviceRTL/src/CMakeLists.txt
Normal file
17
openmp/libomptarget/DeviceRTL/src/CMakeLists.txt
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
target_sources(omptarget.devicertl PRIVATE
|
||||||
|
Configuration.cpp
|
||||||
|
Debug.cpp
|
||||||
|
HostRPC.cpp
|
||||||
|
GenericAllocator.cpp
|
||||||
|
Kernel.cpp
|
||||||
|
LibC.cpp
|
||||||
|
Mapping.cpp
|
||||||
|
Misc.cpp
|
||||||
|
Parallelism.cpp
|
||||||
|
Reduction.cpp
|
||||||
|
State.cpp
|
||||||
|
Synchronization.cpp
|
||||||
|
Tasking.cpp
|
||||||
|
Utils.cpp
|
||||||
|
WarpAllocator.cpp
|
||||||
|
Workshare.cpp)
|
||||||
310
openmp/libomptarget/DeviceRTL/src/GenericAllocator.cpp
Normal file
310
openmp/libomptarget/DeviceRTL/src/GenericAllocator.cpp
Normal file
@@ -0,0 +1,310 @@
|
|||||||
|
//===------- GenericAllocator.cpp - Generic GPU memory allocator -- C++ -*-===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#pragma omp begin declare target device_type(nohost)
|
||||||
|
|
||||||
|
#include "Debug.h"
|
||||||
|
#include "Memory.h"
|
||||||
|
#include "Synchronization.h"
|
||||||
|
#include "Utils.h"
|
||||||
|
|
||||||
|
using namespace ompx;
|
||||||
|
|
||||||
|
char *CONSTANT(omptarget_device_heap_buffer)
|
||||||
|
__attribute__((used, retain, weak, visibility("protected")));
|
||||||
|
|
||||||
|
size_t CONSTANT(omptarget_device_heap_size)
|
||||||
|
__attribute__((used, retain, weak, visibility("protected")));
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
size_t HeapCurPos = 0;
|
||||||
|
mutex::TicketLock HeapLock;
|
||||||
|
mutex::TicketLock AllocationListLock;
|
||||||
|
mutex::TicketLock FreeListLock;
|
||||||
|
constexpr const size_t Alignment = 16;
|
||||||
|
|
||||||
|
using intptr_t = int64_t;
|
||||||
|
|
||||||
|
struct SimpleLinkListNode {
|
||||||
|
template <typename T> friend struct SimpleLinkList;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
SimpleLinkListNode *Prev = nullptr;
|
||||||
|
SimpleLinkListNode *Next = nullptr;
|
||||||
|
|
||||||
|
bool operator>(const SimpleLinkListNode &RHS) const {
|
||||||
|
const auto This = reinterpret_cast<int64_t>(this);
|
||||||
|
const auto RHSThis = reinterpret_cast<int64_t>(&RHS);
|
||||||
|
return This > RHSThis;
|
||||||
|
}
|
||||||
|
|
||||||
|
SimpleLinkListNode() = default;
|
||||||
|
SimpleLinkListNode(const SimpleLinkListNode &) = delete;
|
||||||
|
SimpleLinkListNode(SimpleLinkListNode &&) = delete;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct AllocationMetadata final : SimpleLinkListNode {
|
||||||
|
public:
|
||||||
|
size_t getUserSize() const;
|
||||||
|
void *getUserAddr() const {
|
||||||
|
return const_cast<void *>(reinterpret_cast<const void *>(this + 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t getSize() const { return Size; }
|
||||||
|
void setSize(size_t V) { Size = V; }
|
||||||
|
|
||||||
|
bool isInRange(void *Ptr) {
|
||||||
|
return utils::isInRange(Ptr, this + 1, getUserSize());
|
||||||
|
}
|
||||||
|
|
||||||
|
static AllocationMetadata *getFromUserAddr(void *P) {
|
||||||
|
return getFromAddr(P) - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AllocationMetadata *getFromAddr(void *P) {
|
||||||
|
return reinterpret_cast<AllocationMetadata *>(P);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
size_t Size = 0;
|
||||||
|
int64_t Reserved = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
constexpr const size_t AllocationMetadataSize = sizeof(AllocationMetadata);
|
||||||
|
static_assert(AllocationMetadataSize == 32,
|
||||||
|
"expect the metadata size to be 32");
|
||||||
|
|
||||||
|
size_t AllocationMetadata::getUserSize() const {
|
||||||
|
return Size - AllocationMetadataSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> struct SimpleLinkList {
|
||||||
|
struct iterator {
|
||||||
|
friend SimpleLinkList;
|
||||||
|
|
||||||
|
T *operator->() { return reinterpret_cast<T *>(Node); }
|
||||||
|
T &operator*() { return reinterpret_cast<T &>(*Node); }
|
||||||
|
|
||||||
|
iterator operator++() {
|
||||||
|
Node = Node->Next;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool operator==(const iterator &RHS) const { return Node == RHS.Node; }
|
||||||
|
bool operator!=(const iterator &RHS) const { return !(*this == RHS); }
|
||||||
|
|
||||||
|
iterator next() const {
|
||||||
|
iterator Itr;
|
||||||
|
Itr.Node = Node->Next;
|
||||||
|
return Itr;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
SimpleLinkListNode *Node = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
iterator begin() {
|
||||||
|
iterator Itr;
|
||||||
|
Itr.Node = Head.Next;
|
||||||
|
return Itr;
|
||||||
|
}
|
||||||
|
|
||||||
|
iterator end() { return {}; }
|
||||||
|
|
||||||
|
void insert(SimpleLinkListNode *Node) { insertImpl(&Head, Node); }
|
||||||
|
|
||||||
|
void remove(iterator Itr) {
|
||||||
|
SimpleLinkListNode *Node = Itr.Node;
|
||||||
|
remove(Node);
|
||||||
|
}
|
||||||
|
|
||||||
|
void remove(SimpleLinkListNode *Node) { removeImpl(Node); }
|
||||||
|
|
||||||
|
bool empty() const { return Head.Next == nullptr; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
static void insertImpl(SimpleLinkListNode *Current,
|
||||||
|
SimpleLinkListNode *Node) {
|
||||||
|
SimpleLinkListNode *OldNext = Current->Next;
|
||||||
|
Node->Prev = Current;
|
||||||
|
Node->Next = OldNext;
|
||||||
|
if (OldNext)
|
||||||
|
OldNext->Prev = Node;
|
||||||
|
Current->Next = Node;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void removeImpl(SimpleLinkListNode *Node) {
|
||||||
|
SimpleLinkListNode *Prev = Node->Prev;
|
||||||
|
SimpleLinkListNode *Next = Node->Next;
|
||||||
|
Prev->Next = Next;
|
||||||
|
if (Next)
|
||||||
|
Next->Prev = Prev;
|
||||||
|
Node->Prev = nullptr;
|
||||||
|
Node->Next = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if the node is in the list.
|
||||||
|
bool checkExist(SimpleLinkListNode *Node) const {
|
||||||
|
SimpleLinkListNode *P = Head.Next;
|
||||||
|
while (P) {
|
||||||
|
if (P == Node)
|
||||||
|
return true;
|
||||||
|
P = P->Next;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool checkSanity(SimpleLinkListNode *Current,
|
||||||
|
SimpleLinkListNode *Next) {
|
||||||
|
return Current->Next == Next && (!Next || Next->Prev == Current);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool checkSanity(SimpleLinkListNode *Prev, SimpleLinkListNode *Current,
|
||||||
|
SimpleLinkListNode *Next) {
|
||||||
|
return Prev->Next == Current && Current->Next == Next &&
|
||||||
|
Current->Prev == Prev && (!Next || Next->Prev == Current);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool checkDangling(SimpleLinkListNode *Node) {
|
||||||
|
return Node->Next == nullptr && Node->Prev == nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
SimpleLinkListNode Head;
|
||||||
|
};
|
||||||
|
|
||||||
|
SimpleLinkList<AllocationMetadata> AllocationList;
|
||||||
|
SimpleLinkList<AllocationMetadata> FreeList;
|
||||||
|
|
||||||
|
constexpr const int SplitRatio = 5;
|
||||||
|
|
||||||
|
#ifndef LIBOMPTARGET_MEMORY_PROFILING
|
||||||
|
struct MemoryProfiler {
|
||||||
|
MemoryProfiler(const char *S) {}
|
||||||
|
~MemoryProfiler() {}
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
struct MemoryProfiler : utils::SimpleProfiler {
|
||||||
|
MemoryProfiler(const char *S) : utils::SimpleProfiler(S) {}
|
||||||
|
~MemoryProfiler() { utils::SimpleProfiler::~SimpleProfiler(); }
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
namespace ompx {
|
||||||
|
namespace memory {
|
||||||
|
MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
|
||||||
|
mutex::LockGuard ALG(AllocationListLock);
|
||||||
|
for (auto Itr = AllocationList.begin(); Itr != AllocationList.end(); ++Itr) {
|
||||||
|
AllocationMetadata *MD = &(*Itr);
|
||||||
|
if (MD->isInRange(P))
|
||||||
|
return {MD->getUserAddr(), MD->getUserSize()};
|
||||||
|
}
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
void init() {}
|
||||||
|
} // namespace memory
|
||||||
|
} // namespace ompx
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
|
||||||
|
void *memset(void *dest, int ch, size_t count);
|
||||||
|
|
||||||
|
void *malloc(size_t Size) {
|
||||||
|
MemoryProfiler Profiler(__FUNCTION__);
|
||||||
|
|
||||||
|
Size = utils::align_up(Size + AllocationMetadataSize, Alignment);
|
||||||
|
|
||||||
|
AllocationMetadata *MD = nullptr;
|
||||||
|
|
||||||
|
{
|
||||||
|
mutex::LockGuard FLG(FreeListLock);
|
||||||
|
|
||||||
|
auto Itr = FreeList.begin();
|
||||||
|
for (; Itr != FreeList.end(); ++Itr) {
|
||||||
|
if (Itr->getSize() >= Size)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Found = Itr != FreeList.end() && (Itr->getSize() / Size < SplitRatio);
|
||||||
|
if (Found) {
|
||||||
|
MD = &(*Itr);
|
||||||
|
FreeList.remove(Itr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MD) {
|
||||||
|
mutex::LockGuard ALG(AllocationListLock);
|
||||||
|
AllocationList.insert(MD);
|
||||||
|
return MD->getUserAddr();
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
mutex::LockGuard LG(HeapLock);
|
||||||
|
|
||||||
|
if (Size + HeapCurPos < omptarget_device_heap_size) {
|
||||||
|
void *R = omptarget_device_heap_buffer + HeapCurPos;
|
||||||
|
(void)atomic::add(&HeapCurPos, Size, atomic::acq_rel);
|
||||||
|
MD = AllocationMetadata::getFromAddr(R);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MD) {
|
||||||
|
// We need to reset the head in case of any dirty data.
|
||||||
|
memset(MD, 0, AllocationMetadataSize);
|
||||||
|
MD->setSize(Size);
|
||||||
|
mutex::LockGuard ALG(AllocationListLock);
|
||||||
|
AllocationList.insert(MD);
|
||||||
|
return MD->getUserAddr();
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("out of heap memory! size=%lu, cur=%lu.\n", Size, HeapCurPos);
|
||||||
|
|
||||||
|
printf("%s:%d\n", __FILE__, __LINE__);
|
||||||
|
__builtin_trap();
|
||||||
|
}
|
||||||
|
|
||||||
|
void free(void *P) {
|
||||||
|
MemoryProfiler Profiler(__FUNCTION__);
|
||||||
|
|
||||||
|
if (!P)
|
||||||
|
return;
|
||||||
|
|
||||||
|
auto *MD = AllocationMetadata::getFromUserAddr(P);
|
||||||
|
|
||||||
|
{
|
||||||
|
mutex::LockGuard ALG(AllocationListLock);
|
||||||
|
AllocationList.remove(MD);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
mutex::LockGuard FLG(FreeListLock);
|
||||||
|
FreeList.insert(MD);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void *realloc(void *ptr, size_t new_size) {
|
||||||
|
MemoryProfiler Profiler(__FUNCTION__);
|
||||||
|
|
||||||
|
void *NewPtr = malloc(new_size);
|
||||||
|
if (!NewPtr)
|
||||||
|
return nullptr;
|
||||||
|
auto *OldMD = AllocationMetadata::getFromUserAddr(ptr);
|
||||||
|
assert(ptr == OldMD->getUserAddr());
|
||||||
|
__builtin_memcpy(NewPtr, ptr, utils::min(OldMD->getUserSize(), new_size));
|
||||||
|
free(ptr);
|
||||||
|
return NewPtr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void __kmpc_target_init_allocator() {}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma omp end declare target
|
||||||
420
openmp/libomptarget/DeviceRTL/src/HostRPC.cpp
Normal file
420
openmp/libomptarget/DeviceRTL/src/HostRPC.cpp
Normal file
@@ -0,0 +1,420 @@
|
|||||||
|
//===------- HostRPC.cpp - Implementation of host RPC ------------- C++ ---===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#pragma omp begin declare target device_type(nohost)
|
||||||
|
|
||||||
|
#include "HostRPC.h"
|
||||||
|
|
||||||
|
#include "Debug.h"
|
||||||
|
#include "LibC.h"
|
||||||
|
#include "Memory.h"
|
||||||
|
#include "Synchronization.h"
|
||||||
|
#include "Types.h"
|
||||||
|
#include "Utils.h"
|
||||||
|
|
||||||
|
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
|
||||||
|
|
||||||
|
#ifdef HOSTRPC_DEBUG
|
||||||
|
#define DEBUG_PREFIX "host-rpc-device"
|
||||||
|
#define DP(FMT, ...) \
|
||||||
|
{ printf("%s --> " FMT, DEBUG_PREFIX, __VA_ARGS__); }
|
||||||
|
#else
|
||||||
|
#define DP(FMT, ...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
using namespace ompx;
|
||||||
|
using namespace hostrpc;
|
||||||
|
|
||||||
|
using ArgType = llvm::omp::OMPTgtHostRPCArgType;
|
||||||
|
|
||||||
|
Descriptor *omptarget_hostrpc_descriptor
|
||||||
|
__attribute__((used, retain, weak, visibility("protected")));
|
||||||
|
int32_t *omptarget_hostrpc_futex
|
||||||
|
__attribute__((used, retain, weak, visibility("protected")));
|
||||||
|
char *omptarget_hostrpc_memory_buffer
|
||||||
|
__attribute__((used, retain, weak, visibility("protected")));
|
||||||
|
size_t omptarget_hostrpc_memory_buffer_size
|
||||||
|
__attribute__((used, retain, weak, visibility("protected")));
|
||||||
|
|
||||||
|
#ifdef HOSTRPC_PROFILING
|
||||||
|
int32_t HostRPCId;
|
||||||
|
double GetDescStart;
|
||||||
|
double GetDescEnd;
|
||||||
|
double AddArgStart;
|
||||||
|
double AddArgEnd;
|
||||||
|
double IssueAndWaitStart;
|
||||||
|
double IssueAndWaitEnd;
|
||||||
|
double CopyBackStart;
|
||||||
|
double CopyBackEnd;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
size_t HostRPCMemoryBufferCurrentPosition = 0;
|
||||||
|
constexpr const size_t Alignment = 16;
|
||||||
|
|
||||||
|
// FIXME: For now we only allow one thread requesting host RPC.
|
||||||
|
mutex::TicketLock HostRPCLock;
|
||||||
|
|
||||||
|
void *HostRPCMemAlloc(size_t Size) {
|
||||||
|
Size = utils::align_up(Size, Alignment);
|
||||||
|
|
||||||
|
if (Size + HostRPCMemoryBufferCurrentPosition <
|
||||||
|
omptarget_hostrpc_memory_buffer_size) {
|
||||||
|
void *R =
|
||||||
|
omptarget_hostrpc_memory_buffer + HostRPCMemoryBufferCurrentPosition;
|
||||||
|
atomic::add(&HostRPCMemoryBufferCurrentPosition, Size, atomic::acq_rel);
|
||||||
|
return R;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s:%d\n", __FILE__, __LINE__);
|
||||||
|
__builtin_trap();
|
||||||
|
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// For now we just reset the buffer.
|
||||||
|
void HostRPCMemReset() { HostRPCMemoryBufferCurrentPosition = 0; }
|
||||||
|
|
||||||
|
static_assert(sizeof(intptr_t) == sizeof(int64_t), "pointer size not match");
|
||||||
|
|
||||||
|
struct HostRPCArgInfo {
|
||||||
|
void *BasePtr;
|
||||||
|
int64_t Type;
|
||||||
|
int64_t Size;
|
||||||
|
HostRPCArgInfo *Next;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct HostRPCPointerMapEntry {
|
||||||
|
void *BasePtr;
|
||||||
|
void *MappedBasePtr;
|
||||||
|
int64_t Size;
|
||||||
|
int64_t Kind;
|
||||||
|
};
|
||||||
|
|
||||||
|
void *getMappedPointer(Descriptor *D, void *BasePtr, int64_t Size,
|
||||||
|
int64_t Offset, int64_t Kind) {
|
||||||
|
assert(D->ArgMap && "ArgMap should not be nullptr");
|
||||||
|
|
||||||
|
HostRPCPointerMapEntry *MapTable =
|
||||||
|
reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
|
||||||
|
int I = 0;
|
||||||
|
for (; I < D->NumArgs && MapTable[I].BasePtr; ++I)
|
||||||
|
if (MapTable[I].BasePtr == BasePtr)
|
||||||
|
return utils::advance(MapTable[I].MappedBasePtr, Offset);
|
||||||
|
|
||||||
|
MapTable[I].BasePtr = BasePtr;
|
||||||
|
MapTable[I].MappedBasePtr = HostRPCMemAlloc(Size);
|
||||||
|
MapTable[I].Size = Size;
|
||||||
|
MapTable[I].Kind = Kind;
|
||||||
|
|
||||||
|
if (Kind & ArgType::OMP_HOST_RPC_ARG_COPY_TO) {
|
||||||
|
__builtin_memcpy(MapTable[I].MappedBasePtr, BasePtr, Size);
|
||||||
|
DP("getMappedPointer: copy %ld bytes memory from %p to %p.\n", Size,
|
||||||
|
BasePtr, MapTable[I].MappedBasePtr);
|
||||||
|
}
|
||||||
|
|
||||||
|
return utils::advance(MapTable[I].MappedBasePtr, Offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
void copybackIfNeeded(Descriptor *D) {
|
||||||
|
if (!D->ArgMap)
|
||||||
|
return;
|
||||||
|
|
||||||
|
auto *MapTable = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
|
||||||
|
for (int I = 0; I < D->NumArgs && MapTable[I].BasePtr; ++I)
|
||||||
|
if (MapTable[I].Kind & ArgType::OMP_HOST_RPC_ARG_COPY_FROM) {
|
||||||
|
__builtin_memcpy(MapTable[I].BasePtr, MapTable[I].MappedBasePtr,
|
||||||
|
MapTable[I].Size);
|
||||||
|
DP("copybackIfNeeded: copy %ld bytes memory from %p to %p.\n",
|
||||||
|
MapTable[I].Size, MapTable[I].MappedBasePtr, MapTable[I].BasePtr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
__attribute__((noinline, used)) void *
|
||||||
|
__kmpc_host_rpc_get_desc(int32_t CallId, int32_t NumArgs, void *ArgInfo) {
|
||||||
|
assert(omptarget_hostrpc_descriptor && omptarget_hostrpc_futex &&
|
||||||
|
"no host rpc pointer");
|
||||||
|
|
||||||
|
DP("device: stdin=%p, stdout=%p, stderr=%p\n", stdin, stdout, stderr);
|
||||||
|
DP("get desc for request (id=%d), NumArgs=%d, ArgInfo=%p.\n", CallId, NumArgs,
|
||||||
|
ArgInfo);
|
||||||
|
#ifdef HOSTRPC_DEBUG
|
||||||
|
{
|
||||||
|
void **AIs = reinterpret_cast<void **>(ArgInfo);
|
||||||
|
for (int I = 0; I < NumArgs; ++I)
|
||||||
|
DP("ArgInfo[%d]=%p.\n", I, AIs[I]);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
HostRPCLock.lock();
|
||||||
|
|
||||||
|
#ifdef HOSTRPC_PROFILING
|
||||||
|
HostRPCId = CallId;
|
||||||
|
GetDescStart = omp_get_wtime();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// TODO: change it after we support a queue-like data structure.
|
||||||
|
Descriptor *D = omptarget_hostrpc_descriptor;
|
||||||
|
|
||||||
|
D->Id = CallId;
|
||||||
|
D->ArgInfo = reinterpret_cast<void **>(ArgInfo);
|
||||||
|
D->NumArgs = NumArgs;
|
||||||
|
D->Status = EXEC_STAT_CREATED;
|
||||||
|
D->ReturnValue = 0;
|
||||||
|
D->Args =
|
||||||
|
reinterpret_cast<Argument *>(HostRPCMemAlloc(sizeof(Argument) * NumArgs));
|
||||||
|
D->ArgMap = HostRPCMemAlloc(sizeof(HostRPCPointerMapEntry) * NumArgs);
|
||||||
|
|
||||||
|
assert(!NumArgs || (D->Args && D->ArgMap) && "out of host rpc memory!");
|
||||||
|
|
||||||
|
// Reset the map table.
|
||||||
|
auto *ArgMap = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
|
||||||
|
for (int I = 0; I < NumArgs; ++I)
|
||||||
|
ArgMap[I].BasePtr = nullptr;
|
||||||
|
|
||||||
|
#ifdef HOSTRPC_PROFILING
|
||||||
|
GetDescEnd = omp_get_wtime();
|
||||||
|
AddArgStart = omp_get_wtime();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return D;
|
||||||
|
}
|
||||||
|
|
||||||
|
__attribute__((noinline, used)) void
|
||||||
|
__kmpc_host_rpc_add_arg(void *Desc, int64_t ArgVal, int32_t ArgNum) {
|
||||||
|
auto *D = reinterpret_cast<Descriptor *>(Desc);
|
||||||
|
assert(ArgNum < D->NumArgs && "out-of-range arguments");
|
||||||
|
Argument &ArgInDesc = D->Args[ArgNum];
|
||||||
|
|
||||||
|
DP("add arg (no=%d), arg=%lx to request (id=%d).\n", ArgNum, ArgVal, D->Id);
|
||||||
|
|
||||||
|
// This early branch can rule out nullptr and zero scalar value because it
|
||||||
|
// doesn't matter whether it is a pointer or scalar value.
|
||||||
|
if (ArgVal == 0) {
|
||||||
|
ArgInDesc.Value = 0;
|
||||||
|
ArgInDesc.ArgType = Type::ARG_LITERAL;
|
||||||
|
|
||||||
|
DP("arg (no=%d) is null, done.\n", ArgNum);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *ArgPtr = reinterpret_cast<void *>(ArgVal);
|
||||||
|
|
||||||
|
if (ArgPtr == stdin || ArgPtr == stdout || ArgPtr == stderr) {
|
||||||
|
ArgInDesc.Value = ArgVal;
|
||||||
|
ArgInDesc.ArgType = Type::ARG_POINTER;
|
||||||
|
|
||||||
|
DP("arg (no=%d) is stdin/stdout/stderr, done.\n", ArgNum);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto *AI = reinterpret_cast<HostRPCArgInfo *>(D->ArgInfo[ArgNum]);
|
||||||
|
|
||||||
|
DP("try to find arg (no=%d) from args AI=%p\n", ArgNum, AI);
|
||||||
|
|
||||||
|
if (AI) {
|
||||||
|
// Let's first check if Arg is a scalar.
|
||||||
|
if (AI->Type == ArgType::OMP_HOST_RPC_ARG_SCALAR) {
|
||||||
|
assert(AI->BasePtr == ArgPtr && "invalid scalar argument info");
|
||||||
|
assert(AI->Next == nullptr && "invalid scalar argument info");
|
||||||
|
|
||||||
|
ArgInDesc.Value = ArgVal;
|
||||||
|
ArgInDesc.ArgType = Type::ARG_LITERAL;
|
||||||
|
|
||||||
|
DP("arg (no=%d) is scalar, done.\n", ArgNum);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Then let's see if it is a literal pointer that we don't need copy.
|
||||||
|
auto *P = AI;
|
||||||
|
while (P) {
|
||||||
|
if (P->Type == ArgType::OMP_HOST_RPC_ARG_PTR && P->BasePtr == ArgPtr) {
|
||||||
|
ArgInDesc.Value = ArgVal;
|
||||||
|
ArgInDesc.ArgType = Type::ARG_POINTER;
|
||||||
|
|
||||||
|
DP("arg (no=%d) is literal pointer, done.\n", ArgNum);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
P = P->Next;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Next we check if it is within the range of any buffer described in
|
||||||
|
// argument info.
|
||||||
|
P = AI;
|
||||||
|
while (P) {
|
||||||
|
if ((P->Type & ArgType::OMP_HOST_RPC_ARG_PTR) && P->Size) {
|
||||||
|
if (utils::isInRange(ArgPtr, P->BasePtr, P->Size)) {
|
||||||
|
auto Size = P->Size;
|
||||||
|
auto Offset = utils::getPtrDiff(ArgPtr, P->BasePtr);
|
||||||
|
|
||||||
|
ArgInDesc.Value = utils::ptrtoint(
|
||||||
|
getMappedPointer(D, P->BasePtr, Size, Offset, P->Type));
|
||||||
|
ArgInDesc.ArgType = Type::ARG_POINTER;
|
||||||
|
|
||||||
|
DP("found a match for arg (no=%d). done.\n", ArgNum);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
P = P->Next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now we can't find a match from argument info, then we assume it is from
|
||||||
|
// dynamic allocation.
|
||||||
|
memory::MemoryAllocationInfo MAI = memory::getMemoryAllocationInfo(ArgPtr);
|
||||||
|
if (MAI.isValid()) {
|
||||||
|
auto Size = MAI.Size;
|
||||||
|
auto Offset = utils::getPtrDiff(ArgPtr, MAI.BasePtr);
|
||||||
|
|
||||||
|
ArgInDesc.Value = utils::ptrtoint(
|
||||||
|
getMappedPointer(D, MAI.BasePtr, Size, Offset,
|
||||||
|
/* Kind */ ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM));
|
||||||
|
ArgInDesc.ArgType = Type::ARG_POINTER;
|
||||||
|
|
||||||
|
DP("arg (no=%d) is from malloc. done.\n", ArgNum);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("request (id=%d) arg (no=%d, val=%p) is unknown. send it to host "
|
||||||
|
"directly.\n",
|
||||||
|
D->Id, ArgNum, ArgPtr);
|
||||||
|
|
||||||
|
ArgInDesc.Value = ArgVal;
|
||||||
|
ArgInDesc.ArgType = Type::ARG_POINTER;
|
||||||
|
}
|
||||||
|
|
||||||
|
__attribute__((noinline, used)) int64_t
|
||||||
|
__kmpc_host_rpc_send_and_wait(void *Desc) {
|
||||||
|
auto *D = reinterpret_cast<Descriptor *>(Desc);
|
||||||
|
int32_t Id = D->Id;
|
||||||
|
|
||||||
|
#ifdef HOSTRPC_PROFILING
|
||||||
|
AddArgEnd = omp_get_wtime();
|
||||||
|
IssueAndWaitStart = omp_get_wtime();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
atomic::add(omptarget_hostrpc_futex, 1U, atomic::acq_rel);
|
||||||
|
|
||||||
|
// A system fence is required to make sure futex on the host is also
|
||||||
|
// updated if USM is supported.
|
||||||
|
fence::system(atomic::seq_cst);
|
||||||
|
|
||||||
|
DP("sent request (id=%d) to host. waiting for finish.\n", Id);
|
||||||
|
|
||||||
|
unsigned NS = 8;
|
||||||
|
|
||||||
|
while (atomic::addSys(omptarget_hostrpc_futex, 0)) {
|
||||||
|
asm volatile("nanosleep.u32 %0;" : : "r"(NS));
|
||||||
|
// if (NS < 64)
|
||||||
|
// NS *= 2;
|
||||||
|
// fence::system(atomic::seq_cst);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef HOSTRPC_PROFILING
|
||||||
|
IssueAndWaitEnd = omp_get_wtime();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
DP("finish waiting for request (id=%d).\n", Id);
|
||||||
|
|
||||||
|
int64_t Ret = D->ReturnValue;
|
||||||
|
|
||||||
|
assert(!D->NumArgs || D->ArgMap && "arg map should not be nullptr");
|
||||||
|
|
||||||
|
#ifdef HOSTRPC_PROFILING
|
||||||
|
CopyBackStart = omp_get_wtime();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (D->ArgMap) {
|
||||||
|
DP("copy memory back for request (id=%d).\n", Id);
|
||||||
|
copybackIfNeeded(D);
|
||||||
|
DP("finish copy memory back for request (id=%d).\n", Id);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef HOSTRPC_PROFILING
|
||||||
|
CopyBackEnd = omp_get_wtime();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
HostRPCMemReset();
|
||||||
|
|
||||||
|
// We can unlock now as we already get all temporary part.
|
||||||
|
// TODO: If we have a queue, we don't need this step.
|
||||||
|
HostRPCLock.unlock();
|
||||||
|
|
||||||
|
DP("request (id=%d) is done with return code=%lx.\n", Id, Ret);
|
||||||
|
|
||||||
|
#ifdef HOSTRPC_PROFILING
|
||||||
|
printf("[host-rpc-profiling-device] id=%d, init=%lf, add_arg=%lf, wait=%lf, "
|
||||||
|
"copy=%lf.\n",
|
||||||
|
HostRPCId, GetDescEnd - GetDescStart, AddArgEnd - AddArgStart,
|
||||||
|
IssueAndWaitEnd - IssueAndWaitStart, CopyBackEnd - CopyBackStart);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return Ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
__attribute__((noinline, used)) void
|
||||||
|
__kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
|
||||||
|
int32_t if_expr, int32_t num_threads,
|
||||||
|
void **args, int64_t nargs) {
|
||||||
|
constexpr const int64_t NumArgs = 6;
|
||||||
|
HostRPCArgInfo ArgInfoArray[NumArgs];
|
||||||
|
void *ArgInfo[NumArgs];
|
||||||
|
for (unsigned I = 0; I < NumArgs; ++I) {
|
||||||
|
ArgInfoArray[I].BasePtr = 0;
|
||||||
|
ArgInfoArray[I].Type = ArgType::OMP_HOST_RPC_ARG_SCALAR;
|
||||||
|
ArgInfoArray[I].Size = 0;
|
||||||
|
ArgInfoArray[I].Next = nullptr;
|
||||||
|
ArgInfo[I] = &ArgInfoArray[I];
|
||||||
|
}
|
||||||
|
auto *D = (Descriptor *)__kmpc_host_rpc_get_desc(0, NumArgs, (void *)ArgInfo);
|
||||||
|
|
||||||
|
// Set up arg info struct.
|
||||||
|
ArgInfoArray[0].BasePtr = const_cast<char *>(name);
|
||||||
|
ArgInfoArray[0].Type = ArgType::OMP_HOST_RPC_ARG_COPY_TO;
|
||||||
|
ArgInfoArray[0].Size = strlen(name) + 1;
|
||||||
|
ArgInfoArray[1].BasePtr =
|
||||||
|
reinterpret_cast<void *>(static_cast<int64_t>(gtid));
|
||||||
|
ArgInfoArray[2].BasePtr =
|
||||||
|
reinterpret_cast<void *>(static_cast<int64_t>(if_expr));
|
||||||
|
ArgInfoArray[3].BasePtr =
|
||||||
|
reinterpret_cast<void *>(static_cast<int64_t>(num_threads));
|
||||||
|
ArgInfoArray[5].BasePtr = reinterpret_cast<void *>(nargs);
|
||||||
|
|
||||||
|
// We need to treat args in a little bit different way because nargs might be
|
||||||
|
// on the stack.
|
||||||
|
ArgInfoArray[4].Size = sizeof(void *) * nargs;
|
||||||
|
void *Args = nullptr;
|
||||||
|
if (nargs) {
|
||||||
|
Args = HostRPCMemAlloc(ArgInfoArray[4].Size);
|
||||||
|
__builtin_memcpy(Args, args, ArgInfoArray[4].Size);
|
||||||
|
}
|
||||||
|
ArgInfoArray[4].BasePtr = Args;
|
||||||
|
|
||||||
|
D->Id = CALLID___kmpc_launch_parallel_51_kernel;
|
||||||
|
|
||||||
|
__kmpc_host_rpc_add_arg(D, reinterpret_cast<int64_t>(name), 0);
|
||||||
|
__kmpc_host_rpc_add_arg(D, gtid, 1);
|
||||||
|
__kmpc_host_rpc_add_arg(D, if_expr, 2);
|
||||||
|
__kmpc_host_rpc_add_arg(D, num_threads, 3);
|
||||||
|
__kmpc_host_rpc_add_arg(D, reinterpret_cast<int64_t>(Args), 4);
|
||||||
|
__kmpc_host_rpc_add_arg(D, nargs, 5);
|
||||||
|
|
||||||
|
(void)__kmpc_host_rpc_send_and_wait(D);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma omp end declare target
|
||||||
@@ -16,6 +16,7 @@
|
|||||||
#include "Debug.h"
|
#include "Debug.h"
|
||||||
#include "Interface.h"
|
#include "Interface.h"
|
||||||
#include "Mapping.h"
|
#include "Mapping.h"
|
||||||
|
#include "Memory.h"
|
||||||
#include "State.h"
|
#include "State.h"
|
||||||
#include "Synchronization.h"
|
#include "Synchronization.h"
|
||||||
#include "Types.h"
|
#include "Types.h"
|
||||||
|
|||||||
@@ -7,6 +7,12 @@
|
|||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
#include "LibC.h"
|
#include "LibC.h"
|
||||||
|
#include "Debug.h"
|
||||||
|
#include "Memory.h"
|
||||||
|
#include "Synchronization.h"
|
||||||
|
#include "Utils.h"
|
||||||
|
|
||||||
|
using namespace ompx;
|
||||||
|
|
||||||
#pragma omp begin declare target device_type(nohost)
|
#pragma omp begin declare target device_type(nohost)
|
||||||
|
|
||||||
@@ -32,8 +38,324 @@ int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
} // namespace impl
|
} // namespace impl
|
||||||
|
|
||||||
#pragma omp end declare variant
|
#pragma omp end declare variant
|
||||||
|
|
||||||
|
// Dummy variable for stdio.
|
||||||
|
int StdInDummyVar;
|
||||||
|
int StdOutDummyVar;
|
||||||
|
int StdErrDummyVar;
|
||||||
|
|
||||||
|
struct FILE;
|
||||||
|
__attribute__((used, retain, weak, visibility("protected"))) FILE *stdin =
|
||||||
|
(FILE *)&StdInDummyVar;
|
||||||
|
__attribute__((used, retain, weak, visibility("protected"))) FILE *stdout =
|
||||||
|
(FILE *)&StdOutDummyVar;
|
||||||
|
__attribute__((used, retain, weak, visibility("protected"))) FILE *stderr =
|
||||||
|
(FILE *)&StdErrDummyVar;
|
||||||
|
|
||||||
|
typedef int (*__compar_fn_t)(const void *, const void *);
|
||||||
|
typedef int (*__compar_d_fn_t)(const void *, const void *, void *);
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
const int32_t ToLowerMapTable[] = {
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6,
|
||||||
|
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
|
||||||
|
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
|
||||||
|
37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
|
||||||
|
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 'a', 'b',
|
||||||
|
'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
|
||||||
|
'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 91, 92, 93, 94, 95, 96,
|
||||||
|
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
||||||
|
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 123, 124, 125, 126,
|
||||||
|
127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
};
|
||||||
|
const int32_t *ToLowerPTable = ToLowerMapTable + 128;
|
||||||
|
|
||||||
|
#define X(x) (((x) / 256 | (x)*256) % 65536)
|
||||||
|
|
||||||
|
static const unsigned short BLocTable[] = {
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
|
||||||
|
X(0x200), X(0x200), X(0x200), X(0x200), X(0x320), X(0x220), X(0x220),
|
||||||
|
X(0x220), X(0x220), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
|
||||||
|
X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
|
||||||
|
X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x160),
|
||||||
|
X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
|
||||||
|
X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
|
||||||
|
X(0x4c0), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8),
|
||||||
|
X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x4c0), X(0x4c0), X(0x4c0),
|
||||||
|
X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x8d5), X(0x8d5), X(0x8d5),
|
||||||
|
X(0x8d5), X(0x8d5), X(0x8d5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
|
||||||
|
X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
|
||||||
|
X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
|
||||||
|
X(0x8c5), X(0x8c5), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
|
||||||
|
X(0x4c0), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6),
|
||||||
|
X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6),
|
||||||
|
X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6),
|
||||||
|
X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x4c0),
|
||||||
|
X(0x4c0), X(0x4c0), X(0x4c0), X(0x200), 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
#undef X
|
||||||
|
|
||||||
|
static const unsigned short *BLocTablePTable = BLocTable + 128;
|
||||||
|
|
||||||
|
int ErrNo __attribute__((used, retain));
|
||||||
|
|
||||||
|
static unsigned long int RandomNext = 1;
|
||||||
|
|
||||||
|
#define SWAP(a, b, size) \
|
||||||
|
do { \
|
||||||
|
size_t __size = (size); \
|
||||||
|
char *__a = (a), *__b = (b); \
|
||||||
|
do { \
|
||||||
|
char __tmp = *__a; \
|
||||||
|
*__a++ = *__b; \
|
||||||
|
*__b++ = __tmp; \
|
||||||
|
} while (--__size > 0); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
/* Discontinue quicksort algorithm when partition gets below this size.
|
||||||
|
This particular magic number was chosen to work best on a Sun 4/260. */
|
||||||
|
#define MAX_THRESH 4
|
||||||
|
|
||||||
|
/* Stack node declarations used to store unfulfilled partition obligations. */
|
||||||
|
typedef struct {
|
||||||
|
char *lo;
|
||||||
|
char *hi;
|
||||||
|
} stack_node;
|
||||||
|
|
||||||
|
/* The next 4 #defines implement a very fast in-line stack abstraction. */
|
||||||
|
/* The stack needs log (total_elements) entries (we could even subtract
|
||||||
|
log(MAX_THRESH)). Since total_elements has type size_t, we get as
|
||||||
|
upper bound for log (total_elements):
|
||||||
|
bits per byte (CHAR_BIT) * sizeof(size_t). */
|
||||||
|
#define CHAR_BIT 8
|
||||||
|
#define STACK_SIZE (CHAR_BIT * sizeof(size_t))
|
||||||
|
#define PUSH(low, high) ((void)((top->lo = (low)), (top->hi = (high)), ++top))
|
||||||
|
#define POP(low, high) ((void)(--top, (low = top->lo), (high = top->hi)))
|
||||||
|
#define STACK_NOT_EMPTY (stack < top)
|
||||||
|
|
||||||
|
/* Order size using quicksort. This implementation incorporates
|
||||||
|
four optimizations discussed in Sedgewick:
|
||||||
|
1. Non-recursive, using an explicit stack of pointer that store the
|
||||||
|
next array partition to sort. To save time, this maximum amount
|
||||||
|
of space required to store an array of SIZE_MAX is allocated on the
|
||||||
|
stack. Assuming a 32-bit (64 bit) integer for size_t, this needs
|
||||||
|
only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes).
|
||||||
|
Pretty cheap, actually.
|
||||||
|
2. Chose the pivot element using a median-of-three decision tree.
|
||||||
|
This reduces the probability of selecting a bad pivot value and
|
||||||
|
eliminates certain extraneous comparisons.
|
||||||
|
3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving
|
||||||
|
insertion sort to order the MAX_THRESH items within each partition.
|
||||||
|
This is a big win, since insertion sort is faster for small, mostly
|
||||||
|
sorted array segments.
|
||||||
|
4. The larger of the two sub-partitions is always pushed onto the
|
||||||
|
stack first, with the algorithm then concentrating on the
|
||||||
|
smaller partition. This *guarantees* no more than log (total_elems)
|
||||||
|
stack size is needed (actually O(1) in this case)! */
|
||||||
|
|
||||||
|
void _quicksort(void *const pbase, size_t total_elems, size_t size,
|
||||||
|
__compar_d_fn_t cmp, void *arg) {
|
||||||
|
char *base_ptr = (char *)pbase;
|
||||||
|
|
||||||
|
const size_t max_thresh = MAX_THRESH * size;
|
||||||
|
|
||||||
|
if (total_elems == 0)
|
||||||
|
/* Avoid lossage with unsigned arithmetic below. */
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (total_elems > MAX_THRESH) {
|
||||||
|
char *lo = base_ptr;
|
||||||
|
char *hi = &lo[size * (total_elems - 1)];
|
||||||
|
stack_node stack[STACK_SIZE];
|
||||||
|
stack_node *top = stack;
|
||||||
|
|
||||||
|
PUSH(nullptr, nullptr);
|
||||||
|
|
||||||
|
while (STACK_NOT_EMPTY) {
|
||||||
|
char *left_ptr;
|
||||||
|
char *right_ptr;
|
||||||
|
|
||||||
|
/* Select median value from among LO, MID, and HI. Rearrange
|
||||||
|
LO and HI so the three values are sorted. This lowers the
|
||||||
|
probability of picking a pathological pivot value and
|
||||||
|
skips a comparison for both the LEFT_PTR and RIGHT_PTR in
|
||||||
|
the while loops. */
|
||||||
|
|
||||||
|
char *mid = lo + size * ((hi - lo) / size >> 1);
|
||||||
|
|
||||||
|
if ((*cmp)((void *)mid, (void *)lo, arg) < 0)
|
||||||
|
SWAP(mid, lo, size);
|
||||||
|
if ((*cmp)((void *)hi, (void *)mid, arg) < 0)
|
||||||
|
SWAP(mid, hi, size);
|
||||||
|
else
|
||||||
|
goto jump_over;
|
||||||
|
if ((*cmp)((void *)mid, (void *)lo, arg) < 0)
|
||||||
|
SWAP(mid, lo, size);
|
||||||
|
jump_over:;
|
||||||
|
|
||||||
|
left_ptr = lo + size;
|
||||||
|
right_ptr = hi - size;
|
||||||
|
|
||||||
|
/* Here's the famous ``collapse the walls'' section of quicksort.
|
||||||
|
Gotta like those tight inner loops! They are the main reason
|
||||||
|
that this algorithm runs much faster than others. */
|
||||||
|
do {
|
||||||
|
while ((*cmp)((void *)left_ptr, (void *)mid, arg) < 0)
|
||||||
|
left_ptr += size;
|
||||||
|
|
||||||
|
while ((*cmp)((void *)mid, (void *)right_ptr, arg) < 0)
|
||||||
|
right_ptr -= size;
|
||||||
|
|
||||||
|
if (left_ptr < right_ptr) {
|
||||||
|
SWAP(left_ptr, right_ptr, size);
|
||||||
|
if (mid == left_ptr)
|
||||||
|
mid = right_ptr;
|
||||||
|
else if (mid == right_ptr)
|
||||||
|
mid = left_ptr;
|
||||||
|
left_ptr += size;
|
||||||
|
right_ptr -= size;
|
||||||
|
} else if (left_ptr == right_ptr) {
|
||||||
|
left_ptr += size;
|
||||||
|
right_ptr -= size;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} while (left_ptr <= right_ptr);
|
||||||
|
|
||||||
|
/* Set up pointers for next iteration. First determine whether
|
||||||
|
left and right partitions are below the threshold size. If so,
|
||||||
|
ignore one or both. Otherwise, push the larger partition's
|
||||||
|
bounds on the stack and continue sorting the smaller one. */
|
||||||
|
|
||||||
|
if ((size_t)(right_ptr - lo) <= max_thresh) {
|
||||||
|
if ((size_t)(hi - left_ptr) <= max_thresh)
|
||||||
|
/* Ignore both small partitions. */
|
||||||
|
POP(lo, hi);
|
||||||
|
else
|
||||||
|
/* Ignore small left partition. */
|
||||||
|
lo = left_ptr;
|
||||||
|
} else if ((size_t)(hi - left_ptr) <= max_thresh)
|
||||||
|
/* Ignore small right partition. */
|
||||||
|
hi = right_ptr;
|
||||||
|
else if ((right_ptr - lo) > (hi - left_ptr)) {
|
||||||
|
/* Push larger left partition indices. */
|
||||||
|
PUSH(lo, right_ptr);
|
||||||
|
lo = left_ptr;
|
||||||
|
} else {
|
||||||
|
/* Push larger right partition indices. */
|
||||||
|
PUSH(left_ptr, hi);
|
||||||
|
hi = right_ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Once the BASE_PTR array is partially sorted by quicksort the rest
|
||||||
|
is completely sorted using insertion sort, since this is efficient
|
||||||
|
for partitions below MAX_THRESH size. BASE_PTR points to the beginning
|
||||||
|
of the array to sort, and END_PTR points at the very last element in
|
||||||
|
the array (*not* one beyond it!). */
|
||||||
|
|
||||||
|
#define min(x, y) ((x) < (y) ? (x) : (y))
|
||||||
|
|
||||||
|
{
|
||||||
|
char *const end_ptr = &base_ptr[size * (total_elems - 1)];
|
||||||
|
char *tmp_ptr = base_ptr;
|
||||||
|
char *thresh = min(end_ptr, base_ptr + max_thresh);
|
||||||
|
char *run_ptr;
|
||||||
|
|
||||||
|
/* Find smallest element in first threshold and place it at the
|
||||||
|
array's beginning. This is the smallest array element,
|
||||||
|
and the operation speeds up insertion sort's inner loop. */
|
||||||
|
|
||||||
|
for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
|
||||||
|
if ((*cmp)((void *)run_ptr, (void *)tmp_ptr, arg) < 0)
|
||||||
|
tmp_ptr = run_ptr;
|
||||||
|
|
||||||
|
if (tmp_ptr != base_ptr)
|
||||||
|
SWAP(tmp_ptr, base_ptr, size);
|
||||||
|
|
||||||
|
/* Insertion sort, running from left-hand-side up to right-hand-side. */
|
||||||
|
|
||||||
|
run_ptr = base_ptr + size;
|
||||||
|
while ((run_ptr += size) <= end_ptr) {
|
||||||
|
tmp_ptr = run_ptr - size;
|
||||||
|
while ((*cmp)((void *)run_ptr, (void *)tmp_ptr, arg) < 0)
|
||||||
|
tmp_ptr -= size;
|
||||||
|
|
||||||
|
tmp_ptr += size;
|
||||||
|
if (tmp_ptr != run_ptr) {
|
||||||
|
char *trav;
|
||||||
|
|
||||||
|
trav = run_ptr + size;
|
||||||
|
while (--trav >= run_ptr) {
|
||||||
|
char c = *trav;
|
||||||
|
char *hi, *lo;
|
||||||
|
|
||||||
|
for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
|
||||||
|
*hi = *lo;
|
||||||
|
*hi = c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
||||||
int memcmp(const void *lhs, const void *rhs, size_t count) {
|
int memcmp(const void *lhs, const void *rhs, size_t count) {
|
||||||
@@ -47,16 +369,419 @@ int memcmp(const void *lhs, const void *rhs, size_t count) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void memset(void *dst, int C, size_t count) {
|
|
||||||
auto *dstc = reinterpret_cast<char *>(dst);
|
|
||||||
for (size_t I = 0; I < count; ++I)
|
|
||||||
dstc[I] = C;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf
|
/// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf
|
||||||
int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
|
int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
|
||||||
return impl::omp_vprintf(Format, Arguments, Size);
|
return impl::omp_vprintf(Format, Arguments, Size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#ifndef ULONG_MAX
|
||||||
|
#define ULONG_MAX ((unsigned long)(~0L)) /* 0xFFFFFFFF */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef LONG_MAX
|
||||||
|
#define LONG_MAX ((long)(ULONG_MAX >> 1)) /* 0x7FFFFFFF */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef LONG_MIN
|
||||||
|
#define LONG_MIN ((long)(~LONG_MAX)) /* 0x80000000 */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef ISDIGIT
|
||||||
|
#define ISDIGIT(C) ((C) >= '0' && (C) <= '9')
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef ISSPACE
|
||||||
|
#define ISSPACE(C) ((C) == ' ')
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef ISALPHA
|
||||||
|
#define ISALPHA(C) (((C) >= 'a' && (C) <= 'z') || ((C) >= 'A' && (C) <= 'Z'))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef ISUPPER
|
||||||
|
#define ISUPPER(C) ((C) >= 'A' && (C) <= 'Z')
|
||||||
|
#endif
|
||||||
|
|
||||||
|
long strtol(const char *nptr, char **endptr, int base) {
|
||||||
|
const char *s = nptr;
|
||||||
|
unsigned long acc;
|
||||||
|
int c;
|
||||||
|
unsigned long cutoff;
|
||||||
|
int neg = 0, any, cutlim;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Skip white space and pick up leading +/- sign if any.
|
||||||
|
* If base is 0, allow 0x for hex and 0 for octal, else
|
||||||
|
* assume decimal; if base is already 16, allow 0x.
|
||||||
|
*/
|
||||||
|
do {
|
||||||
|
c = *s++;
|
||||||
|
} while (ISSPACE(c));
|
||||||
|
if (c == '-') {
|
||||||
|
neg = 1;
|
||||||
|
c = *s++;
|
||||||
|
} else if (c == '+')
|
||||||
|
c = *s++;
|
||||||
|
if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X')) {
|
||||||
|
c = s[1];
|
||||||
|
s += 2;
|
||||||
|
base = 16;
|
||||||
|
}
|
||||||
|
if (base == 0)
|
||||||
|
base = c == '0' ? 8 : 10;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Compute the cutoff value between legal numbers and illegal
|
||||||
|
* numbers. That is the largest legal value, divided by the
|
||||||
|
* base. An input number that is greater than this value, if
|
||||||
|
* followed by a legal input character, is too big. One that
|
||||||
|
* is equal to this value may be valid or not; the limit
|
||||||
|
* between valid and invalid numbers is then based on the last
|
||||||
|
* digit. For instance, if the range for longs is
|
||||||
|
* [-2147483648..2147483647] and the input base is 10,
|
||||||
|
* cutoff will be set to 214748364 and cutlim to either
|
||||||
|
* 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated
|
||||||
|
* a value > 214748364, or equal but the next digit is > 7 (or 8),
|
||||||
|
* the number is too big, and we will return a range error.
|
||||||
|
*
|
||||||
|
* Set any if any `digits' consumed; make it negative to indicate
|
||||||
|
* overflow.
|
||||||
|
*/
|
||||||
|
cutoff = neg ? -(unsigned long)LONG_MIN : LONG_MAX;
|
||||||
|
cutlim = cutoff % (unsigned long)base;
|
||||||
|
cutoff /= (unsigned long)base;
|
||||||
|
for (acc = 0, any = 0;; c = *s++) {
|
||||||
|
if (ISDIGIT(c))
|
||||||
|
c -= '0';
|
||||||
|
else if (ISALPHA(c))
|
||||||
|
c -= ISUPPER(c) ? 'A' - 10 : 'a' - 10;
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
if (c >= base)
|
||||||
|
break;
|
||||||
|
if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
|
||||||
|
any = -1;
|
||||||
|
else {
|
||||||
|
any = 1;
|
||||||
|
acc *= base;
|
||||||
|
acc += c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (any < 0) {
|
||||||
|
acc = neg ? LONG_MIN : LONG_MAX;
|
||||||
|
errno = ERANGE;
|
||||||
|
} else if (neg)
|
||||||
|
acc = -acc;
|
||||||
|
if (endptr != 0)
|
||||||
|
*endptr = const_cast<char *>(any ? s - 1 : nptr);
|
||||||
|
return (acc);
|
||||||
|
}
|
||||||
|
|
||||||
|
int strcmp(const char *lhs, const char *rhs) {
|
||||||
|
while (*lhs != '\0' && *rhs != '\0') {
|
||||||
|
if (*lhs == *rhs) {
|
||||||
|
++lhs;
|
||||||
|
++rhs;
|
||||||
|
}
|
||||||
|
return *lhs - *rhs;
|
||||||
|
}
|
||||||
|
if (*lhs != '\0')
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *calloc(size_t num, size_t size) {
|
||||||
|
size_t bits = num * size;
|
||||||
|
char *p = (char *)malloc(bits);
|
||||||
|
if (!p)
|
||||||
|
return p;
|
||||||
|
char *q = (char *)p;
|
||||||
|
while (q - p < bits) {
|
||||||
|
*(int *)q = 0;
|
||||||
|
q += sizeof(int);
|
||||||
|
}
|
||||||
|
while (q - p < bits) {
|
||||||
|
*q = 0;
|
||||||
|
q++;
|
||||||
|
}
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
void exit(int exit_code) { asm volatile("exit;"); }
|
||||||
|
|
||||||
|
size_t strlen(const char *str) {
|
||||||
|
size_t r = 0;
|
||||||
|
while (*str == ' ')
|
||||||
|
++str;
|
||||||
|
|
||||||
|
while (*str != '\0') {
|
||||||
|
++r;
|
||||||
|
++str;
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *strcpy(char *dest, const char *src) {
|
||||||
|
char *pd = dest;
|
||||||
|
const char *ps = src;
|
||||||
|
|
||||||
|
while (*ps != '\0')
|
||||||
|
*(pd++) = *(ps++);
|
||||||
|
|
||||||
|
*pd = '\0';
|
||||||
|
|
||||||
|
return dest;
|
||||||
|
}
|
||||||
|
|
||||||
|
int *__errno_location() { return &ErrNo; }
|
||||||
|
|
||||||
|
char *strcat(char *dest, const char *src) {
|
||||||
|
char *pd = dest;
|
||||||
|
const char *ps = src;
|
||||||
|
|
||||||
|
while (*pd != '\0')
|
||||||
|
++pd;
|
||||||
|
|
||||||
|
while (*ps != '\0')
|
||||||
|
*(pd++) = *(ps++);
|
||||||
|
|
||||||
|
*pd = '\0';
|
||||||
|
|
||||||
|
return dest;
|
||||||
|
}
|
||||||
|
|
||||||
|
void perror(const char *s) { printf("%s", s); }
|
||||||
|
|
||||||
|
int strncmp(const char *lhs, const char *rhs, size_t count) {
|
||||||
|
size_t c = 0;
|
||||||
|
while (*lhs != '\0' && *rhs != '\0' && c < count) {
|
||||||
|
if (*lhs == *rhs) {
|
||||||
|
++lhs;
|
||||||
|
++rhs;
|
||||||
|
++c;
|
||||||
|
} else
|
||||||
|
return *lhs - *rhs;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *strncpy(char *dest, const char *src, size_t count) {
|
||||||
|
char *pd = dest;
|
||||||
|
const char *ps = src;
|
||||||
|
size_t c = 0;
|
||||||
|
|
||||||
|
while (*ps != '\0' && c < count) {
|
||||||
|
*(pd++) = *(ps++);
|
||||||
|
++c;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (c < count)
|
||||||
|
*pd = '\0';
|
||||||
|
|
||||||
|
return dest;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *strchr(const char *s, int c) {
|
||||||
|
do {
|
||||||
|
if (*s == c)
|
||||||
|
return const_cast<char *>(s);
|
||||||
|
} while (*s++);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *strtok(char *str, const char *delim) {
|
||||||
|
static char *s = nullptr;
|
||||||
|
char *tok;
|
||||||
|
if (str == nullptr) {
|
||||||
|
if (s == nullptr)
|
||||||
|
return nullptr;
|
||||||
|
} else
|
||||||
|
s = str;
|
||||||
|
for (size_t i; (*s != '\0'); s++) {
|
||||||
|
for (i = 0; (delim[i] != '\0') && (*s != delim[i]); i++)
|
||||||
|
;
|
||||||
|
if (delim[i] == '\0')
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (*s == '\0')
|
||||||
|
return s = nullptr;
|
||||||
|
tok = s++;
|
||||||
|
for (size_t i; (*s != '\0'); s++) {
|
||||||
|
for (i = 0; (delim[i] != '\0') && (*s != delim[i]); i++)
|
||||||
|
;
|
||||||
|
if (delim[i] != '\0')
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (*s != '\0') {
|
||||||
|
*s = '\0';
|
||||||
|
s++;
|
||||||
|
}
|
||||||
|
return tok;
|
||||||
|
}
|
||||||
|
|
||||||
|
void srand(unsigned seed) { RandomNext = seed; }
|
||||||
|
|
||||||
|
int rand() {
|
||||||
|
RandomNext = RandomNext * 1103515245 + 12345;
|
||||||
|
return static_cast<unsigned int>((RandomNext / 65536)) % 2147483647;
|
||||||
|
}
|
||||||
|
|
||||||
|
int abs(int n) { return n > 0 ? n : -n; }
|
||||||
|
|
||||||
|
void *memcpy(void *dest, const void *src, size_t count) {
|
||||||
|
__builtin_memcpy(dest, src, count);
|
||||||
|
return dest;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned long strtoul(const char *str, char **str_end, int base) {
|
||||||
|
unsigned long res = 0;
|
||||||
|
while (*str != '\0') {
|
||||||
|
if (*str == ' ') {
|
||||||
|
++str;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (*str >= '0' && *str <= '9') {
|
||||||
|
res = res * 10 + *str - '0';
|
||||||
|
++str;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*str_end)
|
||||||
|
*str_end = const_cast<char *>(str);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
double atof(const char *s) { return strtod(s, nullptr); }
|
||||||
|
|
||||||
|
int atoi(const char *str) { return (int)strtol(str, nullptr, 10); }
|
||||||
|
|
||||||
|
double strtod(const char *str, char **ptr) {
|
||||||
|
char *p;
|
||||||
|
|
||||||
|
if (ptr == (char **)0)
|
||||||
|
return atof(str);
|
||||||
|
|
||||||
|
p = const_cast<char *>(str);
|
||||||
|
|
||||||
|
while (ISSPACE(*p))
|
||||||
|
++p;
|
||||||
|
|
||||||
|
if (*p == '+' || *p == '-')
|
||||||
|
++p;
|
||||||
|
|
||||||
|
/* INF or INFINITY. */
|
||||||
|
if ((p[0] == 'i' || p[0] == 'I') && (p[1] == 'n' || p[1] == 'N') &&
|
||||||
|
(p[2] == 'f' || p[2] == 'F')) {
|
||||||
|
if ((p[3] == 'i' || p[3] == 'I') && (p[4] == 'n' || p[4] == 'N') &&
|
||||||
|
(p[5] == 'i' || p[5] == 'I') && (p[6] == 't' || p[6] == 'T') &&
|
||||||
|
(p[7] == 'y' || p[7] == 'Y')) {
|
||||||
|
*ptr = p + 8;
|
||||||
|
return atof(str);
|
||||||
|
} else {
|
||||||
|
*ptr = p + 3;
|
||||||
|
return atof(str);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* NAN or NAN(foo). */
|
||||||
|
if ((p[0] == 'n' || p[0] == 'N') && (p[1] == 'a' || p[1] == 'A') &&
|
||||||
|
(p[2] == 'n' || p[2] == 'N')) {
|
||||||
|
p += 3;
|
||||||
|
if (*p == '(') {
|
||||||
|
++p;
|
||||||
|
while (*p != '\0' && *p != ')')
|
||||||
|
++p;
|
||||||
|
if (*p == ')')
|
||||||
|
++p;
|
||||||
|
}
|
||||||
|
*ptr = p;
|
||||||
|
return atof(str);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* digits, with 0 or 1 periods in it. */
|
||||||
|
if (ISDIGIT(*p) || *p == '.') {
|
||||||
|
int got_dot = 0;
|
||||||
|
while (ISDIGIT(*p) || (!got_dot && *p == '.')) {
|
||||||
|
if (*p == '.')
|
||||||
|
got_dot = 1;
|
||||||
|
++p;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Exponent. */
|
||||||
|
if (*p == 'e' || *p == 'E') {
|
||||||
|
int i;
|
||||||
|
i = 1;
|
||||||
|
if (p[i] == '+' || p[i] == '-')
|
||||||
|
++i;
|
||||||
|
if (ISDIGIT(p[i])) {
|
||||||
|
while (ISDIGIT(p[i]))
|
||||||
|
++i;
|
||||||
|
*ptr = p + i;
|
||||||
|
return atof(str);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*ptr = p;
|
||||||
|
return atof(str);
|
||||||
|
}
|
||||||
|
/* Didn't find any digits. Doesn't look like a number. */
|
||||||
|
*ptr = const_cast<char *>(str);
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *memset(void *dest, int ch, size_t count) {
|
||||||
|
auto *P = reinterpret_cast<unsigned char *>(dest);
|
||||||
|
for (size_t I = 0; I < count; ++I, ++P)
|
||||||
|
*P = (unsigned char)ch;
|
||||||
|
return dest;
|
||||||
|
}
|
||||||
|
|
||||||
|
int tolower(int ch) {
|
||||||
|
if (ch >= 'A' && ch <= 'Z')
|
||||||
|
return ch + 32;
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *strstr(const char *s1, const char *s2) {
|
||||||
|
const size_t len = strlen(s2);
|
||||||
|
while (*s1) {
|
||||||
|
if (!memcmp(s1, s2, len))
|
||||||
|
return const_cast<char *>(s1);
|
||||||
|
++s1;
|
||||||
|
}
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
char *__xpg_basename(const char *path) { return const_cast<char *>(path); }
|
||||||
|
|
||||||
|
const unsigned short **__ctype_b_loc(void) { return &BLocTablePTable; }
|
||||||
|
|
||||||
|
const int32_t **__ctype_tolower_loc(void) { return &ToLowerPTable; }
|
||||||
|
|
||||||
|
void qsort(void *b, size_t n, size_t s, __compar_fn_t cmp) {
|
||||||
|
return _quicksort(b, n, s, (__compar_d_fn_t)cmp, nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
int strcasecmp(const char *s1, const char *s2) {
|
||||||
|
const unsigned char *p1 = (const unsigned char *)s1;
|
||||||
|
const unsigned char *p2 = (const unsigned char *)s2;
|
||||||
|
int result;
|
||||||
|
|
||||||
|
if (p1 == p2)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
while ((result = tolower(*p1) - tolower(*p2++)) == 0)
|
||||||
|
if (*p1++ == '\0')
|
||||||
|
break;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma omp end declare target
|
#pragma omp end declare target
|
||||||
|
|||||||
@@ -282,9 +282,15 @@ uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
|
|||||||
|
|
||||||
uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
|
uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
|
||||||
uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
|
uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
|
||||||
// If we are in SPMD mode, remove one warp.
|
if (IsSPMD)
|
||||||
return BlockSize - (!IsSPMD * impl::getWarpSize());
|
return BlockSize;
|
||||||
|
uint32_t WarpSize = impl::getWarpSize();
|
||||||
|
bool IsFullLastWarp = !(BlockSize % WarpSize);
|
||||||
|
if (OMP_LIKELY(IsFullLastWarp))
|
||||||
|
return BlockSize - WarpSize;
|
||||||
|
return BlockSize - BlockSize % WarpSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t mapping::getMaxTeamThreads() {
|
uint32_t mapping::getMaxTeamThreads() {
|
||||||
return mapping::getMaxTeamThreads(mapping::isSPMDMode());
|
return mapping::getMaxTeamThreads(mapping::isSPMDMode());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -47,6 +47,9 @@ using namespace ompx;
|
|||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
|
uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
|
||||||
|
if (NumThreadsClause != -1)
|
||||||
|
return NumThreadsClause;
|
||||||
|
|
||||||
uint32_t NThreadsICV =
|
uint32_t NThreadsICV =
|
||||||
NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
|
NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
|
||||||
uint32_t NumThreads = mapping::getMaxTeamThreads();
|
uint32_t NumThreads = mapping::getMaxTeamThreads();
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
#include "LibC.h"
|
#include "LibC.h"
|
||||||
#include "Mapping.h"
|
#include "Mapping.h"
|
||||||
#include "State.h"
|
#include "State.h"
|
||||||
|
#include "Memory.h"
|
||||||
#include "Synchronization.h"
|
#include "Synchronization.h"
|
||||||
#include "Types.h"
|
#include "Types.h"
|
||||||
#include "Utils.h"
|
#include "Utils.h"
|
||||||
@@ -83,6 +84,7 @@ struct SharedMemorySmartStackTy {
|
|||||||
void *push(uint64_t Bytes);
|
void *push(uint64_t Bytes);
|
||||||
|
|
||||||
/// Deallocate the last allocation made by the encountering thread and pointed
|
/// Deallocate the last allocation made by the encountering thread and pointed
|
||||||
|
//
|
||||||
/// to by \p Ptr from the stack. Each thread can call this function.
|
/// to by \p Ptr from the stack. Each thread can call this function.
|
||||||
void pop(void *Ptr, uint32_t Bytes);
|
void pop(void *Ptr, uint32_t Bytes);
|
||||||
|
|
||||||
@@ -274,7 +276,8 @@ void state::enterDataEnvironment(IdentTy *Ident) {
|
|||||||
uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
|
uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
|
||||||
if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
|
if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
|
||||||
uint32_t Bytes =
|
uint32_t Bytes =
|
||||||
sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
|
sizeof(ThreadStates[0]) *
|
||||||
|
(mapping::getNumberOfThreadsInBlock() + (mapping::isSPMDMode() ? 0 : 1));
|
||||||
void *ThreadStatesPtr =
|
void *ThreadStatesPtr =
|
||||||
memory::allocGlobal(Bytes, "Thread state array allocation");
|
memory::allocGlobal(Bytes, "Thread state array allocation");
|
||||||
memset(ThreadStatesPtr, 0, Bytes);
|
memset(ThreadStatesPtr, 0, Bytes);
|
||||||
@@ -371,6 +374,15 @@ int omp_get_thread_num(void) {
|
|||||||
return omp_get_ancestor_thread_num(omp_get_level());
|
return omp_get_ancestor_thread_num(omp_get_level());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int omp_get_bulk_thread_num() {
|
||||||
|
ASSERT(mapping::isSPMDMode(), "Not SPMD");
|
||||||
|
int BId = mapping::getBlockIdInKernel();
|
||||||
|
int BSize = mapping::getMaxTeamThreads();
|
||||||
|
int TId = mapping::getThreadIdInBlock();
|
||||||
|
int Id = BId * BSize + TId;
|
||||||
|
return returnValIfLevelIsActive(omp_get_level(), Id, 0);
|
||||||
|
}
|
||||||
|
|
||||||
int omp_get_team_size(int Level) {
|
int omp_get_team_size(int Level) {
|
||||||
return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
|
return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
|
||||||
}
|
}
|
||||||
@@ -379,6 +391,11 @@ int omp_get_num_threads(void) {
|
|||||||
return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
|
return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int omp_get_bulk_num_threads(void) {
|
||||||
|
ASSERT(mapping::isSPMDMode(), "Not SPMD");
|
||||||
|
return mapping::getNumberOfThreadsInKernel();
|
||||||
|
}
|
||||||
|
|
||||||
int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
|
int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
|
||||||
|
|
||||||
int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
|
int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
|
||||||
|
|||||||
@@ -122,6 +122,8 @@ uint32_t atomicExchange(uint32_t *Address, uint32_t Val,
|
|||||||
}
|
}
|
||||||
///}
|
///}
|
||||||
|
|
||||||
|
int32_t atomicAddSys(int32_t *Address, uint32_t Val);
|
||||||
|
|
||||||
// Forward declarations defined to be defined for AMDGCN and NVPTX.
|
// Forward declarations defined to be defined for AMDGCN and NVPTX.
|
||||||
uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
|
uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
|
||||||
atomic::MemScopeTy MemScope);
|
atomic::MemScopeTy MemScope);
|
||||||
@@ -336,6 +338,10 @@ uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
|
|||||||
return __nvvm_atom_inc_gen_ui(Address, Val);
|
return __nvvm_atom_inc_gen_ui(Address, Val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int32_t atomicAddSys(int32_t *Address, int32_t Val) {
|
||||||
|
return __nvvm_atom_sys_add_gen_i(Address, Val);
|
||||||
|
}
|
||||||
|
|
||||||
void namedBarrierInit() {}
|
void namedBarrierInit() {}
|
||||||
|
|
||||||
void namedBarrier() {
|
void namedBarrier() {
|
||||||
@@ -509,6 +515,22 @@ void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
|
|||||||
|
|
||||||
void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }
|
void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }
|
||||||
|
|
||||||
|
int32_t atomic::addSys(int32_t *Addr, int32_t Val) {
|
||||||
|
return impl::atomicAddSys(Addr, Val);
|
||||||
|
}
|
||||||
|
|
||||||
|
void mutex::TicketLock::lock() {
|
||||||
|
uint64_t MyTicket = atomic::add(&NextTicket, 1, atomic::seq_cst);
|
||||||
|
while (atomic::load(&NowServing, atomic::aquire) != MyTicket)
|
||||||
|
;
|
||||||
|
fence::kernel(atomic::aquire);
|
||||||
|
}
|
||||||
|
|
||||||
|
void mutex::TicketLock::unlock() {
|
||||||
|
fence::kernel(atomic::release);
|
||||||
|
atomic::add(&NowServing, 1, atomic::seq_cst);
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
|
void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
|
||||||
|
|
||||||
|
|||||||
304
openmp/libomptarget/DeviceRTL/src/TeamAllocator.cpp
Normal file
304
openmp/libomptarget/DeviceRTL/src/TeamAllocator.cpp
Normal file
@@ -0,0 +1,304 @@
|
|||||||
|
//===------- WarpAllocator.cpp - Warp memory allocator ------- C++ -*-========//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#pragma omp begin declare target device_type(nohost)
|
||||||
|
|
||||||
|
#include "Debug.h"
|
||||||
|
#include "Mapping.h"
|
||||||
|
#include "Memory.h"
|
||||||
|
#include "State.h"
|
||||||
|
#include "Synchronization.h"
|
||||||
|
#include "Types.h"
|
||||||
|
#include "Utils.h"
|
||||||
|
|
||||||
|
using namespace ompx;
|
||||||
|
|
||||||
|
char *CONSTANT(omptarget_device_heap_buffer)
|
||||||
|
__attribute__((used, retain, weak, visibility("protected")));
|
||||||
|
|
||||||
|
size_t CONSTANT(omptarget_device_heap_size)
|
||||||
|
__attribute__((used, retain, weak, visibility("protected")));
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
constexpr const size_t Alignment = 16;
|
||||||
|
constexpr const size_t FirstThreadRatio = 40;
|
||||||
|
constexpr const size_t SplitThreadhold = Alignment * 4;
|
||||||
|
|
||||||
|
template <typename T> T abs(T V) { return V > 0 ? V : -V; }
|
||||||
|
|
||||||
|
template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator;
|
||||||
|
|
||||||
|
class WarpAllocatorEntry {
|
||||||
|
template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> friend struct WarpAllocator;
|
||||||
|
|
||||||
|
/// If Size is less than 0, the entry is allocated (in use).
|
||||||
|
int64_t Size = 0;
|
||||||
|
/// PrevSize is also supposed to be greater than or equal to 0. When it is 0,
|
||||||
|
/// it is the first entry of the buffer.
|
||||||
|
int64_t PrevSize = 0;
|
||||||
|
|
||||||
|
public:
|
||||||
|
bool isFirst() const { return !PrevSize; }
|
||||||
|
|
||||||
|
size_t getSize() const { return abs(Size); }
|
||||||
|
void setSize(size_t V) { Size = V; }
|
||||||
|
|
||||||
|
void setPrevSize(WarpAllocatorEntry *Prev) {
|
||||||
|
PrevSize = Prev ? Prev->getSize() : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t getUserSize() const { return getSize() - sizeof(WarpAllocatorEntry); }
|
||||||
|
|
||||||
|
// Note: isUsed can not be !isUnused or other way around because when Size is
|
||||||
|
// 0, it is uninitialized.
|
||||||
|
bool isUsed() const { return Size < 0; }
|
||||||
|
bool isUnused() const { return Size > 0; }
|
||||||
|
|
||||||
|
void setUsed() {
|
||||||
|
assert(isUnused() && "the entry is in use");
|
||||||
|
Size *= -1;
|
||||||
|
}
|
||||||
|
void setUnused() {
|
||||||
|
assert(isUsed() && "the entry is not in use");
|
||||||
|
Size *= -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *getUserPtr() { return reinterpret_cast<char *>(this + 1); }
|
||||||
|
char *getEndPtr() { return reinterpret_cast<char *>(getNext()); }
|
||||||
|
|
||||||
|
WarpAllocatorEntry *getPrev() { return utils::advance(this, -PrevSize); }
|
||||||
|
WarpAllocatorEntry *getNext() { return utils::advance(this, getSize()); }
|
||||||
|
|
||||||
|
static WarpAllocatorEntry *fromUserPtr(void *Ptr) { return fromPtr(Ptr) - 1; }
|
||||||
|
|
||||||
|
static WarpAllocatorEntry *fromPtr(void *Ptr) {
|
||||||
|
return reinterpret_cast<WarpAllocatorEntry *>(Ptr);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");
|
||||||
|
|
||||||
|
template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
|
||||||
|
void init() {
|
||||||
|
if (mapping::isSPMDMode() &&
|
||||||
|
(mapping::getThreadIdInBlock() || mapping::getBlockIdInKernel()))
|
||||||
|
return;
|
||||||
|
|
||||||
|
size_t HeapSize = omptarget_device_heap_size;
|
||||||
|
|
||||||
|
FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
|
||||||
|
FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
|
||||||
|
size_t OtherThreadHeapSize =
|
||||||
|
(HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
|
||||||
|
OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);
|
||||||
|
|
||||||
|
size_t TeamHeapSize = FirstThreadHeapSize / TEAM_SIZE;
|
||||||
|
TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
|
||||||
|
FirstTeamSize = TeamHeapSize;
|
||||||
|
|
||||||
|
char *LastLimit = omptarget_device_heap_buffer;
|
||||||
|
for (int I = 0; I < WARP_SIZE; ++I) {
|
||||||
|
for (int J = 0; J < TEAM_SIZE; ++J) {
|
||||||
|
Entries[I][J] = nullptr;
|
||||||
|
Limits[I][J] = LastLimit + TeamHeapSize * (J + 1);
|
||||||
|
}
|
||||||
|
LastLimit += I ? OtherThreadHeapSize : FirstThreadHeapSize;
|
||||||
|
Limits[I][TEAM_SIZE - 1] = LastLimit;
|
||||||
|
TeamHeapSize = OtherThreadHeapSize / TEAM_SIZE;
|
||||||
|
TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void *allocate(size_t Size) {
|
||||||
|
int32_t TeamSlot = getTeamSlot();
|
||||||
|
int32_t TIdInWarp = mapping::getThreadIdInWarp();
|
||||||
|
|
||||||
|
Size = utils::align_up(Size + sizeof(WarpAllocatorEntry), Alignment);
|
||||||
|
|
||||||
|
// Error our early if the requested size is larger than the entire block.
|
||||||
|
if (Size > getBlockSize(TIdInWarp, TeamSlot))
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
WarpAllocatorEntry *E = nullptr;
|
||||||
|
{
|
||||||
|
mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
|
||||||
|
|
||||||
|
auto *LastEntry = Entries[TIdInWarp][TeamSlot];
|
||||||
|
auto *NewWatermark = (LastEntry ? LastEntry->getEndPtr()
|
||||||
|
: getBlockBegin(TIdInWarp, TeamSlot)) +
|
||||||
|
Size;
|
||||||
|
if (NewWatermark >= Limits[TIdInWarp][TeamSlot]) {
|
||||||
|
E = findMemorySlow(Size, TIdInWarp, TeamSlot);
|
||||||
|
} else {
|
||||||
|
E = LastEntry ? LastEntry->getNext()
|
||||||
|
: WarpAllocatorEntry::fromPtr(
|
||||||
|
getBlockBegin(TIdInWarp, TeamSlot));
|
||||||
|
E->setSize(Size);
|
||||||
|
E->setPrevSize(LastEntry);
|
||||||
|
Entries[TIdInWarp][TeamSlot] = E;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!E)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
E->setUsed();
|
||||||
|
}
|
||||||
|
|
||||||
|
return E->getUserPtr();
|
||||||
|
}
|
||||||
|
|
||||||
|
void deallocate(void *Ptr) {
|
||||||
|
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(Ptr);
|
||||||
|
|
||||||
|
auto TeamSlot = getTeamSlot();
|
||||||
|
auto TIdInWarp = mapping::getThreadIdInWarp();
|
||||||
|
|
||||||
|
mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
|
||||||
|
E->setUnused();
|
||||||
|
// Is last entry?
|
||||||
|
if (E == Entries[TIdInWarp][TeamSlot]) {
|
||||||
|
do {
|
||||||
|
E = E->getPrev();
|
||||||
|
} while (!E->isFirst() && !E->isUsed());
|
||||||
|
Entries[TIdInWarp][TeamSlot] = E;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
|
||||||
|
if (!utils::isInRange(P, omptarget_device_heap_buffer,
|
||||||
|
omptarget_device_heap_size))
|
||||||
|
return {};
|
||||||
|
|
||||||
|
auto TeamSlot = getTeamSlot();
|
||||||
|
auto TIdInWarp = mapping::getThreadIdInWarp();
|
||||||
|
for (int I = TIdInWarp; I < TIdInWarp + WARP_SIZE; ++I) {
|
||||||
|
int TId = I % WARP_SIZE;
|
||||||
|
for (int J = TeamSlot; J < TeamSlot + TEAM_SIZE; ++J) {
|
||||||
|
int SId = J % TEAM_SIZE;
|
||||||
|
if (P < getBlockBegin(TId, SId) || P >= getBlockEnd(TId, SId))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
mutex::LockGuard LG(Locks[I][SId]);
|
||||||
|
WarpAllocatorEntry *E = Entries[I][SId];
|
||||||
|
if (!E)
|
||||||
|
return {};
|
||||||
|
if (E->getEndPtr() <= P)
|
||||||
|
return {};
|
||||||
|
do {
|
||||||
|
if (E->getUserPtr() <= P && P < E->getEndPtr()) {
|
||||||
|
if (!E->isUsed())
|
||||||
|
return {};
|
||||||
|
return {E->getUserPtr(), E->getUserSize()};
|
||||||
|
}
|
||||||
|
E = E->getPrev();
|
||||||
|
} while (!E->isFirst());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
char *getBlockBegin(int32_t TIdInWarp, int32_t TeamSlot) const {
|
||||||
|
if (TeamSlot)
|
||||||
|
return Limits[TIdInWarp][TeamSlot - 1];
|
||||||
|
if (TIdInWarp)
|
||||||
|
return Limits[TIdInWarp - 1][TEAM_SIZE - 1];
|
||||||
|
return omptarget_device_heap_buffer;
|
||||||
|
}
|
||||||
|
char *getBlockEnd(int32_t TIdInWarp, int32_t TeamSlot) const {
|
||||||
|
return Limits[TIdInWarp][TeamSlot];
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t getBlockSize(int32_t TIdInWarp, int32_t TeamSlot) const {
|
||||||
|
return getBlockEnd(TIdInWarp, TeamSlot) -
|
||||||
|
getBlockBegin(TIdInWarp, TeamSlot);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int32_t getTeamSlot() { return mapping::getBlockIdInKernel() % TEAM_SIZE; }
|
||||||
|
|
||||||
|
WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp,
|
||||||
|
int32_t TeamSlot) {
|
||||||
|
char *Ptr = getBlockBegin(TIdInWarp, TeamSlot);
|
||||||
|
char *Limit = getBlockEnd(TIdInWarp, TeamSlot);
|
||||||
|
|
||||||
|
WarpAllocatorEntry *E = WarpAllocatorEntry::fromPtr(Ptr);
|
||||||
|
do {
|
||||||
|
if (!E->isUsed() && E->getSize() >= Size)
|
||||||
|
break;
|
||||||
|
E = E->getNext();
|
||||||
|
if (reinterpret_cast<char *>(E) + Size > Limit)
|
||||||
|
return nullptr;
|
||||||
|
} while (1);
|
||||||
|
|
||||||
|
size_t OldSize = E->getSize();
|
||||||
|
if (OldSize - Size >= SplitThreadhold) {
|
||||||
|
auto *OldNext = E->getNext();
|
||||||
|
E->setSize(Size);
|
||||||
|
auto *LeftOverE = E->getNext();
|
||||||
|
LeftOverE->setPrevSize(E);
|
||||||
|
LeftOverE->setSize(OldSize - Size);
|
||||||
|
OldNext->setPrevSize(LeftOverE);
|
||||||
|
}
|
||||||
|
|
||||||
|
return E;
|
||||||
|
}
|
||||||
|
|
||||||
|
WarpAllocatorEntry *Entries[WARP_SIZE][TEAM_SIZE];
|
||||||
|
char *Limits[WARP_SIZE][TEAM_SIZE];
|
||||||
|
mutex::TicketLock Locks[WARP_SIZE][TEAM_SIZE];
|
||||||
|
size_t FirstThreadHeapSize;
|
||||||
|
size_t FirstTeamSize;
|
||||||
|
};
|
||||||
|
|
||||||
|
WarpAllocator<32, 16> Allocator;
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
namespace ompx {
|
||||||
|
namespace memory {
|
||||||
|
MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
|
||||||
|
return Allocator.getMemoryAllocationInfo(P);
|
||||||
|
}
|
||||||
|
} // namespace memory
|
||||||
|
} // namespace ompx
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
|
||||||
|
void *malloc(size_t Size) {
|
||||||
|
if (!Size)
|
||||||
|
return nullptr;
|
||||||
|
void *P = Allocator.allocate(Size);
|
||||||
|
assert(P && "allocator out of memory");
|
||||||
|
assert(reinterpret_cast<intptr_t>(P) % Alignment == 0 &&
|
||||||
|
"misaligned address");
|
||||||
|
return P;
|
||||||
|
}
|
||||||
|
|
||||||
|
void free(void *P) {
|
||||||
|
if (!P)
|
||||||
|
return;
|
||||||
|
Allocator.deallocate(P);
|
||||||
|
}
|
||||||
|
|
||||||
|
void *realloc(void *ptr, size_t new_size) {
|
||||||
|
void *NewPtr = malloc(new_size);
|
||||||
|
if (!NewPtr)
|
||||||
|
return nullptr;
|
||||||
|
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(ptr);
|
||||||
|
__builtin_memcpy(NewPtr, ptr, utils::min(E->getUserSize(), new_size));
|
||||||
|
free(ptr);
|
||||||
|
return NewPtr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void __kmpc_target_init_allocator() { Allocator.init(); }
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma omp end declare target
|
||||||
275
openmp/libomptarget/DeviceRTL/src/WarpAllocator.cpp
Normal file
275
openmp/libomptarget/DeviceRTL/src/WarpAllocator.cpp
Normal file
@@ -0,0 +1,275 @@
|
|||||||
|
//===------- WarpAllocator.cpp - Warp memory allocator ------- C++ -*-========//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#pragma omp begin declare target device_type(nohost)
|
||||||
|
|
||||||
|
#include "Debug.h"
|
||||||
|
#include "Mapping.h"
|
||||||
|
#include "Memory.h"
|
||||||
|
#include "State.h"
|
||||||
|
#include "Synchronization.h"
|
||||||
|
#include "Types.h"
|
||||||
|
#include "Utils.h"
|
||||||
|
|
||||||
|
using namespace ompx;
|
||||||
|
|
||||||
|
char *CONSTANT(omptarget_device_heap_buffer)
|
||||||
|
__attribute__((used, retain, weak, visibility("protected")));
|
||||||
|
|
||||||
|
size_t CONSTANT(omptarget_device_heap_size)
|
||||||
|
__attribute__((used, retain, weak, visibility("protected")));
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
constexpr const size_t Alignment = 16;
|
||||||
|
constexpr const size_t FirstThreadRatio = 40;
|
||||||
|
constexpr const size_t SplitThreadhold = Alignment * 16;
|
||||||
|
|
||||||
|
template <typename T> T abs(T V) { return V > 0 ? V : -V; }
|
||||||
|
|
||||||
|
template <int32_t WARP_SIZE> struct WarpAllocator;
|
||||||
|
|
||||||
|
class WarpAllocatorEntry {
|
||||||
|
template <int32_t WARP_SIZE> friend struct WarpAllocator;
|
||||||
|
|
||||||
|
/// If Size is less than 0, the entry is allocated (in use).
|
||||||
|
int64_t Size = 0;
|
||||||
|
/// PrevSize is also supposed to be greater than or equal to 0. When it is 0,
|
||||||
|
/// it is the first entry of the buffer.
|
||||||
|
int64_t PrevSize = 0;
|
||||||
|
|
||||||
|
public:
|
||||||
|
bool isFirst() const { return !PrevSize; }
|
||||||
|
|
||||||
|
size_t getSize() const { return abs(Size); }
|
||||||
|
void setSize(size_t V) { Size = V; }
|
||||||
|
|
||||||
|
void setPrevSize(WarpAllocatorEntry *Prev) {
|
||||||
|
PrevSize = Prev ? Prev->getSize() : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t getUserSize() const { return getSize() - sizeof(WarpAllocatorEntry); }
|
||||||
|
|
||||||
|
// Note: isUsed can not be !isUnused or other way around because when Size is
|
||||||
|
// 0, it is uninitialized.
|
||||||
|
bool isUsed() const { return Size < 0; }
|
||||||
|
bool isUnused() const { return Size > 0; }
|
||||||
|
|
||||||
|
void setUsed() {
|
||||||
|
assert(isUnused() && "the entry is in use");
|
||||||
|
Size *= -1;
|
||||||
|
}
|
||||||
|
void setUnused() {
|
||||||
|
assert(isUsed() && "the entry is not in use");
|
||||||
|
Size *= -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *getUserPtr() { return reinterpret_cast<char *>(this + 1); }
|
||||||
|
char *getEndPtr() { return reinterpret_cast<char *>(getNext()); }
|
||||||
|
|
||||||
|
WarpAllocatorEntry *getPrev() { return utils::advance(this, -PrevSize); }
|
||||||
|
WarpAllocatorEntry *getNext() { return utils::advance(this, getSize()); }
|
||||||
|
|
||||||
|
static WarpAllocatorEntry *fromUserPtr(void *Ptr) { return fromPtr(Ptr) - 1; }
|
||||||
|
|
||||||
|
static WarpAllocatorEntry *fromPtr(void *Ptr) {
|
||||||
|
return reinterpret_cast<WarpAllocatorEntry *>(Ptr);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");
|
||||||
|
|
||||||
|
template <int32_t WARP_SIZE> struct WarpAllocator {
|
||||||
|
void init() {
|
||||||
|
if (mapping::isSPMDMode() &&
|
||||||
|
(mapping::getThreadIdInBlock() || mapping::getBlockId()))
|
||||||
|
return;
|
||||||
|
|
||||||
|
size_t HeapSize = omptarget_device_heap_size;
|
||||||
|
size_t FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
|
||||||
|
FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
|
||||||
|
size_t OtherThreadHeapSize =
|
||||||
|
(HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
|
||||||
|
OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);
|
||||||
|
|
||||||
|
for (int I = 0; I < WARP_SIZE; ++I) {
|
||||||
|
Entries[I] = nullptr;
|
||||||
|
size_t PrivateOffset = OtherThreadHeapSize * I + FirstThreadHeapSize;
|
||||||
|
Limits[I] = omptarget_device_heap_buffer + PrivateOffset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void *allocate(size_t Size) {
|
||||||
|
int32_t TIdInWarp = mapping::getThreadIdInWarp();
|
||||||
|
Size = utils::align_up(Size + sizeof(WarpAllocatorEntry), Alignment);
|
||||||
|
|
||||||
|
// Error our early if the requested size is larger than the entire block.
|
||||||
|
if (Size > getBlockSize(TIdInWarp))
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
WarpAllocatorEntry *E = nullptr;
|
||||||
|
{
|
||||||
|
mutex::LockGuard LG(Locks[TIdInWarp]);
|
||||||
|
|
||||||
|
auto *LastEntry = Entries[TIdInWarp];
|
||||||
|
auto *NewWatermark =
|
||||||
|
(LastEntry ? LastEntry->getEndPtr() : getBlockBegin(TIdInWarp)) +
|
||||||
|
Size;
|
||||||
|
if (NewWatermark >= Limits[TIdInWarp]) {
|
||||||
|
E = findMemorySlow(Size, TIdInWarp);
|
||||||
|
} else {
|
||||||
|
E = LastEntry ? LastEntry->getNext()
|
||||||
|
: WarpAllocatorEntry::fromPtr(getBlockBegin(TIdInWarp));
|
||||||
|
E->setSize(Size);
|
||||||
|
E->setPrevSize(LastEntry);
|
||||||
|
Entries[TIdInWarp] = E;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!E)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
assert(E->isUnused() && "entry is not set to use properly");
|
||||||
|
|
||||||
|
E->setUsed();
|
||||||
|
|
||||||
|
return E->getUserPtr();
|
||||||
|
}
|
||||||
|
|
||||||
|
void deallocate(void *Ptr) {
|
||||||
|
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(Ptr);
|
||||||
|
|
||||||
|
auto TIdInWarp = mapping::getThreadIdInWarp();
|
||||||
|
|
||||||
|
mutex::LockGuard LG(Locks[TIdInWarp]);
|
||||||
|
E->setUnused();
|
||||||
|
// Is last entry?
|
||||||
|
if (E == Entries[TIdInWarp]) {
|
||||||
|
do {
|
||||||
|
E = E->getPrev();
|
||||||
|
} while (!E->isFirst() && !E->isUsed());
|
||||||
|
Entries[TIdInWarp] = E;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
|
||||||
|
if (!utils::isInRange(P, omptarget_device_heap_buffer,
|
||||||
|
omptarget_device_heap_size))
|
||||||
|
return {};
|
||||||
|
|
||||||
|
auto TIdInWarp = mapping::getThreadIdInWarp();
|
||||||
|
for (int I = TIdInWarp; I < TIdInWarp + mapping::getWarpSize(); ++I) {
|
||||||
|
int TId = I % mapping::getWarpSize();
|
||||||
|
if (P < getBlockBegin(TId) || P >= getBlockEnd(TId))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
mutex::LockGuard LG(Locks[I]);
|
||||||
|
WarpAllocatorEntry *E = Entries[I];
|
||||||
|
if (!E)
|
||||||
|
return {};
|
||||||
|
if (E->getEndPtr() <= P)
|
||||||
|
return {};
|
||||||
|
do {
|
||||||
|
if (E->getUserPtr() <= P && P < E->getEndPtr()) {
|
||||||
|
if (!E->isUsed())
|
||||||
|
return {};
|
||||||
|
return {E->getUserPtr(), E->getUserSize()};
|
||||||
|
}
|
||||||
|
E = E->getPrev();
|
||||||
|
} while (!E->isFirst());
|
||||||
|
}
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
char *getBlockBegin(int32_t TIdInWarp) const {
|
||||||
|
return TIdInWarp ? Limits[TIdInWarp - 1] : omptarget_device_heap_buffer;
|
||||||
|
}
|
||||||
|
char *getBlockEnd(int32_t TIdInWarp) const { return Limits[TIdInWarp]; }
|
||||||
|
|
||||||
|
size_t getBlockSize(int32_t TIdInWarp) const {
|
||||||
|
return getBlockEnd(TIdInWarp) - getBlockBegin(TIdInWarp);
|
||||||
|
}
|
||||||
|
|
||||||
|
WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp) {
|
||||||
|
char *Ptr = getBlockBegin(TIdInWarp);
|
||||||
|
char *Limit = getBlockEnd(TIdInWarp);
|
||||||
|
|
||||||
|
WarpAllocatorEntry *E = WarpAllocatorEntry::fromPtr(Ptr);
|
||||||
|
do {
|
||||||
|
if (!E->isUsed() && E->getSize() >= Size)
|
||||||
|
break;
|
||||||
|
E = E->getNext();
|
||||||
|
if (reinterpret_cast<char *>(E) + Size > Limit)
|
||||||
|
return nullptr;
|
||||||
|
} while (1);
|
||||||
|
|
||||||
|
size_t OldSize = E->getSize();
|
||||||
|
if (OldSize - Size >= SplitThreadhold) {
|
||||||
|
auto *OldNext = E->getNext();
|
||||||
|
E->setSize(Size);
|
||||||
|
auto *LeftOverE = E->getNext();
|
||||||
|
LeftOverE->setPrevSize(E);
|
||||||
|
LeftOverE->setSize(OldSize - Size);
|
||||||
|
OldNext->setPrevSize(LeftOverE);
|
||||||
|
}
|
||||||
|
|
||||||
|
return E;
|
||||||
|
}
|
||||||
|
|
||||||
|
WarpAllocatorEntry *Entries[WARP_SIZE];
|
||||||
|
char *Limits[WARP_SIZE];
|
||||||
|
mutex::TicketLock Locks[WARP_SIZE];
|
||||||
|
};
|
||||||
|
|
||||||
|
WarpAllocator<32> Allocator;
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
namespace ompx {
|
||||||
|
namespace memory {
|
||||||
|
MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
|
||||||
|
return Allocator.getMemoryAllocationInfo(P);
|
||||||
|
}
|
||||||
|
} // namespace memory
|
||||||
|
} // namespace ompx
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
|
||||||
|
void *malloc(size_t Size) {
|
||||||
|
if (!Size)
|
||||||
|
return nullptr;
|
||||||
|
void *P = Allocator.allocate(Size);
|
||||||
|
assert(P && "allocator out of memory");
|
||||||
|
assert(reinterpret_cast<intptr_t>(P) % Alignment == 0 &&
|
||||||
|
"misaligned address");
|
||||||
|
return P;
|
||||||
|
}
|
||||||
|
|
||||||
|
void free(void *P) {
|
||||||
|
if (!P)
|
||||||
|
return;
|
||||||
|
Allocator.deallocate(P);
|
||||||
|
}
|
||||||
|
|
||||||
|
void *realloc(void *ptr, size_t new_size) {
|
||||||
|
void *NewPtr = malloc(new_size);
|
||||||
|
if (!NewPtr)
|
||||||
|
return nullptr;
|
||||||
|
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(ptr);
|
||||||
|
__builtin_memcpy(NewPtr, ptr, utils::min(E->getUserSize(), new_size));
|
||||||
|
free(ptr);
|
||||||
|
return NewPtr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void __kmpc_target_init_allocator() { Allocator.init(); }
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma omp end declare target
|
||||||
@@ -111,6 +111,38 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
|
|||||||
stride = loopSize; // make sure we only do 1 chunk per warp
|
stride = loopSize; // make sure we only do 1 chunk per warp
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ForStaticNoLoopNoTrunk(int &last, T &lb, T &ub, ST &stride,
|
||||||
|
ST &chunk, T entityId,
|
||||||
|
T numberOfEntities) {
|
||||||
|
T loopSize = ub - lb + 1;
|
||||||
|
chunk = loopSize / numberOfEntities;
|
||||||
|
|
||||||
|
// We have more entities than iterations.
|
||||||
|
if (chunk == 0) {
|
||||||
|
chunk = 1;
|
||||||
|
lb = lb + entityId * chunk;
|
||||||
|
T inputUb = ub;
|
||||||
|
ub = lb + chunk - 1;
|
||||||
|
last = lb <= inputUb && inputUb <= ub;
|
||||||
|
stride = loopSize; // make sure we only do 1 chunk per warp
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
T leftOver = loopSize - chunk * numberOfEntities;
|
||||||
|
|
||||||
|
if (entityId < leftOver) {
|
||||||
|
chunk++;
|
||||||
|
lb = lb + entityId * chunk;
|
||||||
|
} else {
|
||||||
|
lb = lb + entityId * chunk + leftOver;
|
||||||
|
}
|
||||||
|
|
||||||
|
T inputUb = ub;
|
||||||
|
ub = lb + chunk - 1; // Clang uses i <= ub
|
||||||
|
last = lb <= inputUb && inputUb <= ub;
|
||||||
|
stride = loopSize; // make sure we only do 1 chunk per warp
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Support for Static Init
|
// Support for Static Init
|
||||||
|
|
||||||
@@ -184,6 +216,19 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
|
|||||||
omp_get_num_teams() * numberOfActiveOMPThreads);
|
omp_get_num_teams() * numberOfActiveOMPThreads);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case kmp_sched_distr_parallel_static_noloop_chunk: {
|
||||||
|
ForStaticChunk(lastiter, lb, ub, stride, chunk,
|
||||||
|
numberOfActiveOMPThreads * omp_get_team_num() + gtid,
|
||||||
|
omp_get_num_teams() * numberOfActiveOMPThreads);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case kmp_sched_distr_parallel_static_noloop_nochunk: {
|
||||||
|
ForStaticNoLoopNoTrunk(lastiter, lb, ub, stride, chunk,
|
||||||
|
numberOfActiveOMPThreads * omp_get_team_num() +
|
||||||
|
gtid,
|
||||||
|
omp_get_num_teams() * numberOfActiveOMPThreads);
|
||||||
|
break;
|
||||||
|
}
|
||||||
default: {
|
default: {
|
||||||
// ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
|
// ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
|
||||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
|
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
|
||||||
|
|||||||
@@ -11,8 +11,62 @@ _ZN4ompx*
|
|||||||
|
|
||||||
IsSPMDMode
|
IsSPMDMode
|
||||||
|
|
||||||
malloc
|
|
||||||
free
|
|
||||||
memcmp
|
|
||||||
printf
|
|
||||||
__assert_fail
|
__assert_fail
|
||||||
|
__ctype_b_loc
|
||||||
|
__ctype_tolower_loc
|
||||||
|
__xpg_basename
|
||||||
|
abs
|
||||||
|
atof
|
||||||
|
atoi
|
||||||
|
calloc
|
||||||
|
clock
|
||||||
|
clock_gettime
|
||||||
|
exit
|
||||||
|
fclose
|
||||||
|
feof
|
||||||
|
fflush
|
||||||
|
fgets
|
||||||
|
fopen
|
||||||
|
fprintf
|
||||||
|
fputs
|
||||||
|
fread
|
||||||
|
free
|
||||||
|
fscanf
|
||||||
|
fseek
|
||||||
|
ftell
|
||||||
|
fwrite
|
||||||
|
getc
|
||||||
|
gettimeofday
|
||||||
|
gmtime
|
||||||
|
malloc
|
||||||
|
memcmp
|
||||||
|
memcpy
|
||||||
|
memset
|
||||||
|
pclose
|
||||||
|
perror
|
||||||
|
popen
|
||||||
|
printf
|
||||||
|
qsort
|
||||||
|
rand
|
||||||
|
realloc
|
||||||
|
rewind
|
||||||
|
sprintf
|
||||||
|
srand
|
||||||
|
sscanf
|
||||||
|
stat
|
||||||
|
strcat
|
||||||
|
strchr
|
||||||
|
strcmp
|
||||||
|
strcpy
|
||||||
|
strftime
|
||||||
|
strlen
|
||||||
|
strncmp
|
||||||
|
strncpy
|
||||||
|
strstr
|
||||||
|
strtod
|
||||||
|
strtok
|
||||||
|
strtol
|
||||||
|
strtoul
|
||||||
|
time
|
||||||
|
tolower
|
||||||
|
strcasecmp
|
||||||
|
|||||||
12
openmp/libomptarget/DirectGPUCompilation/clang-gpu
Executable file
12
openmp/libomptarget/DirectGPUCompilation/clang-gpu
Executable file
@@ -0,0 +1,12 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
|
||||||
|
from driver.wrapper import run
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
run(is_cpp=False)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
12
openmp/libomptarget/DirectGPUCompilation/clang-gpu++
Executable file
12
openmp/libomptarget/DirectGPUCompilation/clang-gpu++
Executable file
@@ -0,0 +1,12 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
|
||||||
|
from driver.wrapper import run
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
run(is_cpp=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
40
openmp/libomptarget/DirectGPUCompilation/driver/Main.c
Normal file
40
openmp/libomptarget/DirectGPUCompilation/driver/Main.c
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
//===------- Main.c - Direct compilation program start point ------ C -----===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
extern int __user_main(int, char *[]);
|
||||||
|
extern void __kmpc_target_init_allocator(void);
|
||||||
|
|
||||||
|
#ifdef SINGLE_THREAD_EXECUTION
|
||||||
|
#define THREAD_LIMIT 1
|
||||||
|
#else
|
||||||
|
#define THREAD_LIMIT 1024
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
#pragma omp target enter data map(to: argv[:argc])
|
||||||
|
|
||||||
|
for (int I = 0; I < argc; ++I) {
|
||||||
|
#pragma omp target enter data map(to: argv[I][:strlen(argv[I])])
|
||||||
|
}
|
||||||
|
|
||||||
|
int Ret = 0;
|
||||||
|
#pragma omp target enter data map(to: Ret)
|
||||||
|
|
||||||
|
#pragma omp target teams num_teams(1) thread_limit(1)
|
||||||
|
{ __kmpc_target_init_allocator(); }
|
||||||
|
|
||||||
|
#pragma omp target teams num_teams(1) thread_limit(THREAD_LIMIT)
|
||||||
|
{
|
||||||
|
Ret = __user_main(argc, argv);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma omp target exit data map(from: Ret)
|
||||||
|
|
||||||
|
return Ret;
|
||||||
|
}
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
//===------- UserWrapper.h - User code wrapper --------------- C++ --------===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
|
||||||
|
#define OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
|
||||||
|
|
||||||
|
#pragma omp begin declare target device_type(nohost)
|
||||||
|
|
||||||
|
#endif
|
||||||
0
openmp/libomptarget/DirectGPUCompilation/driver/__init__.py
Executable file
0
openmp/libomptarget/DirectGPUCompilation/driver/__init__.py
Executable file
141
openmp/libomptarget/DirectGPUCompilation/driver/wrapper.py
Executable file
141
openmp/libomptarget/DirectGPUCompilation/driver/wrapper.py
Executable file
@@ -0,0 +1,141 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
|
||||||
|
cwd = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
source_file_suffix = [".c", ".cpp", ".cu", ".hip", ".cc", ".cxx"]
|
||||||
|
|
||||||
|
|
||||||
|
def print_version():
|
||||||
|
cp = subprocess.run(["clang", "--version"])
|
||||||
|
if cp.returncode != 0:
|
||||||
|
sys.exit(cp.returncode)
|
||||||
|
|
||||||
|
|
||||||
|
def compile_loader(loader_name, targets, verbose, dry_run):
|
||||||
|
cmd = [
|
||||||
|
"clang",
|
||||||
|
"-c",
|
||||||
|
"-fopenmp",
|
||||||
|
"-foffload-lto",
|
||||||
|
"-fopenmp-offload-mandatory",
|
||||||
|
"-o",
|
||||||
|
loader_name,
|
||||||
|
]
|
||||||
|
if targets:
|
||||||
|
for arch in targets:
|
||||||
|
cmd.append("--offload-arch={}".format(arch))
|
||||||
|
else:
|
||||||
|
cmd.append("--offload-arch=native")
|
||||||
|
cmd.append(os.path.join(cwd, "Main.c"))
|
||||||
|
if verbose:
|
||||||
|
print(" ".join(cmd), file=sys.stderr)
|
||||||
|
if dry_run:
|
||||||
|
print(" ".join(cmd), file=sys.stderr)
|
||||||
|
return
|
||||||
|
cp = subprocess.run(cmd)
|
||||||
|
if cp.returncode != 0:
|
||||||
|
sys.exit(cp.returncode)
|
||||||
|
|
||||||
|
|
||||||
|
def invoke_clang(is_cpp, args, targets, verbose, dry_run):
|
||||||
|
cmd = [
|
||||||
|
"clang++" if is_cpp else "clang",
|
||||||
|
"-fopenmp",
|
||||||
|
"-foffload-lto",
|
||||||
|
"-fopenmp-offload-mandatory",
|
||||||
|
"-fopenmp-globalize-to-global-space",
|
||||||
|
"-include",
|
||||||
|
os.path.join(cwd, "UserWrapper.h"),
|
||||||
|
"--save-temps",
|
||||||
|
"-rdynamic",
|
||||||
|
"-mllvm",
|
||||||
|
"-enable-host-rpc",
|
||||||
|
"-mllvm",
|
||||||
|
"-openmp-opt-disable-state-machine-rewrite",
|
||||||
|
"-mllvm",
|
||||||
|
"-enable-canonicalize-main-function",
|
||||||
|
"-mllvm",
|
||||||
|
"-canonical-main-function-name=__user_main",
|
||||||
|
]
|
||||||
|
if targets:
|
||||||
|
for arch in targets:
|
||||||
|
cmd.append("--offload-arch={}".format(arch))
|
||||||
|
else:
|
||||||
|
cmd.append("--offload-arch=native")
|
||||||
|
cmd += args
|
||||||
|
if verbose:
|
||||||
|
print(" ".join(cmd))
|
||||||
|
if dry_run:
|
||||||
|
print(" ".join(cmd))
|
||||||
|
return
|
||||||
|
cp = subprocess.run(cmd)
|
||||||
|
if cp.returncode != 0:
|
||||||
|
sys.exit(cp.returncode)
|
||||||
|
|
||||||
|
|
||||||
|
def run(is_cpp=False):
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="clang-gpu", description="clang LLVM GPU compiler"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-c",
|
||||||
|
action="store_true",
|
||||||
|
help="Only run preprocess, compile, and assemble steps",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-v", action="store_true", help="Show commands to run and use verbose output"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--version", action="store_true", help="Print version information"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-###",
|
||||||
|
action="store_true",
|
||||||
|
help="Print (but do not run) the commands to run for this compilation",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--offload-arch",
|
||||||
|
type=str,
|
||||||
|
help=(
|
||||||
|
"Specify an offloading device architecture for CUDA, HIP, or OpenMP. (e.g."
|
||||||
|
" sm_35). If 'native' is used the compiler will detect locally installed"
|
||||||
|
" architectures. For HIP offloading, the device architecture can be"
|
||||||
|
" followed by target ID features delimited by a colon (e.g."
|
||||||
|
" gfx908:xnack+:sramecc-). May be specified more than once."
|
||||||
|
),
|
||||||
|
nargs="*",
|
||||||
|
)
|
||||||
|
|
||||||
|
args, fwd_args = parser.parse_known_args()
|
||||||
|
|
||||||
|
if args.version:
|
||||||
|
print_version()
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.c:
|
||||||
|
fwd_args.append("-c")
|
||||||
|
if args.v:
|
||||||
|
fwd_args.append("-v")
|
||||||
|
|
||||||
|
dry_run = vars(args)["###"]
|
||||||
|
loader_name = None
|
||||||
|
temp_files = []
|
||||||
|
|
||||||
|
if not args.c:
|
||||||
|
tf = tempfile.NamedTemporaryFile()
|
||||||
|
loader_name = "{}.o".format(tf.name)
|
||||||
|
compile_loader(loader_name, args.offload_arch, args.v, dry_run)
|
||||||
|
fwd_args.append(loader_name)
|
||||||
|
temp_files.append(loader_name)
|
||||||
|
|
||||||
|
invoke_clang(is_cpp, fwd_args, args.offload_arch, args.v, dry_run)
|
||||||
|
|
||||||
|
for f in temp_files:
|
||||||
|
if os.path.isfile(f):
|
||||||
|
os.unlink(f)
|
||||||
67
openmp/libomptarget/include/HostRPC.h
Normal file
67
openmp/libomptarget/include/HostRPC.h
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
//===------- HostRPC.h - Host RPC ---------------------------- C++ --------===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef OPENMP_LIBOMPTARGET_INCLUDE_HOSTRPC_H
|
||||||
|
#define OPENMP_LIBOMPTARGET_INCLUDE_HOSTRPC_H
|
||||||
|
|
||||||
|
#ifdef OMPTARGET_DEVICE_RUNTIME
|
||||||
|
#include "Types.h"
|
||||||
|
#else
|
||||||
|
#include <cstdint>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace hostrpc {
|
||||||
|
|
||||||
|
/// RPC call identifier. Note: negative value only. Non-negative values are for
|
||||||
|
/// compiler generated functions.
|
||||||
|
enum CallId {
|
||||||
|
CALLID___kmpc_launch_parallel_51_kernel = -1,
|
||||||
|
CALLID_invalid = -2147483648,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Execution status.
|
||||||
|
enum ExecutionStatus {
|
||||||
|
EXEC_STAT_CREATED = 0,
|
||||||
|
EXEC_STAT_DONE = 1,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum Type {
|
||||||
|
ARG_LITERAL = 0,
|
||||||
|
ARG_POINTER = 1,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Argument {
|
||||||
|
intptr_t Value;
|
||||||
|
int64_t ArgType;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Descriptor {
|
||||||
|
// The following member will be used by both host and device.
|
||||||
|
int32_t Id;
|
||||||
|
struct Argument *Args;
|
||||||
|
int64_t NumArgs;
|
||||||
|
volatile int64_t Status;
|
||||||
|
volatile int64_t ReturnValue;
|
||||||
|
|
||||||
|
// The following members will only be used by device.
|
||||||
|
void **ArgInfo;
|
||||||
|
void *ArgMap;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// A wrapper of HostRPCDescriptor that will only be used between plugins and
|
||||||
|
/// libomptarget. It contains the three stdio global variables.
|
||||||
|
struct DescriptorWrapper {
|
||||||
|
Descriptor D;
|
||||||
|
void *StdIn = nullptr;
|
||||||
|
void *StdOut = nullptr;
|
||||||
|
void *StdErr = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace hostrpc
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -424,6 +424,10 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
|
|||||||
void *VAddr, bool IsRecord, bool SaveOutput,
|
void *VAddr, bool IsRecord, bool SaveOutput,
|
||||||
uint64_t &ReqPtrArgOffset);
|
uint64_t &ReqPtrArgOffset);
|
||||||
|
|
||||||
|
// Host RPC support
|
||||||
|
int64_t __kmpc_host_rpc_get_arg(void *Wrapper, int32_t ArgNo);
|
||||||
|
void __kmpc_host_rpc_set_ret_val(void *Wrapper, int64_t RetVal);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -69,7 +69,8 @@ VERS1.0 {
|
|||||||
__tgt_interop_init;
|
__tgt_interop_init;
|
||||||
__tgt_interop_use;
|
__tgt_interop_use;
|
||||||
__tgt_interop_destroy;
|
__tgt_interop_destroy;
|
||||||
ompt_libomptarget_connect;
|
__kmpc_host_rpc_get_arg;
|
||||||
|
__kmpc_host_rpc_set_ret_val;
|
||||||
local:
|
local:
|
||||||
*;
|
*;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -16,6 +16,8 @@
|
|||||||
#include "PluginManager.h"
|
#include "PluginManager.h"
|
||||||
#include "private.h"
|
#include "private.h"
|
||||||
|
|
||||||
|
#include "HostRPC.h"
|
||||||
|
|
||||||
#include "Shared/EnvironmentVar.h"
|
#include "Shared/EnvironmentVar.h"
|
||||||
#include "Shared/Profile.h"
|
#include "Shared/Profile.h"
|
||||||
|
|
||||||
@@ -504,3 +506,33 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
|
|||||||
delete AsyncInfo;
|
delete AsyncInfo;
|
||||||
*AsyncHandle = nullptr;
|
*AsyncHandle = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Host RPC support functions.
|
||||||
|
EXTERN int64_t __kmpc_host_rpc_get_arg(void *Wrapper, int32_t ArgNum) {
|
||||||
|
auto *W = reinterpret_cast<hostrpc::DescriptorWrapper *>(Wrapper);
|
||||||
|
auto &SD = W->D;
|
||||||
|
|
||||||
|
assert(ArgNum < SD.NumArgs && "out-of-range argument");
|
||||||
|
|
||||||
|
int64_t ArgVal = SD.Args[ArgNum].Value;
|
||||||
|
void *ArgPtr = reinterpret_cast<void *>(ArgVal);
|
||||||
|
|
||||||
|
DP("[host-rpc] get argno=%d arg=%lx...\n", ArgNum, ArgVal);
|
||||||
|
|
||||||
|
if (W->StdIn && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
|
||||||
|
ArgPtr == W->StdIn)
|
||||||
|
return (int64_t)stdin;
|
||||||
|
if (W->StdOut && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
|
||||||
|
ArgPtr == W->StdOut)
|
||||||
|
return (int64_t)stdout;
|
||||||
|
if (W->StdErr && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
|
||||||
|
ArgPtr == W->StdErr)
|
||||||
|
return (int64_t)stderr;
|
||||||
|
|
||||||
|
return ArgVal;
|
||||||
|
}
|
||||||
|
|
||||||
|
EXTERN void __kmpc_host_rpc_set_ret_val(void *Wrapper, int64_t RetVal) {
|
||||||
|
auto &SD = reinterpret_cast<hostrpc::DescriptorWrapper *>(Wrapper)->D;
|
||||||
|
SD.ReturnValue = RetVal;
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user