Compare commits

...

28 Commits

Author SHA1 Message Date
Nicolas Marie
7a714f7795 split runtimes libs install in multiples directories 2024-08-09 10:12:57 -07:00
Nicolas Marie
6ee95b22e5 Fixe segmentation fault fron Atributor Attributes 2024-08-09 10:09:56 -07:00
Nicolas Marie
762b333c3a fixe issues where emmiting omp dynamic allocation instead of a pre alocated memory, we try to add debug informations to an invalide uninitialize adress 2024-08-09 10:01:37 -07:00
Nicolas Marie
13beaff03f Add mpi header include directory to clang-mpi-gpu & add the possibility to setup gpu mpi ranks & threads from env variables MPI_RANKS & MPI_THREADS 2024-08-09 09:58:59 -07:00
Nicolas Marie
c7399b2039 modify rtl to be able to use multi grid syncs 2024-08-09 09:51:58 -07:00
Nicolas Marie
1ce8887a1a temporary remove new operator in conflict with std cpp header when using host headers with host rpcs 2024-08-09 09:49:59 -07:00
Nicolas Marie
de0a400cdb Make debug information of HOST Rpc more clean 2024-08-09 09:47:22 -07:00
Nicolas Marie
dc0bac14f1 continue MPI implementation to run LULESH 2024-08-09 09:45:59 -07:00
Nicolas Marie
c8b441d7c4 fixe allocations and namespace 2024-07-03 15:25:03 -07:00
Nicolas Marie
6b947a0cb3 remove leftover debug 2024-07-03 08:58:32 -07:00
Nicolas Marie
bb9875d4b4 continue implementation of mpi p2p 2024-07-02 17:29:14 -07:00
Nicolas Marie
0ae02a120c mpi p2p communications first try 2024-07-02 09:40:45 -07:00
Nicolas Marie
ad1e11e0d9 Fixe GPU First RPCs to work when given a pointer
- Fixe GPUFirst Memory Allocator to work with new offload plugin.
- Fixe TeamAllocator to not Ignore first Allocation.
2024-06-18 16:42:25 -07:00
Nicolas Marie
3b1aae9380 Use GPUFirst with libc rpc
- add missing headers in rpc.h.def
- add an opcode in libc rpc to handle gpu first host functions calls
- Fixe pointer casting
- Fixe Generated function to account for AMD address space
- remove LibC duplicate FILE declarations
- remove global variable to allow asyncronize rpc call
2024-06-18 08:42:58 -07:00
Nicolas Marie
c59cbdebfd Fixe wrapper to use -O1 on the wrapper (-O0 is not supported) 2024-06-17 17:46:29 -07:00
Nicolas Marie
87cc6ecc5f Fixe libc rpc test 2024-06-17 17:40:54 -07:00
Nicolas Marie
8aa3a4431a Fixe Attributors nulllptr 2024-06-17 17:39:25 -07:00
Nicolas Marie
1582f964a4 Fixe DeviceRTL atomics:
- remove unsupported & unused operations
- add scope awarness to nvidia atomicInc
2024-06-17 17:17:40 -07:00
Nicolas Marie
ef5d9941c3 Temporary fixe for main function canonicalization error. 2024-06-17 17:13:43 -07:00
Nicolas Marie
b4d5dec977 Fixe: remove call to removed LIBC_HAS_BUILTIN macro 2024-06-17 17:10:41 -07:00
Nicolas Marie
eae9159c1d Stop using outdated LTO pipeline for HostRPC 2024-06-17 17:09:15 -07:00
Nicolas Marie
b553ad7fd7 Replace getRawPointer by emitRawPointer 2024-06-17 17:03:07 -07:00
Nicolas Marie
3b6c5536c4 Remove unused CMakeLists.txt 2024-06-17 16:03:17 -07:00
Nicolas Marie
8980abe311 Remove outdated AutoRPC Folder 2024-06-17 14:57:29 -07:00
Joseph Huber
bf407f0829 [libc] Export the RPC interface from libc
Summary:
This patch adds new extensions that allow us to export the RPC interface
from `libc` to other programs. This should allow external users of the
GPU `libc` to interface with the RPC client (which more or less behaves
like syscalls in this context). This is done by wrapping the interface
into a C-style function call.

Obviously, this approach is far less safe than the carefully crafted C++
interface. For example, we now expose the internal packet buffer, and it
is theoretically possible to open a single port with conflicting opcodes
and break the whole interface. So, extra care will be needed when
interacting with this. However, the usage is very similar as shown by
the new test.

This somewhat stretches the concept of `libc` just doing `libc` things,
but I think this is important enough to justify it. It is difficult to
split this out, as we do not want to have the situation where we have
multiple RPC clients running at one time, so for now it makes sense to
just leave it in `libc`.
2024-05-23 09:36:28 -07:00
Nicolas Marie
2a4a8ef91e Fixe compilations issues after rebase of llvm-test-suite-gpu 2024-05-20 14:52:00 -07:00
Nicolas Marie
f1001ba68d Revert "[LTO] Remove Config.UseDefaultPipeline (#82587)"
This reverts commit ec24094b56.
We do need Config.UseDefaultPipeline.
2024-05-20 14:44:46 -07:00
Shilei Tian
1ef92ff738 [OpenMP] Add the initial support for direct gpu compilation
Rebase from llvm-test-suite-gpu, fixe rebase conflict in:

clang/include/clang/Driver/Options.td
clang/lib/CodeGen/CGDecl.cpp
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
clang/lib/Driver/ToolChains/Clang.cpp
clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
llvm/lib/Passes/PassRegistry.def
llvm/lib/Transforms/IPO/AttributorAttributes.cpp
llvm/lib/Transforms/IPO/OpenMPOpt.cpp
openmp/libomptarget/DeviceRTL/src/Mapping.cpp
openmp/libomptarget/DeviceRTL/src/State.cpp
openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
openmp/libomptarget/DeviceRTL/src/Utils.cpp
openmp/libomptarget/DeviceRTL/src/exports
openmp/libomptarget/include/omptarget.h
openmp/libomptarget/src/exports
openmp/libomptarget/src/interface.cpp
2024-05-20 14:44:29 -07:00
91 changed files with 6561 additions and 216 deletions

View File

@@ -267,6 +267,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
LANGOPT(OpenMPGlobalizeToGlobalSpace , 1, 0, "Globalize to global space for the globalized variables")
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")

View File

@@ -3561,6 +3561,10 @@ def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested
HelpText<"Assert no nested parallel regions in the GPU">,
MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;
def fopenmp_globalize_to_global_space : Flag<["-"], "fopenmp-globalize-to-global-space">,
HelpText<"Globalize to global space for the globalized variables">,
MarshallingInfoFlag<LangOpts<"OpenMPGlobalizeToGlobalSpace">>;
} // let Group = f_Group
} // let Visibility = [ClangOption, CC1Option, FC1Option]
} // let Flags = [NoArgumentUnused, HelpHidden]

View File

@@ -1475,6 +1475,7 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
bool NRVO = getLangOpts().ElideConstructors && D.isNRVOVariable();
bool VarAllocated = false; // omp dynamic allocation
if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
address = OpenMPLocalAddr;
AllocaAddr = OpenMPLocalAddr;
@@ -1608,7 +1609,6 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
// by the definition of the VLA. Since this is an escaped declaration, in
// OpenMP we have to use a call to __kmpc_alloc_shared(). The matching
// deallocation call to __kmpc_free_shared() is emitted later.
bool VarAllocated = false;
if (getLangOpts().OpenMPIsTargetDevice) {
auto &RT = CGM.getOpenMPRuntime();
if (RT.isDelayedVariableLengthDecl(*this, &D)) {
@@ -1668,7 +1668,7 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
emission.AllocaAddr = AllocaAddr;
// Emit debug info for local var declaration.
if (EmitDebugInfo && HaveInsertPoint()) {
if (EmitDebugInfo && HaveInsertPoint() && !VarAllocated) {
Address DebugAddr = address;
bool UsePointerValue = NRVO && ReturnValuePointer.isValid();
DI->setLocation(D.getLocation());
@@ -2577,48 +2577,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
(IPD->getParameterKind() == ImplicitParamKind::ThreadPrivateVar);
}
Address DeclPtr = Address::invalid();
RawAddress AllocaPtr = Address::invalid();
bool DoStore = false;
bool IsScalar = hasScalarEvaluationKind(Ty);
bool UseIndirectDebugAddress = false;
// If we already have a pointer to the argument, reuse the input pointer.
if (Arg.isIndirect()) {
DeclPtr = Arg.getIndirectAddress();
DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty));
// Indirect argument is in alloca address space, which may be different
// from the default address space.
auto AllocaAS = CGM.getASTAllocaAddressSpace();
auto *V = DeclPtr.emitRawPointer(*this);
AllocaPtr = RawAddress(V, DeclPtr.getElementType(), DeclPtr.getAlignment());
// For truly ABI indirect arguments -- those that are not `byval` -- store
// the address of the argument on the stack to preserve debug information.
ABIArgInfo ArgInfo = CurFnInfo->arguments()[ArgNo - 1].info;
if (ArgInfo.isIndirect())
UseIndirectDebugAddress = !ArgInfo.getIndirectByVal();
if (UseIndirectDebugAddress) {
auto PtrTy = getContext().getPointerType(Ty);
AllocaPtr = CreateMemTemp(PtrTy, getContext().getTypeAlignInChars(PtrTy),
D.getName() + ".indirect_addr");
EmitStoreOfScalar(V, AllocaPtr, /* Volatile */ false, PtrTy);
}
auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
auto DestLangAS =
getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
if (SrcLangAS != DestLangAS) {
assert(getContext().getTargetAddressSpace(SrcLangAS) ==
CGM.getDataLayout().getAllocaAddrSpace());
auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
auto *T = llvm::PointerType::get(getLLVMContext(), DestAS);
DeclPtr =
DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
*this, V, SrcLangAS, DestLangAS, T, true),
DeclPtr.isKnownNonNull());
}
auto PushCleanupIfNeeded = [this, Ty, &D](Address DeclPtr) {
// Push a destructor cleanup for this parameter if the ABI requires it.
// Don't push a cleanup in a thunk for a method that will also emit a
// cleanup.
@@ -2634,87 +2593,124 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
EHStack.stable_begin();
}
}
} else {
// Check if the parameter address is controlled by OpenMP runtime.
Address OpenMPLocalAddr =
getLangOpts().OpenMP
? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
: Address::invalid();
if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
DeclPtr = OpenMPLocalAddr;
AllocaPtr = DeclPtr;
};
Address DeclPtr = Address::invalid();
RawAddress AllocaPtr = Address::invalid();
Address OpenMPLocalAddr =
getLangOpts().OpenMP
? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
: Address::invalid();
bool DoStore = false;
bool IsScalar = hasScalarEvaluationKind(Ty);
bool UseIndirectDebugAddress = false;
if (OpenMPLocalAddr.isValid()) {
DeclPtr = OpenMPLocalAddr;
AllocaPtr = DeclPtr;
LValue Dst = MakeAddrLValue(DeclPtr, Ty);
if (Arg.isIndirect()) {
LValue Src = MakeAddrLValue(Arg.getIndirectAddress(), Ty);
callCStructCopyConstructor(Dst, Src);
PushCleanupIfNeeded(Arg.getIndirectAddress());
} else {
// Otherwise, create a temporary to hold the value.
EmitStoreOfScalar(Arg.getDirectValue(), Dst, /* isInitialization */ true);
}
} else {
// If we already have a pointer to the argument, reuse the input pointer.
if (Arg.isIndirect()) {
DeclPtr = Arg.getIndirectAddress();
DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty));
// Indirect argument is in alloca address space, which may be different
// from the default address space.
auto AllocaAS = CGM.getASTAllocaAddressSpace();
auto *V = DeclPtr.emitRawPointer(*this);
AllocaPtr = RawAddress(V, DeclPtr.getElementType(), DeclPtr.getAlignment());
auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
auto DestLangAS =
getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
if (SrcLangAS != DestLangAS) {
assert(getContext().getTargetAddressSpace(SrcLangAS) ==
CGM.getDataLayout().getAllocaAddrSpace());
auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
auto *T = llvm::PointerType::get(getLLVMContext(), DestAS);
DeclPtr =
DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
*this, V, SrcLangAS, DestLangAS, T, true),
DeclPtr.isKnownNonNull());
}
PushCleanupIfNeeded(DeclPtr);
} else {
// Create a temporary to hold the value.
DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
D.getName() + ".addr", &AllocaPtr);
DoStore = true;
}
DoStore = true;
}
llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
LValue lv = MakeAddrLValue(DeclPtr, Ty);
if (IsScalar) {
Qualifiers qs = Ty.getQualifiers();
if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
// We honor __attribute__((ns_consumed)) for types with lifetime.
// For __strong, it's handled by just skipping the initial retain;
// otherwise we have to balance out the initial +1 with an extra
// cleanup to do the release at the end of the function.
bool isConsumed = D.hasAttr<NSConsumedAttr>();
LValue lv = MakeAddrLValue(DeclPtr, Ty);
if (IsScalar) {
Qualifiers qs = Ty.getQualifiers();
if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
// We honor __attribute__((ns_consumed)) for types with lifetime.
// For __strong, it's handled by just skipping the initial retain;
// otherwise we have to balance out the initial +1 with an extra
// cleanup to do the release at the end of the function.
bool isConsumed = D.hasAttr<NSConsumedAttr>();
// If a parameter is pseudo-strong then we can omit the implicit retain.
if (D.isARCPseudoStrong()) {
assert(lt == Qualifiers::OCL_Strong &&
"pseudo-strong variable isn't strong?");
assert(qs.hasConst() && "pseudo-strong variable should be const!");
lt = Qualifiers::OCL_ExplicitNone;
}
// If a parameter is pseudo-strong then we can omit the implicit retain.
if (D.isARCPseudoStrong()) {
assert(lt == Qualifiers::OCL_Strong &&
"pseudo-strong variable isn't strong?");
assert(qs.hasConst() && "pseudo-strong variable should be const!");
lt = Qualifiers::OCL_ExplicitNone;
}
// Load objects passed indirectly.
if (Arg.isIndirect() && !ArgVal)
ArgVal = Builder.CreateLoad(DeclPtr);
// Load objects passed indirectly.
if (Arg.isIndirect() && !ArgVal)
ArgVal = Builder.CreateLoad(DeclPtr);
if (lt == Qualifiers::OCL_Strong) {
if (!isConsumed) {
if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
// use objc_storeStrong(&dest, value) for retaining the
// object. But first, store a null into 'dest' because
// objc_storeStrong attempts to release its old value.
llvm::Value *Null = CGM.EmitNullConstant(D.getType());
EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
DoStore = false;
if (lt == Qualifiers::OCL_Strong) {
if (!isConsumed) {
if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
// use objc_storeStrong(&dest, value) for retaining the
// object. But first, store a null into 'dest' because
// objc_storeStrong attempts to release its old value.
llvm::Value *Null = CGM.EmitNullConstant(D.getType());
EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
DoStore = false;
} else
// Don't use objc_retainBlock for block pointers, because we
// don't want to Block_copy something just because we got it
// as a parameter.
ArgVal = EmitARCRetainNonBlock(ArgVal);
}
} else {
// Push the cleanup for a consumed parameter.
if (isConsumed) {
ARCPreciseLifetime_t precise =
(D.hasAttr<ObjCPreciseLifetimeAttr>() ? ARCPreciseLifetime
: ARCImpreciseLifetime);
EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(),
ArgVal, precise);
}
if (lt == Qualifiers::OCL_Weak) {
EmitARCInitWeak(DeclPtr, ArgVal);
DoStore = false; // The weak init is a store, no need to do two.
}
else
// Don't use objc_retainBlock for block pointers, because we
// don't want to Block_copy something just because we got it
// as a parameter.
ArgVal = EmitARCRetainNonBlock(ArgVal);
}
} else {
// Push the cleanup for a consumed parameter.
if (isConsumed) {
ARCPreciseLifetime_t precise = (D.hasAttr<ObjCPreciseLifetimeAttr>()
? ARCPreciseLifetime : ARCImpreciseLifetime);
EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(), ArgVal,
precise);
}
if (lt == Qualifiers::OCL_Weak) {
EmitARCInitWeak(DeclPtr, ArgVal);
DoStore = false; // The weak init is a store, no need to do two.
}
// Enter the cleanup scope.
EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
}
// Enter the cleanup scope.
EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
}
}
// Store the initial value into the alloca.
if (DoStore)
EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
// Store the initial value into the alloca.
if (DoStore)
EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
}
setAddrOfLocalVar(&D, DeclPtr);

View File

@@ -1088,10 +1088,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
// Allocate space for the variable to be globalized
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
llvm::CallBase *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
? OMPRTL_malloc
: OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
// FIXME: We should use the variables actual alignment as an argument.
VoidPtr->addRetAttr(llvm::Attribute::get(
CGM.getLLVMContext(), llvm::Attribute::Alignment,
@@ -1122,12 +1124,14 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
CGM.getContext().getDeclAlign(VD),
AlignmentSource::Decl);
I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress(CGF));
}
I->getSecond().MappedParams->apply(CGF);
}
bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
const VarDecl *VD) const {
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
@@ -1155,10 +1159,12 @@ CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
// Allocate space for this VLA object to be globalized.
llvm::Value *AllocArgs[] = {Size};
llvm::CallBase *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
? OMPRTL_malloc
: OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
VoidPtr->addRetAttr(llvm::Attribute::get(
CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
@@ -1184,20 +1190,29 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
// globalized in the prolog (i.e. emitGenericVarsProlog).
for (const auto &AddrSizePair :
llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
{AddrSizePair.first, AddrSizePair.second});
if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
{AddrSizePair.first});
else
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
{AddrSizePair.first, AddrSizePair.second});
}
// Deallocate the memory for each globalized value
for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
const auto *VD = cast<VarDecl>(Rec.first);
I->getSecond().MappedParams->restore(CGF);
llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
CGF.getTypeSize(VD->getType())};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
FreeArgs);
if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
{Rec.second.GlobalizedVal});
else
CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
OMPRTL___kmpc_free_shared),
{Rec.second.GlobalizedVal, CGF.getTypeSize(VD->getType())});
}
}
}

View File

@@ -6641,6 +6641,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fopenmp-offload-mandatory");
if (Args.hasArg(options::OPT_fopenmp_force_usm))
CmdArgs.push_back("-fopenmp-force-usm");
if (Args.hasArg(options::OPT_fopenmp_globalize_to_global_space))
CmdArgs.push_back("-fopenmp-globalize-to-global-space");
break;
default:
// By default, if Clang doesn't know how to generate useful OpenMP code

View File

@@ -83,6 +83,8 @@ inline void *operator new(__SIZE_TYPE__ size) {
inline void *operator new[](__SIZE_TYPE__ size) { return ::operator new(size); }
//inline void *operator new(__SIZE_TYPE__ size, void *ptr) { return ptr; }
inline void operator delete(void *ptr)OPENMP_NOEXCEPT { ::free(ptr); }
inline void operator delete[](void *ptr) OPENMP_NOEXCEPT {

View File

@@ -0,0 +1,33 @@
// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
// expected-no-diagnostics
extern int printf(const char *, ...);
struct S {
int a;
float b;
};
// CHECK: define{{.*}}void @test(%struct.S* noundef byval(%struct.S) align {{[0-9]+}} [[arg:%[0-9a-zA-Z]+]])
// CHECK: [[g:%[0-9a-zA-Z]+]] = call align {{[0-9]+}} i8* @__kmpc_alloc_shared
// CHECK: bitcast i8* [[g]] to %struct.S*
// CHECK: bitcast %struct.S* [[arg]] to i8**
// CHECK: call void [[cc:@__copy_constructor[_0-9a-zA-Z]+]]
// CHECK: void [[cc]]
void test(struct S s) {
#pragma omp parallel for
for (int i = 0; i < s.a; ++i) {
printf("%i : %i : %f\n", i, s.a, s.b);
}
}
void foo() {
#pragma omp target teams num_teams(1)
{
struct S s;
s.a = 7;
s.b = 11;
test(s);
}
}

View File

@@ -137,6 +137,13 @@ static constexpr OptTable::Info InfoTable[] = {
#undef OPTION
};
/// Host RPC module that will be shared to the corresponding pass.
Module *HostModule = nullptr;
/// We only need to generate the host RPC module once.
bool IsHostModuleGenerated = false;
/// Host RPC object file.
StringRef HostRPCObjFile;
class WrapperOptTable : public opt::GenericOptTable {
public:
WrapperOptTable() : opt::GenericOptTable(InfoTable) {}
@@ -615,10 +622,12 @@ std::vector<std::string> getTargetFeatures(ArrayRef<OffloadFile> InputFiles) {
return UnifiedFeatures;
}
template <typename ModuleHook = function_ref<bool(size_t, const Module &)>>
template <typename PreHookTy = function_ref<bool(size_t, const Module &)>,
typename PostHookTy = function_ref<bool(size_t, const Module &)>>
std::unique_ptr<lto::LTO> createLTO(
const ArgList &Args, const std::vector<std::string> &Features,
ModuleHook Hook = [](size_t, const Module &) { return true; }) {
PreHookTy PreHook = [](size_t, const Module &) { return true; },
PostHookTy PostHook = [](size_t, const Module &) { return true; }) {
const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
// We need to remove AMD's target-id from the processor if present.
StringRef Arch = Args.getLastArgValue(OPT_arch_EQ).split(":").first;
@@ -673,7 +682,9 @@ std::unique_ptr<lto::LTO> createLTO(
return true;
};
}
Conf.PostOptModuleHook = Hook;
Conf.PreOptModuleHook = PreHook;
Conf.PostOptModuleHook = PostHook;
Conf.CGFileType = (Triple.isNVPTX() || SaveTemps)
? CodeGenFileType::AssemblyFile
: CodeGenFileType::ObjectFile;
@@ -692,6 +703,57 @@ bool isValidCIdentifier(StringRef S) {
[](char C) { return C == '_' || isAlnum(C); });
}
bool writeHostModule(std::string &FileName) {
if (!HostModule)
return false;
if (HostModule->getFunctionList().empty())
return false;
auto HostTriple = HostModule->getTargetTriple();
FileName =
sys::path::filename(ExecutableName).str() + "-host-rpc-" + HostTriple;
auto TempFileOrErr = createOutputFile(FileName, "bc");
if (!TempFileOrErr)
reportError(TempFileOrErr.takeError());
int FD = -1;
if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
reportError(errorCodeToError(EC));
auto Out = std::make_unique<llvm::raw_fd_ostream>(FD, true);
WriteBitcodeToFile(*HostModule, *Out);
return true;
}
std::unique_ptr<lto::LTO> createHostRPCLTO(StringRef HostTriple) {
const llvm::Triple Triple(HostTriple);
lto::Config Conf;
lto::ThinBackend Backend;
Backend =
lto::createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
// TODO: host arch?
// Conf.CPU = Arch.str();
Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple);
// TODO: host features?
// Conf.MAttrs = Features;
Conf.CGOptLevel = *CodeGenOpt::getLevel(3);
Conf.OptLevel = 3;
Conf.DefaultTriple = Triple.getTriple();
LTOError = false;
Conf.DiagHandler = diagnosticHandler;
Conf.PTO.LoopVectorization = Conf.OptLevel > 1;
Conf.PTO.SLPVectorization = Conf.OptLevel > 1;
Conf.CGFileType = CodeGenFileType::ObjectFile;
Conf.HasWholeProgramVisibility = false;
return std::make_unique<lto::LTO>(std::move(Conf), Backend);
}
Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
SmallVectorImpl<StringRef> &OutputFiles,
const ArgList &Args) {
@@ -777,14 +839,52 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
BitcodeOutput.push_back(*TempFileOrErr);
return false;
};
auto AddHostModuleAddr = [&](size_t, const Module &M) {
if (!HostModule)
return true;
Module &CM = const_cast<Module &>(M);
auto *MD = CM.getOrInsertNamedMetadata("llvm.hostrpc.hostmodule");
MD->clearOperands();
MD->addOperand(MDTuple::get(
CM.getContext(), {ConstantAsMetadata::get(ConstantInt::get(
Type::getInt64Ty(CM.getContext()),
reinterpret_cast<uintptr_t>(HostModule)))}));
return true;
};
// We assume visibility of the whole program if every input file was bitcode.
auto Features = getTargetFeatures(BitcodeInputFiles);
auto LTOBackend = Args.hasArg(OPT_embed_bitcode) ||
Args.hasArg(OPT_builtin_bitcode_EQ) ||
Args.hasArg(OPT_clang_backend)
? createLTO(Args, Features, OutputBitcode)
: createLTO(Args, Features);
? createLTO(Args, Features, AddHostModuleAddr, OutputBitcode)
: createLTO(Args, Features, AddHostModuleAddr);
LLVMContext &Ctx = LTOBackend->getContext();
StringRef HostTriple =
Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple());
std::unique_ptr<Module> HostModulePtr;
if (!IsHostModuleGenerated) {
HostModulePtr = std::make_unique<Module>(
sys::path::filename(ExecutableName).str() + "-host-rpc.bc", Ctx);
HostModule = HostModulePtr.get();
HostModulePtr->setTargetTriple(HostTriple);
std::string Msg;
const Target *T =
TargetRegistry::lookupTarget(HostModule->getTargetTriple(), Msg);
if (!T)
return createStringError(inconvertibleErrorCode(), Msg);
auto Options =
codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple(HostTriple));
StringRef CPU = "";
StringRef Features = "";
std::unique_ptr<TargetMachine> TM(
T->createTargetMachine(HostTriple, CPU, Features, Options, Reloc::PIC_,
HostModule->getCodeModel()));
HostModule->setDataLayout(TM->createDataLayout());
}
// We need to resolve the symbols so the LTO backend knows which symbols need
// to be kept or can be internalized. This is a simplified symbol resolution
@@ -808,6 +908,7 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
size_t Idx = 0;
for (auto &Sym : Symbols) {
lto::SymbolResolution &Res = Resolutions[Idx++];
// We will use this as the prevailing symbol definition in LTO unless
@@ -878,6 +979,57 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
if (Error Err = LTOBackend->run(AddStream))
return Err;
std::string HostModuleTempFile;
bool ValidHostModule = writeHostModule(HostModuleTempFile);
// Reset the HostModule pointer.
HostModulePtr.reset();
HostModule = nullptr;
// TODO: this is really redundant code.
if (ValidHostModule) {
auto HostLTO = createHostRPCLTO(HostTriple);
std::string HostBitCodeFile = HostModuleTempFile + ".bc";
auto BufferOrError = MemoryBuffer::getFile(HostBitCodeFile);
if (!BufferOrError)
reportError(createFileError(HostBitCodeFile, BufferOrError.getError()));
Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr =
llvm::lto::InputFile::create(*BufferOrError.get());
if (!BitcodeFileOrErr)
return BitcodeFileOrErr.takeError();
const auto Symbols = (*BitcodeFileOrErr)->symbols();
SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
size_t Idx = 0;
for (auto &Sym : Symbols) {
(void)Sym;
lto::SymbolResolution &Res = Resolutions[Idx++];
Res.ExportDynamic = true;
Res.VisibleToRegularObj = true;
Res.LinkerRedefined = false;
Res.Prevailing = true;
}
if (Error Err = HostLTO->add(std::move(*BitcodeFileOrErr), Resolutions))
return Err;
auto RPCAddStream =
[&](size_t Task,
const Twine &ModuleName) -> std::unique_ptr<CachedFileStream> {
int FD = -1;
auto TempFileOrErr = createOutputFile(
sys::path::filename(ExecutableName) + "-host-rpc-" + HostTriple, "o");
if (!TempFileOrErr)
reportError(TempFileOrErr.takeError());
HostRPCObjFile = *TempFileOrErr;
if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
reportError(errorCodeToError(EC));
return std::make_unique<CachedFileStream>(
std::make_unique<llvm::raw_fd_ostream>(FD, true));
};
if (Error Err = HostLTO->run(RPCAddStream))
return Err;
}
if (LTOError)
return createStringError(inconvertibleErrorCode(),
"Errors encountered inside the LTO pipeline.");
@@ -1285,6 +1437,9 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
WrappedOutput.push_back(*OutputOrErr);
}
if (!HostRPCObjFile.empty())
WrappedOutput.push_back(HostRPCObjFile);
return WrappedOutput;
}

View File

@@ -248,12 +248,12 @@ else()
endif()
if(LIBC_TARGET_TRIPLE)
set(LIBC_INSTALL_LIBRARY_DIR lib${LLVM_LIBDIR_SUFFIX}/${LIBC_TARGET_TRIPLE})
set(LIBC_INSTALL_LIBRARY_DIR "lib${LLVM_LIBDIR_SUFFIX}/${LIBC_TARGET_TRIPLE}/libc")
elseif(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR)
set(LIBC_INSTALL_LIBRARY_DIR
lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE})
"lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE}/libc")
else()
set(LIBC_INSTALL_LIBRARY_DIR lib${LLVM_LIBDIR_SUFFIX})
set(LIBC_INSTALL_LIBRARY_DIR "lib${LLVM_LIBDIR_SUFFIX}")
endif()
if(LIBC_TARGET_OS_IS_GPU)

View File

@@ -212,6 +212,10 @@ set(TARGET_LIBC_ENTRYPOINTS
# gpu/rpc.h entrypoints
libc.src.gpu.rpc_host_call
libc.src.gpu.rpc_fprintf
libc.src.gpu.rpc_open_port
libc.src.gpu.rpc_send_n
libc.src.gpu.rpc_recv_n
libc.src.gpu.rpc_close_port
)
set(TARGET_LIBM_ENTRYPOINTS

View File

@@ -636,6 +636,7 @@ if(LIBC_TARGET_OS_IS_GPU)
DEPENDS
.llvm_libc_common_h
.llvm-libc-types.rpc_opcodes_t
.llvm-libc-types.rpc_port_t
)
endif()

View File

@@ -11,7 +11,13 @@
#include "__llvm-libc-common.h"
#include "llvm-libc-types/rpc_opcodes_t.h"
#include <llvm-libc-types/size_t.h>
#include <llvm-libc-types/rpc_opcodes_t.h>
#include <llvm-libc-types/rpc_port_t.h>
#include <stddef.h>
#include <stdio.h>
%%public_api()

View File

@@ -114,6 +114,7 @@ add_header(socklen_t HDR socklen_t.h)
add_header(struct_sockaddr_un HDR struct_sockaddr_un.h)
add_header(struct_sockaddr HDR struct_sockaddr.h)
add_header(rpc_opcodes_t HDR rpc_opcodes_t.h)
add_header(rpc_port_t HDR rpc_port_t.h)
add_header(ACTION HDR ACTION.h)
add_header(ENTRY HDR ENTRY.h)
add_header(struct_hsearch_data HDR struct_hsearch_data.h)

View File

@@ -34,6 +34,7 @@ typedef enum {
RPC_PRINTF_TO_STDOUT,
RPC_PRINTF_TO_STDERR,
RPC_PRINTF_TO_STREAM,
RPC_GPUFIRST,
RPC_LAST = 0xFFFF,
} rpc_opcode_t;

View File

@@ -0,0 +1,16 @@
//===-- Definition of type rpc_port_t -------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef __LLVM_LIBC_TYPES_RPC_PORT_T_H__
#define __LLVM_LIBC_TYPES_RPC_PORT_T_H__
typedef struct {
__UINT8_TYPE__ reserved[32];
} rpc_port_t;
#endif // __LLVM_LIBC_TYPES_RPC_PORT_T_H__

View File

@@ -15,6 +15,7 @@ typedef enum : unsigned short {
RPC_TEST_NOOP = 1 << 15,
RPC_TEST_INCREMENT,
RPC_TEST_INTERFACE,
RPC_TEST_EXTERNAL,
RPC_TEST_STREAM,
} rpc_test_opcode_t;

View File

@@ -79,7 +79,7 @@ install(
if(LIBC_TARGET_OS_IS_GPU)
set(gpu_install_dir lib${LLVM_LIBDIR_SUFFIX})
if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR)
set(gpu_install_dir lib${LLVM_LIBDIR_SUFFIX}/${LLVM_HOST_TRIPLE})
set(gpu_install_dir "lib${LLVM_LIBDIR_SUFFIX}/${LLVM_HOST_TRIPLE}/libcgpu")
endif()
install(
TARGETS ${added_gpu_archive_targets}

View File

@@ -1,8 +1,11 @@
def RPCPortT : NamedType<"rpc_port_t">;
def RPCPortPtrT : PtrType<RPCPortT>;
def GPUExtensions : StandardSpec<"GPUExtensions"> {
HeaderSpec RPC = HeaderSpec<
"gpu/rpc.h",
[], // Macros
[], // Types
[RPCPortT, SizeTType], // Types
[], // Enumerations
[
FunctionSpec<
@@ -18,6 +21,26 @@ def GPUExtensions : StandardSpec<"GPUExtensions"> {
ArgSpec<VoidPtr>,
ArgSpec<SizeTType>]
>,
FunctionSpec<
"rpc_open_port",
RetValSpec<RPCPortT>,
[ArgSpec<UnsignedIntType>]
>,
FunctionSpec<
"rpc_send_n",
RetValSpec<VoidType>,
[ArgSpec<RPCPortPtrT>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
>,
FunctionSpec<
"rpc_recv_n",
RetValSpec<VoidType>,
[ArgSpec<RPCPortPtrT>, ArgSpec<VoidPtr>, ArgSpec<SizeTPtr>]
>,
FunctionSpec<
"rpc_close_port",
RetValSpec<VoidType>,
[ArgSpec<RPCPortPtrT>]
>,
]
>;
let Headers = [

View File

@@ -24,12 +24,14 @@ namespace LIBC_NAMESPACE::cpp {
#define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
#endif
// This implementation of bit_cast requires trivially-constructible To, to avoid
// UB in the implementation.
template <typename To, typename From>
LIBC_INLINE constexpr cpp::enable_if_t<
(sizeof(To) == sizeof(From)) &&
// Implementation of bit_cast that cannot use the compiler builtin must be
// trivially-constructible To, to avoid UB in the implementation.
#if !__has_builtin(__builtin_bit_cast)
cpp::is_trivially_constructible<To>::value &&
#endif
cpp::is_trivially_copyable<To>::value &&
cpp::is_trivially_copyable<From>::value,
To>

View File

@@ -320,6 +320,7 @@ public:
LIBC_INLINE void send_n(const void *src, uint64_t size);
template <typename A>
LIBC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc);
LIBC_INLINE void recv_n(void *dst, uint64_t *size);
LIBC_INLINE uint16_t get_opcode() const {
return process.header[index].opcode;
@@ -359,7 +360,9 @@ struct Client {
: process(port_count, buffer) {}
using Port = rpc::Port<false>;
template <uint16_t opcode> LIBC_INLINE Port open();
template <uint16_t opcode> LIBC_INLINE Port open() { return open(opcode); }
LIBC_INLINE Port open(uint16_t opcode);
private:
Process<false> process;
@@ -484,6 +487,14 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
}
}
/// Helper routine to simplify the interface when recieving from the GPU using
/// thread private pointers to the underly value and the destination pointer
/// contains enough data to recieve the values.
template <bool T> LIBC_INLINE void Port<T>::recv_n(void *dst, uint64_t *size) {
void **dst_ptr = &dst;
recv_n(dst_ptr, size, [=](uint64_t) { return dst; });
}
/// Receives an arbitrarily sized data buffer across the shared channel in
/// multiples of the packet length. The \p alloc function is called with the
/// size of the data so that we can initialize the size of the \p dst buffer.
@@ -522,9 +533,9 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
/// is, there are send operations pending that haven't been serviced on this
/// port. Each port instance uses an associated \p opcode to tell the server
/// what to do. The Client interface provides the appropriate lane size to the
/// port using the platform's returned value.
template <uint16_t opcode>
[[clang::convergent]] LIBC_INLINE Client::Port Client::open() {
/// port using the platform's returned value. It is required that \p opcode is
/// uniform between all the lanes for this to work.
[[clang::convergent]] LIBC_INLINE Client::Port Client::open(uint16_t opcode) {
// Repeatedly perform a naive linear scan for a port that can be opened to
// send data.
for (uint32_t index = gpu::get_cluster_id();; ++index) {

View File

@@ -1073,24 +1073,13 @@ using make_integral_or_big_int_signed_t =
namespace cpp {
// Specialization of cpp::bit_cast ('bit.h') from T to BigInt.
template <typename To, typename From>
LIBC_INLINE constexpr cpp::enable_if_t<
(sizeof(To) == sizeof(From)) && cpp::is_trivially_copyable<To>::value &&
cpp::is_trivially_copyable<From>::value && is_big_int<To>::value,
To>
bit_cast(const From &from) {
To out;
using Storage = decltype(out.val);
out.val = cpp::bit_cast<Storage>(from);
return out;
}
// Specialization of cpp::bit_cast ('bit.h') from BigInt to T.
template <typename To, size_t Bits>
LIBC_INLINE constexpr cpp::enable_if_t<
sizeof(To) == sizeof(UInt<Bits>) &&
#if !__has_builtin(__builtin_bit_cast)
cpp::is_trivially_constructible<To>::value &&
#endif
cpp::is_trivially_copyable<To>::value &&
cpp::is_trivially_copyable<UInt<Bits>>::value,
To>

View File

@@ -17,6 +17,59 @@ add_entrypoint_object(
rpc_fprintf.h
DEPENDS
libc.src.stdio.gpu.gpu_file
)
add_entrypoint_object(
rpc_open_port
SRCS
rpc_open_port.cpp
HDRS
rpc_open_port.h
DEPENDS
libc.src.__support.RPC.rpc_client
libc.src.__support.GPU.utils
)
add_entrypoint_object(
rpc_close_port
SRCS
rpc_close_port.cpp
HDRS
rpc_close_port.h
DEPENDS
libc.src.__support.RPC.rpc_client
libc.src.__support.GPU.utils
)
add_entrypoint_object(
rpc_get_buffer
SRCS
rpc_get_buffer.cpp
HDRS
rpc_get_buffer.h
DEPENDS
libc.src.__support.RPC.rpc_client
libc.src.__support.GPU.utils
)
add_entrypoint_object(
rpc_send_n
SRCS
rpc_send_n.cpp
HDRS
rpc_send_n.h
DEPENDS
libc.src.__support.RPC.rpc_client
libc.src.__support.GPU.utils
)
add_entrypoint_object(
rpc_recv_n
SRCS
rpc_recv_n.cpp
HDRS
rpc_recv_n.h
DEPENDS
libc.src.__support.RPC.rpc_client
libc.src.__support.GPU.utils
)

View File

@@ -0,0 +1,25 @@
//===---------- GPU implementation of the external RPC port interface -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "src/gpu/rpc_close_port.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/RPC/rpc_client.h"
#include "src/__support/common.h"
namespace LIBC_NAMESPACE {
static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
LLVM_LIBC_FUNCTION(void, rpc_close_port, (rpc_port_t * handle)) {
rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
port.close();
*handle = cpp::bit_cast<rpc_port_t>(port);
}
} // namespace LIBC_NAMESPACE

View File

@@ -0,0 +1,20 @@
//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H
#define LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H
#include <gpu/rpc.h>
namespace LIBC_NAMESPACE {
void rpc_close_port(rpc_port_t *handle);
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H

View File

@@ -0,0 +1,26 @@
//===---------- GPU implementation of the external RPC port interface -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "src/gpu/rpc_open_port.h"
#include "src/__support/CPP/bit.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/RPC/rpc_client.h"
#include "src/__support/common.h"
namespace LIBC_NAMESPACE {
static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
LLVM_LIBC_FUNCTION(rpc_port_t, rpc_open_port, (unsigned opcode)) {
uint32_t uniform = gpu::broadcast_value(gpu::get_lane_mask(), opcode);
rpc::Client::Port port = rpc::client.open(static_cast<uint16_t>(uniform));
return cpp::bit_cast<rpc_port_t>(port);
}
} // namespace LIBC_NAMESPACE

View File

@@ -0,0 +1,20 @@
//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H
#define LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H
#include <gpu/rpc.h>
namespace LIBC_NAMESPACE {
rpc_port_t rpc_open_port(unsigned opcode);
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H

View File

@@ -0,0 +1,26 @@
//===---------- GPU implementation of the external RPC port interface -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "src/gpu/rpc_recv_n.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/RPC/rpc_client.h"
#include "src/__support/common.h"
namespace LIBC_NAMESPACE {
static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
LLVM_LIBC_FUNCTION(void, rpc_recv_n,
(rpc_port_t * handle, void *dst, size_t *size)) {
rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
port.recv_n(dst, reinterpret_cast<uint64_t *>(size));
*handle = cpp::bit_cast<rpc_port_t>(port);
}
} // namespace LIBC_NAMESPACE

20
libc/src/gpu/rpc_recv_n.h Normal file
View File

@@ -0,0 +1,20 @@
//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_GPU_RPC_RECV_H
#define LLVM_LIBC_SRC_GPU_RPC_RECV_H
#include <gpu/rpc.h>
namespace LIBC_NAMESPACE {
void rpc_recv_n(rpc_port_t *handle, void *dst, size_t *size);
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_GPU_RPC_RECV_H

View File

@@ -0,0 +1,26 @@
//===---------- GPU implementation of the external RPC port interface -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "src/gpu/rpc_send_n.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/RPC/rpc_client.h"
#include "src/__support/common.h"
namespace LIBC_NAMESPACE {
static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
LLVM_LIBC_FUNCTION(void, rpc_send_n,
(rpc_port_t * handle, const void *src, size_t size)) {
rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
port.send_n(src, size);
*handle = cpp::bit_cast<rpc_port_t>(port);
}
} // namespace LIBC_NAMESPACE

20
libc/src/gpu/rpc_send_n.h Normal file
View File

@@ -0,0 +1,20 @@
//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_GPU_RPC_SEND_H
#define LLVM_LIBC_SRC_GPU_RPC_SEND_H
#include <gpu/rpc.h>
namespace LIBC_NAMESPACE {
void rpc_send_n(rpc_port_t *handle, const void *src, size_t size);
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_GPU_RPC_SEND_H

View File

@@ -44,6 +44,21 @@ add_integration_test(
rpc_interface_test.cpp
)
add_integration_test(
startup_rpc_external_interface_test
SUITE libc-startup-tests
SRCS
rpc_external_interface_test.cpp
DEPENDS
libc.src.gpu.rpc_open_port
libc.src.gpu.rpc_send_n
libc.src.gpu.rpc_recv_n
libc.src.gpu.rpc_close_port
LOADER_ARGS
--threads 32
--blocks 8
)
add_integration_test(
startup_rpc_stream_test
SUITE libc-startup-tests

View File

@@ -0,0 +1,43 @@
//===-- Loader test to check the external RPC interface with the loader ---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <gpu/rpc.h>
#include "src/gpu/rpc_close_port.h"
#include "src/gpu/rpc_open_port.h"
#include "src/gpu/rpc_recv_n.h"
#include "src/gpu/rpc_send_n.h"
#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/RPC/rpc_client.h"
#include "test/IntegrationTest/test.h"
using namespace LIBC_NAMESPACE;
static void test_interface() {
uint32_t num_additions =
10 + 10 * gpu::get_thread_id() + 10 * gpu::get_block_id();
uint64_t cnt = 0;
for (uint32_t i = 0; i < num_additions; ++i) {
size_t size = sizeof(uint64_t);
rpc_port_t port = LIBC_NAMESPACE::rpc_open_port(RPC_TEST_EXTERNAL);
LIBC_NAMESPACE::rpc_send_n(&port, &cnt, size);
LIBC_NAMESPACE::rpc_recv_n(&port, &cnt, &size);
LIBC_NAMESPACE::rpc_close_port(&port);
ASSERT_TRUE(size == sizeof(uint64_t));
}
ASSERT_TRUE(cnt == num_additions && "Invalid number of increments");
}
TEST_MAIN(int argc, char **argv, char **envp) {
test_interface();
return 0;
}

View File

@@ -11,8 +11,8 @@
#include "utils/gpu/server/llvmlibc_rpc_server.h"
#include "llvm-libc-types/rpc_opcodes_t.h"
#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
#include "llvm-libc-types/rpc_opcodes_t.h"
#include <cstddef>
#include <cstdint>
@@ -222,6 +222,28 @@ inline void register_rpc_callbacks(rpc_device_t device) {
}
},
nullptr);
// Register the stream test handler.
rpc_register_callback(
device, static_cast<rpc_opcode_t>(RPC_TEST_EXTERNAL),
[](rpc_port_t port, void *data) {
uint64_t sizes[lane_size] = {0};
void *dst[lane_size] = {nullptr};
rpc_recv_n(
port, dst, sizes,
[](uint64_t size, void *) -> void * { return new char[size]; },
nullptr);
for (uint64_t i = 0; i < lane_size; ++i) {
if (dst[i])
*reinterpret_cast<uint64_t *>(dst[i]) += 1;
}
rpc_send_n(port, dst, sizes);
for (uint64_t i = 0; i < lane_size; ++i) {
if (dst[i])
delete[] reinterpret_cast<uint8_t *>(dst[i]);
}
},
nullptr);
}
#endif

View File

@@ -25,6 +25,19 @@ enum OMPTgtExecModeFlags : unsigned char {
OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
};
enum OMPTgtHostRPCArgType {
// No need to copy.
OMP_HOST_RPC_ARG_SCALAR = 0,
OMP_HOST_RPC_ARG_PTR = 1,
// Copy to device.
OMP_HOST_RPC_ARG_COPY_TO = OMP_HOST_RPC_ARG_PTR | (1 << 1),
// Copy to device.
OMP_HOST_RPC_ARG_COPY_FROM = OMP_HOST_RPC_ARG_PTR | (1 << 2),
// Copy to and from device.
OMP_HOST_RPC_ARG_COPY_TOFROM =
OMP_HOST_RPC_ARG_COPY_TO | OMP_HOST_RPC_ARG_COPY_FROM,
};
} // end namespace omp
} // end namespace llvm

View File

@@ -227,7 +227,9 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
__OMP_RTL(__kmpc_get_warp_size, false, Int32, )
__OMP_RTL(omp_get_thread_num, false, Int32, )
__OMP_RTL(omp_get_bulk_thread_num, false, Int32, )
__OMP_RTL(omp_get_num_threads, false, Int32, )
__OMP_RTL(omp_get_bulk_num_threads, false, Int32, )
__OMP_RTL(omp_get_max_threads, false, Int32, )
__OMP_RTL(omp_in_parallel, false, Int32, )
__OMP_RTL(omp_get_dynamic, false, Int32, )
@@ -489,6 +491,8 @@ __OMP_RTL(__kmpc_reduction_get_fixed_buffer, false, VoidPtr, )
__OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
__OMP_RTL(malloc, false, VoidPtr, SizeTy)
__OMP_RTL(free, false, Void, VoidPtr)
__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)
__OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
@@ -502,6 +506,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
__OMP_RTL(__kmpc_launch_parallel_51_kernel, false, Void, Int8Ptr, Int32, Int32,
Int32, VoidPtrPtr, Int64)
__OMP_RTL(__last, false, Void, )
#undef __OMP_RTL
@@ -709,6 +716,8 @@ __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_bulk_thread_num, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_bulk_num_threads, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs())
__OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, SExt, ParamAttrs())

View File

@@ -60,6 +60,9 @@ struct Config {
bool VerifyEach = false;
bool DisableVerify = false;
/// Use the standard optimization pipeline.
bool UseDefaultPipeline = false;
/// Flag to indicate that the optimizer should not assume builtins are present
/// on the target.
bool Freestanding = false;

View File

@@ -303,6 +303,9 @@ public:
/// by LTO but might not be visible from bitcode symbol table.
static ArrayRef<const char*> getRuntimeLibcallSymbols();
/// Returns the context.
LLVMContext &getContext() { return RegularLTO.Ctx; }
private:
Config Conf;

View File

@@ -0,0 +1,31 @@
//===- Transform/IPO/HostRPC.h - Code of automatic host rpc -----*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_IPO_HOSTRPC_H
#define LLVM_TRANSFORMS_IPO_HOSTRPC_H
#include "llvm/Analysis/CGSCCPassManager.h"
#include "llvm/IR/PassManager.h"
namespace llvm {
class HostRPCPass : public PassInfoMixin<HostRPCPass> {
public:
HostRPCPass() : LTOPhase(ThinOrFullLTOPhase::None) {}
HostRPCPass(ThinOrFullLTOPhase LTOPhase) : LTOPhase(LTOPhase) {}
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
private:
const ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None;
};
} // namespace llvm
#endif // LLVM_TRANSFORMS_IPO_HOSTRPC_H

View File

@@ -0,0 +1,30 @@
//===--------------- CanonicalizeMainFunction.h -----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Utility function to canonicalize main function.
// The canonical main function is defined as: int main(int argc, char *argv[]);
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
#define LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
#include "llvm/IR/PassManager.h"
namespace llvm {
/// A pass that canonicalizes main function in a module.
class CanonicalizeMainFunctionPass
: public PassInfoMixin<CanonicalizeMainFunctionPass> {
public:
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};
} // end namespace llvm
#endif // LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H

View File

@@ -330,6 +330,8 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
report_fatal_error(Twine("unable to parse pass pipeline description '") +
Conf.OptPipeline + "': " + toString(std::move(Err)));
}
} else if (Conf.UseDefaultPipeline) {
MPM.addPass(PB.buildPerModuleDefaultPipeline(OL));
} else if (IsThinLTO) {
MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary));
} else {

View File

@@ -143,6 +143,7 @@
#include "llvm/Transforms/IPO/GlobalDCE.h"
#include "llvm/Transforms/IPO/GlobalOpt.h"
#include "llvm/Transforms/IPO/GlobalSplit.h"
#include "llvm/Transforms/IPO/HostRPC.h"
#include "llvm/Transforms/IPO/HotColdSplitting.h"
#include "llvm/Transforms/IPO/IROutliner.h"
#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
@@ -267,6 +268,7 @@
#include "llvm/Transforms/Utils/BreakCriticalEdges.h"
#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
#include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
#include "llvm/Transforms/Utils/CountVisits.h"
#include "llvm/Transforms/Utils/DXILUpgrade.h"
#include "llvm/Transforms/Utils/Debugify.h"

View File

@@ -53,6 +53,7 @@
#include "llvm/Transforms/IPO/GlobalDCE.h"
#include "llvm/Transforms/IPO/GlobalOpt.h"
#include "llvm/Transforms/IPO/GlobalSplit.h"
#include "llvm/Transforms/IPO/HostRPC.h"
#include "llvm/Transforms/IPO/HotColdSplitting.h"
#include "llvm/Transforms/IPO/IROutliner.h"
#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
@@ -127,6 +128,7 @@
#include "llvm/Transforms/Utils/AddDiscriminators.h"
#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
#include "llvm/Transforms/Utils/CountVisits.h"
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
@@ -183,9 +185,9 @@ static cl::opt<bool> EnablePostPGOLoopRotation(
"enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden,
cl::desc("Run the loop rotation transformation after PGO instrumentation"));
static cl::opt<bool> EnableGlobalAnalyses(
"enable-global-analyses", cl::init(true), cl::Hidden,
cl::desc("Enable inter-procedural analyses"));
static cl::opt<bool>
EnableGlobalAnalyses("enable-global-analyses", cl::init(true), cl::Hidden,
cl::desc("Enable inter-procedural analyses"));
static cl::opt<bool>
RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden,
@@ -305,6 +307,13 @@ extern cl::opt<bool> EnableMemProfContextDisambiguation;
extern cl::opt<bool> EnableInferAlignmentPass;
} // namespace llvm
static cl::opt<bool> EnableHostRPC("enable-host-rpc", cl::init(false),
cl::Hidden, cl::desc("Enable HostRPC pass"));
static cl::opt<bool> EnableCanonicalizeMainFunction(
"enable-canonicalize-main-function", cl::init(false), cl::Hidden,
cl::desc("Enable CanonicalizeMainFunction pass"));
PipelineTuningOptions::PipelineTuningOptions() {
LoopInterleaving = true;
LoopVectorization = true;
@@ -1101,6 +1110,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
}
if (EnableCanonicalizeMainFunction)
MPM.addPass(CanonicalizeMainFunctionPass());
// Try to perform OpenMP specific optimizations on the module. This is a
// (quick!) no-op if there are no OpenMP runtime calls present in the module.
MPM.addPass(OpenMPOptPass());
@@ -1120,11 +1132,10 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
// and prior to optimizing globals.
// FIXME: This position in the pipeline hasn't been carefully considered in
// years, it should be re-analyzed.
MPM.addPass(IPSCCPPass(
IPSCCPOptions(/*AllowFuncSpec=*/
Level != OptimizationLevel::Os &&
Level != OptimizationLevel::Oz &&
!isLTOPreLink(Phase))));
MPM.addPass(IPSCCPPass(IPSCCPOptions(/*AllowFuncSpec=*/
Level != OptimizationLevel::Os &&
Level != OptimizationLevel::Oz &&
!isLTOPreLink(Phase))));
// Attach metadata to indirect call sites indicating the set of functions
// they may target at run-time. This should follow IPSCCP.
@@ -1616,7 +1627,7 @@ PassBuilder::buildFatLTODefaultPipeline(OptimizationLevel Level, bool ThinLTO,
ModulePassManager
PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
if (Level == OptimizationLevel::O0)
return buildO0DefaultPipeline(Level, /*LTOPreLink*/true);
return buildO0DefaultPipeline(Level, /*LTOPreLink*/ true);
ModulePassManager MPM;
@@ -1739,6 +1750,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
// in the current module.
MPM.addPass(CrossDSOCFIPass());
if (EnableCanonicalizeMainFunction)
MPM.addPass(CanonicalizeMainFunctionPass());
if (Level == OptimizationLevel::O0) {
// The WPD and LowerTypeTest passes need to run at -O0 to lower type
// metadata and intrinsics.
@@ -1769,6 +1783,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
// Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink));
if (EnableHostRPC)
MPM.addPass(HostRPCPass(ThinOrFullLTOPhase::FullLTOPostLink));
// Remove unused virtual tables to improve the quality of code generated by
// whole-program devirtualization and bitset lowering.
MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true));
@@ -2143,6 +2160,9 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,
MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));
if (EnableCanonicalizeMainFunction)
MPM.addPass(CanonicalizeMainFunctionPass());
return MPM;
}

View File

@@ -47,6 +47,7 @@ MODULE_PASS("attributor", AttributorPass())
MODULE_PASS("attributor-light", AttributorLightPass())
MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass())
MODULE_PASS("canonicalize-main-function", CanonicalizeMainFunctionPass())
MODULE_PASS("check-debugify", NewPMCheckDebugifyPass())
MODULE_PASS("constmerge", ConstantMergePass())
MODULE_PASS("coro-cleanup", CoroCleanupPass())
@@ -142,6 +143,7 @@ MODULE_PASS("tsan-module", ModuleThreadSanitizerPass())
MODULE_PASS("verify", VerifierPass())
MODULE_PASS("view-callgraph", CallGraphViewerPass())
MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
MODULE_PASS("host-rpc", HostRPCPass())
#undef MODULE_PASS
#ifndef MODULE_PASS_WITH_PARAMS

View File

@@ -397,12 +397,27 @@ static bool getPotentialCopiesOfMemoryValue(
dbgs() << "Underlying object is a valid nullptr, giving up.\n";);
return false;
}
// TODO: Use assumed noalias return.
if (!isa<AllocaInst>(&Obj) && !isa<GlobalVariable>(&Obj) &&
!(IsLoad ? isAllocationFn(&Obj, TLI) : isNoAliasCall(&Obj))) {
LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << Obj
<< "\n";);
return false;
if (!isa<AllocaInst>(&Obj) && !isa<GlobalVariable>(&Obj)) {
auto *CB = dyn_cast<CallBase>(&Obj);
bool R = false;
if (CB) {
const AANoAlias *AAN =
A.getOrCreateAAFor<AANoAlias>(IRPosition::callsite_returned(*CB),
&QueryingAA, DepClassTy::OPTIONAL);
if (! AAN)
return false;
if (AAN->isValidState())
R = AAN->isAssumedNoAlias();
else
R = isNoAliasCall(&Obj);
if (!R)
R = isAllocationFn(&Obj, TLI);
}
if (!R) {
LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << Obj
<< "\n";);
return false;
}
}
if (auto *GV = dyn_cast<GlobalVariable>(&Obj))
if (!GV->hasLocalLinkage() &&
@@ -2226,7 +2241,8 @@ void Attributor::runTillFixpoint() {
LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
<< IterationCounter << "/" << MaxIterations
<< " iterations\n");
<< " iterations. #Changed AAs: " << ChangedAAs.size()
<< "\n");
// Reset abstract arguments not settled in a sound fixpoint by now. This
// happens when we stopped the fixpoint iteration early. Note that only the
@@ -2241,13 +2257,21 @@ void Attributor::runTillFixpoint() {
AbstractState &State = ChangedAA->getState();
if (!State.isAtFixpoint()) {
LLVM_DEBUG(
dbgs() << "\n[Attributor] Invalidate AA " << *ChangedAA
<< " as it is not at fix point after max iterations.\n");
State.indicatePessimisticFixpoint();
NumAttributesTimedOut++;
}
for (auto &DepIt : ChangedAA->Deps)
for (auto &DepIt : ChangedAA->Deps) {
LLVM_DEBUG(dbgs() << "\n[Attributor] Invalidate dependent AA "
<< *(DepIt.getPointer()) << " of AA " << *ChangedAA
<< "\n");
ChangedAAs.push_back(cast<AbstractAttribute>(DepIt.getPointer()));
}
ChangedAA->Deps.clear();
}

View File

@@ -1669,6 +1669,8 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
// might change while we iterate through a loop. For now, we give up if
// the PHI is not invariant.
if (isa<PHINode>(Usr)) {
if (!Usr->getType()->isPointerTy())
return false;
// Note the order here, the Usr access might change the map, CurPtr is
// already in it though.
bool IsFirstPHIUser = !OffsetInfoMap.count(Usr);
@@ -1899,6 +1901,31 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
return false;
}
if (auto *II = dyn_cast<ICmpInst>(Usr)) {
auto CheckIfUsedAsPred = [&](const Use &U, bool &Follow) {
const auto *UU = U.getUser();
if (isa<SelectInst>(UU))
return true;
if (isa<BranchInst>(UU))
return true;
LLVM_DEBUG(dbgs() << "[AAPointerInfo] ICmpInst user not handled " << *UU
<< "\n");
return false;
};
if (!A.checkForAllUses(CheckIfUsedAsPred, *this, *II,
/* CheckBBLivenessOnly */ true,
DepClassTy::OPTIONAL,
/* IgnoreDroppableUses */ true)) {
LLVM_DEBUG(
dbgs() << "[AAPointerInfo] Check for all uses failed for ICmpInst "
<< *II << "\n");
return false;
}
return true;
}
LLVM_DEBUG(dbgs() << "[AAPointerInfo] User not handled " << *Usr << "\n");
return false;
};
@@ -2040,6 +2067,7 @@ struct AAPointerInfoCallSiteReturned final : AAPointerInfoFloating {
};
} // namespace
/// -----------------------NoUnwind Function Attribute--------------------------
namespace {
@@ -11906,10 +11934,35 @@ struct AAUnderlyingObjectsImpl
/// See AbstractAttribute::trackStatistics()
void trackStatistics() const override {}
// TODO: This is a temporary solution for terminals.
bool checkIfTerminals(Attributor &A, Value *V) {
auto *CI = dyn_cast<CallInst>(V);
if (!CI)
return false;
Function *Scope = CI->getFunction();
const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*Scope);
LibFunc TLIFn;
if (TLI && TLI->getLibFunc(*CI, TLIFn)) {
if (TLIFn == LibFunc::LibFunc_malloc ||
TLIFn == LibFunc::LibFunc___kmpc_alloc_shared)
return true;
}
return false;
}
/// See AbstractAttribute::updateImpl(...).
ChangeStatus updateImpl(Attributor &A) override {
auto &Ptr = getAssociatedValue();
if (checkIfTerminals(A, &Ptr)) {
bool Changed = false;
Changed |= InterAssumedUnderlyingObjects.insert(&Ptr);
Changed |= IntraAssumedUnderlyingObjects.insert(&Ptr);
return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
}
auto DoUpdate = [&](SmallSetVector<Value *, 8> &UnderlyingObjects,
AA::ValueScope Scope) {
bool UsedAssumedInformation = false;
@@ -11925,6 +11978,10 @@ struct AAUnderlyingObjectsImpl
for (unsigned I = 0; I < Values.size(); ++I) {
auto &VAC = Values[I];
auto *Obj = VAC.getValue();
if (checkIfTerminals(A, Obj)) {
Changed |= UnderlyingObjects.insert(Obj);
continue;
}
Value *UO = getUnderlyingObject(Obj);
if (UO && UO != VAC.getValue() && SeenObjects.insert(UO).second) {
const auto *OtherAA = A.getAAFor<AAUnderlyingObjects>(
@@ -11955,6 +12012,16 @@ struct AAUnderlyingObjectsImpl
continue;
}
if (auto *LI = dyn_cast<LoadInst>(Obj)) {
LLVM_DEBUG({
dbgs() << "[AAUnderlyingObjects] for CtxI ";
getCtxI()->print(dbgs());
dbgs() << " at position " << getIRPosition() << " has LoadInst ";
LI->print(dbgs());
dbgs() << '\n';
});
}
Changed |= UnderlyingObjects.insert(Obj);
}
@@ -11965,6 +12032,8 @@ struct AAUnderlyingObjectsImpl
Changed |= DoUpdate(IntraAssumedUnderlyingObjects, AA::Intraprocedural);
Changed |= DoUpdate(InterAssumedUnderlyingObjects, AA::Interprocedural);
LLVM_DEBUG(dumpState(dbgs()));
return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
}
@@ -12003,6 +12072,19 @@ private:
return Changed;
}
void dumpState(raw_ostream &O) {
O << "Underlying objects:\nintra procedureal:\n";
for (auto *Obj : IntraAssumedUnderlyingObjects) {
Obj->print(O);
O << '\n';
}
O << "inter procedureal:\n";
for (auto *Obj : InterAssumedUnderlyingObjects) {
Obj->print(O);
O << '\n';
}
}
/// All the underlying objects collected so far via intra procedural scope.
SmallSetVector<Value *, 8> IntraAssumedUnderlyingObjects;
/// All the underlying objects collected so far via inter procedural scope.

View File

@@ -20,6 +20,7 @@ add_llvm_component_library(LLVMipo
GlobalDCE.cpp
GlobalOpt.cpp
GlobalSplit.cpp
HostRPC.cpp
HotColdSplitting.cpp
IPO.cpp
IROutliner.cpp

View File

@@ -0,0 +1,970 @@
//===- Transform/IPO/HostRPC.h - Code of automatic host rpc -----*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/IPO/HostRPC.h"
#include "llvm/ADT/EnumeratedArray.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/CodeGen/CommandFlags.h"
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/IPO/Attributor.h"
#include <cstdint>
#define DEBUG_TYPE "host-rpc"
using namespace llvm;
using ArgType = llvm::omp::OMPTgtHostRPCArgType;
static cl::opt<bool>
UseDummyHostModule("host-rpc-use-dummy-host-module", cl::init(false),
cl::Hidden,
cl::desc("Use dummy host module if there no host module "
"attached to the device module"));
namespace {
enum class HostRPCRuntimeFunction {
#define __OMPRTL_HOST_RPC(_ENUM) OMPRTL_##_ENUM
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val),
__OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper),
__OMPRTL_HOST_RPC(__last),
#undef __OMPRTL_HOST_RPC
};
#define __OMPRTL_HOST_RPC(_ENUM) \
auto OMPRTL_##_ENUM = HostRPCRuntimeFunction::OMPRTL_##_ENUM;
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val)
__OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper)
#undef __OMPRTL_HOST_RPC
// TODO: Remove those functions implemented in device runtime.
static constexpr const char *InternalPrefix[] = {
"__kmp", "llvm.", "nvm.",
"omp_", "vprintf", "malloc",
"free", "__keep_alive", "__llvm_omp_vprintf",
"rpc_", "MPI_", "fprintf", "sprintf"
};
bool isInternalFunction(Function &F) {
auto Name = F.getName();
for (auto *P : InternalPrefix)
if (Name.starts_with(P))
return true;
return false;
}
std::string typeToString(Type *T) {
if (T->is16bitFPTy())
return "f16";
if (T->isFloatTy())
return "f32";
if (T->isDoubleTy())
return "f64";
if (T->isPointerTy())
return "ptr";
if (T->isStructTy())
return std::string(T->getStructName());
if (T->isIntegerTy())
return "i" + std::to_string(T->getIntegerBitWidth());
LLVM_DEBUG(dbgs() << "[HostRPC] unknown type " << *T
<< " for typeToString.\n";);
llvm_unreachable("unknown type");
}
class HostRPC {
/// LLVM context instance
LLVMContext &Context;
/// Device module.
Module &M;
/// Host module
Module &HM;
/// Data layout of the device module.
DataLayout DL;
IRBuilder<> Builder;
/// External functions we are operating on.
SmallSetVector<Function *, 8> FunctionWorkList;
/// Attributor instance.
Attributor &A;
// Types
Type *Int8PtrTy;
Type *VoidTy;
Type *Int32Ty;
Type *Int64Ty;
StructType *ArgInfoTy;
// Values
Constant *NullPtr;
Constant *NullInt64;
struct CallSiteInfo {
CallInst *CI = nullptr;
SmallVector<Type *> Params;
};
struct HostRPCArgInfo {
Value *BasePtr = nullptr;
Constant *Type = nullptr;
Value *Size = nullptr;
};
///
SmallVector<Function *> HostEntryTable;
EnumeratedArray<Function *, HostRPCRuntimeFunction,
HostRPCRuntimeFunction::OMPRTL___last> RFIs;
SmallVector<std::pair<CallInst *, CallInst *>> CallInstMap;
Constant *getConstantInt64(uint64_t Val) {
return ConstantInt::get(Int64Ty, Val);
}
static std::string getWrapperFunctionName(Function *F, CallSiteInfo &CSI) {
std::string Name = "__kmpc_host_rpc_wrapper_" + std::string(F->getName());
if (!F->isVarArg())
return Name;
for (unsigned I = F->getFunctionType()->getNumParams();
I < CSI.Params.size(); ++I) {
Name.push_back('_');
Name.append(typeToString(CSI.Params[I]));
}
return Name;
}
void registerAAs();
Value *convertToInt64Ty(Value *V);
Value *convertFromInt64TyTo(Value *V, Type *TargetTy);
Constant *convertToInt64Ty(Constant *C);
Constant *convertFromInt64TyTo(Constant *C, Type *T);
// int device_wrapper(call_no, arg_info, ...) {
// void *desc = __kmpc_host_rpc_get_desc(call_no, num_args, arg_info);
// __kmpc_host_rpc_add_arg(desc, arg1, sizeof(arg1));
// __kmpc_host_rpc_add_arg(desc, arg2, sizeof(arg2));
// ...
// int r = (int)__kmpc_host_rpc_send_and_wait(desc);
// return r;
// }
Function *getDeviceWrapperFunction(StringRef WrapperName, Function *F,
CallSiteInfo &CSI);
// void host_wrapper(desc) {
// int arg1 = (int)__kmpc_host_rpc_get_arg(desc, 0);
// float arg2 = (float)__kmpc_host_rpc_get_arg(desc, 1);
// char *arg3 = (char *)__kmpc_host_rpc_get_arg(desc, 2);
// ...
// int r = actual_call(arg1, arg2, arg3, ...);
// __kmpc_host_rpc_set_ret_val(ptr(desc, (int64_t)r);
// }
Function *getHostWrapperFunction(StringRef WrapperName, Function *F,
CallSiteInfo &CSI);
bool rewriteWithHostRPC(Function *F);
void emitHostWrapperInvoker();
bool recollectInformation();
public:
HostRPC(Module &DeviceModule, Module &HostModule, Attributor &A)
: Context(DeviceModule.getContext()), M(DeviceModule), HM(HostModule),
DL(M.getDataLayout()), Builder(Context), A(A) {
assert(&M.getContext() == &HM.getContext() &&
"device and host modules have different context");
Int8PtrTy = PointerType::getUnqual(Context);
VoidTy = Type::getVoidTy(Context);
Int32Ty = Type::getInt32Ty(Context);
Int64Ty = Type::getInt64Ty(Context);
NullPtr = ConstantInt::getNullValue(Int8PtrTy);
NullInt64 = ConstantInt::getNullValue(Int64Ty);
#define __OMP_RTL(_ENUM, MOD, VARARG, RETTY, ...) \
{ \
SmallVector<Type *> Params{__VA_ARGS__}; \
Function *F = (MOD).getFunction(#_ENUM); \
if (!F) { \
FunctionType *FT = FunctionType::get(RETTY, Params, VARARG); \
F = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage, \
#_ENUM, (MOD)); \
} \
RFIs[OMPRTL_##_ENUM] = F; \
}
// devices functions:
// get information about the functions that we are calling
__OMP_RTL(__kmpc_host_rpc_get_desc, M, false, Int8PtrTy, Int32Ty, Int32Ty,
Int8PtrTy)
// get arguments information about one of the argument
__OMP_RTL(__kmpc_host_rpc_add_arg, M, false, VoidTy, Int8PtrTy, Int64Ty,
Int32Ty)
// send the function to the host the function
__OMP_RTL(__kmpc_host_rpc_send_and_wait, M, false, Int64Ty, Int8PtrTy)
// host functions:
// get arguments (mirror of add arg)
__OMP_RTL(__kmpc_host_rpc_get_arg, HM, false, Int64Ty, Int8PtrTy, Int32Ty)
// send the ruturn value
__OMP_RTL(__kmpc_host_rpc_set_ret_val, HM, false, VoidTy, Int8PtrTy,
Int64Ty)
// Invoke the function on the host
__OMP_RTL(__kmpc_host_rpc_invoke_host_wrapper, HM, false, VoidTy, Int32Ty,
Int8PtrTy)
#undef __OMP_RTL
ArgInfoTy = StructType::create({Int64Ty, Int64Ty, Int64Ty, Int8PtrTy},
"struct.arg_info_t");
}
bool run();
};
Value *HostRPC::convertToInt64Ty(Value *V) {
if (auto *C = dyn_cast<Constant>(V))
return convertToInt64Ty(C);
Type *T = V->getType();
if (T == Int64Ty)
return V;
if (T->isPointerTy())
return Builder.CreatePtrToInt(V, Int64Ty);
if (T->isIntegerTy())
return Builder.CreateIntCast(V, Int64Ty, /* isSigned */ true);
if (T->isFloatingPointTy()) {
V = Builder.CreateBitCast(
V, Type::getIntNTy(V->getContext(), T->getScalarSizeInBits()));
return Builder.CreateIntCast(V, Int64Ty, /* isSigned */ true);
}
llvm_unreachable("unknown cast to int64_t");
}
Value *HostRPC::convertFromInt64TyTo(Value *V, Type *T) {
if (auto *C = dyn_cast<Constant>(V))
return convertFromInt64TyTo(C, T);
if (T == Int64Ty)
return V;
if (T->isPointerTy())
return Builder.CreateIntToPtr(V, T);
if (T->isIntegerTy())
return Builder.CreateIntCast(V, T, /* isSigned */ true);
if (T->isFloatingPointTy()) {
V = Builder.CreateIntCast(
V, Type::getIntNTy(V->getContext(), T->getScalarSizeInBits()),
/* isSigned */ true);
return Builder.CreateBitCast(V, T);
}
LLVM_DEBUG(dbgs() << "[HostRPC] unknown type " << *T
<< " for typeFromint64_t.\n";);
llvm_unreachable("unknown cast from int64_t");
}
Constant *HostRPC::convertToInt64Ty(Constant *C) {
Type *T = C->getType();
if (T == Int64Ty)
return C;
if (T->isPointerTy())
return ConstantExpr::getPtrToInt(C, Int64Ty);
if (T->isIntegerTy()) {
return ConstantFoldIntegerCast(C, Int64Ty, true, DL);
}
if (T->isFloatingPointTy()) {
// cast to an int of the same size
C = ConstantExpr::getBitCast(C,
Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()));
// set the int of size 64
return ConstantFoldIntegerCast(C, Int64Ty, true, DL);
}
llvm_unreachable("unknown cast to int64_t");
}
Constant *HostRPC::convertFromInt64TyTo(Constant *C, Type *T) {
assert(C->getType() == Int64Ty);
if (T == Int64Ty)
return C;
if (T->isPointerTy())
return ConstantExpr::getIntToPtr(C, T);
if (T->isIntegerTy()) {
return ConstantFoldIntegerCast(C, T, true, DL);
}
if (T->isFloatingPointTy()) {
// change size to T size
C = ConstantFoldIntegerCast(C,
Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()), true, DL);
// from int to float
return ConstantExpr::getBitCast(C, T);
}
llvm_unreachable("unknown cast from int64_t");
}
void HostRPC::registerAAs() {
for (auto *F : FunctionWorkList)
for (User *U : F->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
for (unsigned I = 0; I < CI->arg_size(); ++I) {
Value *Operand = CI->getArgOperand(I);
if (!Operand->getType()->isPointerTy())
continue;
A.getOrCreateAAFor<AAUnderlyingObjects>(
IRPosition::callsite_argument(*CI, I),
/* QueryingAA */ nullptr, DepClassTy::NONE);
}
}
}
bool HostRPC::recollectInformation() {
FunctionWorkList.clear();
for (Function &F : M) {
// If the function is already defined, it definitely does not require RPC.
if (!F.isDeclaration())
continue;
// If it is an internal function, skip it as well.
if (isInternalFunction(F))
continue;
// If there is no use of the function, skip it.
if (F.use_empty())
continue;
LLVM_DEBUG({
dbgs() << "[HostRPC] RPCing function: " << F.getName() << "\n"
<< F << "\n";
});
FunctionWorkList.insert(&F);
}
return !FunctionWorkList.empty();
}
bool HostRPC::run() {
bool Changed = false;
LLVM_DEBUG(dbgs() << "[HostRPC] Running Pass\n");
if (!recollectInformation())
return Changed;
Changed = true;
LLVM_DEBUG(dbgs() << "[HostRPC] Reading Function to relocate:\n");
// We add a couple of assumptions to those RPC functions such that AAs will
// not error out because of unknown implementation of those functions.
for (Function &F : M) {
if (!F.isDeclaration())
continue;
F.addFnAttr(Attribute::NoRecurse);
for (auto &Arg : F.args())
if (Arg.getType()->isPointerTy())
Arg.addAttr(Attribute::NoCapture);
if (!F.isVarArg())
continue;
for (User *U : F.users()) {
auto *CB = dyn_cast<CallBase>(U);
if (!CB)
continue;
for (unsigned I = F.getFunctionType()->getNumParams(); I < CB->arg_size();
++I) {
Value *Arg = CB->getArgOperand(I);
if (Arg->getType()->isPointerTy())
CB->addParamAttr(I, Attribute::NoCapture);
}
}
}
//LLVM_DEBUG(M.dump());
registerAAs();
ChangeStatus Status = A.run();
if (!recollectInformation())
return Status == ChangeStatus::CHANGED;
LLVM_DEBUG(dbgs() << "[HostRPC] Rewrite Functions:\n");
for (Function *F : FunctionWorkList)
Changed |= rewriteWithHostRPC(F);
if (!Changed)
return Changed;
LLVM_DEBUG(dbgs() << "[HostRPC] Replace Function Call:\n");
// replace all call to the function to a call to the rpc wrapper that have replace it.
for (auto Itr = CallInstMap.rbegin(); Itr != CallInstMap.rend(); ++Itr) {
auto *CI = Itr->first;
auto *NewCI = Itr->second;
CI->replaceAllUsesWith(NewCI);
CI->eraseFromParent();
}
LLVM_DEBUG(dbgs() << "[HostRPC] Erase Call to Non existing Functions:\n");
// erase all trace of the function in the Module
for (Function *F : FunctionWorkList)
if (F->user_empty())
F->eraseFromParent();
LLVM_DEBUG(dbgs() << "[HostRPC] Emit Host Wrapper:\n");
emitHostWrapperInvoker();
LLVM_DEBUG(dbgs() << "[HostRPC] Done\n");
return Changed;
}
bool HostRPC::rewriteWithHostRPC(Function *F) {
LLVM_DEBUG({
dbgs() << "[HostRPC] Rewriting Function: " << *F << "\n";
});
SmallVector<CallInst *> WorkList;
for (User *U : F->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
WorkList.push_back(CI);
}
if (WorkList.empty())
return false;
for (CallInst *CI : WorkList) {
CallSiteInfo CSI;
CSI.CI = CI;
unsigned NumArgs = CI->arg_size();
for (unsigned I = 0; I < NumArgs; ++I)
CSI.Params.push_back(CI->getArgOperand(I)->getType());
std::string WrapperName = getWrapperFunctionName(F, CSI);
Function *DeviceWrapperFn = getDeviceWrapperFunction(WrapperName, F, CSI);
Function *HostWrapperFn = getHostWrapperFunction(WrapperName, F, CSI);
int32_t WrapperNumber = -1;
for (unsigned I = 0; I < HostEntryTable.size(); ++I) {
if (HostEntryTable[I] == HostWrapperFn) {
WrapperNumber = I;
break;
}
}
if (WrapperNumber == -1) {
WrapperNumber = HostEntryTable.size();
HostEntryTable.push_back(HostWrapperFn);
}
auto CheckIfIdentifierPtr = [this](const Value *V) {
auto *CI = dyn_cast<CallInst>(V);
if (!CI)
return false;
Function *Callee = CI->getCalledFunction();
if (this->FunctionWorkList.count(Callee))
return true;
return Callee->getName().starts_with("__kmpc_host_rpc_wrapper_");
};
auto CheckIfDynAlloc = [](Value *V) -> CallInst * {
auto *CI = dyn_cast<CallInst>(V);
if (!CI)
return nullptr;
Function *Callee = CI->getCalledFunction();
auto Name = Callee->getName();
if (Name == "malloc" || Name == "__kmpc_alloc_shared")
return CI;
return nullptr;
};
auto CheckIfStdIO = [](Value *V) -> GlobalVariable * {
auto *LI = dyn_cast<LoadInst>(V);
if (!LI)
return nullptr;
auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
if (!GV)
return nullptr;
auto Name = GV->getName();
if (Name == "stdout" || Name == "stderr" || Name == "stdin")
return GV;
return nullptr;
};
auto CheckIfGlobalVariable = [](Value *V) {
if (auto *GV = dyn_cast<GlobalVariable>(V))
return GV;
if (auto *LI = dyn_cast<LoadInst>(V))
if (auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()))
return GV;
return static_cast<GlobalVariable *>(nullptr);
};
auto CheckIfNullPtr = [](Value *V) {
if (!V->getType()->isPointerTy())
return false;
return V == ConstantInt::getNullValue(V->getType());
};
auto HandleDirectUse = [&](Value *Ptr, HostRPCArgInfo &AI,
bool IsPointer = false) {
AI.BasePtr = Ptr;
AI.Type = getConstantInt64(IsPointer ? ArgType::OMP_HOST_RPC_ARG_PTR
: ArgType::OMP_HOST_RPC_ARG_SCALAR);
AI.Size = NullInt64;
};
SmallVector<SmallVector<HostRPCArgInfo>> ArgInfo;
bool IsConstantArgInfo = true;
for (unsigned I = 0; I < CI->arg_size(); ++I) {
ArgInfo.emplace_back();
auto &AII = ArgInfo.back();
Value *Operand = CI->getArgOperand(I);
//LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Argument: " << I << ": " << *Operand << "\n"; });
// Check if scalar type.
if (!Operand->getType()->isPointerTy()) {
AII.emplace_back();
HandleDirectUse(Operand, AII.back());
IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
//LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Constant: " << *Operand << "\n"; });
continue;
}
if (CheckIfNullPtr(Operand)){
//LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Null Ptr: " << *Operand << "\n"; });
continue;
}
auto Pred = [&](Value &Obj) {
if (CheckIfNullPtr(&Obj))
return true;
bool IsConstantArgument = false;
if (!F->isVarArg() &&
F->hasParamAttribute(I, Attribute::AttrKind::ReadOnly))
IsConstantArgument = true;
HostRPCArgInfo AI;
if (auto *IO = CheckIfStdIO(&Obj)) {
HandleDirectUse(IO, AI, /* IsPointer */ true);
} else if (CheckIfIdentifierPtr(&Obj)) {
IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
HandleDirectUse(Operand, AI, /* IsPointer */ true);
} else if (auto *GV = CheckIfGlobalVariable(&Obj)) {
AI.BasePtr = GV;
AI.Size = getConstantInt64(DL.getTypeStoreSize(GV->getValueType()));
AI.Type =
getConstantInt64(GV->isConstant() || IsConstantArgument
? ArgType::OMP_HOST_RPC_ARG_COPY_TO
: ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM);
} else if (CheckIfDynAlloc(&Obj)) {
// We will handle this case at runtime so here we don't do anything.
//LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Dynamic Alloc: " << *Operand << "\n"; });
return true;
} else if (isa<AllocaInst>(&Obj)) {
llvm_unreachable("alloca instruction needs to be handled!");
} else {
LLVM_DEBUG({
dbgs() << "[HostRPC] warning: call site " << *CI << ", operand "
<< *Operand << ", underlying object " << Obj
<< " cannot be handled.\n";
});
return true;
}
AII.push_back(std::move(AI));
return true;
};
//LLVM_DEBUG({
// dbgs() << "[HostRPC] function rewrite:\n"
// << "Function: " << *F << "\n"
// << "Call site: " << *CI << "\n "
// << "Operand: " << *Operand << "\n";
//});
// TODO replace with LLVM functions to not use Attributors.
assert(!IRPosition::callsite_argument(*CI, I)
.getAnchorScope()->hasFnAttribute(Attribute::OptimizeNone)
&& "[HostRPC]: Optimize None is not supported");
const llvm::AAUnderlyingObjects* AAUO =
A.getOrCreateAAFor<AAUnderlyingObjects>(
IRPosition::callsite_argument(*CI, I));
//LLVM_DEBUG({dbgs() << "[HostRPC] AAUO:" << AAUO << "\n";});
if (!AAUO->forallUnderlyingObjects(Pred))
llvm_unreachable("internal error");
}
// Reset the insert point to the call site.
Builder.SetInsertPoint(CI);
Value *ArgInfoVal = nullptr;
if (!IsConstantArgInfo) {
ArgInfoVal = Builder.CreateAlloca(Int8PtrTy, getConstantInt64(NumArgs),
"arg_info");
for (unsigned I = 0; I < NumArgs; ++I) {
auto &AII = ArgInfo[I];
Value *Next = NullPtr;
for (auto &AI : AII) {
Value *AIV = Builder.CreateAlloca(ArgInfoTy);
Value *AIIArg =
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(0)});
Builder.Insert(AIIArg);
Builder.CreateStore(convertToInt64Ty(AI.BasePtr), AIIArg);
Value *AIIType =
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(1)});
Builder.Insert(AIIType);
Builder.CreateStore(AI.Type, AIIType);
Value *AIISize =
GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(2)});
Builder.Insert(AIISize);
Builder.CreateStore(AI.Size, AIISize);
Value *AIINext =
GetElementPtrInst::Create(Int8PtrTy, AIV, {getConstantInt64(3)});
Builder.Insert(AIINext);
Builder.CreateStore(Next, AIINext);
Next = AIV;
}
Value *AIIV = GetElementPtrInst::Create(Int8PtrTy, ArgInfoVal,
{getConstantInt64(I)});
Builder.Insert(AIIV);
Builder.CreateStore(Next, AIIV);
}
} else {
SmallVector<Constant *> ArgInfoInitVar;
for (auto &AII : ArgInfo) {
Constant *Last = NullPtr;
for (auto &AI : AII) {
auto *Arg = cast<Constant>(AI.BasePtr);
auto *CS =
ConstantStruct::get(ArgInfoTy, {convertToInt64Ty(Arg), AI.Type,
cast<Constant>(AI.Size), Last});
auto *GV = new GlobalVariable(
M, ArgInfoTy, /* isConstant */ true,
GlobalValue::LinkageTypes::InternalLinkage, CS, "",
nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 0);
// force adress space 0 on AMD GPU
// insted of address space 1 for globals
Last = GV;
}
LLVM_DEBUG({
dbgs() << "[HostRPC] ArgInfoInitVar:" << *Last << "\n";
});
ArgInfoInitVar.push_back(Last);
}
Constant *ArgInfoInit = ConstantArray::get(
ArrayType::get(Int8PtrTy, NumArgs), ArgInfoInitVar);
ArgInfoVal = new GlobalVariable(
M, ArrayType::get(Int8PtrTy, NumArgs), /* isConstant */ true,
GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info",
nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 0);
}
SmallVector<Value *> Args{ConstantInt::get(Int32Ty, WrapperNumber),
ArgInfoVal};
for (Value *Operand : CI->args())
Args.push_back(Operand);
CallInst *NewCall = Builder.CreateCall(DeviceWrapperFn, Args);
CallInstMap.emplace_back(CI, NewCall);
}
return true;
}
Function *HostRPC::getDeviceWrapperFunction(StringRef WrapperName, Function *F,
CallSiteInfo &CSI) {
Function *WrapperFn = M.getFunction(WrapperName);
if (WrapperFn)
return WrapperFn;
// return_type device_wrapper(int32_t call_no, void *arg_info, ...)
SmallVector<Type *> Params{Int32Ty, Int8PtrTy};
Params.append(CSI.Params);
Type *RetTy = F->getReturnType();
FunctionType *FT = FunctionType::get(RetTy, Params, /*isVarArg*/ false);
WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::InternalLinkage,
WrapperName, M);
// Emit the body of the device wrapper
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
Builder.SetInsertPoint(EntryBB);
// skip call_no and arg_info.
constexpr const unsigned NumArgSkipped = 2;
Value *Desc = nullptr;
{
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_get_desc];
//LLVM_DEBUG({dbgs() << "[HostRPC] Building: rpc get desc: " << Fn->getName() << "\n"; });
for (unsigned i = 0; i < 3; ++i)
//LLVM_DEBUG({dbgs() << "ParamI: " << *(Fn->getFunctionType()->getParamType(i)) << "\n"; });
Desc = Builder.CreateCall(
Fn,
{
WrapperFn->getArg(0),
ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
WrapperFn->getArg(1)
},
"desc"
);
}
{
Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_add_arg];
//LLVM_DEBUG({dbgs() << "[HostRPC] Building: rpc add arg\n"; });
for (unsigned I = NumArgSkipped; I < WrapperFn->arg_size(); ++I) {
Value *V = convertToInt64Ty(WrapperFn->getArg(I));
Builder.CreateCall(
Fn, {Desc, V, ConstantInt::get(Int32Ty, I - NumArgSkipped)});
}
}
//LLVM_DEBUG({dbgs() << "[HostRPC] Building: rpc send and wait\n"; });
Value *RetVal =
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_send_and_wait], {Desc});
if (RetTy->isVoidTy()) {
Builder.CreateRetVoid();
return WrapperFn;
}
if (RetTy != RetVal->getType())
RetVal = convertFromInt64TyTo(RetVal, RetTy);
Builder.CreateRet(RetVal);
//LLVM_DEBUG({dbgs() << "[HostRPC] Device Wrapper Function:\n" << *WrapperFn; });
return WrapperFn;
}
Function *HostRPC::getHostWrapperFunction(StringRef WrapperName, Function *F,
CallSiteInfo &CSI) {
Function *WrapperFn = HM.getFunction(WrapperName);
if (WrapperFn)
return WrapperFn;
SmallVector<Type *> Params{Int8PtrTy};
FunctionType *FT = FunctionType::get(VoidTy, Params, /* isVarArg */ false);
WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,
WrapperName, HM);
Value *Desc = WrapperFn->getArg(0);
// Emit the body of the host wrapper
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
Builder.SetInsertPoint(EntryBB);
SmallVector<Value *> Args;
for (unsigned I = 0; I < CSI.CI->arg_size(); ++I) {
Value *V = Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_arg],
{Desc, ConstantInt::get(Int32Ty, I)});
Args.push_back(convertFromInt64TyTo(V, CSI.Params[I]));
}
// The host callee that will be called eventually by the host wrapper.
Function *HostCallee = HM.getFunction(F->getName());
if (!HostCallee)
HostCallee = Function::Create(F->getFunctionType(), F->getLinkage(),
F->getName(), HM);
Value *RetVal = Builder.CreateCall(HostCallee, Args);
if (!RetVal->getType()->isVoidTy()) {
RetVal = convertToInt64Ty(RetVal);
Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_set_ret_val],
{Desc, RetVal});
}
Builder.CreateRetVoid();
return WrapperFn;
}
void HostRPC::emitHostWrapperInvoker() {
IRBuilder<> Builder(Context);
unsigned NumEntries = HostEntryTable.size();
Function *F = RFIs[OMPRTL___kmpc_host_rpc_invoke_host_wrapper];
F->setDLLStorageClass(
GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
Value *CallNo = F->getArg(0);
Value *Desc = F->getArg(1);
SmallVector<BasicBlock *> SwitchBBs;
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", F);
BasicBlock *ReturnBB = BasicBlock::Create(Context, "return", F);
// Emit code for the return bb.
Builder.SetInsertPoint(ReturnBB);
Builder.CreateRetVoid();
// Create BB for each host entry and emit function call.
for (unsigned I = 0; I < NumEntries; ++I) {
BasicBlock *BB = BasicBlock::Create(Context, "invoke.bb", F, ReturnBB);
SwitchBBs.push_back(BB);
Builder.SetInsertPoint(BB);
Builder.CreateCall(HostEntryTable[I], {Desc});
Builder.CreateBr(ReturnBB);
}
// Emit code for the entry BB.
Builder.SetInsertPoint(EntryBB);
SwitchInst *Switch = Builder.CreateSwitch(CallNo, ReturnBB, NumEntries);
for (unsigned I = 0; I < NumEntries; ++I)
Switch->addCase(ConstantInt::get(cast<IntegerType>(Int32Ty), I),
SwitchBBs[I]);
}
Module *getHostModule(Module &M) {
auto *MD = M.getNamedMetadata("llvm.hostrpc.hostmodule");
if (!MD || MD->getNumOperands() == 0)
return nullptr;
auto *Node = MD->getOperand(0);
assert(Node->getNumOperands() == 1 && "invliad named metadata");
auto *CAM = dyn_cast<ConstantAsMetadata>(Node->getOperand(0));
if (!CAM)
return nullptr;
auto *CI = cast<ConstantInt>(CAM->getValue());
Module *Mod = reinterpret_cast<Module *>(CI->getZExtValue());
M.eraseNamedMetadata(MD);
return Mod;
}
} // namespace
PreservedAnalyses HostRPCPass::run(Module &M, ModuleAnalysisManager &AM) {
std::unique_ptr<Module> DummyHostModule;
Module *HostModule = nullptr;
if (UseDummyHostModule) {
DummyHostModule =
std::make_unique<Module>("dummy-host-rpc.bc", M.getContext());
HostModule = DummyHostModule.get();
} else {
HostModule = getHostModule(M);
}
if (!HostModule)
return PreservedAnalyses::all();
bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
// The pass will not run if it is not invoked directly or not invoked at link
// time.
if (!UseDummyHostModule && !PostLink)
return PreservedAnalyses::all();
FunctionAnalysisManager &FAM =
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
AnalysisGetter AG(FAM);
CallGraphUpdater CGUpdater;
BumpPtrAllocator Allocator;
AttributorConfig AC(CGUpdater);
AC.DefaultInitializeLiveInternals = false;
AC.RewriteSignatures = false;
AC.PassName = DEBUG_TYPE;
AC.MaxFixpointIterations = 1024;
InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
SetVector<Function *> Functions;
Attributor A(Functions, InfoCache, AC);
HostRPC RPC(M, *HostModule, A);
bool Changed = RPC.run();
LLVM_DEBUG({
if (Changed && UseDummyHostModule) {
M.dump();
HostModule->dump();
}
});
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}

View File

@@ -56,6 +56,7 @@
#include "llvm/Transforms/IPO/Attributor.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
#include <algorithm>
#include <optional>
@@ -138,6 +139,15 @@ static cl::opt<bool>
cl::desc("Enables more verbose remarks."), cl::Hidden,
cl::init(false));
static cl::opt<bool>
EnableParallel51Split("openmp-opt-enable-parallel-51-split",
cl::desc("Enable the kernel split at parallel_51."),
cl::Hidden, cl::init(false));
static cl::opt<bool> EnableParallel51SplitMultiTeams(
"openmp-opt-enable-parallel-51-split-multi-teams",
cl::desc("Enable multi-teams support for the parallel 51 kernel."),
cl::Hidden, cl::init(false));
static cl::opt<unsigned>
SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
cl::desc("Maximal number of attributor iterations."),
@@ -917,6 +927,28 @@ private:
}
};
bool canUseMultiTeam(Function *OutlinedFn, OMPInformationCache &OMPInfoCache) {
if (!EnableParallel51SplitMultiTeams)
return false;
return true;
}
void collectReachingKernels(Function *F,
SmallVector<Kernel> &ReachingKernels) {
if (omp::isOpenMPKernel(*F)) {
if (F->hasFnAttribute("omp_parallel_51_kernel"))
ReachingKernels.push_back(F);
return;
}
for (User *U : F->users()) {
auto *I = dyn_cast<Instruction>(U);
if (!I)
continue;
collectReachingKernels(I->getFunction(), ReachingKernels);
}
}
struct OpenMPOpt {
using OptimizationRemarkGetter =
@@ -945,6 +977,11 @@ struct OpenMPOpt {
<< " functions\n");
if (IsModulePass) {
if (EnableParallel51Split)
Changed |= splitKernels();
OMPInfoCache.recollectUses();
Changed |= runAttributor(IsModulePass);
// Recollect uses, in case Attributor deleted any.
@@ -1047,6 +1084,308 @@ struct OpenMPOpt {
}
private:
/// Create a new kernel for each function call to __kmpc_parallel_51.
bool splitKernels() {
bool Changed = false;
OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
if (!Parallel51RFI.Declaration)
return Changed;
SmallVector<CallInst *> WorkItems;
for (User *U : Parallel51RFI.Declaration->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
Function *F = CI->getFunction();
// If the parent function is already from kernel split, skip it.
Attribute Attr = F->getFnAttribute("omp_parallel_51_kernel");
if (Attr.isValid())
continue;
WorkItems.push_back(CI);
}
if (WorkItems.empty())
return Changed;
Changed = true;
auto CreateNewKernel = [](LLVMContext &Ctx, Module &M,
OpenMPIRBuilder &IRBuilder, Function *Parallel51,
const CallInst *CI) {
SmallVector<Type *, 4> ArgTypes;
// int32_t gtid
ArgTypes.push_back(Type::getInt32Ty(Ctx));
// int32_t if_expr
ArgTypes.push_back(Type::getInt32Ty(Ctx));
// int32_t num_threads
ArgTypes.push_back(Type::getInt32Ty(Ctx));
// void **args
ArgTypes.push_back(PointerType::getUnqual(Ctx));
// int64_t nargs
ArgTypes.push_back(Type::getInt64Ty(Ctx));
FunctionType *FT =
FunctionType::get(Type::getVoidTy(Ctx), ArgTypes, false);
std::string KernelName = "__omp_offloading_parallel_51_from_";
KernelName += CI->getFunction()->getName();
// Sanitize the kernel name
{
size_t DotPos = KernelName.find('.');
while (DotPos != std::string::npos) {
KernelName[DotPos] = '_';
DotPos = KernelName.find('.');
}
}
Function *K =
Function::Create(FT, GlobalValue::WeakODRLinkage, KernelName, &M);
K->setVisibility(GlobalValue::ProtectedVisibility);
// exec mode global variable
GlobalVariable *ModeGV = new GlobalVariable(
M, Type::getInt8Ty(Ctx), /*isConstant=*/true,
GlobalValue::WeakAnyLinkage,
ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD),
K->getName() + "_exec_mode");
appendToCompilerUsed(M, {ModeGV});
// Attach "kernel" metadata to the new kernel.
NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
Metadata *MDVals[] = {ConstantAsMetadata::get(K),
MDString::get(Ctx, "kernel"),
ConstantAsMetadata::get(ConstantInt::get(
llvm::Type::getInt32Ty(Ctx), 1))};
MD->addOperand(MDNode::get(Ctx, MDVals));
// Set kernel attributes.
K->setAttributes(
AttributeList::get(Ctx, AttributeList::FunctionIndex,
CI->getFunction()->getAttributes().getFnAttrs()));
K->removeFnAttr("omp_target_thread_limit");
K->removeFnAttr("omp_target_num_teams");
K->addFnAttr("omp_parallel_51_kernel");
K->addFnAttr("kernel");
K->getArg(3)->addAttr(Attribute::NoAlias);
auto *EntryBB = BasicBlock::Create(Ctx, "entry", K);
OpenMPIRBuilder::LocationDescription TargetInitLoc(
{EntryBB, EntryBB->end()});
auto IP = IRBuilder.createTargetInit(TargetInitLoc, /* IsSPMD */ true);
BasicBlock *UserCodeBB = IP.getBlock();
auto UseDirectIfPossible = [](Value *LHS, Value *RHS) -> Value * {
if (isa<Constant>(LHS))
return LHS;
return RHS;
};
Value *Ident = CI->getOperand(0);
Value *TId = UseDirectIfPossible(CI->getOperand(1), K->getArg(0));
Value *IfExpr = UseDirectIfPossible(CI->getOperand(2), K->getArg(1));
Value *NumThreads = UseDirectIfPossible(CI->getOperand(3), K->getArg(2));
Value *ProcBind = CI->getOperand(4);
Value *Fn = CI->getOperand(5);
Value *WrapperFn = CI->getOperand(6);
Value *Args = K->getArg(3);
Value *NArgs = UseDirectIfPossible(CI->getOperand(8), K->getArg(4));
(void)CallInst::Create(Parallel51,
{Ident, TId, IfExpr, NumThreads, ProcBind, Fn,
WrapperFn, Args, NArgs},
"", UserCodeBB);
OpenMPIRBuilder::LocationDescription TargetDeInitLoc(
{UserCodeBB, UserCodeBB->end()});
IRBuilder.createTargetDeinit(TargetDeInitLoc, /* IsSPMD */ true);
IRBuilder.Builder.CreateRetVoid();
return K;
};
auto EnableMultiTeam = [](Function *OutlinedFn) {
SmallVector<CallInst *> WorkItems;
for (BasicBlock &BB : *OutlinedFn)
for (Instruction &I : BB) {
CallInst *C = dyn_cast<CallInst>(&I);
if (!C)
continue;
Function *Callee = C->getCalledFunction();
auto CalleeName = Callee->getName();
if (CalleeName == "__kmpc_for_static_init_4" ||
CalleeName == "__kmpc_for_static_init_4u" ||
CalleeName == "__kmpc_for_static_init_8" ||
CalleeName == "__kmpc_for_static_init_8u")
WorkItems.push_back(C);
}
for (CallInst *C : WorkItems) {
constexpr const unsigned SchedTypeArgNum = 2;
Value *SchedTypeVal = C->getOperand(SchedTypeArgNum);
ConstantInt *SchedTypeCI = cast<ConstantInt>(SchedTypeVal);
int32_t SchedType = SchedTypeCI->getSExtValue();
Value *NewSchedType = SchedTypeVal;
if (SchedType == /*kmp_sched_static_chunk*/ 33)
NewSchedType = ConstantInt::get(SchedTypeCI->getType(), 100);
else if (SchedType == /*kmp_sched_static_chunk*/ 34)
NewSchedType = ConstantInt::get(SchedTypeCI->getType(), 101);
C->setOperand(SchedTypeArgNum, NewSchedType);
}
};
auto &Ctx = M.getContext();
for (CallInst *CI : WorkItems) {
Function *K = CreateNewKernel(Ctx, M, OMPInfoCache.OMPBuilder,
Parallel51RFI.Declaration, CI);
constexpr const unsigned OutlinedFnArgNum = 5;
Function *OutlinedFn =
dyn_cast<Function>(CI->getOperand(OutlinedFnArgNum));
assert(OutlinedFn && "arg fn is not a function");
if (canUseMultiTeam(OutlinedFn, OMPInfoCache))
EnableMultiTeam(OutlinedFn);
auto &Ctx = M.getContext();
IRBuilder<> Builder(CI);
FunctionCallee Callee =
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
M, OMPRTL___kmpc_launch_parallel_51_kernel);
Constant *KernelName = ConstantDataArray::getString(Ctx, K->getName());
GlobalVariable *KernelNameVar =
new GlobalVariable(M, KernelName->getType(), /*isConstant=*/true,
GlobalValue::WeakAnyLinkage, KernelName);
Value *Args[6] = {KernelNameVar, CI->getArgOperand(1),
CI->getArgOperand(2), CI->getArgOperand(3),
CI->getArgOperand(7), CI->getArgOperand(8)};
CallInst *C = Builder.CreateCall(Callee, Args);
OMPInfoCache.setCallingConvention(Callee, C);
CI->eraseFromParent();
}
// Check all the use of __kmpc_parallel_51 again. If it is just used in
// parallel kernel, we can mark the main kernel as SPMD mode as well.
bool OnlyUsedByParallelKernel = true;
for (User *U : Parallel51RFI.Declaration->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
Function *F = CI->getFunction();
Attribute Attr = F->getFnAttribute("omp_parallel_51_kernel");
if (Attr.isValid())
continue;
OnlyUsedByParallelKernel = false;
break;
}
if (OnlyUsedByParallelKernel) {
for (auto K : SCC) {
if (!omp::isOpenMPKernel(*K))
continue;
Attribute Attr = K->getFnAttribute("omp_parallel_51_kernel");
if (Attr.isValid())
continue;
GlobalVariable *ExecMode =
M.getGlobalVariable((K->getName() + "_exec_mode").str());
assert(ExecMode && "kernel without exec mode");
assert(ExecMode->getInitializer() &&
"ExecMode doesn't have initializer!");
assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
"ExecMode is not an integer!");
const int8_t ExecModeVal =
cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
// Kernel is already in SPMD mode, skip.
if (ExecModeVal & OMP_TGT_EXEC_MODE_SPMD)
continue;
auto *NewExecModeC =
ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD);
ExecMode->setInitializer(NewExecModeC);
OMPInformationCache::RuntimeFunctionInfo &TargetInitRFI =
OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
OMPInformationCache::RuntimeFunctionInfo &TargetDeInitRFI =
OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
CallInst *InitCI = nullptr;
CallInst *DeInitCI = nullptr;
for (User *U : TargetInitRFI.Declaration->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
if (CI->getFunction() == K) {
InitCI = CI;
break;
}
}
for (User *U : TargetDeInitRFI.Declaration->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
if (CI->getFunction() == K) {
DeInitCI = CI;
break;
}
}
assert(InitCI && DeInitCI && "kernel without init and deinit");
InitCI->setArgOperand(1, NewExecModeC);
InitCI->setArgOperand(2,
ConstantInt::getNullValue(Type::getInt1Ty(Ctx)));
DeInitCI->setArgOperand(1, NewExecModeC);
}
}
// Check the use of omp_get_thread_num and omp_get_num_threads.
OMPInformationCache::RuntimeFunctionInfo &ThreadNumRFI =
OMPInfoCache.RFIs[OMPRTL_omp_get_thread_num];
OMPInformationCache::RuntimeFunctionInfo &NumThreadsRFI =
OMPInfoCache.RFIs[OMPRTL_omp_get_num_threads];
auto CheckIfOnlyUsedByParallelKernel = [&](CallInst *CI) {
SmallVector<Kernel> ReachingKernels;
collectReachingKernels(CI->getFunction(),
ReachingKernels);
return ReachingKernels.size() < 2;
};
SmallVector<CallInst *> WorkList;
for (User *U : ThreadNumRFI.Declaration->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
if (!CheckIfOnlyUsedByParallelKernel(CI))
continue;
WorkList.push_back(CI);
}
for (User *U : NumThreadsRFI.Declaration->users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
if (!CheckIfOnlyUsedByParallelKernel(CI))
continue;
WorkList.push_back(CI);
}
for (CallInst *CI : WorkList) {
Function *Callee = nullptr;
if (CI->getCalledFunction() == ThreadNumRFI.Declaration)
Callee = OMPInfoCache.RFIs[OMPRTL_omp_get_bulk_thread_num].Declaration;
if (CI->getCalledFunction() == NumThreadsRFI.Declaration)
Callee = OMPInfoCache.RFIs[OMPRTL_omp_get_bulk_num_threads].Declaration;
assert(Callee && "unknown callee");
auto &Builder = OMPInfoCache.OMPBuilder.Builder;
Builder.SetInsertPoint(CI);
Value *C = Builder.CreateCall(Callee);
CI->replaceAllUsesWith(C);
CI->eraseFromParent();
}
return Changed;
}
/// Merge parallel regions when it is safe.
bool mergeParallelRegions() {
const unsigned CallbackCalleeOperand = 2;

View File

@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMTransformUtils
CallGraphUpdater.cpp
CanonicalizeAliases.cpp
CanonicalizeFreezeInLoops.cpp
CanonicalizeMainFunction.cpp
CloneFunction.cpp
CloneModule.cpp
CodeExtractor.cpp

View File

@@ -0,0 +1,105 @@
//===-------------- CanonicalizeMainFunction.cpp ----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Utility function to canonicalize main function.
// The canonical main function is defined as: int main(int argc, char *argv[]);
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/Support/CommandLine.h"
using namespace llvm;
#define DEBUG_TYPE "canonicalize-main-function"
static cl::opt<std::string>
MainFunctionName("canonical-main-function-name",
cl::desc("New main function name"),
cl::value_desc("main function name"));
bool rewriteMainFunction(Function &F) {
if (F.arg_size() == 2 && F.getReturnType()->isIntegerTy(32))
return false;
auto &Ctx = F.getContext();
auto &DL = F.getParent()->getDataLayout();
auto *Int32Ty = IntegerType::getInt32Ty(Ctx);
auto *PtrTy = PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace());
FunctionType *NewFnTy =
FunctionType::get(Int32Ty, {Int32Ty, PtrTy}, /* isVarArg */ false);
Function *NewFn =
Function::Create(NewFnTy, F.getLinkage(), F.getAddressSpace(), "");
F.getParent()->getFunctionList().insert(F.getIterator(), NewFn);
NewFn->takeName(&F);
NewFn->copyAttributesFrom(&F);
NewFn->setSubprogram(F.getSubprogram());
F.setSubprogram(nullptr);
NewFn->splice(NewFn->begin(), &F);
if (!F.getReturnType()->isIntegerTy(32)) {
SmallVector<ReturnInst *> WorkList;
for (BasicBlock &BB : *NewFn)
for (Instruction &I : BB) {
auto *RI = dyn_cast<ReturnInst>(&I);
if (!RI)
continue;
assert(RI->getReturnValue() == nullptr &&
"return value of a void main function is not nullptr");
WorkList.push_back(RI);
}
for (auto *RI : WorkList) {
(void)ReturnInst::Create(Ctx, ConstantInt::getNullValue(Int32Ty), RI);
RI->eraseFromParent();
}
}
if (F.arg_size() == NewFn->arg_size())
for (unsigned I = 0; I < NewFn->arg_size(); ++I) {
Argument *OldArg = F.getArg(I);
Argument *NewArg = NewFn->getArg(I);
NewArg->takeName(OldArg);
OldArg->replaceAllUsesWith(NewArg);
}
return true;
}
PreservedAnalyses CanonicalizeMainFunctionPass::run(Module &M,
ModuleAnalysisManager &AM) {
Function *MainFunc = nullptr;
for (Function &F : M)
if (F.getName() == "main") {
assert(MainFunc == nullptr && "more than one main function");
MainFunc = &F;
}
if (MainFunc == nullptr)
return PreservedAnalyses::all();
bool Changed = false;
if (!MainFunctionName.empty() && MainFunc->getName() != MainFunctionName) {
MainFunc->setName(MainFunctionName);
Changed = true;
}
assert(MainFunc->arg_size() == 2 && MainFunc->getReturnType()->isIntegerTy(32)
&& "[Canonicalize Main Function] wrong user main function type; should be: int main(int argc, char** argv) (in CanonicalizeMainFunction.cpp)");
// if (rewriteMainFunction(*MainFunc)) {
// MainFunc->eraseFromParent();
// Changed = true;
// }
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}

View File

@@ -28,10 +28,10 @@ if(OPENMP_STANDALONE_BUILD)
else()
# When building in tree we install the runtime according to the LLVM settings.
if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND NOT APPLE)
set(OFFLOAD_INSTALL_LIBDIR lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE} CACHE STRING
set(OFFLOAD_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE}/offload" CACHE STRING
"Path where built offload libraries should be installed.")
else()
set(OFFLOAD_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}" CACHE STRING
set(OFFLOAD_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}/offload" CACHE STRING
"Path where built offload libraries should be installed.")
endif()
endif()

View File

@@ -77,16 +77,19 @@ set(include_files
${include_directory}/Interface.h
${include_directory}/LibC.h
${include_directory}/Mapping.h
${include_directory}/Memory.h
${include_directory}/State.h
${include_directory}/Synchronization.h
${include_directory}/Types.h
${include_directory}/Utils.h
${include_directory}/Mpi.h
)
set(src_files
${source_directory}/Allocator.cpp
${source_directory}/Configuration.cpp
${source_directory}/Debug.cpp
${source_directory}/HostRPC.cpp
${source_directory}/Kernel.cpp
${source_directory}/LibC.cpp
${source_directory}/Mapping.cpp
@@ -98,8 +101,22 @@ set(src_files
${source_directory}/Tasking.cpp
${source_directory}/Utils.cpp
${source_directory}/Workshare.cpp
${source_directory}/Mpi.cpp
)
# WarpAllocator.cpp is missing from this list
if (LIBOMPTARGET_DEVICE_BUILTIN_ALLOCATOR)
# Use the already buildin allocator of DeviceRTL instead of GPUFirst one,
# Does not support rpc call of function argument with pointer to GPU memory
# as Allocation informations is not saved.
list(APPEND src_files ${source_directory}/BuiltinAllocator.cpp)
elseif (LIBOMPTARGET_GENERIC_ALLOCATOR)
list(APPEND src_files ${source_directory}/GenericAllocator.cpp)
else()
list(APPEND src_files ${source_directory}/TeamAllocator.cpp)
endif()
# We disable the slp vectorizer during the runtime optimization to avoid
# vectorized accesses to the shared state. Generally, those are "good" but
# the optimizer pipeline (esp. Attributor) does not fully support vectorized
@@ -129,8 +146,18 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
-I${include_directory}
-I${devicertl_base_directory}/../include
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
-I${CMAKE_SOURCE_DIR}/../libc/include
)
if (LIBOMPTARGET_DEVICERTL_HOSTRPC_DEBUG)
list(APPEND bc_flags "-DHOSTRPC_DEBUG")
endif()
if (LIBOMPTARGET_DEVICERTL_HOSTRPC_PROFILING)
list(APPEND bc_flags "-DHOSTRPC_PROFILING")
endif()
# first create an object target
add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
function(compileDeviceRTLLibrary target_cpu target_name target_triple)
@@ -298,6 +325,7 @@ set_target_properties(omptarget.devicertl PROPERTIES
ARCHIVE_OUTPUT_DIRECTORY "${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}"
LINKER_LANGUAGE CXX
)
target_link_libraries(omptarget.devicertl PRIVATE omptarget.devicertl.all_objs)
install(TARGETS omptarget.devicertl ARCHIVE DESTINATION ${OFFLOAD_INSTALL_LIBDIR})

View File

@@ -39,6 +39,12 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
__builtin_trap(); \
__builtin_unreachable();
#define assert(expr) \
{ \
if (!(expr)) \
__assert_fail(#expr, __FILE__, __LINE__, __PRETTY_FUNCTION__); \
}
///}
#define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);

View File

@@ -360,6 +360,22 @@ int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
///}
/// Host RPC
///
/// {
void *__kmpc_host_rpc_get_desc(int32_t CallNo, int32_t NumArgs, void *ArgInfo);
void __kmpc_host_rpc_add_arg(void *Desc, int64_t Arg, int32_t ArgNum);
int64_t __kmpc_host_rpc_send_and_wait(void *Desc);
/// }
/// Launch parallel kernel
///
///{
void __kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
int32_t if_expr, int32_t num_threads,
void **args, int64_t nargs);
///}
}
#endif

View File

@@ -14,12 +14,112 @@
#include "Types.h"
#ifndef _ASM_GENERIC_ERRNO_BASE_H
#define _ASM_GENERIC_ERRNO_BASE_H
#define EPERM 1 /* Operation not permitted */
#define ENOENT 2 /* No such file or directory */
#define ESRCH 3 /* No such process */
#define EINTR 4 /* Interrupted system call */
#define EIO 5 /* I/O error */
#define ENXIO 6 /* No such device or address */
#define E2BIG 7 /* Argument list too long */
#define ENOEXEC 8 /* Exec format error */
#define EBADF 9 /* Bad file number */
#define ECHILD 10 /* No child processes */
#define EAGAIN 11 /* Try again */
#define ENOMEM 12 /* Out of memory */
#define EACCES 13 /* Permission denied */
#define EFAULT 14 /* Bad address */
#define ENOTBLK 15 /* Block device required */
#define EBUSY 16 /* Device or resource busy */
#define EEXIST 17 /* File exists */
#define EXDEV 18 /* Cross-device link */
#define ENODEV 19 /* No such device */
#define ENOTDIR 20 /* Not a directory */
#define EISDIR 21 /* Is a directory */
#define EINVAL 22 /* Invalid argument */
#define ENFILE 23 /* File table overflow */
#define EMFILE 24 /* Too many open files */
#define ENOTTY 25 /* Not a typewriter */
#define ETXTBSY 26 /* Text file busy */
#define EFBIG 27 /* File too large */
#define ENOSPC 28 /* No space left on device */
#define ESPIPE 29 /* Illegal seek */
#define EROFS 30 /* Read-only file system */
#define EMLINK 31 /* Too many links */
#define EPIPE 32 /* Broken pipe */
#define EDOM 33 /* Math argument out of domain of func */
#define ERANGE 34 /* Math result not representable */
#endif
#define errno (*__errno_location())
extern "C" {
int memcmp(const void *lhs, const void *rhs, size_t count);
void memset(void *dst, int C, size_t count);
void *memset(void *dst, int C, size_t count);
int printf(const char *format, ...);
long strtol(const char *str, char **str_end, int base);
int strcmp(const char *lhs, const char *rhs);
void *calloc(size_t num, size_t size);
int strcasecmp(const char *string1, const char *string2);
void exit(int exit_code);
size_t strlen(const char *str);
int atoi(const char *str);
char *strcpy(char *dest, const char *src);
int stat(const char *path, struct stat *buf);
int *__errno_location();
char *strcat(char *dest, const char *src);
void perror(const char *s);
int strncmp(const char *lhs, const char *rhs, size_t count);
char *strncpy(char *dest, const char *src, size_t count);
char *strchr(const char *str, int ch);
char *strtok(char *str, const char *delim);
const unsigned short **__ctype_b_loc(void);
void *realloc(void *ptr, size_t new_size);
void qsort(void *const pbase, size_t total_elems, size_t size,
int (*comp)(const void *, const void *));
int gettimeofday(struct timeval *tv, struct timezone *tz);
char *__xpg_basename(const char *path);
void srand(unsigned seed);
int rand();
int abs(int n);
void *memcpy(void *dest, const void *src, size_t count);
double atof(const char *str);
double strtod(const char *str, char **ptr);
long strtol(const char *nptr, char **endptr, int base);
}
#endif

View File

@@ -0,0 +1,39 @@
//===--- Memory.h - OpenMP device runtime memory allocator -------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//
//===----------------------------------------------------------------------===//
#ifndef OMPTARGET_MEMORY_H
#define OMPTARGET_MEMORY_H
#include "Types.h"
extern "C" {
__attribute__((leaf)) void *malloc(size_t Size);
__attribute__((leaf)) void free(void *Ptr);
}
namespace ompx {
namespace memory {
struct MemoryAllocationInfo {
void *BasePtr = nullptr;
size_t Size = 0;
bool isValid() const { return BasePtr; }
};
/// Get the memory allocation information if pointer \p P is in the range of one
/// of the buffer allocated by \p malloc.
MemoryAllocationInfo getMemoryAllocationInfo(void *P);
} // namespace memory
} // namespace ompx
#endif

View File

@@ -0,0 +1,158 @@
#ifndef MPI_H
#define MPI_H
#ifdef __cplusplus
extern "C" {
#endif
//struct MPI_Comm_s;
//struct MPI_Comm_s {
// int id; // id = 0 -> MPI_COMM_WORLD (the only supported)
// uint32_t size;
// uint32_t barrier_counter;
// uint32_t barrier_generation_counter;
// uint32_t *nodes_infos;
//};
typedef struct MPI_Comm_s *MPI_Comm;
MPI_Comm MPI_COMM_WORLD;
typedef enum MPI_Datatype_e {
MPI_CHAR,
MPI_SHORT,
MPI_INT,
MPI_LONG,
MPI_LONG_LONG_INT,
MPI_LONG_LONG,
MPI_SIGNED_CHAR,
MPI_UNSIGNED_CHAR,
MPI_UNSIGNED_SHORT,
MPI_UNSIGNED,
MPI_UNSIGNED_LONG,
MPI_UNSIGNED_LONG_LONG,
MPI_FLOAT,
MPI_DOUBLE,
MPI_LONG_DOUBLE,
MPI_WCHAR,
MPI_C_BOOL,
MPI_INT8_T,
MPI_INT16_T,
MPI_INT32_T,
MPI_INT64_T,
MPI_UINT8_T,
MPI_UINT16_T,
MPI_UINT32_T,
MPI_UINT64_T,
MPI_C_COMPLEX,
MPI_C_FLOAT_COMPLEX,
MPI_C_DOUBLE_COMPLEX,
MPI_C_LONG_DOUBLE_COMPLEX,
MPI_BYTE,
//MPI_PACKED,
//MPI_AINT,
//MPI_OFFSET,
//MPI_COUNT,
MPI_CXX_BOOL
//MPI_CXX_FLOAT_COMPLEX,
//MPI_CXX_DOUBLE_COMPLEX,
//MPI_CXX_LONG_DOUBLE_COMPLEX,
} MPI_Datatype;
typedef enum MPI_Intent_e {
MPI_THREAD_SINGLE,
MPI_THREAD_FUNNELED,
MPI_THREAD_SERIALIZED,
MPI_THREAD_MULTIPLE
} MPI_Intent;
typedef void MPI_User_function(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype);
typedef struct MPI_Op_s {
MPI_User_function *func;
} MPI_Op;
typedef struct MPI_Status_s {
int MPI_SOURCE;
int MPI_TAG;
int MPI_ERROR;
} MPI_Status;
typedef struct MPI_Request_s *MPI_Request;
//const MPI_Status MPI_STATUS_IGNORE;
//const MPI_Status MPI_STATUSES_IGNORE;
// global variables
const int MPI_ANY_SOURCE = -1;
const int MPI_ANY_TAG = -1;
const int MPI_SUCCESS = 0;
MPI_Status MPI_STATUS_IGNORE;
MPI_Status MPI_STATUSES_IGNORE;
MPI_Op MPI_MIN;
MPI_Op MPI_MAX;
MPI_Request MPI_REQUEST_NULL = 0;
// Common functions
int MPI_Init(int *argc, char **argv);
int MPI_Init_thread(int *argc, char ***argv, int required, int *provided);
int MPI_Finalize(void);
int MPI_Abort(MPI_Comm comm, int errorcode);
int MPI_Barrier(MPI_Comm comm);
int MPI_Comm_rank(MPI_Comm comm, int *rank);
int MPI_Comm_size(MPI_Comm comm, int *size);
int MPI_Type_size(MPI_Datatype datatype, int *size);
// Blocking Comm
int MPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm);
//int MPI_Rsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm);
int MPI_Ssend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm);
int MPI_Bsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm);
int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status);
// Non Blocking Comm
int MPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
//int MPI_Irsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
int MPI_Issend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
int MPI_Ibsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request *request);
// Non Blocking Wait
int MPI_Test(MPI_Request *request, int *flag, MPI_Status *status);
int MPI_Wait(MPI_Request *request, MPI_Status *status);
int MPI_Testall(int count, MPI_Request array_of_requests[], int *flag, MPI_Status array_of_statuses[]);
int MPI_Waitall(int count, MPI_Request array_of_requests[], MPI_Status array_of_statuses[]);
int MPI_Testany(int count, MPI_Request array_of_requests[], int *index, int *flag, MPI_Status *status);
int MPI_Waitany(int count, MPI_Request array_of_requests[], int *index, MPI_Status *status);
int MPI_Testsome(int incount, MPI_Request array_of_requests[], int *outcount, int array_of_indices[], MPI_Status array_of_statuses[]);
int MPI_Waitsome(int incount, MPI_Request array_of_requests[], int *outcount, int array_of_indices[], MPI_Status array_of_statuses[]);
// Persistent Communications setup
int MPI_Send_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
//int MPI_Rsend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
int MPI_Ssend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
int MPI_Bsend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
int MPI_Recv_init(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request *request);
// Persistent Communications start and end
int MPI_Start(MPI_Request *request);
int MPI_Startall(int count, MPI_Request array_of_requests[]);
int MPI_Request_free(MPI_Request *request);
// Collective Operation
int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm);
int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm);
// Uknow
double MPI_Wtime(void);
#ifdef __cplusplus
}
#endif
#endif // MPI_H

View File

@@ -36,6 +36,9 @@ enum MemScopeTy {
uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering,
MemScopeTy MemScope = MemScopeTy::all);
///
int32_t addSys(int32_t *Addr, int32_t V);
/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
/// result is stored in \p *Addr;
/// {
@@ -135,6 +138,58 @@ void system(atomic::OrderingTy Ordering);
} // namespace fence
namespace atomic {
/// Atomically load \p Addr with \p Ordering semantics.
uint32_t load(uint32_t *Addr, int Ordering);
/// Atomically load \p Addr with \p Ordering semantics.
uint64_t load(uint64_t *Addr, int Ordering);
/// Atomically store \p V to \p Addr with \p Ordering semantics.
void store(uint32_t *Addr, uint32_t V, int Ordering);
void store(uint64_t *Addr, uint64_t V, int Ordering);
/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);
/// Atomically add \p V to \p *Addr with \p Ordering semantics.
uint32_t add(uint32_t *Addr, uint32_t V, int Ordering);
/// Atomically add \p V to \p *Addr with \p Ordering semantics.
uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);
} // namespace atomic
namespace mutex {
class TicketLock {
uint64_t NowServing = 0;
uint64_t NextTicket = 0;
public:
TicketLock() = default;
TicketLock(const TicketLock &) = delete;
void lock();
void unlock();
};
template <typename T> class LockGuard {
T &Lock;
public:
explicit LockGuard(T &L) : Lock(L) { Lock.lock(); }
~LockGuard() { Lock.unlock(); }
};
} // namespace mutex
} // namespace ompx
#endif

View File

@@ -36,6 +36,7 @@ using size_t = decltype(sizeof(char));
// TODO: Properly implement this
using intptr_t = int64_t;
using uintptr_t = uint64_t;
using ptrdiff_t = intptr_t;
static_assert(sizeof(int8_t) == 1, "type size mismatch");
static_assert(sizeof(uint8_t) == 1, "type size mismatch");
@@ -83,6 +84,11 @@ enum kmp_sched_t {
kmp_sched_distr_static_nochunk = 92,
kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
// The following two are for direct GPU compilation where we don't have outer
// loop to distribute workload among teams but we still need to.
kmp_sched_distr_parallel_static_noloop_chunk = 100,
kmp_sched_distr_parallel_static_noloop_nochunk = 101,
kmp_sched_default = kmp_sched_static_nochunk,
kmp_sched_unordered_first = kmp_sched_static_chunk,
kmp_sched_unordered_last = kmp_sched_auto,

View File

@@ -16,6 +16,9 @@
#pragma omp begin declare target device_type(nohost)
extern "C" double omp_get_wtime();
extern "C" int printf(const char *, ...);
namespace ompx {
namespace utils {
@@ -37,8 +40,8 @@ template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
}
/// Advance \p Ptr by \p Bytes bytes.
template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
template <typename Ty1, typename Ty2> inline Ty1 advance(Ty1 Ptr, Ty2 Bytes) {
return reinterpret_cast<Ty1>(reinterpret_cast<char *>(Ptr) + Bytes);
}
/// Return the first bit set in \p V.
@@ -82,12 +85,39 @@ template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
return *((DstTy *)(&V));
}
template <typename Ty = char>
ptrdiff_t getPtrDiff(const void *End, const void *Begin) {
return reinterpret_cast<const Ty *>(End) -
reinterpret_cast<const Ty *>(Begin);
}
inline bool isInRange(void *Ptr, void *BasePtr, int64_t Offset) {
ptrdiff_t Diff = getPtrDiff(Ptr, BasePtr);
return Diff >= 0 && Diff < Offset;
}
inline intptr_t ptrtoint(void *Ptr) { return reinterpret_cast<intptr_t>(Ptr); }
template <typename T> T min(T a, T b) { return a < b ? a : b; }
/// A pointer variable that has by design an `undef` value. Use with care.
[[clang::loader_uninitialized]] static void *const UndefPtr;
#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
class SimpleProfiler {
const char *HeadLine = nullptr;
double Start;
public:
SimpleProfiler(const char *HL) : HeadLine(HL), Start(omp_get_wtime()) {}
~SimpleProfiler() {
double End = omp_get_wtime();
printf("%s --> %lf s.\n", HeadLine, End - Start);
}
};
} // namespace utils
} // namespace ompx

View File

@@ -0,0 +1,44 @@
//===------- BuiltinAllocator.cpp - Generic GPU memory allocator -- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#pragma omp begin declare target device_type(nohost)
#include "Debug.h"
#include "Memory.h"
#include "Utils.h"
using namespace ompx;
namespace ompx {
namespace memory {
MemoryAllocationInfo getMemoryAllocationInfo(void *P) { return {}; }
void init() {}
} // namespace memory
} // namespace ompx
extern "C" {
void *realloc(void *ptr, size_t new_size) {
void *NewPtr = malloc(new_size);
if (!NewPtr)
return nullptr;
__builtin_memcpy(NewPtr, ptr, new_size);
free(ptr);
return NewPtr;
}
}
extern "C" {
void __kmpc_target_init_allocator() { return; }
}
#pragma omp end declare target

View File

@@ -0,0 +1,312 @@
//===------- GenericAllocator.cpp - Generic GPU memory allocator -- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#pragma omp begin declare target device_type(nohost)
#include "Debug.h"
#include "Memory.h"
#include "Synchronization.h"
#include "Utils.h"
using namespace ompx;
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility(
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
__omp_rtl_device_memory_pool_tracker;
// TODO: implement Device Debug Allocation Tracker
namespace {
size_t HeapCurPos = 0;
mutex::TicketLock HeapLock;
mutex::TicketLock AllocationListLock;
mutex::TicketLock FreeListLock;
constexpr const size_t Alignment = 16;
using intptr_t = int64_t;
struct SimpleLinkListNode {
template <typename T> friend struct SimpleLinkList;
protected:
SimpleLinkListNode *Prev = nullptr;
SimpleLinkListNode *Next = nullptr;
bool operator>(const SimpleLinkListNode &RHS) const {
const auto This = reinterpret_cast<int64_t>(this);
const auto RHSThis = reinterpret_cast<int64_t>(&RHS);
return This > RHSThis;
}
SimpleLinkListNode() = default;
SimpleLinkListNode(const SimpleLinkListNode &) = delete;
SimpleLinkListNode(SimpleLinkListNode &&) = delete;
};
struct AllocationMetadata final : SimpleLinkListNode {
public:
size_t getUserSize() const;
void *getUserAddr() const {
return const_cast<void *>(reinterpret_cast<const void *>(this + 1));
}
size_t getSize() const { return Size; }
void setSize(size_t V) { Size = V; }
bool isInRange(void *Ptr) {
return utils::isInRange(Ptr, this + 1, getUserSize());
}
static AllocationMetadata *getFromUserAddr(void *P) {
return getFromAddr(P) - 1;
}
static AllocationMetadata *getFromAddr(void *P) {
return reinterpret_cast<AllocationMetadata *>(P);
}
private:
size_t Size = 0;
int64_t Reserved = 0;
};
constexpr const size_t AllocationMetadataSize = sizeof(AllocationMetadata);
static_assert(AllocationMetadataSize == 32,
"expect the metadata size to be 32");
size_t AllocationMetadata::getUserSize() const {
return Size - AllocationMetadataSize;
}
template <typename T> struct SimpleLinkList {
struct iterator {
friend SimpleLinkList;
T *operator->() { return reinterpret_cast<T *>(Node); }
T &operator*() { return reinterpret_cast<T &>(*Node); }
iterator operator++() {
Node = Node->Next;
return *this;
}
bool operator==(const iterator &RHS) const { return Node == RHS.Node; }
bool operator!=(const iterator &RHS) const { return !(*this == RHS); }
iterator next() const {
iterator Itr;
Itr.Node = Node->Next;
return Itr;
}
private:
SimpleLinkListNode *Node = nullptr;
};
iterator begin() {
iterator Itr;
Itr.Node = Head.Next;
return Itr;
}
iterator end() { return {}; }
void insert(SimpleLinkListNode *Node) { insertImpl(&Head, Node); }
void remove(iterator Itr) {
SimpleLinkListNode *Node = Itr.Node;
remove(Node);
}
void remove(SimpleLinkListNode *Node) { removeImpl(Node); }
bool empty() const { return Head.Next == nullptr; }
private:
static void insertImpl(SimpleLinkListNode *Current,
SimpleLinkListNode *Node) {
SimpleLinkListNode *OldNext = Current->Next;
Node->Prev = Current;
Node->Next = OldNext;
if (OldNext)
OldNext->Prev = Node;
Current->Next = Node;
}
static void removeImpl(SimpleLinkListNode *Node) {
SimpleLinkListNode *Prev = Node->Prev;
SimpleLinkListNode *Next = Node->Next;
Prev->Next = Next;
if (Next)
Next->Prev = Prev;
Node->Prev = nullptr;
Node->Next = nullptr;
}
/// Check if the node is in the list.
bool checkExist(SimpleLinkListNode *Node) const {
SimpleLinkListNode *P = Head.Next;
while (P) {
if (P == Node)
return true;
P = P->Next;
}
return false;
}
static bool checkSanity(SimpleLinkListNode *Current,
SimpleLinkListNode *Next) {
return Current->Next == Next && (!Next || Next->Prev == Current);
}
static bool checkSanity(SimpleLinkListNode *Prev, SimpleLinkListNode *Current,
SimpleLinkListNode *Next) {
return Prev->Next == Current && Current->Next == Next &&
Current->Prev == Prev && (!Next || Next->Prev == Current);
}
static bool checkDangling(SimpleLinkListNode *Node) {
return Node->Next == nullptr && Node->Prev == nullptr;
}
SimpleLinkListNode Head;
};
SimpleLinkList<AllocationMetadata> AllocationList;
SimpleLinkList<AllocationMetadata> FreeList;
constexpr const int SplitRatio = 5;
#ifndef LIBOMPTARGET_MEMORY_PROFILING
struct MemoryProfiler {
MemoryProfiler(const char *S) {}
~MemoryProfiler() {}
};
#else
struct MemoryProfiler : utils::SimpleProfiler {
MemoryProfiler(const char *S) : utils::SimpleProfiler(S) {}
~MemoryProfiler() { utils::SimpleProfiler::~SimpleProfiler(); }
};
#endif
} // namespace
namespace ompx {
namespace memory {
MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
mutex::LockGuard ALG(AllocationListLock);
for (auto Itr = AllocationList.begin(); Itr != AllocationList.end(); ++Itr) {
AllocationMetadata *MD = &(*Itr);
if (MD->isInRange(P))
return {MD->getUserAddr(), MD->getUserSize()};
}
return {};
}
void init() {}
} // namespace memory
} // namespace ompx
extern "C" {
void *memset(void *dest, int ch, size_t count);
void *malloc(size_t Size) {
MemoryProfiler Profiler(__FUNCTION__);
Size = utils::align_up(Size + AllocationMetadataSize, Alignment);
AllocationMetadata *MD = nullptr;
{
mutex::LockGuard FLG(FreeListLock);
auto Itr = FreeList.begin();
for (; Itr != FreeList.end(); ++Itr) {
if (Itr->getSize() >= Size)
break;
}
bool Found = Itr != FreeList.end() && (Itr->getSize() / Size < SplitRatio);
if (Found) {
MD = &(*Itr);
FreeList.remove(Itr);
}
}
if (MD) {
mutex::LockGuard ALG(AllocationListLock);
AllocationList.insert(MD);
return MD->getUserAddr();
}
{
mutex::LockGuard LG(HeapLock);
if (Size + HeapCurPos < __omp_rtl_device_memory_pool.Size) {
void *R = reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr) + HeapCurPos;
(void)atomic::add(&HeapCurPos, Size, atomic::acq_rel);
MD = AllocationMetadata::getFromAddr(R);
}
}
if (MD) {
// We need to reset the head in case of any dirty data.
memset(MD, 0, AllocationMetadataSize);
MD->setSize(Size);
mutex::LockGuard ALG(AllocationListLock);
AllocationList.insert(MD);
return MD->getUserAddr();
}
printf("out of heap memory! size=%lu, cur=%lu.\n", Size, HeapCurPos);
printf("%s:%d\n", __FILE__, __LINE__);
__builtin_trap();
}
void free(void *P) {
MemoryProfiler Profiler(__FUNCTION__);
if (!P)
return;
auto *MD = AllocationMetadata::getFromUserAddr(P);
{
mutex::LockGuard ALG(AllocationListLock);
AllocationList.remove(MD);
}
{
mutex::LockGuard FLG(FreeListLock);
FreeList.insert(MD);
}
}
void *realloc(void *ptr, size_t new_size) {
MemoryProfiler Profiler(__FUNCTION__);
void *NewPtr = malloc(new_size);
if (!NewPtr)
return nullptr;
auto *OldMD = AllocationMetadata::getFromUserAddr(ptr);
assert(ptr == OldMD->getUserAddr());
__builtin_memcpy(NewPtr, ptr, utils::min(OldMD->getUserSize(), new_size));
free(ptr);
return NewPtr;
}
void __kmpc_target_init_allocator() {}
}
#pragma omp end declare target

View File

@@ -0,0 +1,428 @@
//===------- HostRPC.cpp - Implementation of host RPC ------------- C++ ---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#pragma omp begin declare target device_type(nohost)
#include "HostRPC.h"
#include "Debug.h"
#include "LibC.h"
#include "Memory.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
#include "llvm-libc-types/rpc_opcodes_t.h"
#include "llvm-libc-types/rpc_port_t.h"
#ifdef HOSTRPC_DEBUG
#define DEBUG_PREFIX "host-rpc-device"
#define DP(FMT, ...) \
{ printf("%s --> " FMT, DEBUG_PREFIX, __VA_ARGS__); }
#else
#define DP(FMT, ...)
#endif
using namespace ompx;
using namespace hostrpc;
using ArgType = llvm::omp::OMPTgtHostRPCArgType;
#ifdef HOSTRPC_PROFILING
int32_t HostRPCId;
double GetDescStart;
double GetDescEnd;
double AddArgStart;
double AddArgEnd;
double IssueAndWaitStart;
double IssueAndWaitEnd;
double CopyBackStart;
double CopyBackEnd;
#endif
// libc rpc functions forward declare:
// TODO: replace when a proper header exposing device functions is created
extern "C" {
rpc_port_t rpc_open_port(rpc_opcode_t);
void rpc_send_n(rpc_port_t *handle, const void *src, size_t size);
void rpc_recv_n(rpc_port_t *handle, void *dst, size_t *size);
void rpc_close_port(rpc_port_t *handle);
}
namespace {
size_t HostRPCMemoryBufferCurrentPosition = 0;
constexpr const size_t Alignment = 16;
static_assert(sizeof(intptr_t) == sizeof(int64_t), "pointer size not match");
struct HostRPCArgInfo {
void *BasePtr;
int64_t Type;
int64_t Size;
HostRPCArgInfo *Next;
};
struct HostRPCPointerMapEntry {
void *BasePtr;
void *MappedBasePtr;
int64_t Size;
int64_t Kind;
};
void *getMappedPointer(Descriptor *D, void *BasePtr, int64_t Size,
int64_t Offset, int64_t Kind) {
assert(D->ArgMap && "ArgMap should not be nullptr");
HostRPCPointerMapEntry *MapTable =
reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
int I = 0;
for (; I < D->NumArgs && MapTable[I].BasePtr; ++I)
if (MapTable[I].BasePtr == BasePtr)
return utils::advance(MapTable[I].MappedBasePtr, Offset);
MapTable[I].BasePtr = BasePtr;
MapTable[I].MappedBasePtr = malloc(Size);
MapTable[I].Size = Size;
MapTable[I].Kind = Kind;
if (Kind & ArgType::OMP_HOST_RPC_ARG_COPY_TO) {
__builtin_memcpy(MapTable[I].MappedBasePtr, BasePtr, Size);
DP("getMappedPointer: copy %ld bytes memory from %p to %p.\n", Size,
BasePtr, MapTable[I].MappedBasePtr);
}
return utils::advance(MapTable[I].MappedBasePtr, Offset);
}
void copybackIfNeeded(Descriptor *D) {
if (!D->ArgMap)
return;
auto *MapTable = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
for (int I = 0; I < D->NumArgs && MapTable[I].BasePtr; ++I)
if (MapTable[I].Kind & ArgType::OMP_HOST_RPC_ARG_COPY_FROM) {
__builtin_memcpy(MapTable[I].BasePtr, MapTable[I].MappedBasePtr,
MapTable[I].Size);
DP("copybackIfNeeded: copy %ld bytes memory from %p to %p.\n",
MapTable[I].Size, MapTable[I].MappedBasePtr, MapTable[I].BasePtr);
}
}
} // namespace
extern "C" {
__attribute__((noinline, used)) void *
__kmpc_host_rpc_get_desc(int32_t CallId, int32_t NumArgs, void *ArgInfo) {
DP("get desc for request (id=%d), NumArgs=%d, ArgInfo=%p.\n", CallId, NumArgs,
ArgInfo);
#ifdef HOSTRPC_DEBUG
{
void **AIs = reinterpret_cast<void **>(ArgInfo);
for (int I = 0; I < NumArgs; ++I)
DP("ArgInfo[%d]=%p.\n", I, AIs[I]);
}
#endif
#ifdef HOSTRPC_PROFILING
HostRPCId = CallId;
GetDescStart = omp_get_wtime();
#endif
Descriptor *D = (Descriptor *) malloc(sizeof(Descriptor));
D->Id = CallId;
D->ArgInfo = reinterpret_cast<void **>(ArgInfo);
D->NumArgs = NumArgs;
D->Status = EXEC_STAT_CREATED;
D->ReturnValue = 0;
D->Args =
reinterpret_cast<Argument *>(malloc(sizeof(Argument) * NumArgs));
D->ArgMap = malloc(sizeof(HostRPCPointerMapEntry) * NumArgs);
assert(!NumArgs || (D->Args && D->ArgMap) && "out of host rpc memory!");
// Reset the map table.
auto *ArgMap = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
for (int I = 0; I < NumArgs; ++I)
ArgMap[I].BasePtr = nullptr;
#ifdef HOSTRPC_PROFILING
GetDescEnd = omp_get_wtime();
AddArgStart = omp_get_wtime();
#endif
return D;
}
__attribute__((noinline, used)) void
__kmpc_host_rpc_add_arg(void *Desc, int64_t ArgVal, int32_t ArgNum) {
auto *D = reinterpret_cast<Descriptor *>(Desc);
assert(ArgNum < D->NumArgs && "out-of-range arguments");
Argument &ArgInDesc = D->Args[ArgNum];
DP("add arg (no=%d), arg=%lx to request (id=%d).\n", ArgNum, ArgVal, D->Id);
// This early branch can rule out nullptr and zero scalar value because it
// doesn't matter whether it is a pointer or scalar value.
if (ArgVal == 0) {
ArgInDesc.Value = 0;
ArgInDesc.ArgType = Type::ARG_LITERAL;
DP("arg (no=%d) is null, done.\n", ArgNum);
return;
}
void *ArgPtr = reinterpret_cast<void *>(ArgVal);
const auto *AI = reinterpret_cast<HostRPCArgInfo *>(D->ArgInfo[ArgNum]);
DP("try to find arg (no=%d) from args AI=%p\n", ArgNum, AI);
if (AI) {
// Let's first check if Arg is a scalar.
if (AI->Type == ArgType::OMP_HOST_RPC_ARG_SCALAR) {
assert(AI->BasePtr == ArgPtr && "invalid scalar argument info");
assert(AI->Next == nullptr && "invalid scalar argument info");
ArgInDesc.Value = ArgVal;
ArgInDesc.ArgType = Type::ARG_LITERAL;
DP("arg (no=%d) is scalar, done.\n", ArgNum);
return;
}
// Then let's see if it is a literal pointer that we don't need copy.
auto *P = AI;
while (P) {
if (P->Type == ArgType::OMP_HOST_RPC_ARG_PTR && P->BasePtr == ArgPtr) {
ArgInDesc.Value = ArgVal;
ArgInDesc.ArgType = Type::ARG_POINTER;
DP("arg (no=%d) is literal pointer, done.\n", ArgNum);
return;
}
P = P->Next;
}
// Next we check if it is within the range of any buffer described in
// argument info.
P = AI;
while (P) {
if ((P->Type & ArgType::OMP_HOST_RPC_ARG_PTR) && P->Size) {
if (utils::isInRange(ArgPtr, P->BasePtr, P->Size)) {
auto Size = P->Size;
auto Offset = utils::getPtrDiff(ArgPtr, P->BasePtr);
ArgInDesc.Value = utils::ptrtoint(
getMappedPointer(D, P->BasePtr, Size, Offset, P->Type));
ArgInDesc.ArgType = Type::ARG_POINTER;
DP("found a match for arg (no=%d). done.\n", ArgNum);
return;
}
}
P = P->Next;
}
}
// Now we can't find a match from argument info, then we assume it is from
// dynamic allocation.
memory::MemoryAllocationInfo MAI = memory::getMemoryAllocationInfo(ArgPtr);
if (MAI.isValid()) {
auto Size = MAI.Size;
auto Offset = utils::getPtrDiff(ArgPtr, MAI.BasePtr);
ArgInDesc.Value = utils::ptrtoint(
getMappedPointer(D, MAI.BasePtr, Size, Offset,
/* Kind */ ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM));
ArgInDesc.ArgType = Type::ARG_POINTER;
DP("arg (no=%d) is from malloc. done.\n", ArgNum);
return;
}
printf("request (id=%d) arg (no=%d, val=%p) is unknown. send it to host "
"directly.\n",
D->Id, ArgNum, ArgPtr);
ArgInDesc.Value = ArgVal;
ArgInDesc.ArgType = Type::ARG_POINTER;
}
__attribute__((noinline, used)) int64_t
__kmpc_host_rpc_send_and_wait(void *Desc) {
Descriptor *D = reinterpret_cast<Descriptor *>(Desc);
int32_t Id = D->Id;
#ifdef HOSTRPC_PROFILING
AddArgEnd = omp_get_wtime();
IssueAndWaitStart = omp_get_wtime();
#endif
// // WORKING back & forth of an uint64_t
//
// printf("[HostRPC] [Device]: Start \n");
//
// rpc_port_t port = rpc_open_port(RPC_GPUFIRST);
//
// uint64_t size_send = sizeof(uint64_t);
// void *buf_send = malloc(size_send);
// *((uint64_t *) buf_send) = 123456789;
//
// printf("[Hostrpc] [Device] [SEND]: %lu\n", *((uint64_t *) buf_send));
// printf("[HostRPC] [Device] [SEND] Size: %lu\n", size_send);
//
// rpc_send_n(&port, buf_send, size_send);
//
//
// uint64_t size_recv = sizeof(uint64_t);
// void *buf_recv = malloc(size_recv);
//
// rpc_recv_n(&port, buf_recv, &size_recv);
//
// printf("[HostRPC] [Device] [RECV]: %lu\n", *((uint64_t *) buf_recv));
// printf("[HostRPC] [Device] [RECV] Size: %lu\n", size_recv);
//
// rpc_close_port(&port);
//
// assert(size_send == size_recv);
//
// printf("[HostRPC] [Device]: End \n");
//
// // END of working part
rpc_port_t port = rpc_open_port(RPC_GPUFIRST);
Argument *Args = D->Args;
rpc_send_n(&port, D, sizeof(Descriptor));
rpc_send_n(&port, Args, sizeof(Argument) * D->NumArgs);
// CPU is calling the function here
// unuse
uint64_t size_recv = 0;
rpc_recv_n(&port, D, &size_recv);
rpc_recv_n(&port, Args, &size_recv);
D->Args = Args;
(void) size_recv;
rpc_close_port(&port);
#ifdef HOSTRPC_PROFILING
IssueAndWaitEnd = omp_get_wtime();
#endif
DP("finish waiting for request (id=%d).\n", Id);
int64_t Ret = D->ReturnValue;
assert(!D->NumArgs || D->ArgMap && "arg map should not be nullptr");
#ifdef HOSTRPC_PROFILING
CopyBackStart = omp_get_wtime();
#endif
if (D->ArgMap) {
DP("copy memory back for request (id=%d).\n", Id);
copybackIfNeeded(D);
DP("finish copy memory back for request (id=%d).\n", Id);
}
#ifdef HOSTRPC_PROFILING
CopyBackEnd = omp_get_wtime();
#endif
// free memory allocated for the call
HostRPCPointerMapEntry *MapTable = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
for(int i = 0; i < D->NumArgs && MapTable[i].BasePtr; ++i){
free(MapTable[i].MappedBasePtr);
}
free(D->Args);
free(D->ArgMap);
free(D);
DP("request (id=%d) is done with return code=%lx.\n", Id, Ret);
#ifdef HOSTRPC_PROFILING
printf("[host-rpc-profiling-device] id=%d, init=%lf, add_arg=%lf, wait=%lf, "
"copy=%lf.\n",
HostRPCId, GetDescEnd - GetDescStart, AddArgEnd - AddArgStart,
IssueAndWaitEnd - IssueAndWaitStart, CopyBackEnd - CopyBackStart);
#endif
return Ret;
}
__attribute__((noinline, used)) void
__kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
int32_t if_expr, int32_t num_threads,
void **args, int64_t nargs) {
constexpr const int64_t NumArgs = 6;
HostRPCArgInfo ArgInfoArray[NumArgs];
void *ArgInfo[NumArgs];
for (unsigned I = 0; I < NumArgs; ++I) {
ArgInfoArray[I].BasePtr = 0;
ArgInfoArray[I].Type = ArgType::OMP_HOST_RPC_ARG_SCALAR;
ArgInfoArray[I].Size = 0;
ArgInfoArray[I].Next = nullptr;
ArgInfo[I] = &ArgInfoArray[I];
}
auto *D = (Descriptor *)__kmpc_host_rpc_get_desc(0, NumArgs, (void *)ArgInfo);
// Set up arg info struct.
ArgInfoArray[0].BasePtr = const_cast<char *>(name);
ArgInfoArray[0].Type = ArgType::OMP_HOST_RPC_ARG_COPY_TO;
ArgInfoArray[0].Size = strlen(name) + 1;
ArgInfoArray[1].BasePtr =
reinterpret_cast<void *>(static_cast<int64_t>(gtid));
ArgInfoArray[2].BasePtr =
reinterpret_cast<void *>(static_cast<int64_t>(if_expr));
ArgInfoArray[3].BasePtr =
reinterpret_cast<void *>(static_cast<int64_t>(num_threads));
ArgInfoArray[5].BasePtr = reinterpret_cast<void *>(nargs);
// We need to treat args in a little bit different way because nargs might be
// on the stack.
ArgInfoArray[4].Size = sizeof(void *) * nargs;
void *Args = nullptr;
if (nargs) {
Args = malloc(ArgInfoArray[4].Size);
__builtin_memcpy(Args, args, ArgInfoArray[4].Size);
}
ArgInfoArray[4].BasePtr = Args;
D->Id = CALLID___kmpc_launch_parallel_51_kernel;
__kmpc_host_rpc_add_arg(D, reinterpret_cast<int64_t>(name), 0);
__kmpc_host_rpc_add_arg(D, gtid, 1);
__kmpc_host_rpc_add_arg(D, if_expr, 2);
__kmpc_host_rpc_add_arg(D, num_threads, 3);
__kmpc_host_rpc_add_arg(D, reinterpret_cast<int64_t>(Args), 4);
__kmpc_host_rpc_add_arg(D, nargs, 5);
(void)__kmpc_host_rpc_send_and_wait(D);
}
}
#pragma omp end declare target

View File

@@ -16,6 +16,7 @@
#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"
#include "Memory.h"
#include "State.h"
#include "Synchronization.h"
#include "Types.h"

View File

@@ -7,6 +7,12 @@
//===----------------------------------------------------------------------===//
#include "LibC.h"
#include "Debug.h"
#include "Memory.h"
#include "Synchronization.h"
#include "Utils.h"
using namespace ompx;
#pragma omp begin declare target device_type(nohost)
@@ -47,6 +53,321 @@ int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
#endif
#pragma omp end declare variant
// Dummy variable for stdio.
int StdInDummyVar;
int StdOutDummyVar;
int StdErrDummyVar;
struct FILE;
__attribute__((used, retain, weak, visibility("protected"))) FILE *stdin =
(FILE *)&StdInDummyVar;
__attribute__((used, retain, weak, visibility("protected"))) FILE *stdout =
(FILE *)&StdOutDummyVar;
__attribute__((used, retain, weak, visibility("protected"))) FILE *stderr =
(FILE *)&StdErrDummyVar;
typedef int (*__compar_fn_t)(const void *, const void *);
typedef int (*__compar_d_fn_t)(const void *, const void *, void *);
namespace {
const int32_t ToLowerMapTable[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 'a', 'b',
'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 91, 92, 93, 94, 95, 96,
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 123, 124, 125, 126,
127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
};
const int32_t *ToLowerPTable = ToLowerMapTable + 128;
#define X(x) (((x) / 256 | (x)*256) % 65536)
static const unsigned short BLocTable[] = {
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
X(0x200), X(0x200), X(0x200), X(0x200), X(0x320), X(0x220), X(0x220),
X(0x220), X(0x220), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x160),
X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
X(0x4c0), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8),
X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x4c0), X(0x4c0), X(0x4c0),
X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x8d5), X(0x8d5), X(0x8d5),
X(0x8d5), X(0x8d5), X(0x8d5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
X(0x8c5), X(0x8c5), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
X(0x4c0), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6),
X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6),
X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6),
X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x4c0),
X(0x4c0), X(0x4c0), X(0x4c0), X(0x200), 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
};
#undef X
static const unsigned short *BLocTablePTable = BLocTable + 128;
int ErrNo __attribute__((used, retain));
static unsigned long int RandomNext = 1;
#define SWAP(a, b, size) \
do { \
size_t __size = (size); \
char *__a = (a), *__b = (b); \
do { \
char __tmp = *__a; \
*__a++ = *__b; \
*__b++ = __tmp; \
} while (--__size > 0); \
} while (0)
/* Discontinue quicksort algorithm when partition gets below this size.
This particular magic number was chosen to work best on a Sun 4/260. */
#define MAX_THRESH 4
/* Stack node declarations used to store unfulfilled partition obligations. */
typedef struct {
char *lo;
char *hi;
} stack_node;
/* The next 4 #defines implement a very fast in-line stack abstraction. */
/* The stack needs log (total_elements) entries (we could even subtract
log(MAX_THRESH)). Since total_elements has type size_t, we get as
upper bound for log (total_elements):
bits per byte (CHAR_BIT) * sizeof(size_t). */
#define CHAR_BIT 8
#define STACK_SIZE (CHAR_BIT * sizeof(size_t))
#define PUSH(low, high) ((void)((top->lo = (low)), (top->hi = (high)), ++top))
#define POP(low, high) ((void)(--top, (low = top->lo), (high = top->hi)))
#define STACK_NOT_EMPTY (stack < top)
/* Order size using quicksort. This implementation incorporates
four optimizations discussed in Sedgewick:
1. Non-recursive, using an explicit stack of pointer that store the
next array partition to sort. To save time, this maximum amount
of space required to store an array of SIZE_MAX is allocated on the
stack. Assuming a 32-bit (64 bit) integer for size_t, this needs
only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes).
Pretty cheap, actually.
2. Chose the pivot element using a median-of-three decision tree.
This reduces the probability of selecting a bad pivot value and
eliminates certain extraneous comparisons.
3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving
insertion sort to order the MAX_THRESH items within each partition.
This is a big win, since insertion sort is faster for small, mostly
sorted array segments.
4. The larger of the two sub-partitions is always pushed onto the
stack first, with the algorithm then concentrating on the
smaller partition. This *guarantees* no more than log (total_elems)
stack size is needed (actually O(1) in this case)! */
void _quicksort(void *const pbase, size_t total_elems, size_t size,
__compar_d_fn_t cmp, void *arg) {
char *base_ptr = (char *)pbase;
const size_t max_thresh = MAX_THRESH * size;
if (total_elems == 0)
/* Avoid lossage with unsigned arithmetic below. */
return;
if (total_elems > MAX_THRESH) {
char *lo = base_ptr;
char *hi = &lo[size * (total_elems - 1)];
stack_node stack[STACK_SIZE];
stack_node *top = stack;
PUSH(nullptr, nullptr);
while (STACK_NOT_EMPTY) {
char *left_ptr;
char *right_ptr;
/* Select median value from among LO, MID, and HI. Rearrange
LO and HI so the three values are sorted. This lowers the
probability of picking a pathological pivot value and
skips a comparison for both the LEFT_PTR and RIGHT_PTR in
the while loops. */
char *mid = lo + size * ((hi - lo) / size >> 1);
if ((*cmp)((void *)mid, (void *)lo, arg) < 0)
SWAP(mid, lo, size);
if ((*cmp)((void *)hi, (void *)mid, arg) < 0)
SWAP(mid, hi, size);
else
goto jump_over;
if ((*cmp)((void *)mid, (void *)lo, arg) < 0)
SWAP(mid, lo, size);
jump_over:;
left_ptr = lo + size;
right_ptr = hi - size;
/* Here's the famous ``collapse the walls'' section of quicksort.
Gotta like those tight inner loops! They are the main reason
that this algorithm runs much faster than others. */
do {
while ((*cmp)((void *)left_ptr, (void *)mid, arg) < 0)
left_ptr += size;
while ((*cmp)((void *)mid, (void *)right_ptr, arg) < 0)
right_ptr -= size;
if (left_ptr < right_ptr) {
SWAP(left_ptr, right_ptr, size);
if (mid == left_ptr)
mid = right_ptr;
else if (mid == right_ptr)
mid = left_ptr;
left_ptr += size;
right_ptr -= size;
} else if (left_ptr == right_ptr) {
left_ptr += size;
right_ptr -= size;
break;
}
} while (left_ptr <= right_ptr);
/* Set up pointers for next iteration. First determine whether
left and right partitions are below the threshold size. If so,
ignore one or both. Otherwise, push the larger partition's
bounds on the stack and continue sorting the smaller one. */
if ((size_t)(right_ptr - lo) <= max_thresh) {
if ((size_t)(hi - left_ptr) <= max_thresh)
/* Ignore both small partitions. */
POP(lo, hi);
else
/* Ignore small left partition. */
lo = left_ptr;
} else if ((size_t)(hi - left_ptr) <= max_thresh)
/* Ignore small right partition. */
hi = right_ptr;
else if ((right_ptr - lo) > (hi - left_ptr)) {
/* Push larger left partition indices. */
PUSH(lo, right_ptr);
lo = left_ptr;
} else {
/* Push larger right partition indices. */
PUSH(left_ptr, hi);
hi = right_ptr;
}
}
}
/* Once the BASE_PTR array is partially sorted by quicksort the rest
is completely sorted using insertion sort, since this is efficient
for partitions below MAX_THRESH size. BASE_PTR points to the beginning
of the array to sort, and END_PTR points at the very last element in
the array (*not* one beyond it!). */
#define min(x, y) ((x) < (y) ? (x) : (y))
{
char *const end_ptr = &base_ptr[size * (total_elems - 1)];
char *tmp_ptr = base_ptr;
char *thresh = min(end_ptr, base_ptr + max_thresh);
char *run_ptr;
/* Find smallest element in first threshold and place it at the
array's beginning. This is the smallest array element,
and the operation speeds up insertion sort's inner loop. */
for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
if ((*cmp)((void *)run_ptr, (void *)tmp_ptr, arg) < 0)
tmp_ptr = run_ptr;
if (tmp_ptr != base_ptr)
SWAP(tmp_ptr, base_ptr, size);
/* Insertion sort, running from left-hand-side up to right-hand-side. */
run_ptr = base_ptr + size;
while ((run_ptr += size) <= end_ptr) {
tmp_ptr = run_ptr - size;
while ((*cmp)((void *)run_ptr, (void *)tmp_ptr, arg) < 0)
tmp_ptr -= size;
tmp_ptr += size;
if (tmp_ptr != run_ptr) {
char *trav;
trav = run_ptr + size;
while (--trav >= run_ptr) {
char c = *trav;
char *hi, *lo;
for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
*hi = *lo;
*hi = c;
}
}
}
}
}
} // namespace
extern "C" {
int memcmp(const void *lhs, const void *rhs, size_t count) {
@@ -60,16 +381,419 @@ int memcmp(const void *lhs, const void *rhs, size_t count) {
return 0;
}
void memset(void *dst, int C, size_t count) {
auto *dstc = reinterpret_cast<char *>(dst);
for (size_t I = 0; I < count; ++I)
dstc[I] = C;
}
/// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf
int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
return impl::omp_vprintf(Format, Arguments, Size);
}
// -----------------------------------------------------------------------------
#ifndef ULONG_MAX
#define ULONG_MAX ((unsigned long)(~0L)) /* 0xFFFFFFFF */
#endif
#ifndef LONG_MAX
#define LONG_MAX ((long)(ULONG_MAX >> 1)) /* 0x7FFFFFFF */
#endif
#ifndef LONG_MIN
#define LONG_MIN ((long)(~LONG_MAX)) /* 0x80000000 */
#endif
#ifndef ISDIGIT
#define ISDIGIT(C) ((C) >= '0' && (C) <= '9')
#endif
#ifndef ISSPACE
#define ISSPACE(C) ((C) == ' ')
#endif
#ifndef ISALPHA
#define ISALPHA(C) (((C) >= 'a' && (C) <= 'z') || ((C) >= 'A' && (C) <= 'Z'))
#endif
#ifndef ISUPPER
#define ISUPPER(C) ((C) >= 'A' && (C) <= 'Z')
#endif
long strtol(const char *nptr, char **endptr, int base) {
const char *s = nptr;
unsigned long acc;
int c;
unsigned long cutoff;
int neg = 0, any, cutlim;
/*
* Skip white space and pick up leading +/- sign if any.
* If base is 0, allow 0x for hex and 0 for octal, else
* assume decimal; if base is already 16, allow 0x.
*/
do {
c = *s++;
} while (ISSPACE(c));
if (c == '-') {
neg = 1;
c = *s++;
} else if (c == '+')
c = *s++;
if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X')) {
c = s[1];
s += 2;
base = 16;
}
if (base == 0)
base = c == '0' ? 8 : 10;
/*
* Compute the cutoff value between legal numbers and illegal
* numbers. That is the largest legal value, divided by the
* base. An input number that is greater than this value, if
* followed by a legal input character, is too big. One that
* is equal to this value may be valid or not; the limit
* between valid and invalid numbers is then based on the last
* digit. For instance, if the range for longs is
* [-2147483648..2147483647] and the input base is 10,
* cutoff will be set to 214748364 and cutlim to either
* 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated
* a value > 214748364, or equal but the next digit is > 7 (or 8),
* the number is too big, and we will return a range error.
*
* Set any if any `digits' consumed; make it negative to indicate
* overflow.
*/
cutoff = neg ? -(unsigned long)LONG_MIN : LONG_MAX;
cutlim = cutoff % (unsigned long)base;
cutoff /= (unsigned long)base;
for (acc = 0, any = 0;; c = *s++) {
if (ISDIGIT(c))
c -= '0';
else if (ISALPHA(c))
c -= ISUPPER(c) ? 'A' - 10 : 'a' - 10;
else
break;
if (c >= base)
break;
if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
any = -1;
else {
any = 1;
acc *= base;
acc += c;
}
}
if (any < 0) {
acc = neg ? LONG_MIN : LONG_MAX;
errno = ERANGE;
} else if (neg)
acc = -acc;
if (endptr != 0)
*endptr = const_cast<char *>(any ? s - 1 : nptr);
return (acc);
}
int strcmp(const char *lhs, const char *rhs) {
while (*lhs != '\0' && *rhs != '\0') {
if (*lhs == *rhs) {
++lhs;
++rhs;
}
return *lhs - *rhs;
}
if (*lhs != '\0')
return 1;
return -1;
}
void *calloc(size_t num, size_t size) {
size_t bits = num * size;
char *p = (char *)malloc(bits);
if (!p)
return p;
char *q = (char *)p;
while (q - p < bits) {
*(int *)q = 0;
q += sizeof(int);
}
while (q - p < bits) {
*q = 0;
q++;
}
return p;
}
void exit(int exit_code) { asm volatile("exit;"); }
size_t strlen(const char *str) {
size_t r = 0;
while (*str == ' ')
++str;
while (*str != '\0') {
++r;
++str;
}
return r;
}
char *strcpy(char *dest, const char *src) {
char *pd = dest;
const char *ps = src;
while (*ps != '\0')
*(pd++) = *(ps++);
*pd = '\0';
return dest;
}
int *__errno_location() { return &ErrNo; }
char *strcat(char *dest, const char *src) {
char *pd = dest;
const char *ps = src;
while (*pd != '\0')
++pd;
while (*ps != '\0')
*(pd++) = *(ps++);
*pd = '\0';
return dest;
}
void perror(const char *s) { printf("%s", s); }
int strncmp(const char *lhs, const char *rhs, size_t count) {
size_t c = 0;
while (*lhs != '\0' && *rhs != '\0' && c < count) {
if (*lhs == *rhs) {
++lhs;
++rhs;
++c;
} else
return *lhs - *rhs;
}
return 0;
}
char *strncpy(char *dest, const char *src, size_t count) {
char *pd = dest;
const char *ps = src;
size_t c = 0;
while (*ps != '\0' && c < count) {
*(pd++) = *(ps++);
++c;
}
if (c < count)
*pd = '\0';
return dest;
}
char *strchr(const char *s, int c) {
do {
if (*s == c)
return const_cast<char *>(s);
} while (*s++);
return nullptr;
}
char *strtok(char *str, const char *delim) {
static char *s = nullptr;
char *tok;
if (str == nullptr) {
if (s == nullptr)
return nullptr;
} else
s = str;
for (size_t i; (*s != '\0'); s++) {
for (i = 0; (delim[i] != '\0') && (*s != delim[i]); i++)
;
if (delim[i] == '\0')
break;
}
if (*s == '\0')
return s = nullptr;
tok = s++;
for (size_t i; (*s != '\0'); s++) {
for (i = 0; (delim[i] != '\0') && (*s != delim[i]); i++)
;
if (delim[i] != '\0')
break;
}
if (*s != '\0') {
*s = '\0';
s++;
}
return tok;
}
void srand(unsigned seed) { RandomNext = seed; }
int rand() {
RandomNext = RandomNext * 1103515245 + 12345;
return static_cast<unsigned int>((RandomNext / 65536)) % 2147483647;
}
int abs(int n) { return n > 0 ? n : -n; }
void *memcpy(void *dest, const void *src, size_t count) {
__builtin_memcpy(dest, src, count);
return dest;
}
unsigned long strtoul(const char *str, char **str_end, int base) {
unsigned long res = 0;
while (*str != '\0') {
if (*str == ' ') {
++str;
continue;
}
if (*str >= '0' && *str <= '9') {
res = res * 10 + *str - '0';
++str;
continue;
}
break;
}
if (*str_end)
*str_end = const_cast<char *>(str);
return res;
}
double atof(const char *s) { return strtod(s, nullptr); }
int atoi(const char *str) { return (int)strtol(str, nullptr, 10); }
double strtod(const char *str, char **ptr) {
char *p;
if (ptr == (char **)0)
return atof(str);
p = const_cast<char *>(str);
while (ISSPACE(*p))
++p;
if (*p == '+' || *p == '-')
++p;
/* INF or INFINITY. */
if ((p[0] == 'i' || p[0] == 'I') && (p[1] == 'n' || p[1] == 'N') &&
(p[2] == 'f' || p[2] == 'F')) {
if ((p[3] == 'i' || p[3] == 'I') && (p[4] == 'n' || p[4] == 'N') &&
(p[5] == 'i' || p[5] == 'I') && (p[6] == 't' || p[6] == 'T') &&
(p[7] == 'y' || p[7] == 'Y')) {
*ptr = p + 8;
return atof(str);
} else {
*ptr = p + 3;
return atof(str);
}
}
/* NAN or NAN(foo). */
if ((p[0] == 'n' || p[0] == 'N') && (p[1] == 'a' || p[1] == 'A') &&
(p[2] == 'n' || p[2] == 'N')) {
p += 3;
if (*p == '(') {
++p;
while (*p != '\0' && *p != ')')
++p;
if (*p == ')')
++p;
}
*ptr = p;
return atof(str);
}
/* digits, with 0 or 1 periods in it. */
if (ISDIGIT(*p) || *p == '.') {
int got_dot = 0;
while (ISDIGIT(*p) || (!got_dot && *p == '.')) {
if (*p == '.')
got_dot = 1;
++p;
}
/* Exponent. */
if (*p == 'e' || *p == 'E') {
int i;
i = 1;
if (p[i] == '+' || p[i] == '-')
++i;
if (ISDIGIT(p[i])) {
while (ISDIGIT(p[i]))
++i;
*ptr = p + i;
return atof(str);
}
}
*ptr = p;
return atof(str);
}
/* Didn't find any digits. Doesn't look like a number. */
*ptr = const_cast<char *>(str);
return 0.0;
}
void *memset(void *dest, int ch, size_t count) {
auto *P = reinterpret_cast<unsigned char *>(dest);
for (size_t I = 0; I < count; ++I, ++P)
*P = (unsigned char)ch;
return dest;
}
int tolower(int ch) {
if (ch >= 'A' && ch <= 'Z')
return ch + 32;
return ch;
}
char *strstr(const char *s1, const char *s2) {
const size_t len = strlen(s2);
while (*s1) {
if (!memcmp(s1, s2, len))
return const_cast<char *>(s1);
++s1;
}
return (0);
}
char *__xpg_basename(const char *path) { return const_cast<char *>(path); }
const unsigned short **__ctype_b_loc(void) { return &BLocTablePTable; }
const int32_t **__ctype_tolower_loc(void) { return &ToLowerPTable; }
void qsort(void *b, size_t n, size_t s, __compar_fn_t cmp) {
return _quicksort(b, n, s, (__compar_d_fn_t)cmp, nullptr);
}
int strcasecmp(const char *s1, const char *s2) {
const unsigned char *p1 = (const unsigned char *)s1;
const unsigned char *p2 = (const unsigned char *)s2;
int result;
if (p1 == p2)
return 0;
while ((result = tolower(*p1) - tolower(*p2++)) == 0)
if (*p1++ == '\0')
break;
return result;
}
}
#pragma omp end declare target

View File

@@ -279,9 +279,15 @@ uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
// If we are in SPMD mode, remove one warp.
return BlockSize - (!IsSPMD * impl::getWarpSize());
if (IsSPMD)
return BlockSize;
uint32_t WarpSize = impl::getWarpSize();
bool IsFullLastWarp = !(BlockSize % WarpSize);
if (OMP_LIKELY(IsFullLastWarp))
return BlockSize - WarpSize;
return BlockSize - BlockSize % WarpSize;
}
uint32_t mapping::getMaxTeamThreads() {
return mapping::getMaxTeamThreads(mapping::isSPMDMode());
}

View File

@@ -0,0 +1,885 @@
#pragma omp begin declare target device_type(nohost)
#include "Utils.h"
#include "Synchronization.h"
#include "Interface.h"
#include "Memory.h"
#include "Debug.h"
#include "LibC.h"
#include "Mpi.h"
//using namespace ompx;
// forward declaration of new
inline void *operator new(__SIZE_TYPE__ size, void *ptr) { return ptr; }
namespace mpiutils {
template <typename T>
struct LinkListNode {
template <typename R> friend struct LinkList;
public:
T *getPrev(){ return Prev; };
T *getNext(){ return Next; };
private:
T *Prev = nullptr;
T *Next = nullptr;
};
template <typename T>
struct LinkList {
public:
T *getHead(){return Head;};
T *getTail(){return Tail;};
void push(T *Node){ insertImpl(Tail, Node); }
T *remove(T *Node){ return removeImpl(Node); }
void lock(){ listlock.lock(); }
void unlock(){ listlock.unlock(); }
private:
T *Head = nullptr;
T *Tail = nullptr;
ompx::mutex::TicketLock listlock;
void insertImpl(T *Prev, T *Node) {
lock();
// Set current Node link
Node->Prev = Prev;
if (Node->Prev)
Node->Next = Node->Prev->Next;
else
Node->Next = Head;
// Set adjacent nodes links
if (Node->Prev)
Node->Prev->Next = Node;
if (Node->Next)
Node->Next->Prev = Node;
// Set head & tail
if (!Node->Prev)
Head = Node;
if (!Node->Next)
Tail = Node;
unlock();
}
T *removeImpl(T *Node) {
lock();
// Set adjacent nodes links
if (Node->Prev)
Node->Prev->Next = Node->Next;
if (Node->Next)
Node->Next->Prev = Node->Prev;
// Set Head and tail
if (!Node->Prev)
Head = Node->Next;
if (!Node->Next)
Tail = Node->Prev;
unlock();
// Set current nodes links
Node->Prev = nullptr;
Node->Next = nullptr;
return Node;
}
};
}
extern "C" {
const int MPI_MAX_BUF_SEND = 8 * 512 * 1024;
enum MPI_Request_type_e {
MPI_SEND,
MPI_RECV
};
struct MPI_Message_info_s {
int count; // nb. of items
MPI_Datatype datatype; // type of items
int rank; // sender or recv rank
int tag; // tag
struct MPI_Comm_s *comm; // communicator
};
struct MPI_Message_s : MPI_Message_info_s,
mpiutils::LinkListNode<struct MPI_Message_s> {
const void *send_data;
void *buf_data;
uint32_t status;
// 0 send done (may be waiting for recv)
// 1 recv done (done, sender can clean)
bool buffered;
};
struct MPI_Request_s : MPI_Message_info_s {
enum MPI_Request_type_e req_type;
MPI_Status mpi_status;
bool persistent;
bool enable;
// true: running (not finish)
// false: finish (send / recv done)
};
struct MPI_Send_Request_s : MPI_Request_s {
const void *send_data;
bool buffered;
struct MPI_Message_s *msg;
};
struct MPI_Recv_Request_s : MPI_Request_s {
void *recv_data;
};
struct MPI_Comm_s {
int id; // id = 0 -> MPI_COMM_WORLD (the only supported)
uint32_t size;
uint32_t barrier_counter;
uint32_t barrier_generation_counter;
int *ranks; // map teams to ranks
struct mpiutils::LinkList<struct MPI_Message_s> *messagebox;
};
}
namespace impl {
// forward declaration for varient
void yield(void);
// amdgcn
#pragma omp begin declare variant match(device = {arch(amdgcn)})
void yield(void){
// split kernel here (if it is ever implemented)
//__ompx_split();
__builtin_amdgcn_s_sleep(1);
}
#pragma omp end declare variant
// nvptx
#pragma omp begin declare variant match( \
device = {arch(nvptx, nvptx64)}, \
implementation = {extension(match_any)})
void yield(void){
// split kernel here
//__ompx_split();
}
#pragma omp end declare variant
void barrier(uint32_t *counter, uint32_t *gen_counter, uint32_t size){
int previous_gen = ompx::atomic::load(gen_counter, ompx::atomic::seq_cst);
int current = ompx::atomic::inc(counter, size - 1,
ompx::atomic::seq_cst, ompx::atomic::MemScopeTy::device);
if (current + 1 == size)
ompx::atomic::add(gen_counter, 1, ompx::atomic::seq_cst);
while(ompx::atomic::load(gen_counter, ompx::atomic::seq_cst) <= previous_gen){
yield();
}
}
size_t mpi_type_size(MPI_Datatype datatype){
switch(datatype){
case MPI_CHAR : return sizeof(char) ;
case MPI_SHORT : return sizeof(signed short int) ;
case MPI_INT : return sizeof(signed int) ;
case MPI_LONG : return sizeof(signed long int) ;
case MPI_LONG_LONG_INT : return sizeof(signed long long int) ;
case MPI_LONG_LONG : return sizeof(signed long long int) ;
case MPI_SIGNED_CHAR : return sizeof(signed char) ;
case MPI_UNSIGNED_CHAR : return sizeof(unsigned char) ;
case MPI_UNSIGNED_SHORT : return sizeof(unsigned short int) ;
case MPI_UNSIGNED : return sizeof(unsigned int) ;
case MPI_UNSIGNED_LONG : return sizeof(unsigned long int) ;
case MPI_UNSIGNED_LONG_LONG : return sizeof(unsigned long long int) ;
case MPI_FLOAT : return sizeof(float) ;
case MPI_DOUBLE : return sizeof(double) ;
case MPI_LONG_DOUBLE : return sizeof(long double) ;
case MPI_WCHAR : return sizeof(wchar_t) ;
case MPI_C_BOOL : return sizeof(bool) ; // should be `_Bool` but it is not defined
case MPI_INT8_T : return sizeof(int8_t) ;
case MPI_INT16_T : return sizeof(int16_t) ;
case MPI_INT32_T : return sizeof(int32_t) ;
case MPI_INT64_T : return sizeof(int64_t) ;
case MPI_UINT8_T : return sizeof(uint8_t) ;
case MPI_UINT16_T : return sizeof(uint16_t) ;
case MPI_UINT32_T : return sizeof(uint32_t) ;
case MPI_UINT64_T : return sizeof(uint64_t) ;
case MPI_C_COMPLEX : return sizeof(float _Complex) ;
case MPI_C_FLOAT_COMPLEX : return sizeof(float _Complex) ;
case MPI_C_DOUBLE_COMPLEX : return sizeof(double _Complex) ;
case MPI_C_LONG_DOUBLE_COMPLEX : return sizeof(long double _Complex) ;
case MPI_BYTE : return 8 ;
//case MPI_PACKED : return //unsported ;
//case MPI_AINT : return sizeof(MPI_Aint) ;
//case MPI_OFFSET : return sizeof(MPI_Offset) ;
//case MPI_COUNT : return sizeof(MPI_Count) ;
case MPI_CXX_BOOL : return sizeof(bool) ;
//case MPI_CXX_FLOAT_COMPLEX : return sizeof(std::complex<float>) ;
//case MPI_CXX_DOUBLE_COMPLEX : return sizeof(std::complex<double>) ;
//case MPI_CXX_LONG_DOUBLE_COMPLEX: return sizeof(std::complex<long double>);
default:
__builtin_unreachable();
}
__builtin_unreachable();
}
int mpi_rank(MPI_Comm comm) {
return comm->ranks[omp_get_team_num()];
}
void mpi_req_init(int count, MPI_Datatype datatype,
int rank, int tag, MPI_Comm comm,
enum MPI_Request_type_e req_type, struct MPI_Request_s *req)
{
req->req_type = req_type;
req->count = count;
req->datatype = datatype;
req->rank = rank;
req->tag = tag;
req->comm = comm;
req->enable = false;
}
struct MPI_Send_Request_s *mpi_send_init(
const void *buf, int count, MPI_Datatype datatype,
int recv_rank, int tag, MPI_Comm comm,
bool buffered, bool persistent)
{
struct MPI_Send_Request_s *req =
reinterpret_cast<struct MPI_Send_Request_s *>(
malloc(sizeof(struct MPI_Send_Request_s)));
req->send_data = buf;
req->buffered = buffered;
req->persistent = persistent;
mpi_req_init(count, datatype, recv_rank, tag, comm, MPI_SEND, req);
return req;
}
struct MPI_Recv_Request_s *mpi_recv_init(
void *buf, int count, MPI_Datatype datatype,
int send_rank, int tag, MPI_Comm comm)
{
struct MPI_Recv_Request_s *req =
reinterpret_cast<struct MPI_Recv_Request_s *>(
malloc(sizeof(struct MPI_Recv_Request_s)));
req->recv_data = buf;
req->mpi_status.MPI_SOURCE = MPI_ANY_SOURCE;
req->mpi_status.MPI_TAG = MPI_ANY_TAG;
req->mpi_status.MPI_ERROR = MPI_SUCCESS;
mpi_req_init(count, datatype, send_rank, tag, comm, MPI_RECV, req);
return req;
}
void mpi_req_free(struct MPI_Request_s **req){
free(*req);
*req = MPI_REQUEST_NULL;
}
bool mpi_msg_test(struct MPI_Message_s *msg)
{
return ompx::atomic::load(&msg->status, ompx::atomic::seq_cst) == 1;
}
void mpi_msg_wait(struct MPI_Message_s *msg)
{
while (! mpi_msg_test(msg))
yield();
}
void mpi_msg_free(struct MPI_Message_s *msg)
{
free(msg);
}
struct MPI_Message_s *mpi_send(
const void *buf, int count, MPI_Datatype datatype, int recv_rank,
int tag, MPI_Comm comm, bool buffered, bool blocking)
{
int data_size = mpi_type_size(datatype) * count;
int send_rank = mpi_rank(comm);
struct MPI_Message_s *msg = reinterpret_cast<struct MPI_Message_s *>(
malloc(sizeof(struct MPI_Message_s)));
msg->datatype = datatype;
msg->count = count;
msg->rank = send_rank;
msg->tag = tag;
msg->buffered = buffered;
ompx::atomic::store(&msg->status, 0, ompx::atomic::seq_cst);
if (buffered){
msg->buf_data = malloc(data_size);
memcpy(msg->buf_data, buf, data_size);
} else {
msg->send_data = buf;
}
comm->messagebox[recv_rank].push(msg);
if (blocking && !buffered) {// ssend
mpi_msg_wait(msg);
mpi_msg_free(msg);
}
return msg;
}
void mpi_send_start(struct MPI_Send_Request_s *req)
{
req->enable = true;
struct MPI_Message_s *msg = mpi_send(req->send_data, req->count, req->datatype,
req->rank, req->tag, req->comm, req->buffered, false);
req->msg = msg;
}
bool mpi_send_test(struct MPI_Send_Request_s *req)
{
if (req->buffered)
return true;
return mpi_msg_test(req->msg);
}
void mpi_send_wait(struct MPI_Send_Request_s *req)
{
if (req->buffered)
return;
mpi_msg_wait(req->msg);
}
struct MPI_Message_s *__mpi_recv_test(int count, MPI_Datatype datatype,
int source, int tag, MPI_Comm comm)
{
struct mpiutils::LinkList<struct MPI_Message_s> *messages =
&comm->messagebox[mpi_rank(comm)];
messages->lock();
for (struct MPI_Message_s *msg = messages->getHead();
msg != nullptr; msg = msg->getNext()){
if ((source == MPI_ANY_SOURCE || source == msg->rank)
&& (tag == MPI_ANY_TAG || tag == msg->tag)){
assert(count == msg->count && datatype == msg->datatype
&& "[MPI_recv]: count or datatype invalide");
messages->unlock();
// TODO: fixe race condition here
messages->remove(msg);
return msg;
}
}
messages->unlock();
return nullptr;
}
struct MPI_Message_s *__mpi_recv_wait(int count, MPI_Datatype datatype,
int source, int tag, MPI_Comm comm)
{
struct MPI_Message_s *msg = nullptr;
while ((msg = __mpi_recv_test(count, datatype, source, tag, comm)) == nullptr){
// we did not reciev any messages
yield();
}
return msg;
}
void __mpi_recv_do(struct MPI_Message_s *msg, void *buf, MPI_Status *status)
{
int data_size = mpi_type_size(msg->datatype) * msg->count;
if (msg->buffered) {
memcpy(buf, msg->buf_data, data_size);
free(msg->buf_data);
} else {
memcpy(buf, msg->send_data, data_size);
}
if (status != &MPI_STATUS_IGNORE && status != &MPI_STATUSES_IGNORE) {
status->MPI_SOURCE = msg->rank;
status->MPI_TAG = msg->tag;
}
if (msg->buffered)
mpi_msg_free(msg);
else
ompx::atomic::store(&msg->status, 1, ompx::atomic::seq_cst);
}
bool mpi_recv_test(void *buf, int count, MPI_Datatype datatype,
int source, int tag, MPI_Comm comm, MPI_Status *status)
{
struct MPI_Message_s *msg = __mpi_recv_test(count, datatype, source, tag, comm);
if (msg == nullptr)
return false;
__mpi_recv_do(msg, buf, status);
return true;
}
void mpi_recv_wait(void *buf, int count, MPI_Datatype datatype,
int source, int tag, MPI_Comm comm, MPI_Status *status)
{
struct MPI_Message_s *msg = __mpi_recv_wait(count, datatype, source, tag, comm);
__mpi_recv_do(msg, buf, status);
}
void mpi_recv(void *buf, int count, MPI_Datatype datatype,
int source, int tag, MPI_Comm comm, MPI_Status *status)
{
mpi_recv_wait(buf, count, datatype, source, tag, comm, status);
}
void mpi_recv_start(struct MPI_Recv_Request_s *req){
bool res = mpi_recv_test(req->recv_data, req->count, req->datatype,
req->rank, req->tag, req->comm, &req->mpi_status);
}
bool mpi_recv_test(struct MPI_Recv_Request_s *req){
bool res = mpi_recv_test(req->recv_data, req->count, req->datatype,
req->rank, req->tag, req->comm, &req->mpi_status);
return res;
}
void mpi_recv_wait(struct MPI_Recv_Request_s *req){
mpi_recv_wait(req->recv_data, req->count, req->datatype,
req->rank, req->tag, req->comm, &req->mpi_status);
}
void mpi_req_start(struct MPI_Request_s **reqp){
if (reqp == &MPI_REQUEST_NULL)
return;
struct MPI_Request_s *req = *reqp;
if (!req->enable)
return;
switch (req->req_type) {
case (MPI_SEND):
mpi_send_start(static_cast<struct MPI_Send_Request_s *>(req));
break;
case (MPI_RECV):
mpi_recv_start(static_cast<struct MPI_Recv_Request_s *>(req));
break;
default:
__builtin_unreachable();
}
}
bool mpi_req_test(struct MPI_Request_s **reqp){
if (reqp == &MPI_REQUEST_NULL)
return false;
struct MPI_Request_s *req = *reqp;
if (!req->enable)
return false;
bool res = false;
switch (req->req_type) {
case (MPI_SEND):
res = mpi_send_test(static_cast<struct MPI_Send_Request_s *>(req));
break;
case (MPI_RECV):
res = mpi_recv_test(static_cast<struct MPI_Recv_Request_s *>(req));
break;
default:
__builtin_unreachable();
}
return res;
}
void mpi_req_wait(struct MPI_Request_s **reqp){
if (reqp == &MPI_REQUEST_NULL)
return;
struct MPI_Request_s *req = *reqp;
if (!req->enable)
return;
switch (req->req_type) {
case (MPI_SEND):
mpi_send_wait(static_cast<struct MPI_Send_Request_s *>(req));
break;
case (MPI_RECV):
mpi_recv_wait(static_cast<struct MPI_Recv_Request_s *>(req));
break;
default:
__builtin_unreachable();
}
}
void mpi_req_deactivte(struct MPI_Request_s **reqp, MPI_Status *status) {
struct MPI_Request_s *req = *reqp;
if (status != &MPI_STATUS_IGNORE) {
status->MPI_SOURCE = req->mpi_status.MPI_SOURCE;
status->MPI_TAG = req->mpi_status.MPI_TAG;
status->MPI_ERROR = req->mpi_status.MPI_ERROR;
}
if (req->persistent) {
req->enable = false;
} else {
mpi_req_free(reqp);
}
}
} // namespace impl
extern "C" {
// global used for atomics
uint32_t global_counter = 0;
uint32_t global_generation_counter = 0;
int MPI_Init(int *argc, char **argv){
(void) argc;
(void) argv;
int size = omp_get_num_teams();
int rank = omp_get_team_num();
if (omp_get_team_num() == 0){
MPI_COMM_WORLD = reinterpret_cast<MPI_Comm>(
malloc(sizeof(struct MPI_Comm_s)));
MPI_COMM_WORLD->id = 0;
MPI_COMM_WORLD->size = size;
MPI_COMM_WORLD->barrier_counter = 0;
MPI_COMM_WORLD->barrier_generation_counter = 0;
MPI_COMM_WORLD->ranks = reinterpret_cast<int *>(
malloc(MPI_COMM_WORLD->size * sizeof(int)));
MPI_COMM_WORLD->messagebox =
reinterpret_cast<struct mpiutils::LinkList<struct MPI_Message_s> *>(
malloc(MPI_COMM_WORLD->size
* sizeof(struct mpiutils::LinkList<struct MPI_Message_s>)));
}
impl::barrier(&global_counter, &global_generation_counter,
omp_get_num_teams());
MPI_COMM_WORLD->ranks[omp_get_team_num()] = rank;
new (&MPI_COMM_WORLD->messagebox[rank])
mpiutils::LinkList<struct MPI_Message_s>();
impl::barrier(&global_counter, &global_generation_counter,
omp_get_num_teams());
return 0;
}
int MPI_Init_thread(int *argc, char ***argv, int required, int *provided)
{
if (required == MPI_THREAD_MULTIPLE)
return 1; //TODO; Return proper MPI_Error
*provided = MPI_THREAD_FUNNELED;
return MPI_Init(argc, *argv);
}
int MPI_Finalize(void){
impl::barrier(&global_counter, &global_generation_counter,
omp_get_num_teams());
if(omp_get_team_num() == 0){
free(MPI_COMM_WORLD->ranks);
free(MPI_COMM_WORLD->messagebox);
free(MPI_COMM_WORLD);
}
return 0;
}
int MPI_Abort(MPI_Comm comm, int errorcode)
{
printf("MPI_Abort(%d)\n", errorcode);
// TODO; Abort the GPu kernel here
return 0;
}
int MPI_Barrier(MPI_Comm comm) {
impl::barrier(&comm->barrier_counter, &comm->barrier_generation_counter,
comm->size);
return 0;
}
int MPI_Comm_rank(MPI_Comm comm, int *rank){
*rank = impl::mpi_rank(comm);
return 0;
}
int MPI_Comm_size(MPI_Comm comm, int *size){
*size = comm->size;
return 0;
}
int MPI_Type_size(MPI_Datatype datatype, int *size){
*size = impl::mpi_type_size(datatype);
return 0;
}
// Blocking Communications
int MPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)
{
impl::mpi_send(buf, count, datatype, dest, tag, comm,
(count * impl::mpi_type_size(datatype)) < MPI_MAX_BUF_SEND, true);
return 0;
}
int MPI_Ssend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)
{
impl::mpi_send(buf, count, datatype, dest, tag, comm, false, true);
return 0;
}
int MPI_Bsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)
{
impl::mpi_send(buf, count, datatype, dest, tag, comm, true, true);
return 0;
}
int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status)
{
impl::mpi_recv(buf, count, datatype, source, tag, comm, status);
return 0;
}
// Non-Blocking Communications
int MPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
{
struct MPI_Send_Request_s *req =
impl::mpi_send_init(buf, count, datatype, dest, tag, comm,
(count * impl::mpi_type_size(datatype)) < MPI_MAX_BUF_SEND, false);
impl::mpi_send_start(req);
*request = req;
return 0;
}
int MPI_Issend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
{
struct MPI_Send_Request_s *req =
impl::mpi_send_init(buf, count, datatype, dest, tag, comm, false, false);
impl::mpi_send_start(req);
*request = req;
return 0;
}
int MPI_Ibsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
{
struct MPI_Send_Request_s *req =
impl::mpi_send_init(buf, count, datatype, dest, tag, comm, true, false);
impl::mpi_send_start(req);
*request = req;
return 0;
}
int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request *request)
{
struct MPI_Recv_Request_s *req =
impl::mpi_recv_init(buf, count, datatype, source, tag, comm);
impl::mpi_recv_start(req); // try to recive early to reduce deadlock probability
*request = req;
return 0;
}
// Test & Wait
int MPI_Test(MPI_Request *request, int *flag, MPI_Status *status)
{
*flag = impl::mpi_req_test(request);
if (*flag)
impl::mpi_req_deactivte(request, status);
return 0;
}
int MPI_Wait(MPI_Request *request, MPI_Status *status)
{
impl::mpi_req_wait(request);
impl::mpi_req_deactivte(request, status);
return 0;
}
int MPI_Testall(int count, MPI_Request array_of_requests[], int *flag, MPI_Status array_of_statuses[])
{
int finished = 0;
for (int i = 0; i < count; ++i)
finished += impl::mpi_req_test(&array_of_requests[i]);
*flag = (finished == count);
if (*flag)
for (int i = 0; i < count; ++i)
impl::mpi_req_deactivte(&array_of_requests[i], &array_of_statuses[i]);
return 0;
}
int MPI_Waitall(int count, MPI_Request array_of_requests[], MPI_Status array_of_statuses[])
{
// we don't want to wait for the request one after the others
while (true) {
int finished = 0;
for (int i = 0; i < count; ++i)
finished += impl::mpi_req_test(&array_of_requests[i]);
if (finished >= count)
break;
impl::yield();
}
for (int i = 0; i < count; ++i)
impl::mpi_req_deactivte(&array_of_requests[i], &array_of_statuses[i]);
return 0;
}
int MPI_Testany(int count, MPI_Request array_of_requests[], int *index, int *flag, MPI_Status *status)
{
for (int i = 0; i < count; ++i) {
if (impl::mpi_req_test(&array_of_requests[i])) {
*flag = true;
*index = i;
impl::mpi_req_deactivte(&array_of_requests[i], status);
return 0;
}
}
*flag = false;
return 0;
}
int MPI_Waitany(int count, MPI_Request array_of_requests[], int *index, MPI_Status *status)
{
while (true) {
for (int i = 0; i < count; ++i) {
if (impl::mpi_req_test(&array_of_requests[i])) {
*index = i;
impl::mpi_req_deactivte(&array_of_requests[i], status);
return 0;
}
}
impl::yield();
}
return 0;
}
int MPI_Testsome(int incount, MPI_Request array_of_requests[], int *outcount, int array_of_indices[], MPI_Status array_of_statuses[])
{
*outcount = 0;
for (int i = 0; i < incount; ++i) {
if(impl::mpi_req_test(&array_of_requests[i])) {
array_of_indices[*outcount] = i;
impl::mpi_req_deactivte(&array_of_requests[i],
&array_of_statuses[*outcount]);
(*outcount)++;
}
}
return 0;
}
int MPI_Waitsome(int incount, MPI_Request array_of_requests[], int *outcount, int array_of_indices[], MPI_Status array_of_statuses[])
{
while (true) {
for (int i = 0; i < incount; ++i)
if (impl::mpi_req_test(&array_of_requests[i]))
return MPI_Testsome(incount, array_of_requests,
outcount, array_of_indices, array_of_statuses);
impl::yield();
}
return 0;
}
// Persistent Communications setup
int MPI_Send_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
{
struct MPI_Send_Request_s *req =
impl::mpi_send_init(buf, count, datatype, dest, tag, comm,
(count * impl::mpi_type_size(datatype)) < MPI_MAX_BUF_SEND, true);
*request = req;
return 0;
}
int MPI_Ssend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
{
struct MPI_Send_Request_s *req =
impl::mpi_send_init(buf, count, datatype, dest, tag, comm, false, true);
*request = req;
return 0;
}
int MPI_Bsend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
{
struct MPI_Send_Request_s *req =
impl::mpi_send_init(buf, count, datatype, dest, tag, comm, true, true);
*request = req;
return 0;
}
int MPI_Recv_init(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request *request)
{
struct MPI_Recv_Request_s *req =
impl::mpi_recv_init(buf, count, datatype, source, tag, comm);
*request = req;
return 0;
}
// Persistent Communications start and end
int MPI_Start(MPI_Request *request)
{
impl::mpi_req_start(request);
return 0;
}
int MPI_Startall(int count, MPI_Request array_of_requests[])
{
for (int i = 0; i < count; ++i)
MPI_Start(&array_of_requests[i]);
return 0;
}
int MPI_Request_free(MPI_Request *request)
{
impl::mpi_req_free(request);
return 0;
}
// Collective Operation
int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm)
{
// TODO; Implement MPI_Reduce
return 0;
}
int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
{
// TODO; Implement MPI_Allreduce
return 0;
}
// double
double MPI_Wtime(void)
{
return omp_get_wtime(); // thx openmp
}
} // extern "C"
#pragma omp end declare target

View File

@@ -47,6 +47,9 @@ using namespace ompx;
namespace {
uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
if (NumThreadsClause != -1)
return NumThreadsClause;
uint32_t NThreadsICV =
NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
uint32_t NumThreads = mapping::getMaxTeamThreads();

View File

@@ -17,6 +17,7 @@
#include "LibC.h"
#include "Mapping.h"
#include "State.h"
#include "Memory.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"
@@ -83,6 +84,7 @@ struct SharedMemorySmartStackTy {
void *push(uint64_t Bytes);
/// Deallocate the last allocation made by the encountering thread and pointed
//
/// to by \p Ptr from the stack. Each thread can call this function.
void pop(void *Ptr, uint32_t Bytes);
@@ -274,7 +276,8 @@ void state::enterDataEnvironment(IdentTy *Ident) {
uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
uint32_t Bytes =
sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
sizeof(ThreadStates[0]) *
(mapping::getNumberOfThreadsInBlock() + (mapping::isSPMDMode() ? 0 : 1));
void *ThreadStatesPtr =
memory::allocGlobal(Bytes, "Thread state array allocation");
memset(ThreadStatesPtr, 0, Bytes);
@@ -371,6 +374,15 @@ int omp_get_thread_num(void) {
return omp_get_ancestor_thread_num(omp_get_level());
}
int omp_get_bulk_thread_num() {
ASSERT(mapping::isSPMDMode(), "Not SPMD");
int BId = mapping::getBlockIdInKernel();
int BSize = mapping::getMaxTeamThreads();
int TId = mapping::getThreadIdInBlock();
int Id = BId * BSize + TId;
return returnValIfLevelIsActive(omp_get_level(), Id, 0);
}
int omp_get_team_size(int Level) {
return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
}
@@ -379,6 +391,11 @@ int omp_get_num_threads(void) {
return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
}
int omp_get_bulk_num_threads(void) {
ASSERT(mapping::isSPMDMode(), "Not SPMD");
return mapping::getNumberOfThreadsInKernel();
}
int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }
int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }

View File

@@ -333,7 +333,14 @@ void setCriticalLock(omp_lock_t *Lock) {
uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
atomic::MemScopeTy MemScope) {
return __nvvm_atom_inc_gen_ui(Address, Val);
switch(MemScope) {
case atomic::MemScopeTy::all:
return __nvvm_atom_sys_inc_gen_ui(Address, Val);
case atomic::MemScopeTy::device:
return __nvvm_atom_inc_gen_ui(Address, Val);
case atomic::MemScopeTy::cgroup:
return __nvvm_atom_cta_inc_gen_ui(Address, Val);
}
}
void namedBarrierInit() {}
@@ -509,6 +516,19 @@ void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }
void mutex::TicketLock::lock() {
uint64_t MyTicket = atomic::add(&NextTicket, 1, atomic::seq_cst);
while (atomic::load(&NowServing, atomic::aquire) != MyTicket)
;
fence::kernel(atomic::aquire);
}
void mutex::TicketLock::unlock() {
fence::kernel(atomic::release);
atomic::add(&NowServing, 1, atomic::seq_cst);
}
extern "C" {
void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}

View File

@@ -0,0 +1,308 @@
//===------- WarpAllocator.cpp - Warp memory allocator ------- C++ -*-========//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#pragma omp begin declare target device_type(nohost)
#include "Debug.h"
#include "Mapping.h"
#include "Memory.h"
#include "State.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"
using namespace ompx;
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility(
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
__omp_rtl_device_memory_pool_tracker;
// TODO: implement Device Debug Allocation Tracker
namespace {
constexpr const size_t Alignment = 16;
constexpr const size_t FirstThreadRatio = 40;
constexpr const size_t SplitThreadhold = Alignment * 4;
template <typename T> T abs(T V) { return V > 0 ? V : -V; }
template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator;
class WarpAllocatorEntry {
template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> friend struct WarpAllocator;
/// If Size is less than 0, the entry is allocated (in use).
int64_t Size = 0;
/// PrevSize is also supposed to be greater than or equal to 0. When it is 0,
/// it is the first entry of the buffer.
int64_t PrevSize = 0;
public:
bool isFirst() const { return !PrevSize; }
size_t getSize() const { return abs(Size); }
void setSize(size_t V) { Size = V; }
void setPrevSize(WarpAllocatorEntry *Prev) {
PrevSize = Prev ? Prev->getSize() : 0;
}
size_t getUserSize() const { return getSize() - sizeof(WarpAllocatorEntry); }
// Note: isUsed can not be !isUnused or other way around because when Size is
// 0, it is uninitialized.
bool isUsed() const { return Size < 0; }
bool isUnused() const { return Size > 0; }
void setUsed() {
assert(isUnused() && "the entry is in use");
Size *= -1;
}
void setUnused() {
assert(isUsed() && "the entry is not in use");
Size *= -1;
}
char *getUserPtr() { return reinterpret_cast<char *>(this + 1); }
char *getEndPtr() { return reinterpret_cast<char *>(getNext()); }
WarpAllocatorEntry *getPrev() { return utils::advance(this, -PrevSize); }
WarpAllocatorEntry *getNext() { return utils::advance(this, getSize()); }
static WarpAllocatorEntry *fromUserPtr(void *Ptr) { return fromPtr(Ptr) - 1; }
static WarpAllocatorEntry *fromPtr(void *Ptr) {
return reinterpret_cast<WarpAllocatorEntry *>(Ptr);
}
};
static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");
template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
void init() {
if (mapping::isSPMDMode() &&
(mapping::getThreadIdInBlock() || mapping::getBlockIdInKernel()))
return;
size_t HeapSize = __omp_rtl_device_memory_pool.Size;
FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
size_t OtherThreadHeapSize =
(HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);
size_t TeamHeapSize = FirstThreadHeapSize / TEAM_SIZE;
TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
FirstTeamSize = TeamHeapSize;
char *LastLimit = reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr);
for (int I = 0; I < WARP_SIZE; ++I) {
for (int J = 0; J < TEAM_SIZE; ++J) {
Entries[I][J] = nullptr;
Limits[I][J] = LastLimit + TeamHeapSize * (J + 1);
}
LastLimit += I ? OtherThreadHeapSize : FirstThreadHeapSize;
Limits[I][TEAM_SIZE - 1] = LastLimit;
TeamHeapSize = OtherThreadHeapSize / TEAM_SIZE;
TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
}
}
void *allocate(size_t Size) {
int32_t TeamSlot = getTeamSlot();
int32_t TIdInWarp = mapping::getThreadIdInWarp();
Size = utils::align_up(Size + sizeof(WarpAllocatorEntry), Alignment);
// Error our early if the requested size is larger than the entire block.
if (Size > getBlockSize(TIdInWarp, TeamSlot))
return nullptr;
WarpAllocatorEntry *E = nullptr;
{
mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
auto *LastEntry = Entries[TIdInWarp][TeamSlot];
auto *NewWatermark = (LastEntry ? LastEntry->getEndPtr()
: getBlockBegin(TIdInWarp, TeamSlot)) +
Size;
if (NewWatermark >= Limits[TIdInWarp][TeamSlot]) {
E = findMemorySlow(Size, TIdInWarp, TeamSlot);
} else {
E = LastEntry ? LastEntry->getNext()
: WarpAllocatorEntry::fromPtr(
getBlockBegin(TIdInWarp, TeamSlot));
E->setSize(Size);
E->setPrevSize(LastEntry);
Entries[TIdInWarp][TeamSlot] = E;
}
if (!E)
return nullptr;
E->setUsed();
}
return E->getUserPtr();
}
void deallocate(void *Ptr) {
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(Ptr);
auto TeamSlot = getTeamSlot();
auto TIdInWarp = mapping::getThreadIdInWarp();
mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
E->setUnused();
// Is last entry?
if (E == Entries[TIdInWarp][TeamSlot]) {
do {
E = E->getPrev();
} while (!E->isFirst() && !E->isUsed());
Entries[TIdInWarp][TeamSlot] = E;
}
}
memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
if (!utils::isInRange(P, reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr),
__omp_rtl_device_memory_pool.Size))
return {};
auto TeamSlot = getTeamSlot();
auto TIdInWarp = mapping::getThreadIdInWarp();
for (int I = TIdInWarp; I < TIdInWarp + WARP_SIZE; ++I) {
int TId = I % WARP_SIZE;
for (int J = TeamSlot; J < TeamSlot + TEAM_SIZE; ++J) {
int SId = J % TEAM_SIZE;
if (P < getBlockBegin(TId, SId) || P >= getBlockEnd(TId, SId))
continue;
mutex::LockGuard LG(Locks[I][SId]);
WarpAllocatorEntry *E = Entries[I][SId];
if (!E)
return {};
if (E->getEndPtr() <= P)
return {};
bool isFirst = false;
while (!isFirst) {
if (E->getUserPtr() <= P && P < E->getEndPtr()) {
if (!E->isUsed())
return {};
return {E->getUserPtr(), E->getUserSize()};
}
isFirst = E->isFirst();
E = E->getPrev();
}
}
}
return {};
}
private:
char *getBlockBegin(int32_t TIdInWarp, int32_t TeamSlot) const {
if (TeamSlot)
return Limits[TIdInWarp][TeamSlot - 1];
if (TIdInWarp)
return Limits[TIdInWarp - 1][TEAM_SIZE - 1];
return reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr);
}
char *getBlockEnd(int32_t TIdInWarp, int32_t TeamSlot) const {
return Limits[TIdInWarp][TeamSlot];
}
size_t getBlockSize(int32_t TIdInWarp, int32_t TeamSlot) const {
return getBlockEnd(TIdInWarp, TeamSlot) -
getBlockBegin(TIdInWarp, TeamSlot);
}
static int32_t getTeamSlot() { return mapping::getBlockIdInKernel() % TEAM_SIZE; }
WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp,
int32_t TeamSlot) {
char *Ptr = getBlockBegin(TIdInWarp, TeamSlot);
char *Limit = getBlockEnd(TIdInWarp, TeamSlot);
WarpAllocatorEntry *E = WarpAllocatorEntry::fromPtr(Ptr);
do {
if (!E->isUsed() && E->getSize() >= Size)
break;
E = E->getNext();
if (reinterpret_cast<char *>(E) + Size > Limit)
return nullptr;
} while (1);
size_t OldSize = E->getSize();
if (OldSize - Size >= SplitThreadhold) {
auto *OldNext = E->getNext();
E->setSize(Size);
auto *LeftOverE = E->getNext();
LeftOverE->setPrevSize(E);
LeftOverE->setSize(OldSize - Size);
OldNext->setPrevSize(LeftOverE);
}
return E;
}
WarpAllocatorEntry *Entries[WARP_SIZE][TEAM_SIZE];
char *Limits[WARP_SIZE][TEAM_SIZE];
mutex::TicketLock Locks[WARP_SIZE][TEAM_SIZE];
size_t FirstThreadHeapSize;
size_t FirstTeamSize;
};
WarpAllocator<32, 16> Allocator;
} // namespace
namespace ompx {
namespace memory {
MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
return Allocator.getMemoryAllocationInfo(P);
}
} // namespace memory
} // namespace ompx
extern "C" {
void *malloc(size_t Size) {
if (!Size)
return nullptr;
void *P = Allocator.allocate(Size);
assert(P && "allocator out of memory");
assert(reinterpret_cast<intptr_t>(P) % Alignment == 0 &&
"misaligned address");
return P;
}
void free(void *P) {
if (!P)
return;
Allocator.deallocate(P);
}
void *realloc(void *ptr, size_t new_size) {
void *NewPtr = malloc(new_size);
if (!NewPtr)
return nullptr;
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(ptr);
__builtin_memcpy(NewPtr, ptr, utils::min(E->getUserSize(), new_size));
free(ptr);
return NewPtr;
}
void __kmpc_target_init_allocator() { Allocator.init(); }
}
#pragma omp end declare target

View File

@@ -0,0 +1,277 @@
//===------- WarpAllocator.cpp - Warp memory allocator ------- C++ -*-========//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#pragma omp begin declare target device_type(nohost)
#include "Debug.h"
#include "Mapping.h"
#include "Memory.h"
#include "State.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"
using namespace ompx;
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility(
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
__omp_rtl_device_memory_pool_tracker;
// TODO: implement Device Debug Allocation Tracker
namespace {
constexpr const size_t Alignment = 16;
constexpr const size_t FirstThreadRatio = 40;
constexpr const size_t SplitThreadhold = Alignment * 16;
template <typename T> T abs(T V) { return V > 0 ? V : -V; }
template <int32_t WARP_SIZE> struct WarpAllocator;
class WarpAllocatorEntry {
template <int32_t WARP_SIZE> friend struct WarpAllocator;
/// If Size is less than 0, the entry is allocated (in use).
int64_t Size = 0;
/// PrevSize is also supposed to be greater than or equal to 0. When it is 0,
/// it is the first entry of the buffer.
int64_t PrevSize = 0;
public:
bool isFirst() const { return !PrevSize; }
size_t getSize() const { return abs(Size); }
void setSize(size_t V) { Size = V; }
void setPrevSize(WarpAllocatorEntry *Prev) {
PrevSize = Prev ? Prev->getSize() : 0;
}
size_t getUserSize() const { return getSize() - sizeof(WarpAllocatorEntry); }
// Note: isUsed can not be !isUnused or other way around because when Size is
// 0, it is uninitialized.
bool isUsed() const { return Size < 0; }
bool isUnused() const { return Size > 0; }
void setUsed() {
assert(isUnused() && "the entry is in use");
Size *= -1;
}
void setUnused() {
assert(isUsed() && "the entry is not in use");
Size *= -1;
}
char *getUserPtr() { return reinterpret_cast<char *>(this + 1); }
char *getEndPtr() { return reinterpret_cast<char *>(getNext()); }
WarpAllocatorEntry *getPrev() { return utils::advance(this, -PrevSize); }
WarpAllocatorEntry *getNext() { return utils::advance(this, getSize()); }
static WarpAllocatorEntry *fromUserPtr(void *Ptr) { return fromPtr(Ptr) - 1; }
static WarpAllocatorEntry *fromPtr(void *Ptr) {
return reinterpret_cast<WarpAllocatorEntry *>(Ptr);
}
};
static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");
template <int32_t WARP_SIZE> struct WarpAllocator {
void init() {
if (mapping::isSPMDMode() &&
(mapping::getThreadIdInBlock() || mapping::getBlockId()))
return;
size_t HeapSize = __omp_rtl_device_memory_pool.Size;
size_t FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
size_t OtherThreadHeapSize =
(HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);
for (int I = 0; I < WARP_SIZE; ++I) {
Entries[I] = nullptr;
size_t PrivateOffset = OtherThreadHeapSize * I + FirstThreadHeapSize;
Limits[I] = reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr) + PrivateOffset;
}
}
void *allocate(size_t Size) {
int32_t TIdInWarp = mapping::getThreadIdInWarp();
Size = utils::align_up(Size + sizeof(WarpAllocatorEntry), Alignment);
// Error our early if the requested size is larger than the entire block.
if (Size > getBlockSize(TIdInWarp))
return nullptr;
WarpAllocatorEntry *E = nullptr;
{
mutex::LockGuard LG(Locks[TIdInWarp]);
auto *LastEntry = Entries[TIdInWarp];
auto *NewWatermark =
(LastEntry ? LastEntry->getEndPtr() : getBlockBegin(TIdInWarp)) +
Size;
if (NewWatermark >= Limits[TIdInWarp]) {
E = findMemorySlow(Size, TIdInWarp);
} else {
E = LastEntry ? LastEntry->getNext()
: WarpAllocatorEntry::fromPtr(getBlockBegin(TIdInWarp));
E->setSize(Size);
E->setPrevSize(LastEntry);
Entries[TIdInWarp] = E;
}
}
if (!E)
return nullptr;
assert(E->isUnused() && "entry is not set to use properly");
E->setUsed();
return E->getUserPtr();
}
void deallocate(void *Ptr) {
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(Ptr);
auto TIdInWarp = mapping::getThreadIdInWarp();
mutex::LockGuard LG(Locks[TIdInWarp]);
E->setUnused();
// Is last entry?
if (E == Entries[TIdInWarp]) {
do {
E = E->getPrev();
} while (!E->isFirst() && !E->isUsed());
Entries[TIdInWarp] = E;
}
}
memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
if (!utils::isInRange(P, reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr),
__omp_rtl_device_memory_pool.Size))
return {};
auto TIdInWarp = mapping::getThreadIdInWarp();
for (int I = TIdInWarp; I < TIdInWarp + mapping::getWarpSize(); ++I) {
int TId = I % mapping::getWarpSize();
if (P < getBlockBegin(TId) || P >= getBlockEnd(TId))
continue;
mutex::LockGuard LG(Locks[I]);
WarpAllocatorEntry *E = Entries[I];
if (!E)
return {};
if (E->getEndPtr() <= P)
return {};
do {
if (E->getUserPtr() <= P && P < E->getEndPtr()) {
if (!E->isUsed())
return {};
return {E->getUserPtr(), E->getUserSize()};
}
E = E->getPrev();
} while (!E->isFirst());
}
return {};
}
private:
char *getBlockBegin(int32_t TIdInWarp) const {
return TIdInWarp ? Limits[TIdInWarp - 1] : reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr);
}
char *getBlockEnd(int32_t TIdInWarp) const { return Limits[TIdInWarp]; }
size_t getBlockSize(int32_t TIdInWarp) const {
return getBlockEnd(TIdInWarp) - getBlockBegin(TIdInWarp);
}
WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp) {
char *Ptr = getBlockBegin(TIdInWarp);
char *Limit = getBlockEnd(TIdInWarp);
WarpAllocatorEntry *E = WarpAllocatorEntry::fromPtr(Ptr);
do {
if (!E->isUsed() && E->getSize() >= Size)
break;
E = E->getNext();
if (reinterpret_cast<char *>(E) + Size > Limit)
return nullptr;
} while (1);
size_t OldSize = E->getSize();
if (OldSize - Size >= SplitThreadhold) {
auto *OldNext = E->getNext();
E->setSize(Size);
auto *LeftOverE = E->getNext();
LeftOverE->setPrevSize(E);
LeftOverE->setSize(OldSize - Size);
OldNext->setPrevSize(LeftOverE);
}
return E;
}
WarpAllocatorEntry *Entries[WARP_SIZE];
char *Limits[WARP_SIZE];
mutex::TicketLock Locks[WARP_SIZE];
};
WarpAllocator<32> Allocator;
} // namespace
namespace ompx {
namespace memory {
MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
return Allocator.getMemoryAllocationInfo(P);
}
} // namespace memory
} // namespace ompx
extern "C" {
void *malloc(size_t Size) {
if (!Size)
return nullptr;
void *P = Allocator.allocate(Size);
assert(P && "allocator out of memory");
assert(reinterpret_cast<intptr_t>(P) % Alignment == 0 &&
"misaligned address");
return P;
}
void free(void *P) {
if (!P)
return;
Allocator.deallocate(P);
}
void *realloc(void *ptr, size_t new_size) {
void *NewPtr = malloc(new_size);
if (!NewPtr)
return nullptr;
WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(ptr);
__builtin_memcpy(NewPtr, ptr, utils::min(E->getUserSize(), new_size));
free(ptr);
return NewPtr;
}
void __kmpc_target_init_allocator() { Allocator.init(); }
}
#pragma omp end declare target

View File

@@ -111,6 +111,38 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
stride = loopSize; // make sure we only do 1 chunk per warp
}
static void ForStaticNoLoopNoTrunk(int &last, T &lb, T &ub, ST &stride,
ST &chunk, T entityId,
T numberOfEntities) {
T loopSize = ub - lb + 1;
chunk = loopSize / numberOfEntities;
// We have more entities than iterations.
if (chunk == 0) {
chunk = 1;
lb = lb + entityId * chunk;
T inputUb = ub;
ub = lb + chunk - 1;
last = lb <= inputUb && inputUb <= ub;
stride = loopSize; // make sure we only do 1 chunk per warp
return;
}
T leftOver = loopSize - chunk * numberOfEntities;
if (entityId < leftOver) {
chunk++;
lb = lb + entityId * chunk;
} else {
lb = lb + entityId * chunk + leftOver;
}
T inputUb = ub;
ub = lb + chunk - 1; // Clang uses i <= ub
last = lb <= inputUb && inputUb <= ub;
stride = loopSize; // make sure we only do 1 chunk per warp
}
////////////////////////////////////////////////////////////////////////////////
// Support for Static Init
@@ -184,6 +216,19 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
omp_get_num_teams() * numberOfActiveOMPThreads);
break;
}
case kmp_sched_distr_parallel_static_noloop_chunk: {
ForStaticChunk(lastiter, lb, ub, stride, chunk,
numberOfActiveOMPThreads * omp_get_team_num() + gtid,
omp_get_num_teams() * numberOfActiveOMPThreads);
break;
}
case kmp_sched_distr_parallel_static_noloop_nochunk: {
ForStaticNoLoopNoTrunk(lastiter, lb, ub, stride, chunk,
numberOfActiveOMPThreads * omp_get_team_num() +
gtid,
omp_get_num_teams() * numberOfActiveOMPThreads);
break;
}
default: {
// ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,

View File

@@ -11,8 +11,65 @@ _ZN4ompx*
IsSPMDMode
malloc
free
memcmp
printf
__assert_fail
__ctype_b_loc
__ctype_tolower_loc
__xpg_basename
abs
atof
atoi
calloc
clock
clock_gettime
exit
fclose
feof
fflush
fgets
fopen
fprintf
fputs
fread
free
fscanf
fseek
ftell
fwrite
getc
gettimeofday
gmtime
malloc
memcmp
memcpy
memset
pclose
perror
popen
printf
qsort
rand
realloc
rewind
sprintf
srand
sscanf
stat
strcat
strchr
strcmp
strcpy
strftime
strlen
strncmp
strncpy
strstr
strtod
strtok
strtol
strtoul
time
tolower
strcasecmp
MPI_*

View File

@@ -0,0 +1,12 @@
#!/usr/bin/env python3
from driver.wrapper import run
def main():
run(is_cpp=False, is_mpi=False)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,12 @@
#!/usr/bin/env python3
from driver.wrapper import run
def main():
run(is_cpp=True, is_mpi=False)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,12 @@
#!/usr/bin/env python3
from driver.wrapper import run
def main():
run(is_cpp=False, is_mpi=True)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,12 @@
#!/usr/bin/env python3
from driver.wrapper import run
def main():
run(is_cpp=True, is_mpi=True)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,104 @@
//===------- Main.c - Direct compilation program start point ------ C -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
extern int __user_main(int, char *[]);
extern void __kmpc_target_init_allocator(void);
//#ifdef SINGLE_THREAD_EXECUTION
//#define THREAD_LIMIT 1
//#define TEAM_LIMIT 1
//#else
// 110 * 960 = 105 600 threads max for now
// could probably do a 110 * 1024 = 112640 if we fixe offload
//#define TEAM_LIMIT 110
//#define THREAD_LIMIT 961 // limit is not included, max is 960
// 220 * 512 = 112 640 threads max
//#define TEAM_LIMIT 221
//#define THREAD_LIMIT 513 // limit is not included, max is 512
// 440 * 256 = 112 640 threads max
//#define TEAM_LIMIT 440
//#define THREAD_LIMIT 257 // limit is not included, max is 256
// 880 * 128 = 112 640 threads max
//#define TEAM_LIMIT 880
//#define THREAD_LIMIT 129 // limit is not included, max is 128
// 1320 * 64 = 84 480 threads max
//#define TEAM_LIMIT 1320 // will not do more that 1320 thread block per gpu, i don't know why
//#define THREAD_LIMIT 65 // limit is not included, max is 65
// /!\ Do not do less that 64 thread per teams, that will put more that 1 warp
// per teams and the synchronisations functions are not build to handle that.
// Light for testing
//#define TEAM_LIMIT 880
//#define THREAD_LIMIT 65 // limit is not included, max is 64
//#define TEAM_LIMIT 1760
//#define TEAM_LIMIT 1320
//#define TEAM_LIMIT 660
//#define THREAD_LIMIT 128
//#endif
//#pragma omp begin declare target device_type(nohost)
//void scheduler_init(void);
//#pragma omp begin declare variant match(device = {arch(amdgcn)})
//void scheduler_init(void){
// __builtin_amdgcn_ds_gws_init(TEAM_LIMIT - 1, 0);
//}
//#pragma omp end declare variant
//#pragma omp end declare target
int main(int argc, char *argv[]) {
char* MPI_Ranks = getenv("MPI_RANKS");
char* MPI_Threads = getenv("MPI_THREADS");
int nb_teams = 8;
int nb_threads = 64;
if (MPI_Ranks != NULL)
nb_teams = atoi(MPI_Ranks);
if (MPI_Threads != NULL)
nb_threads = atoi(MPI_Threads);
printf("Using %d MPI_Ranks and %d MPI_Threads\n", nb_teams, nb_threads);
#pragma omp target enter data map(to: argv[:argc])
for (int I = 0; I < argc; ++I) {
#pragma omp target enter data map(to: argv[I][:strlen(argv[I])])
}
int Ret = 0;
#pragma omp target enter data map(to: Ret)
#pragma omp target teams num_teams(1) thread_limit(1)
{
__kmpc_target_init_allocator();
}
#pragma omp target teams num_teams(nb_teams) thread_limit(nb_threads)
{
Ret = __user_main(argc, argv);
}
#pragma omp target exit data map(from: Ret)
return Ret;
}

View File

@@ -0,0 +1,14 @@
//===------- UserWrapper.h - User code wrapper --------------- C++ --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
#define OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
#pragma omp begin declare target device_type(nohost)
#endif

View File

@@ -0,0 +1,4 @@
#include <stdint.h>
#include "../../DeviceRTL/include/Mpi.h"

View File

@@ -0,0 +1,147 @@
import argparse
import os
import pathlib
import subprocess
import sys
import tempfile
cwd = os.path.dirname(os.path.realpath(__file__))
source_file_suffix = [".c", ".cpp", ".cu", ".hip", ".cc", ".cxx"]
def print_version():
cp = subprocess.run(["clang", "--version"])
if cp.returncode != 0:
sys.exit(cp.returncode)
def compile_loader(loader_name, args, targets, verbose, dry_run):
cmd = [
"clang",
"-c",
"-fopenmp",
"-foffload-lto",
"-fopenmp-offload-mandatory",
"-o",
loader_name,
]
if targets:
for arch in targets:
cmd.append("--offload-arch={}".format(arch))
else:
cmd.append("--offload-arch=native")
cmd += args
cmd.append(os.path.join(cwd, "Main.c"))
if verbose:
print(" ".join(cmd), file=sys.stderr)
if dry_run:
print(" ".join(cmd), file=sys.stderr)
return
cp = subprocess.run(cmd)
if cp.returncode != 0:
sys.exit(cp.returncode)
def invoke_clang(is_cpp, is_mpi, args, targets, verbose, dry_run):
cmd = [
"clang++" if is_cpp else "clang",
"-fopenmp",
"-foffload-lto",
"-fopenmp-offload-mandatory",
"-fopenmp-globalize-to-global-space",
"-I" if is_mpi else "",
cwd if is_mpi else "",
"-include",
os.path.join(cwd, "UserWrapper.h"),
"-include" if is_mpi else "",
os.path.join(cwd, "mpi.h") if is_mpi else "",
"--save-temps",
"-rdynamic",
"-mllvm",
"-enable-host-rpc",
"-mllvm",
"-openmp-opt-disable-state-machine-rewrite",
"-mllvm",
"-enable-canonicalize-main-function",
"-mllvm",
"-canonical-main-function-name=__user_main",
]
if targets:
for arch in targets:
cmd.append("--offload-arch={}".format(arch))
else:
cmd.append("--offload-arch=native")
cmd += args
if verbose:
print(" ".join(cmd), file=sys.stderr)
if dry_run:
print(" ".join(cmd))
return
cp = subprocess.run(cmd)
if cp.returncode != 0:
sys.exit(cp.returncode)
def run(is_cpp=False, is_mpi=False):
parser = argparse.ArgumentParser(
prog="clang-gpu", description="clang LLVM GPU compiler"
)
# should be changed, this prevent call with -ccc-print-phases
parser.add_argument(
"-c",
action="store_true",
help="Only run preprocess, compile, and assemble steps",
)
parser.add_argument(
"-v", action="store_true", help="Show commands to run and use verbose output"
)
parser.add_argument(
"--version", action="store_true", help="Print version information"
)
parser.add_argument(
"-###",
action="store_true",
help="Print (but do not run) the commands to run for this compilation",
)
parser.add_argument(
"--offload-arch",
type=str,
help=(
"Specify an offloading device architecture for CUDA, HIP, or OpenMP. (e.g."
" sm_35). If 'native' is used the compiler will detect locally installed"
" architectures. For HIP offloading, the device architecture can be"
" followed by target ID features delimited by a colon (e.g."
" gfx908:xnack+:sramecc-). May be specified more than once."
),
nargs="*",
)
args, fwd_args = parser.parse_known_args()
if args.version:
print_version()
return
if args.c:
fwd_args.append("-c")
if args.v:
fwd_args.append("-v")
dry_run = vars(args)["###"]
loader_name = None
temp_files = []
if not args.c:
tf = tempfile.NamedTemporaryFile()
loader_name = "{}.o".format(tf.name)
compile_loader(loader_name, ["-O1"], args.offload_arch, args.v, dry_run)
fwd_args.append(loader_name)
temp_files.append(loader_name)
invoke_clang(is_cpp, is_mpi, fwd_args, args.offload_arch, args.v, dry_run)
for f in temp_files:
if os.path.isfile(f):
os.unlink(f)

67
offload/include/HostRPC.h Normal file
View File

@@ -0,0 +1,67 @@
//===------- HostRPC.h - Host RPC ---------------------------- C++ --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef OPENMP_LIBOMPTARGET_INCLUDE_HOSTRPC_H
#define OPENMP_LIBOMPTARGET_INCLUDE_HOSTRPC_H
#ifdef OMPTARGET_DEVICE_RUNTIME
#include "Types.h"
#else
#include <cstdint>
#endif
namespace hostrpc {
/// RPC call identifier. Note: negative value only. Non-negative values are for
/// compiler generated functions.
enum CallId {
CALLID___kmpc_launch_parallel_51_kernel = -1,
CALLID_invalid = -2147483648,
};
/// Execution status.
enum ExecutionStatus {
EXEC_STAT_CREATED = 0,
EXEC_STAT_DONE = 1,
};
enum Type {
ARG_LITERAL = 0,
ARG_POINTER = 1,
};
struct Argument {
intptr_t Value;
int64_t ArgType;
};
struct Descriptor {
// The following member will be used by both host and device.
int32_t Id;
struct Argument *Args;
int64_t NumArgs;
int64_t Status;
int64_t ReturnValue;
// The following members will only be used by device.
void **ArgInfo;
void *ArgMap;
};
/// A wrapper of HostRPCDescriptor that will only be used between plugins and
/// libomptarget. It contains the three stdio global variables.
struct DescriptorWrapper {
Descriptor D;
void *StdIn = nullptr;
void *StdOut = nullptr;
void *StdErr = nullptr;
};
} // namespace hostrpc
#endif

View File

@@ -425,6 +425,10 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
void *VAddr, bool IsRecord, bool SaveOutput,
uint64_t &ReqPtrArgOffset);
// Host RPC support
int64_t __kmpc_host_rpc_get_arg(void *descriptor, int32_t ArgNo);
void __kmpc_host_rpc_set_ret_val(void *descriptor, int64_t RetVal);
#ifdef __cplusplus
}
#endif

View File

@@ -678,7 +678,7 @@ struct AMDGPUQueueTy {
if (Queue)
return Plugin::success();
hsa_status_t Status =
hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
hsa_queue_create(Agent, QueueSize, 2 /*HSA_QUEUE_TYPE_MULTI*/, callbackError,
nullptr, UINT32_MAX, UINT32_MAX, &Queue);
return Plugin::check(Status, "Error in hsa_queue_create: %s");
}
@@ -3315,16 +3315,33 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (GenericDevice.getRPCServer())
Stream->setRPCServer(GenericDevice.getRPCServer());
utils::AMDGPUMGSyncInfo *multigridSyncArgs = nullptr;
if (auto Err = ArgsMemoryManager.allocate(sizeof(utils::AMDGPUMGSyncInfo),
reinterpret_cast<void **>(&multigridSyncArgs)))
return Err;
multigridSyncArgs->mgs = nullptr; // multigrid sync barrier data pointer
multigridSyncArgs->sgs = {0, 0}; // singlegrid sync barrier data
multigridSyncArgs->grid_id = 0; // id of the current grid in multigpu launch
multigridSyncArgs->num_grids = 1; // number of grid in multigpu launch
multigridSyncArgs->prev_sum = 0; // sum of all thread that have already been run
multigridSyncArgs->all_sum = NumBlocks * NumThreads; // sum of all thread to run
multigridSyncArgs->num_wg = NumBlocks; // number of workgroup (nb. teams)
//printf("%lu\n", NumBlocks);
// Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
ImplArgs->BlockCountX = NumBlocks;
ImplArgs->BlockCountY = 1;
ImplArgs->BlockCountZ = 1;
ImplArgs->GroupSizeX = NumThreads;
ImplArgs->GroupSizeY = 1;
ImplArgs->GroupSizeZ = 1;
ImplArgs->GridDims = 1;
ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
ImplArgs->block_count_x = NumBlocks;
ImplArgs->block_count_y = 1;
ImplArgs->block_count_z = 1;
ImplArgs->group_size_x = NumThreads;
ImplArgs->group_size_y = 1;
ImplArgs->group_size_z = 1;
ImplArgs->grid_dims = 1;
ImplArgs->dynamic_lds_size = KernelArgs.DynCGroupMem;
ImplArgs->multigrid_sync_arg = multigridSyncArgs;
}
// Push the kernel launch into the stream.

View File

@@ -35,19 +35,55 @@ namespace target {
namespace plugin {
namespace utils {
// GPU struct for cooperative group / grid sync / multigrid sync
// The structures below for MGPU launch match the device library format
struct AMDGPUMGSyncData {
uint32_t w0;
uint32_t w1;
};
struct AMDGPUMGSyncInfo {
struct AMDGPUMGSyncData* mgs;
uint32_t grid_id;
uint32_t num_grids;
uint64_t prev_sum;
uint64_t all_sum;
struct AMDGPUMGSyncData sgs;
uint num_wg;
};
// The implicit arguments of COV5 AMDGPU kernels.
// See: https://github.com/llvm/llvm-project/blob/6e86e11148474e4ecd49dbf0ca5dd9caddcdbd11/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp#L583
// static_assert(sizeof(void *) == 8)
struct AMDGPUImplicitArgsTy {
uint32_t BlockCountX;
uint32_t BlockCountY;
uint32_t BlockCountZ;
uint16_t GroupSizeX;
uint16_t GroupSizeY;
uint16_t GroupSizeZ;
uint8_t Unused0[46]; // 46 byte offset.
uint16_t GridDims;
uint8_t Unused1[54]; // 54 byte offset.
uint32_t DynamicLdsSize;
uint8_t Unused2[132]; // 132 byte offset.
uint32_t block_count_x;
uint32_t block_count_y;
uint32_t block_count_z;
uint16_t group_size_x;
uint16_t group_size_y;
uint16_t group_size_z;
uint16_t remainder_x;
uint16_t remainder_y;
uint16_t remainder_z;
uint8_t reserved0[16]; // reserved
uint64_t global_offset_x;
uint64_t global_offset_y;
uint64_t global_offset_z;
uint16_t grid_dims;
uint8_t reserved1[6]; // reserved
void * printf_buffer;
void * hostcall_buffer;
AMDGPUMGSyncInfo *multigrid_sync_arg;
void * heap_v1;
void * completion_action;
uint32_t dynamic_lds_size;
uint8_t reserved2[68]; // reserved
uint32_t private_base;
uint32_t shared_base;
void * queue_ptr;
uint8_t unused[48]; // 48 byte offset.
};
// Dummy struct for COV4 implicitargs.

View File

@@ -15,12 +15,45 @@
#if defined(LIBOMPTARGET_RPC_SUPPORT)
#include "llvm-libc-types/rpc_opcodes_t.h"
#include "llvmlibc_rpc_server.h"
#include "HostRPC.h"
#include "llvm/Support/DynamicLibrary.h"
#endif
using namespace llvm;
using namespace omp;
using namespace target;
#ifdef LIBOMPTARGET_RPC_SUPPORT
// GPUFirst Host Function Wrapper Invoker
class HostRPCInvokerWrapper {
void (*Invoker)(int32_t, void *) = nullptr;
std::unique_ptr<sys::DynamicLibrary> DL;
std::once_flag Flag;
void initInvoker() {
std::string ErrMsg;
DL = std::make_unique<sys::DynamicLibrary>(
sys::DynamicLibrary::getPermanentLibrary(nullptr, &ErrMsg));
assert(DL->isValid() && "invalid DL");
*((void **)&Invoker) =
DL->getAddressOfSymbol("__kmpc_host_rpc_invoke_host_wrapper");
assert(Invoker && "Invoker is nullptr");
}
public:
void invoke(int32_t CallNo, void *Desc) {
std::call_once(Flag, &HostRPCInvokerWrapper::initInvoker, this);
Invoker(CallNo, Desc);
}
};
HostRPCInvokerWrapper *Invoker;
// GPUFirst END
#endif
RPCServerTy::RPCServerTy(plugin::GenericPluginTy &Plugin)
: Handles(Plugin.getNumDevices()) {}
@@ -89,6 +122,75 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
"Failed to register RPC free handler for device %d: %d\n",
Device.getDeviceId(), Err);
// GPUFirst
// Register custom opcode handler for gpu first
auto GPUFirstHandler = [](rpc_port_t port, void *Data) {
// printf("[HostRPC] [Host]: GPUFirstHandler\n");
// // WORKING back & forth of an uint64_t
//
// printf("[HostRPC] [Host]: Start \n");
//
// uint64_t size_recv = 0;
// void *buf_recv = nullptr;
//
// rpc_recv_n(port, &buf_recv, &size_recv,
// [](uint64_t size, void* data){ return malloc(size); }, nullptr);
//
// printf("[HostRPC] [Host] [RECV]: %lu\n", *((uint64_t *) buf_recv));
// printf("[HostRPC] [Host] [RECV] Size: %lu\n", size_recv);
//
// uint64_t size_send = sizeof(uint64_t);
// void *buf_send = malloc(size_send);
// *((uint64_t *) buf_send) = 987654321;
//
// printf("[Hostrpc] [Host] [SEND]: %lu\n", *((uint64_t *) buf_send));
// printf("[HostRPC] [Host] [SEND] Size: %lu\n", size_send);
//
// rpc_send_n(port, &buf_send, &size_send);
//
// printf("[HostRPC] [Host]: End \n");
//
// // END of working part
auto _rpc_recv_n = [](rpc_port_t *handle, void **dst, size_t *size){
rpc_recv_n(*handle, dst, size,
[](uint64_t size, void* data){ return malloc(size); },
nullptr);
};
auto _rpc_send_n = [](rpc_port_t *handle, void *src, size_t size){
rpc_send_n(*handle, &src, &size);
};
uint64_t size_recv = 0;
hostrpc::Descriptor *D = nullptr;
hostrpc::Argument *Args = nullptr;
_rpc_recv_n(&port, reinterpret_cast<void **>(&D), &size_recv);
_rpc_recv_n(&port, reinterpret_cast<void **>(&Args), &size_recv);
D->Args = Args;
if(Invoker == nullptr)
Invoker = new HostRPCInvokerWrapper();
Invoker->invoke(D->Id, D);
_rpc_send_n(&port, D, sizeof(hostrpc::Descriptor));
_rpc_send_n(&port, D->Args, sizeof(hostrpc::Argument) * D->NumArgs);
free(D->Args);
free(D);
};
if (rpc_status_t Err =
rpc_register_callback(RPCDevice, RPC_GPUFIRST, GPUFirstHandler, &Invoker))
return plugin::Plugin::error(
"Failed to register RPC GPU First handler for device %d: %d\n", Device.getDeviceId(),
Err);
// GPUFirst END
// Get the address of the RPC client from the device.
void *ClientPtr;
plugin::GlobalTy ClientGlobal(rpc_client_symbol_name, sizeof(void *));

View File

@@ -70,7 +70,8 @@ VERS1.0 {
__tgt_interop_init;
__tgt_interop_use;
__tgt_interop_destroy;
ompt_libomptarget_connect;
__kmpc_host_rpc_get_arg;
__kmpc_host_rpc_set_ret_val;
local:
*;
};

View File

@@ -16,6 +16,8 @@
#include "PluginManager.h"
#include "private.h"
#include "HostRPC.h"
#include "Shared/EnvironmentVar.h"
#include "Shared/Profile.h"
@@ -510,3 +512,21 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
delete AsyncInfo;
*AsyncHandle = nullptr;
}
// Host RPC support functions.
EXTERN int64_t __kmpc_host_rpc_get_arg(void *descriptor, int32_t ArgNum) {
hostrpc::Descriptor &SD = *reinterpret_cast<hostrpc::Descriptor *>(descriptor);
assert(ArgNum < SD.NumArgs && "out-of-range argument");
int64_t ArgVal = SD.Args[ArgNum].Value;
DP("[host-rpc] get argno=%d arg=%lx...\n", ArgNum, ArgVal);
return ArgVal;
}
EXTERN void __kmpc_host_rpc_set_ret_val(void *descriptor, int64_t RetVal) {
hostrpc::Descriptor &SD = *reinterpret_cast<hostrpc::Descriptor *>(descriptor);
SD.ReturnValue = RetVal;
}

View File

@@ -49,10 +49,10 @@ else()
# When building in tree we install the runtime according to the LLVM settings.
if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND NOT APPLE)
set(OPENMP_INSTALL_LIBDIR lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE} CACHE STRING
set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE}/openmp" CACHE STRING
"Path where built openmp libraries should be installed.")
else()
set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}" CACHE STRING
set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}/openmp" CACHE STRING
"Path where built OpenMP libraries should be installed.")
endif()