Compare commits
2 Commits
target_ent
...
jit
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
02bc7effcc | ||
|
|
6f3e60f1c0 |
@@ -160,6 +160,8 @@ def err_drv_invalid_Xarch_argument_with_args : Error<
|
||||
"invalid Xarch argument: '%0', options requiring arguments are unsupported">;
|
||||
def err_drv_Xopenmp_target_missing_triple : Error<
|
||||
"cannot deduce implicit triple value for -Xopenmp-target, specify triple using -Xopenmp-target=<triple>">;
|
||||
def err_drv_openmp_jit_without_lto : Error<
|
||||
"cannot enable OpenMP offloading JIT, specify bitcode compilation with '-foffload-lto'">;
|
||||
def err_drv_invalid_Xopenmp_target_with_args : Error<
|
||||
"invalid -Xopenmp-target argument: '%0', options requiring arguments are unsupported">;
|
||||
def err_drv_argument_only_allowed_with : Error<
|
||||
|
||||
@@ -2539,6 +2539,10 @@ def fopenmp_target_new_runtime : Flag<["-"], "fopenmp-target-new-runtime">,
|
||||
Group<f_Group>, Flags<[CC1Option, HelpHidden]>;
|
||||
def fno_openmp_target_new_runtime : Flag<["-"], "fno-openmp-target-new-runtime">,
|
||||
Group<f_Group>, Flags<[CC1Option, HelpHidden]>;
|
||||
def fopenmp_target_jit : Flag<["-"], "fopenmp-target-jit">, Group<f_Group>,
|
||||
HelpText<"Enable JIT comilation for OpenMP Offloading">, Flags<[ NoArgumentUnused]>;
|
||||
def fno_openmp_target_jit : Flag<["-"], "fno-openmp-target-jit">, Group<f_Group>,
|
||||
Flags<[NoArgumentUnused, HelpHidden]>;
|
||||
defm openmp_optimistic_collapse : BoolFOption<"openmp-optimistic-collapse",
|
||||
LangOpts<"OpenMPOptimisticCollapse">, DefaultFalse,
|
||||
PosFlag<SetTrue, [CC1Option]>, NegFlag<SetFalse>, BothFlags<[NoArgumentUnused, HelpHidden]>>;
|
||||
|
||||
@@ -8285,6 +8285,12 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
|
||||
auto OpenMPTCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
|
||||
ArgStringList CmdArgs;
|
||||
|
||||
if (!C.getDriver().isUsingLTO(/* IsOffload */ true) &&
|
||||
Args.hasFlag(options::OPT_fopenmp_target_jit,
|
||||
options::OPT_fno_openmp_target_jit, /*Default*/ false)) {
|
||||
C.getDriver().Diag(clang::diag::err_drv_openmp_jit_without_lto);
|
||||
}
|
||||
|
||||
// Pass the CUDA path to the linker wrapper tool.
|
||||
for (Action::OffloadKind Kind : {Action::OFK_Cuda, Action::OFK_OpenMP}) {
|
||||
auto TCRange = C.getOffloadToolChains(Kind);
|
||||
@@ -8355,6 +8361,11 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
|
||||
if (!OOpt.empty())
|
||||
CmdArgs.push_back(Args.MakeArgString(Twine("-opt-level=O") + OOpt));
|
||||
}
|
||||
|
||||
if (Args.hasFlag(options::OPT_fopenmp_target_jit,
|
||||
options::OPT_fno_openmp_target_jit,
|
||||
/*Default=*/false))
|
||||
CmdArgs.push_back(Args.MakeArgString("-target-embed-bc"));
|
||||
}
|
||||
|
||||
CmdArgs.push_back("-host-triple");
|
||||
|
||||
@@ -98,6 +98,7 @@ struct LTOCodeGenerator {
|
||||
|
||||
void setCpu(StringRef MCpu) { Config.CPU = std::string(MCpu); }
|
||||
void setAttrs(std::vector<std::string> MAttrs) { Config.MAttrs = MAttrs; }
|
||||
void setUseDefaultPipeline(bool Value) { Config.UseDefaultPipeline = Value; }
|
||||
void setOptLevel(unsigned OptLevel);
|
||||
|
||||
void setShouldInternalize(bool Value) { ShouldInternalize = Value; }
|
||||
@@ -193,6 +194,8 @@ struct LTOCodeGenerator {
|
||||
void resetMergedModule() { MergedModule.reset(); }
|
||||
void DiagnosticHandler(const DiagnosticInfo &DI);
|
||||
|
||||
Module &getMergedModule() const { return *MergedModule; }
|
||||
|
||||
private:
|
||||
/// Verify the merged module on first call.
|
||||
///
|
||||
|
||||
@@ -111,6 +111,8 @@ public:
|
||||
createInLocalContext(std::unique_ptr<LLVMContext> Context, const void *mem,
|
||||
size_t length, const TargetOptions &options,
|
||||
StringRef path);
|
||||
static ErrorOr<std::unique_ptr<LTOModule>>
|
||||
clone(const LTOModule &LM, const TargetOptions &options);
|
||||
|
||||
const Module &getModule() const { return *Mod; }
|
||||
Module &getModule() { return *Mod; }
|
||||
|
||||
@@ -538,7 +538,8 @@ bool LTOCodeGenerator::optimize() {
|
||||
this->applyScopeRestrictions();
|
||||
|
||||
// Write LTOPostLink flag for passes that require all the modules.
|
||||
MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
|
||||
if (!MergedModule->getModuleFlag("LTOPostLink"))
|
||||
MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
|
||||
|
||||
// Add an appropriate DataLayout instance for this module...
|
||||
MergedModule->setDataLayout(TargetMach->createDataLayout());
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
#include "llvm/Support/SourceMgr.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Target/TargetLoweringObjectFile.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
#include "llvm/Transforms/Utils/GlobalStatus.h"
|
||||
#include <system_error>
|
||||
using namespace llvm;
|
||||
@@ -701,3 +702,47 @@ bool LTOModule::hasCtorDtor() const {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
ErrorOr<std::unique_ptr<LTOModule>>
|
||||
LTOModule::clone(const LTOModule &LM, const TargetOptions &options) {
|
||||
auto NM = CloneModule(LM.getModule());
|
||||
|
||||
std::string TripleStr = NM->getTargetTriple();
|
||||
if (TripleStr.empty())
|
||||
TripleStr = sys::getDefaultTargetTriple();
|
||||
llvm::Triple Triple(TripleStr);
|
||||
|
||||
// find machine architecture for this module
|
||||
std::string errMsg;
|
||||
const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
|
||||
if (!march)
|
||||
return make_error_code(object::object_error::arch_not_found);
|
||||
|
||||
// construct LTOModule, hand over ownership of module and target
|
||||
SubtargetFeatures Features;
|
||||
Features.getDefaultSubtargetFeatures(Triple);
|
||||
std::string FeatureStr = Features.getString();
|
||||
// Set a default CPU for Darwin triples.
|
||||
std::string CPU;
|
||||
if (Triple.isOSDarwin()) {
|
||||
if (Triple.getArch() == llvm::Triple::x86_64)
|
||||
CPU = "core2";
|
||||
else if (Triple.getArch() == llvm::Triple::x86)
|
||||
CPU = "yonah";
|
||||
else if (Triple.isArm64e())
|
||||
CPU = "apple-a12";
|
||||
else if (Triple.getArch() == llvm::Triple::aarch64 ||
|
||||
Triple.getArch() == llvm::Triple::aarch64_32)
|
||||
CPU = "cyclone";
|
||||
}
|
||||
|
||||
TargetMachine *target =
|
||||
march->createTargetMachine(TripleStr, CPU, FeatureStr, options, None);
|
||||
|
||||
std::unique_ptr<LTOModule> Ret(
|
||||
new LTOModule(std::move(NM), LM.MBRef, target));
|
||||
Ret->parseSymbols();
|
||||
Ret->parseMetadata();
|
||||
|
||||
return std::move(Ret);
|
||||
}
|
||||
|
||||
@@ -1497,6 +1497,14 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
|
||||
MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
|
||||
}
|
||||
|
||||
{
|
||||
FunctionPassManager EarlyFPM;
|
||||
// Break up allocas
|
||||
EarlyFPM.addPass(SROAPass());
|
||||
MPM.addPass(createModuleToFunctionPassAdaptor(
|
||||
std::move(EarlyFPM), PTO.EagerlyInvalidateAnalyses));
|
||||
}
|
||||
|
||||
// Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
|
||||
MPM.addPass(OpenMPOptPass());
|
||||
|
||||
@@ -1632,9 +1640,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
|
||||
PGOOpt->ProfileRemappingFile);
|
||||
}
|
||||
|
||||
// Break up allocas
|
||||
FPM.addPass(SROAPass());
|
||||
|
||||
// LTO provides additional opportunities for tailcall elimination due to
|
||||
// link-time inlining, and visibility of nocapture attribute.
|
||||
FPM.addPass(TailCallElimPass());
|
||||
|
||||
@@ -128,6 +128,7 @@ template <typename VTy, typename Ty> struct ValueRAII {
|
||||
Val(OldValue), Active(Active) {
|
||||
if (!Active)
|
||||
return;
|
||||
Ptr = &V.lookup(/* IsReadonly */ false, Ident);
|
||||
ASSERT(*Ptr == OldValue &&
|
||||
"ValueRAII initialization with wrong old value!");
|
||||
*Ptr = NewValue;
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#include <mutex>
|
||||
#include <set>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "ExclusiveAccess.h"
|
||||
@@ -302,6 +303,9 @@ typedef std::map<void *, ShadowPtrValTy> ShadowPtrListTy;
|
||||
struct PendingCtorDtorListsTy {
|
||||
std::list<void *> PendingCtors;
|
||||
std::list<void *> PendingDtors;
|
||||
|
||||
std::unordered_map<__tgt_device_image *, std::list<void *>> PendingJITCtors;
|
||||
std::unordered_map<__tgt_device_image *, std::list<void *>> PendingJITDtors;
|
||||
};
|
||||
typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy>
|
||||
PendingCtorsDtorsPerLibrary;
|
||||
@@ -461,6 +465,10 @@ struct DeviceTy {
|
||||
int32_t destroyEvent(void *Event);
|
||||
/// }
|
||||
|
||||
__tgt_target_table *loadJITImage(__tgt_device_image *Image,
|
||||
const char *EntryName, void **TgtArgs,
|
||||
ptrdiff_t *TgtOffsets, int NumArgs);
|
||||
|
||||
private:
|
||||
// Call to RTL
|
||||
void init(); // To be called only via DeviceTy::initOnce()
|
||||
|
||||
@@ -197,6 +197,12 @@ struct __tgt_device_info {
|
||||
void *Device = nullptr;
|
||||
};
|
||||
|
||||
struct __tgt_kernel_launch_entry {
|
||||
__tgt_offload_entry *HostEntry = nullptr;
|
||||
void *TargetEntry = nullptr;
|
||||
__tgt_device_image *Image = nullptr;
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
// Forward declarations.
|
||||
@@ -184,4 +185,7 @@ struct TableMap {
|
||||
};
|
||||
typedef std::map<void *, TableMap> HostPtrToTableMapTy;
|
||||
|
||||
/// A set that stores all registered JIT images.
|
||||
extern std::unordered_set<__tgt_device_image *> RegisteredJITImages;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -57,7 +57,7 @@ else()
|
||||
set(LIBOMPTARGET_DEP_LIBRARIES)
|
||||
endif()
|
||||
|
||||
add_library(omptarget.rtl.amdgpu SHARED
|
||||
add_llvm_library(omptarget.rtl.amdgpu SHARED
|
||||
impl/impl.cpp
|
||||
impl/interop_hsa.cpp
|
||||
impl/data.cpp
|
||||
@@ -100,6 +100,7 @@ target_link_libraries(
|
||||
${OPENMP_PTHREAD_LIB}
|
||||
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
|
||||
${LDFLAGS_UNDEFINED}
|
||||
LLVM-LIBOMPTARGET-JIT
|
||||
)
|
||||
|
||||
# in case of amdgcn, skip running tests if amdgpu-arch was not built or fails
|
||||
|
||||
@@ -35,8 +35,16 @@
|
||||
#include "omptargetplugin.h"
|
||||
#include "print_tracing.h"
|
||||
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/Frontend/OpenMP/OMPConstants.h"
|
||||
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
||||
#include "llvm/Support/MemoryBuffer.h"
|
||||
#include "llvm/Support/Path.h"
|
||||
#include "llvm/Support/Program.h"
|
||||
|
||||
#include "JIT.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
// hostrpc interface, FIXME: consider moving to its own include these are
|
||||
// statically linked into amdgpu/plugin if present from hostrpc_services.a,
|
||||
@@ -850,8 +858,81 @@ pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
static RTLDeviceInfoTy DeviceInfo;
|
||||
|
||||
static __tgt_target_table *
|
||||
__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
|
||||
|
||||
namespace {
|
||||
|
||||
std::unique_ptr<jit::JITEngine> JITEngine;
|
||||
|
||||
class AMDDeviceToolChain : public jit::DeviceToolChain {
|
||||
static std::string getMainExecutable(const char *Name) {
|
||||
void *Ptr = (void *)(intptr_t)&getMainExecutable;
|
||||
auto COWPath = sys::fs::getMainExecutable(Name, Ptr);
|
||||
return sys::path::parent_path(COWPath).str();
|
||||
}
|
||||
|
||||
/// Get a temporary filename suitable for output.
|
||||
static Error createOutputFile(const Twine &Prefix, StringRef Extension,
|
||||
SmallString<128> &NewFilename) {
|
||||
if (std::error_code EC =
|
||||
sys::fs::createTemporaryFile(Prefix, Extension, NewFilename))
|
||||
return createFileError(NewFilename, EC);
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
Expected<std::string> link(StringRef Input, StringRef Prefix) {
|
||||
ErrorOr<std::string> LLDPath =
|
||||
sys::findProgramByName("lld", {getMainExecutable("lld")});
|
||||
if (!LLDPath)
|
||||
LLDPath = sys::findProgramByName("lld");
|
||||
if (!LLDPath)
|
||||
return createStringError(LLDPath.getError(),
|
||||
"Unable to find 'lld' in path");
|
||||
|
||||
SmallString<128> TempFile;
|
||||
if (Error Err = createOutputFile(Prefix, "o", TempFile))
|
||||
return std::move(Err);
|
||||
|
||||
SmallVector<StringRef, 16> CmdArgs;
|
||||
CmdArgs.push_back(*LLDPath);
|
||||
CmdArgs.push_back("-flavor");
|
||||
CmdArgs.push_back("gnu");
|
||||
CmdArgs.push_back("--no-undefined");
|
||||
CmdArgs.push_back("-shared");
|
||||
CmdArgs.push_back("-o");
|
||||
CmdArgs.push_back(TempFile);
|
||||
CmdArgs.push_back(Input);
|
||||
|
||||
if (sys::ExecuteAndWait(*LLDPath, CmdArgs))
|
||||
return createStringError(inconvertibleErrorCode(), "'lld' failed");
|
||||
|
||||
return static_cast<std::string>(TempFile);
|
||||
}
|
||||
|
||||
public:
|
||||
std::unique_ptr<MemoryBuffer> run(const std::string &FileName,
|
||||
const jit::DeviceInfo &DI) override {
|
||||
std::string Prefix = "libomptarget-amdgcn-" + DI.MCpu + "-jit";
|
||||
auto FileNameOrErr = link(FileName, Prefix);
|
||||
if (!FileNameOrErr) {
|
||||
Error E = FileNameOrErr.takeError();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::string TempFile = *FileNameOrErr;
|
||||
auto MBOrError = MemoryBuffer::getFile(TempFile, /*IsText=*/false,
|
||||
/*RequiresNullTerminator=*/false);
|
||||
if (std::error_code EC = MBOrError.getError()) {
|
||||
sys::fs::remove(TempFile);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
sys::fs::remove(TempFile);
|
||||
return std::move(*MBOrError);
|
||||
}
|
||||
} AMDDTC;
|
||||
|
||||
int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
|
||||
__tgt_async_info *AsyncInfo) {
|
||||
assert(AsyncInfo && "AsyncInfo is nullptr");
|
||||
@@ -1090,6 +1171,30 @@ static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
|
||||
return packet_id;
|
||||
}
|
||||
|
||||
__tgt_target_table *loadJITImage(int DeviceId, __tgt_device_image *Image,
|
||||
__tgt_offload_entry *Entry, void **Args,
|
||||
int NumArgs, int TeamNum, int ThreadLimit,
|
||||
int LoopTripCount) {
|
||||
auto Kernel =
|
||||
jit::Kernel::create(Image, Entry->name, DeviceInfo.GPUName[DeviceId],
|
||||
Args, NumArgs, TeamNum, ThreadLimit, LoopTripCount);
|
||||
if (auto *TT = JITEngine->getTargetTable(DeviceId, Kernel))
|
||||
return TT;
|
||||
|
||||
auto *NewImage = JITEngine->getImage(DeviceId, Kernel, Image);
|
||||
if (!NewImage)
|
||||
return nullptr;
|
||||
|
||||
auto *TT = __tgt_rtl_load_binary_locked(DeviceId, NewImage);
|
||||
if (!TT)
|
||||
return nullptr;
|
||||
|
||||
if (!JITEngine->insertTargetTable(DeviceId, Kernel, TT))
|
||||
return nullptr;
|
||||
|
||||
return TT;
|
||||
}
|
||||
|
||||
int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
|
||||
ptrdiff_t *tgt_offsets, int32_t arg_num,
|
||||
int32_t num_teams, int32_t thread_limit,
|
||||
@@ -1111,7 +1216,24 @@ int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
|
||||
DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
|
||||
}
|
||||
|
||||
KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
|
||||
auto LaunchEntry = reinterpret_cast<__tgt_kernel_launch_entry *>(tgt_entry_ptr);
|
||||
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(LaunchEntry->TargetEntry);
|
||||
// If kernel info is nullptr, it means we are dealing with JIT image.
|
||||
if (KernelInfo == nullptr) {
|
||||
assert(LaunchEntry->Image && LaunchEntry->HostEntry);
|
||||
__tgt_device_image NewImage = *(LaunchEntry->Image);
|
||||
NewImage.EntriesBegin = LaunchEntry->HostEntry;
|
||||
NewImage.EntriesEnd = NewImage.EntriesBegin + 1;
|
||||
auto TargetTable =
|
||||
loadJITImage(device_id, &NewImage, LaunchEntry->HostEntry, ptrs.data(),
|
||||
arg_num, num_teams, thread_limit, loop_tripcount);
|
||||
if (!TargetTable)
|
||||
return OFFLOAD_FAIL;
|
||||
|
||||
KernelInfo = reinterpret_cast<KernelTy *>(TargetTable->EntriesBegin->addr);
|
||||
}
|
||||
|
||||
assert(KernelInfo && "KernelInfo should not be nullptr");
|
||||
|
||||
std::string kernel_name = std::string(KernelInfo->Name);
|
||||
auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
|
||||
@@ -1640,7 +1762,22 @@ hsa_status_t allow_access_to_all_gpu_agents(void *ptr) {
|
||||
|
||||
extern "C" {
|
||||
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
|
||||
return elf_machine_id_is_amdgcn(image);
|
||||
if(elf_machine_id_is_amdgcn(image))
|
||||
return 1;
|
||||
|
||||
jit::JITEngine::init();
|
||||
|
||||
if (!JITEngine)
|
||||
JITEngine = std::make_unique<jit::JITEngine>("amdgcn", AMDDTC,
|
||||
DeviceInfo.NumberOfDevices);
|
||||
|
||||
if (!jit::JITEngine::isValidModule("amdgcn", image))
|
||||
return 0;
|
||||
|
||||
if (jit::JITEngine::isSpecializationSupported(image))
|
||||
return 2;
|
||||
|
||||
return 3;
|
||||
}
|
||||
|
||||
int __tgt_rtl_number_of_devices() {
|
||||
@@ -1811,12 +1948,23 @@ int32_t __tgt_rtl_init_device(int device_id) {
|
||||
DeviceInfo.GroupsPerDevice[device_id] *
|
||||
DeviceInfo.ThreadsPerGroup[device_id]);
|
||||
|
||||
if (JITEngine) {
|
||||
jit::DeviceInfo DI;
|
||||
DI.Arch = "amdgcn";
|
||||
DI.MCpu = DeviceInfo.GPUName[device_id];
|
||||
DI.ThreadsPerBlock = DeviceInfo.ThreadsPerGroup[device_id];
|
||||
DI.BlocksPerGrid = DeviceInfo.GroupsPerDevice[device_id];
|
||||
DI.WarpSize = 32;
|
||||
DI.NumThreads = DeviceInfo.NumThreads[device_id];
|
||||
DI.NumTeams = DeviceInfo.NumTeams[device_id];
|
||||
DI.EnvNumThreads = DeviceInfo.Env.TeamThreadLimit;
|
||||
DI.EnvNumTeams = DeviceInfo.Env.NumTeams;
|
||||
JITEngine->init(device_id, DI);
|
||||
}
|
||||
|
||||
return OFFLOAD_SUCCESS;
|
||||
}
|
||||
|
||||
static __tgt_target_table *
|
||||
__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
|
||||
|
||||
__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
|
||||
__tgt_device_image *image) {
|
||||
DeviceInfo.load_run_lock.lock();
|
||||
|
||||
@@ -11,4 +11,5 @@
|
||||
##===----------------------------------------------------------------------===##
|
||||
|
||||
add_subdirectory(elf_common)
|
||||
add_subdirectory(JIT)
|
||||
add_subdirectory(MemoryManager)
|
||||
|
||||
36
openmp/libomptarget/plugins/common/JIT/CMakeLists.txt
Normal file
36
openmp/libomptarget/plugins/common/JIT/CMakeLists.txt
Normal file
@@ -0,0 +1,36 @@
|
||||
##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
# See https://llvm.org/LICENSE.txt for license information.
|
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
#
|
||||
##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# JIT module
|
||||
#
|
||||
##===----------------------------------------------------------------------===##
|
||||
|
||||
set(LLVM_LINK_COMPONENTS
|
||||
AllTargetsAsmParsers
|
||||
AllTargetsCodeGens
|
||||
AllTargetsDescs
|
||||
AllTargetsInfos
|
||||
LTO
|
||||
)
|
||||
|
||||
add_llvm_library(LLVM-LIBOMPTARGET-JIT STATIC BUILDTREE_ONLY JIT.cpp)
|
||||
|
||||
# Build elf_common with PIC to be able to link it with plugin shared libraries.
|
||||
set_property(TARGET LLVM-LIBOMPTARGET-JIT PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
target_link_libraries(LLVM-LIBOMPTARGET-JIT INTERFACE ${OPENMP_PTHREAD_LIB} ncurses dl)
|
||||
|
||||
# Expose JIT.h directory to the users of this library.
|
||||
target_include_directories(LLVM-LIBOMPTARGET-JIT
|
||||
INTERFACE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}
|
||||
${LLVM_INCLUDE_DIRS}
|
||||
PRIVATE
|
||||
${LIBOMPTARGET_INCLUDE_DIR}
|
||||
)
|
||||
1332
openmp/libomptarget/plugins/common/JIT/JIT.cpp
Normal file
1332
openmp/libomptarget/plugins/common/JIT/JIT.cpp
Normal file
File diff suppressed because it is too large
Load Diff
361
openmp/libomptarget/plugins/common/JIT/JIT.h
Normal file
361
openmp/libomptarget/plugins/common/JIT/JIT.h
Normal file
@@ -0,0 +1,361 @@
|
||||
//===-- JIT.h --- JIT module ----------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// JIT module for target plugins.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
// Forward declaration.
|
||||
struct __tgt_target_table;
|
||||
struct __tgt_device_image;
|
||||
struct __tgt_offload_entry;
|
||||
struct __tgt_async_info;
|
||||
|
||||
namespace llvm {
|
||||
class MemoryBuffer;
|
||||
} // namespace llvm
|
||||
|
||||
namespace jit {
|
||||
class Kernel;
|
||||
|
||||
namespace impl {
|
||||
|
||||
/// Optimization action applied to a kernel, which is in the form:
|
||||
/// operation:index:value
|
||||
///
|
||||
/// 'operation' can be:
|
||||
/// 's': value specialization;
|
||||
/// 'a': alignment specialization;
|
||||
/// 't': number of threads;
|
||||
/// 'T': number of teams.
|
||||
///
|
||||
/// 'index' can be 'n' for those operations that don't require index, or an
|
||||
/// integer number.
|
||||
///
|
||||
/// 'value' can be an action (recursively defined, but in fact we don't
|
||||
/// support it for now), or an integer value.
|
||||
class Action {
|
||||
public:
|
||||
enum class ActionKind : uint8_t {
|
||||
None = 0,
|
||||
Alignment,
|
||||
Specialization,
|
||||
NumTeams,
|
||||
NumThreads,
|
||||
};
|
||||
|
||||
explicit Action(const std::string &S);
|
||||
|
||||
explicit Action(ActionKind AK, uintptr_t V, int Index);
|
||||
|
||||
explicit Action(ActionKind AK, uintptr_t V);
|
||||
|
||||
std::string toString() const;
|
||||
|
||||
bool match(const Kernel &K) const;
|
||||
|
||||
static std::string ActionsToString(const std::vector<Action> &Actions);
|
||||
|
||||
private:
|
||||
enum ValuePos : uint8_t {
|
||||
POS_OpCode = 0,
|
||||
POS_Index = 1,
|
||||
POS_Value = 2,
|
||||
};
|
||||
|
||||
ActionKind Kind;
|
||||
uintptr_t Value;
|
||||
int Index;
|
||||
};
|
||||
|
||||
class KernelSpecialization {
|
||||
/// Kernel entry name.
|
||||
const std::string Name;
|
||||
/// Target architecture.
|
||||
const std::string MCpu;
|
||||
///
|
||||
std::vector<Action> Actions;
|
||||
|
||||
friend class Image;
|
||||
|
||||
public:
|
||||
explicit KernelSpecialization(const std::string &Name,
|
||||
const std::string &MCpu)
|
||||
: Name(Name), MCpu(MCpu) {}
|
||||
|
||||
explicit KernelSpecialization(const std::string &Name,
|
||||
const std::string &MCpu,
|
||||
const std::string &ActionString);
|
||||
|
||||
explicit KernelSpecialization(const std::string &Name,
|
||||
const std::string &MCpu,
|
||||
const std::vector<Action> &A);
|
||||
|
||||
bool match(const Kernel &K) const;
|
||||
|
||||
const std::string &getName() const { return Name; }
|
||||
};
|
||||
|
||||
class SpecializationStatistics {
|
||||
/// Kernel name.
|
||||
const std::string KernelName;
|
||||
///
|
||||
uint64_t ThresholdTotalCount = 20;
|
||||
///
|
||||
float ThresholdRatio = 0.5f;
|
||||
/// Total number of specialization variants that have been generated for the
|
||||
/// corresponding kernel.
|
||||
uint64_t TotalCount = 0;
|
||||
/// Count for each argument.
|
||||
std::vector<uint64_t> ArgCount;
|
||||
/// Count for num_thread.
|
||||
uint64_t NumThreadsCount = 0;
|
||||
/// Count for num_team.
|
||||
uint64_t NumTeamsCount = 0;
|
||||
/// Gaurd lock.
|
||||
std::mutex Lock;
|
||||
|
||||
friend class StatisticsUpdater;
|
||||
|
||||
public:
|
||||
SpecializationStatistics(const std::string &Name, int NumArgs)
|
||||
: KernelName(Name), ArgCount(NumArgs, 0) {}
|
||||
|
||||
bool reachThreshold(Action::ActionKind Kind, int Index) const;
|
||||
|
||||
bool reachThreshold(Action::ActionKind Kind) const;
|
||||
};
|
||||
|
||||
class TargetTable {
|
||||
const KernelSpecialization *Specialization;
|
||||
__tgt_target_table *Table;
|
||||
|
||||
public:
|
||||
TargetTable(const KernelSpecialization *KS, __tgt_target_table *Table)
|
||||
: Specialization(KS), Table(Table) {}
|
||||
|
||||
bool match(const Kernel &K) const;
|
||||
|
||||
__tgt_target_table *get() const { return Table; }
|
||||
};
|
||||
|
||||
class TargetTableCache {
|
||||
///
|
||||
std::unordered_map<std::string, std::list<TargetTable>> Map;
|
||||
|
||||
public:
|
||||
__tgt_target_table *insert(const KernelSpecialization *KS,
|
||||
__tgt_target_table *Table) {
|
||||
auto &Tables = Map[KS->getName()];
|
||||
Tables.emplace_back(KS, Table);
|
||||
|
||||
return Tables.back().get();
|
||||
}
|
||||
|
||||
__tgt_target_table *get(const Kernel &K) const;
|
||||
};
|
||||
|
||||
class Image {
|
||||
KernelSpecialization Specialization;
|
||||
///
|
||||
const char *Start = nullptr;
|
||||
///
|
||||
const char *End = nullptr;
|
||||
|
||||
void dump(std::ostream &OS) const;
|
||||
|
||||
friend class ImageCache;
|
||||
|
||||
public:
|
||||
Image(const KernelSpecialization &KS, const char *ImageStart,
|
||||
const char *ImageEnd)
|
||||
: Specialization(KS), Start(ImageStart), End(ImageEnd) {}
|
||||
|
||||
///
|
||||
std::pair<void *, void *> get() const {
|
||||
return std::make_pair((void *)Start, (void *)End);
|
||||
}
|
||||
|
||||
///
|
||||
bool match(const Kernel &K) const { return Specialization.match(K); }
|
||||
|
||||
const KernelSpecialization &getKernelSpecialization() const {
|
||||
return Specialization;
|
||||
}
|
||||
};
|
||||
|
||||
class ImageCache {
|
||||
public:
|
||||
ImageCache(const std::string &Arch);
|
||||
|
||||
~ImageCache();
|
||||
|
||||
///
|
||||
const Image *insert(const std::string &Key, const KernelSpecialization &KS,
|
||||
std::unique_ptr<llvm::MemoryBuffer> MB);
|
||||
|
||||
///
|
||||
const Image *get(const std::string &Key, const Kernel &K) const {
|
||||
auto Itr = Map.find(Key);
|
||||
if (Itr == Map.end())
|
||||
return nullptr;
|
||||
|
||||
auto &L = Itr->second;
|
||||
for (auto &I : L)
|
||||
if (I.match(K))
|
||||
return &I;
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
private:
|
||||
const std::string Arch;
|
||||
///
|
||||
std::list<std::unique_ptr<llvm::MemoryBuffer>> NewBuffer;
|
||||
///
|
||||
std::unordered_map<std::string, std::list<Image>> Map;
|
||||
};
|
||||
|
||||
} // namespace impl
|
||||
|
||||
struct DeviceInfo {
|
||||
/// Architecture, e.g. nvptx64, amdgcn.
|
||||
std::string Arch;
|
||||
/// GPU code name, e.g. sm_75 for Nvidia GPU.
|
||||
std::string MCpu;
|
||||
/// Maximum number of registers the device can support.
|
||||
uint64_t MaxNumRegs = 0;
|
||||
uint64_t ThreadsPerBlock = 0;
|
||||
uint64_t BlocksPerGrid = 0;
|
||||
uint64_t WarpSize = 32;
|
||||
/// Values set by users.
|
||||
int64_t EnvNumThreads = -1;
|
||||
int64_t EnvNumTeams = -1;
|
||||
/// Default values when users don't set explicitly.
|
||||
uint64_t NumThreads = 0;
|
||||
uint64_t NumTeams = 0;
|
||||
};
|
||||
|
||||
class Kernel {
|
||||
/// Kernel entry name.
|
||||
std::string Name;
|
||||
/// Target architecture where the kernel is about to be launched.
|
||||
std::string MCpu;
|
||||
/// Number of threads.
|
||||
int NumThreads = 0;
|
||||
/// Number of teams.
|
||||
int NumTeams = 0;
|
||||
///
|
||||
int LoopTripCount = 0;
|
||||
/// Number of arguments.
|
||||
int NumArgs = 0;
|
||||
/// Pointer to the kernel arguments.
|
||||
uintptr_t *Args = nullptr;
|
||||
/// If the kernel is specialized, an id will be assigned.
|
||||
uintptr_t Id = 0;
|
||||
|
||||
Kernel() = default;
|
||||
|
||||
public:
|
||||
static Kernel create(__tgt_device_image *Image, const char *Name,
|
||||
const std::string &MCpu, void **Args, int NumArgs,
|
||||
int NumTeams, int NumThreads, int LoopTripCount);
|
||||
|
||||
const std::string &getName() const { return Name; }
|
||||
|
||||
const std::string &getMCpu() const { return MCpu; }
|
||||
|
||||
int getNumThreads() const { return NumThreads; }
|
||||
|
||||
int getNumTeams() const { return NumTeams; }
|
||||
|
||||
uintptr_t getArg(int Index) const {
|
||||
assert(Index < NumArgs && "out of range access");
|
||||
return Args[Index];
|
||||
}
|
||||
|
||||
int getNumArgs() const { return NumArgs; }
|
||||
|
||||
friend class JITEngine;
|
||||
};
|
||||
|
||||
class DeviceToolChain {
|
||||
public:
|
||||
virtual std::unique_ptr<llvm::MemoryBuffer> run(const std::string &FileName,
|
||||
const DeviceInfo &DI) = 0;
|
||||
};
|
||||
|
||||
class JITEngine {
|
||||
const std::string Arch;
|
||||
int NumDevices = 0;
|
||||
|
||||
DeviceToolChain &DTC;
|
||||
std::vector<DeviceInfo> DI;
|
||||
std::unique_ptr<impl::ImageCache> IC;
|
||||
std::vector<std::unique_ptr<impl::TargetTableCache>> TTC;
|
||||
|
||||
class StatisticMap {
|
||||
std::unordered_map<std::string,
|
||||
std::unique_ptr<impl::SpecializationStatistics>>
|
||||
Map;
|
||||
std::mutex Mtx;
|
||||
|
||||
public:
|
||||
impl::SpecializationStatistics &get(const std::string &K, int NumArgs) {
|
||||
std::lock_guard<std::mutex> LG(Mtx);
|
||||
auto Itr = Map.find(K);
|
||||
if (Itr != Map.end())
|
||||
return *Itr->second;
|
||||
auto R = Map.insert(
|
||||
{K, std::make_unique<impl::SpecializationStatistics>(K, NumArgs)});
|
||||
return *R.first->second;
|
||||
}
|
||||
} Statistics;
|
||||
|
||||
public:
|
||||
JITEngine(const char *A, DeviceToolChain &DTC, int NumDevices);
|
||||
|
||||
///
|
||||
bool init(int DeviceId, const DeviceInfo &D) {
|
||||
if (DeviceId >= NumDevices)
|
||||
return false;
|
||||
DI[DeviceId] = D;
|
||||
TTC[DeviceId] = std::make_unique<impl::TargetTableCache>();
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Look up the target table cache. Return nullptr if there is no cache match
|
||||
/// for that specific kernel.
|
||||
__tgt_target_table *getTargetTable(int DeviceId, const Kernel &K);
|
||||
|
||||
/// Get the device image.
|
||||
__tgt_device_image *getImage(int DeviceId, Kernel &K,
|
||||
__tgt_device_image *Image);
|
||||
/// Get the device image without any kernel specialization.
|
||||
__tgt_device_image *getImage(int DeviceId, __tgt_device_image *Image);
|
||||
|
||||
bool insertTargetTable(int DeviceId, const Kernel &K,
|
||||
__tgt_target_table *Table);
|
||||
|
||||
static bool isValidModule(const std::string &Arch, __tgt_device_image *Image);
|
||||
|
||||
static bool isSpecializationSupported(__tgt_device_image *Image);
|
||||
|
||||
static void init();
|
||||
};
|
||||
} // namespace jit
|
||||
@@ -37,17 +37,38 @@ if (LIBOMPTARGET_DEP_CUDA_FOUND AND LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
|
||||
set(LIBOMPTARGET_CAN_LINK_LIBCUDA TRUE)
|
||||
endif()
|
||||
|
||||
set(LLVM_LINK_COMPONENTS
|
||||
AllTargetsAsmParsers
|
||||
AllTargetsCodeGens
|
||||
AllTargetsDescs
|
||||
AllTargetsInfos
|
||||
LTO
|
||||
)
|
||||
|
||||
set(src_files src/rtl.cpp)
|
||||
|
||||
if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
|
||||
libomptarget_say("Building CUDA plugin linked against libcuda")
|
||||
include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
|
||||
add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
|
||||
set (LIBOMPTARGET_DEP_LIBRARIES ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES})
|
||||
set(dependences ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES})
|
||||
else()
|
||||
libomptarget_say("Building CUDA plugin for dlopened libcuda")
|
||||
include_directories(dynamic_cuda)
|
||||
add_library(omptarget.rtl.cuda SHARED src/rtl.cpp dynamic_cuda/cuda.cpp)
|
||||
set (LIBOMPTARGET_DEP_LIBRARIES ${CMAKE_DL_LIBS})
|
||||
list(APPEND src_files dynamic_cuda/cuda.cpp)
|
||||
set(dependences ${CMAKE_DL_LIBS})
|
||||
endif()
|
||||
|
||||
add_llvm_library(omptarget.rtl.cuda SHARED ${src_files}
|
||||
LINK_LIBS elf_common
|
||||
MemoryManager
|
||||
${LIBOMPTARGET_DEP_LIBRARIES}
|
||||
${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
|
||||
${OPENMP_PTHREAD_LIB}
|
||||
${dependences}
|
||||
LLVM-LIBOMPTARGET-JIT
|
||||
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
|
||||
"-Wl,-z,defs")
|
||||
|
||||
add_dependencies(omptarget.rtl.cuda omptarget.devicertl.nvptx)
|
||||
|
||||
# Install plugin under the lib destination folder.
|
||||
@@ -58,15 +79,6 @@ target_include_directories(omptarget.rtl.cuda PRIVATE
|
||||
${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
target_link_libraries(omptarget.rtl.cuda
|
||||
elf_common
|
||||
MemoryManager
|
||||
${LIBOMPTARGET_DEP_LIBRARIES}
|
||||
${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
|
||||
${OPENMP_PTHREAD_LIB}
|
||||
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
|
||||
"-Wl,-z,defs")
|
||||
|
||||
# Report to the parent scope that we are building a plugin for CUDA.
|
||||
# This controls whether tests are run for the nvptx offloading target
|
||||
# Run them if libcuda is available, or if the user explicitly asked for dlopen
|
||||
|
||||
@@ -13,12 +13,17 @@
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdlib>
|
||||
#include <cuda.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "Debug.h"
|
||||
@@ -31,7 +36,12 @@
|
||||
|
||||
#include "MemoryManager.h"
|
||||
|
||||
#include "JIT.h"
|
||||
|
||||
#include "llvm/Frontend/OpenMP/OMPConstants.h"
|
||||
#include "llvm/Support/MemoryBuffer.h"
|
||||
|
||||
using llvm::MemoryBuffer;
|
||||
|
||||
// Utility for retrieving and printing CUDA error string.
|
||||
#ifdef OMPTARGET_DEBUG
|
||||
@@ -91,6 +101,24 @@ struct KernelTy {
|
||||
};
|
||||
|
||||
namespace {
|
||||
std::unique_ptr<jit::JITEngine> JITEngine;
|
||||
|
||||
class NVDeviceToolChain : public jit::DeviceToolChain {
|
||||
public:
|
||||
std::unique_ptr<llvm::MemoryBuffer> run(const std::string &FileName,
|
||||
const jit::DeviceInfo &DI) override {
|
||||
auto MBOrError = llvm::MemoryBuffer::getFile(
|
||||
FileName, /*IsText=*/true, /*RequiresNullTerminator=*/false);
|
||||
if (!MBOrError)
|
||||
return nullptr;
|
||||
if (const char *Str = getenv("LIBOMPTARGET_JIT_DUMP_ASM"))
|
||||
fprintf(stderr, ">>> ptx:\n%s\n", (*MBOrError)->getBufferStart());
|
||||
return std::move(*MBOrError);
|
||||
}
|
||||
} NVDTC;
|
||||
|
||||
std::unordered_set<void *> NonSpecializedImages;
|
||||
|
||||
bool checkResult(CUresult Err, const char *ErrMsg) {
|
||||
if (Err == CUDA_SUCCESS)
|
||||
return true;
|
||||
@@ -158,9 +186,20 @@ struct DeviceDataTy {
|
||||
int ThreadsPerBlock = 0;
|
||||
int BlocksPerGrid = 0;
|
||||
int WarpSize = 0;
|
||||
// Maximum number of registers available per block
|
||||
int MaxRegisters = 0;
|
||||
// OpenMP properties
|
||||
int NumTeams = 0;
|
||||
int NumThreads = 0;
|
||||
|
||||
struct ComputeCapabilityTy {
|
||||
int Major = 3;
|
||||
int Minor = 5;
|
||||
|
||||
std::string toString() const { return "sm_" + std::to_string(toInt()); }
|
||||
|
||||
int toInt() const { return Major * 10 + Minor; }
|
||||
} ComputeCapability;
|
||||
};
|
||||
|
||||
/// Resource allocator where \p T is the resource type.
|
||||
@@ -471,7 +510,6 @@ class DeviceRTLTy {
|
||||
E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
|
||||
}
|
||||
|
||||
public:
|
||||
CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
|
||||
assert(AsyncInfo && "AsyncInfo is nullptr");
|
||||
|
||||
@@ -486,6 +524,40 @@ public:
|
||||
return reinterpret_cast<CUstream>(AsyncInfo->Queue);
|
||||
}
|
||||
|
||||
__tgt_device_image *loadJITImage(int DeviceId, __tgt_device_image *Image) {
|
||||
return JITEngine->getImage(DeviceId, Image);
|
||||
}
|
||||
|
||||
__tgt_target_table *loadJITImage(int DeviceId, __tgt_device_image *Image,
|
||||
__tgt_offload_entry *Entry, void **Args,
|
||||
int NumArgs, int TeamNum, int ThreadLimit,
|
||||
int LoopTripCount) {
|
||||
auto Kernel = jit::Kernel::create(
|
||||
Image, Entry->name, DeviceData[DeviceId].ComputeCapability.toString(),
|
||||
Args, NumArgs, TeamNum, ThreadLimit, LoopTripCount);
|
||||
if (auto *TT = JITEngine->getTargetTable(DeviceId, Kernel)) {
|
||||
DP("couldn't find cached target table for kernel entry " DPxMOD ".\n",
|
||||
DPxPTR(Entry));
|
||||
return TT;
|
||||
}
|
||||
|
||||
auto *NewImage = JITEngine->getImage(DeviceId, Kernel, Image);
|
||||
if (!NewImage) {
|
||||
DP("failed to jit image for kernel entry " DPxMOD ".\n", DPxPTR(Entry));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto *TT = loadBinary(DeviceId, NewImage);
|
||||
if (!TT)
|
||||
return nullptr;
|
||||
|
||||
if (!JITEngine->insertTargetTable(DeviceId, Kernel, TT))
|
||||
return nullptr;
|
||||
|
||||
return TT;
|
||||
}
|
||||
|
||||
public:
|
||||
// This class should not be copied
|
||||
DeviceRTLTy(const DeviceRTLTy &) = delete;
|
||||
DeviceRTLTy(DeviceRTLTy &&) = delete;
|
||||
@@ -749,6 +821,50 @@ public:
|
||||
DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
|
||||
}
|
||||
|
||||
// Get compute capability
|
||||
int SM;
|
||||
Err = cuDeviceGetAttribute(
|
||||
&SM, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device);
|
||||
if (Err != CUDA_SUCCESS) {
|
||||
DP("Error getting compute capablity major, use default value %d\n",
|
||||
DeviceData[DeviceId].ComputeCapability.Major);
|
||||
} else {
|
||||
DeviceData[DeviceId].ComputeCapability.Major = SM;
|
||||
}
|
||||
Err = cuDeviceGetAttribute(
|
||||
&SM, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device);
|
||||
if (Err != CUDA_SUCCESS) {
|
||||
DP("Error getting compute capablity minor, use default value %d\n",
|
||||
DeviceData[DeviceId].ComputeCapability.Minor);
|
||||
} else {
|
||||
DeviceData[DeviceId].ComputeCapability.Minor = SM;
|
||||
}
|
||||
int MaxRegs;
|
||||
Err = cuDeviceGetAttribute(
|
||||
&MaxRegs, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device);
|
||||
if (Err != CUDA_SUCCESS) {
|
||||
DP("Error getting max registers per block, use default value %d\n",
|
||||
DeviceData[DeviceId].MaxRegisters);
|
||||
} else {
|
||||
DeviceData[DeviceId].MaxRegisters = MaxRegs;
|
||||
}
|
||||
|
||||
if (JITEngine) {
|
||||
jit::DeviceInfo DI;
|
||||
DI.Arch = "nvptx64";
|
||||
DI.MCpu = DeviceData[DeviceId].ComputeCapability.toString();
|
||||
DI.MaxNumRegs = DeviceData[DeviceId].MaxRegisters;
|
||||
DI.ThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
|
||||
DI.BlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
|
||||
DI.WarpSize = 32;
|
||||
DI.NumThreads = DeviceData[DeviceId].NumThreads;
|
||||
DI.NumTeams = DeviceData[DeviceId].NumTeams;
|
||||
DI.EnvNumThreads = EnvTeamThreadLimit;
|
||||
DI.EnvNumTeams = EnvNumTeams;
|
||||
|
||||
JITEngine->init(DeviceId, DI);
|
||||
}
|
||||
|
||||
return OFFLOAD_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -790,14 +906,24 @@ public:
|
||||
|
||||
__tgt_target_table *loadBinary(const int DeviceId,
|
||||
const __tgt_device_image *Image) {
|
||||
void *ImageStart = Image->ImageStart;
|
||||
if (NonSpecializedImages.find(ImageStart) != NonSpecializedImages.end()) {
|
||||
auto *NewImage =
|
||||
loadJITImage(DeviceId, const_cast<__tgt_device_image *>(Image));
|
||||
if (!NewImage)
|
||||
return nullptr;
|
||||
|
||||
ImageStart = NewImage->ImageStart;
|
||||
}
|
||||
|
||||
// Clear the offload table as we are going to create a new one.
|
||||
clearOffloadEntriesTable(DeviceId);
|
||||
|
||||
// Create the module and extract the function pointers.
|
||||
CUmodule Module;
|
||||
DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
|
||||
DP("Load data from image " DPxMOD "\n", DPxPTR(ImageStart));
|
||||
CUresult Err =
|
||||
cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
|
||||
cuModuleLoadDataEx(&Module, ImageStart, 0, nullptr, nullptr);
|
||||
if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
|
||||
return nullptr;
|
||||
|
||||
@@ -1073,7 +1199,7 @@ public:
|
||||
ptrdiff_t *TgtOffsets, const int ArgNum,
|
||||
const int TeamNum, const int ThreadLimit,
|
||||
const unsigned int LoopTripCount,
|
||||
__tgt_async_info *AsyncInfo) const {
|
||||
__tgt_async_info *AsyncInfo) {
|
||||
// All args are references.
|
||||
std::vector<void *> Args(ArgNum);
|
||||
std::vector<void *> Ptrs(ArgNum);
|
||||
@@ -1083,7 +1209,27 @@ public:
|
||||
Args[I] = &Ptrs[I];
|
||||
}
|
||||
|
||||
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
|
||||
auto LaunchEntry =
|
||||
reinterpret_cast<__tgt_kernel_launch_entry *>(TgtEntryPtr);
|
||||
KernelTy *KernelInfo =
|
||||
reinterpret_cast<KernelTy *>(LaunchEntry->TargetEntry);
|
||||
// If kernel info is nullptr, it means we are dealing with JIT image.
|
||||
if (KernelInfo == nullptr) {
|
||||
assert(LaunchEntry->Image && LaunchEntry->HostEntry);
|
||||
__tgt_device_image NewImage = *(LaunchEntry->Image);
|
||||
NewImage.EntriesBegin = LaunchEntry->HostEntry;
|
||||
NewImage.EntriesEnd = NewImage.EntriesBegin + 1;
|
||||
auto TargetTable =
|
||||
loadJITImage(DeviceId, &NewImage, LaunchEntry->HostEntry, Ptrs.data(),
|
||||
ArgNum, TeamNum, ThreadLimit, LoopTripCount);
|
||||
if (!TargetTable)
|
||||
return OFFLOAD_FAIL;
|
||||
|
||||
KernelInfo =
|
||||
reinterpret_cast<KernelTy *>(TargetTable->EntriesBegin->addr);
|
||||
}
|
||||
|
||||
assert(KernelInfo && "KernelInfo should not be nullptr");
|
||||
|
||||
const bool IsSPMDGenericMode =
|
||||
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
|
||||
@@ -1484,7 +1630,24 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
|
||||
return elf_check_machine(image, /* EM_CUDA */ 190);
|
||||
if (elf_check_machine(image, /* EM_CUDA */ 190))
|
||||
return 1;
|
||||
|
||||
jit::JITEngine::init();
|
||||
|
||||
if (!JITEngine)
|
||||
JITEngine = std::make_unique<jit::JITEngine>("nvptx64", NVDTC,
|
||||
DeviceRTL.getNumOfDevices());
|
||||
|
||||
if (!jit::JITEngine::isValidModule("nvptx64", image))
|
||||
return 0;
|
||||
|
||||
if (jit::JITEngine::isSpecializationSupported(image))
|
||||
return 2;
|
||||
|
||||
NonSpecializedImages.insert(image->ImageStart);
|
||||
|
||||
return 3;
|
||||
}
|
||||
|
||||
int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
|
||||
|
||||
@@ -19,7 +19,10 @@
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// adds requires flags
|
||||
@@ -40,6 +43,7 @@ EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) {
|
||||
}
|
||||
}
|
||||
}
|
||||
// PM->RTLs.RegisterLib(createBinDescFrom(desc));
|
||||
PM->RTLs.RegisterLib(desc);
|
||||
}
|
||||
|
||||
|
||||
@@ -107,6 +107,13 @@ static int InitLibrary(DeviceTy &Device) {
|
||||
rc = OFFLOAD_FAIL;
|
||||
break;
|
||||
}
|
||||
|
||||
const bool IsJITImage =
|
||||
RegisteredJITImages.find(img) != RegisteredJITImages.end();
|
||||
|
||||
if (IsJITImage)
|
||||
continue;
|
||||
|
||||
// 2) load image into the target table.
|
||||
__tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] =
|
||||
Device.load_binary(img);
|
||||
@@ -1500,16 +1507,6 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
|
||||
// get target table.
|
||||
__tgt_target_table *TargetTable = nullptr;
|
||||
{
|
||||
std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
|
||||
assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
|
||||
"Not expecting a device ID outside the table's bounds!");
|
||||
TargetTable = TM->Table->TargetsTable[DeviceId];
|
||||
}
|
||||
assert(TargetTable && "Global data has not been mapped\n");
|
||||
|
||||
// We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
|
||||
// need to manifest base pointers prior to launching a kernel. Even if we have
|
||||
// mapped an object only partially, e.g. A[N:M], although the kernel is
|
||||
@@ -1536,11 +1533,42 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
|
||||
}
|
||||
}
|
||||
|
||||
// Launch device execution.
|
||||
void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
|
||||
DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
|
||||
TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
|
||||
__tgt_device_image *Image = TM->Table->TargetsImages[Device.DeviceID];
|
||||
const bool UseJIT =
|
||||
RegisteredJITImages.find(Image) != RegisteredJITImages.end();
|
||||
void *TgtEntryPtr = nullptr;
|
||||
__tgt_kernel_launch_entry LaunchEntry;
|
||||
// get target table if in non-JIT mode.
|
||||
if (UseJIT) {
|
||||
__tgt_offload_entry *Entry = nullptr;
|
||||
__tgt_target_table *HostTable = &TM->Table->HostTable;
|
||||
// Find the entry name from the host entries
|
||||
// TODO: We might want a map for this
|
||||
for (auto Itr = HostTable->EntriesBegin; Itr != HostTable->EntriesEnd;
|
||||
++Itr)
|
||||
if (Itr->addr == HostPtr) {
|
||||
Entry = Itr;
|
||||
break;
|
||||
}
|
||||
assert(Entry && "cannot find entry");
|
||||
LaunchEntry.HostEntry = Entry;
|
||||
LaunchEntry.Image = Image;
|
||||
DP("Launching target jit execution %s with pointer " DPxMOD ".\n",
|
||||
Entry->name, DPxPTR(TgtEntryPtr));
|
||||
} else {
|
||||
std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
|
||||
assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
|
||||
"Not expecting a device ID outside the table's bounds!");
|
||||
__tgt_target_table *TargetTable = TM->Table->TargetsTable[DeviceId];
|
||||
assert(TargetTable && "Global data has not been mapped\n");
|
||||
LaunchEntry.TargetEntry = TargetTable->EntriesBegin[TM->Index].addr;
|
||||
DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
|
||||
TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr),
|
||||
TM->Index);
|
||||
}
|
||||
TgtEntryPtr = &LaunchEntry;
|
||||
|
||||
// Launch device execution.
|
||||
{
|
||||
TIMESCOPE_WITH_NAME_AND_IDENT(
|
||||
IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc);
|
||||
|
||||
@@ -38,6 +38,8 @@ PluginManager *PM;
|
||||
static char *ProfileTraceFile = nullptr;
|
||||
#endif
|
||||
|
||||
std::unordered_set<__tgt_device_image *> RegisteredJITImages;
|
||||
|
||||
__attribute__((constructor(101))) void init() {
|
||||
DP("Init target library!\n");
|
||||
|
||||
@@ -250,8 +252,7 @@ static void RegisterImageIntoTranslationTable(TranslationTable &TT,
|
||||
|
||||
static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
|
||||
__tgt_device_image *img,
|
||||
RTLInfoTy *RTL) {
|
||||
|
||||
RTLInfoTy *RTL, bool IsJITImage) {
|
||||
for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) {
|
||||
DeviceTy &Device = *PM->Devices[RTL->Idx + i];
|
||||
Device.PendingGlobalsMtx.lock();
|
||||
@@ -261,13 +262,21 @@ static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
|
||||
if (entry->flags & OMP_DECLARE_TARGET_CTOR) {
|
||||
DP("Adding ctor " DPxMOD " to the pending list.\n",
|
||||
DPxPTR(entry->addr));
|
||||
Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
|
||||
if (IsJITImage)
|
||||
Device.PendingCtorsDtors[desc].PendingJITCtors[img].push_back(
|
||||
entry->addr);
|
||||
else
|
||||
Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
|
||||
} else if (entry->flags & OMP_DECLARE_TARGET_DTOR) {
|
||||
// Dtors are pushed in reverse order so they are executed from end
|
||||
// to beginning when unregistering the library!
|
||||
DP("Adding dtor " DPxMOD " to the pending list.\n",
|
||||
DPxPTR(entry->addr));
|
||||
Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
|
||||
if (IsJITImage)
|
||||
Device.PendingCtorsDtors[desc].PendingJITDtors[img].push_front(
|
||||
entry->addr);
|
||||
else
|
||||
Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
|
||||
}
|
||||
|
||||
if (entry->flags & OMP_DECLARE_TARGET_LINK) {
|
||||
@@ -363,14 +372,21 @@ void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
|
||||
// Scan the RTLs that have associated images until we find one that supports
|
||||
// the current image.
|
||||
for (auto &R : AllRTLs) {
|
||||
if (!R.is_valid_binary(img)) {
|
||||
int Ret = R.is_valid_binary(img);
|
||||
if (Ret == 0) {
|
||||
DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
|
||||
DPxPTR(img->ImageStart), R.RTLName.c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
DP("Image " DPxMOD " is compatible with RTL %s!\n",
|
||||
DPxPTR(img->ImageStart), R.RTLName.c_str());
|
||||
// TODO: should use enum here.
|
||||
const bool IsJITImage = Ret == 2;
|
||||
|
||||
DP("%sImage " DPxMOD " is compatible with RTL %s!\n",
|
||||
IsJITImage ? "JIT " : "", DPxPTR(img->ImageStart), R.RTLName.c_str());
|
||||
|
||||
if (IsJITImage)
|
||||
RegisteredJITImages.insert(img);
|
||||
|
||||
initRTLonce(R);
|
||||
|
||||
@@ -395,7 +411,7 @@ void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
|
||||
FoundRTL = &R;
|
||||
|
||||
// Load ctors/dtors for static objects
|
||||
RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL);
|
||||
RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL, IsJITImage);
|
||||
|
||||
// if an RTL was found we are done - proceed to register the next image
|
||||
break;
|
||||
@@ -427,6 +443,9 @@ void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
|
||||
|
||||
assert(R->isUsed && "Expecting used RTLs.");
|
||||
|
||||
// FIXME: This is WRONG!!!
|
||||
continue;
|
||||
|
||||
if (!R->is_valid_binary(img)) {
|
||||
DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n",
|
||||
DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
|
||||
|
||||
Reference in New Issue
Block a user