Compare commits
2 Commits
test-alloc
...
jit
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
02bc7effcc | ||
|
|
6f3e60f1c0 |
@@ -160,6 +160,8 @@ def err_drv_invalid_Xarch_argument_with_args : Error<
|
|||||||
"invalid Xarch argument: '%0', options requiring arguments are unsupported">;
|
"invalid Xarch argument: '%0', options requiring arguments are unsupported">;
|
||||||
def err_drv_Xopenmp_target_missing_triple : Error<
|
def err_drv_Xopenmp_target_missing_triple : Error<
|
||||||
"cannot deduce implicit triple value for -Xopenmp-target, specify triple using -Xopenmp-target=<triple>">;
|
"cannot deduce implicit triple value for -Xopenmp-target, specify triple using -Xopenmp-target=<triple>">;
|
||||||
|
def err_drv_openmp_jit_without_lto : Error<
|
||||||
|
"cannot enable OpenMP offloading JIT, specify bitcode compilation with '-foffload-lto'">;
|
||||||
def err_drv_invalid_Xopenmp_target_with_args : Error<
|
def err_drv_invalid_Xopenmp_target_with_args : Error<
|
||||||
"invalid -Xopenmp-target argument: '%0', options requiring arguments are unsupported">;
|
"invalid -Xopenmp-target argument: '%0', options requiring arguments are unsupported">;
|
||||||
def err_drv_argument_only_allowed_with : Error<
|
def err_drv_argument_only_allowed_with : Error<
|
||||||
|
|||||||
@@ -2539,6 +2539,10 @@ def fopenmp_target_new_runtime : Flag<["-"], "fopenmp-target-new-runtime">,
|
|||||||
Group<f_Group>, Flags<[CC1Option, HelpHidden]>;
|
Group<f_Group>, Flags<[CC1Option, HelpHidden]>;
|
||||||
def fno_openmp_target_new_runtime : Flag<["-"], "fno-openmp-target-new-runtime">,
|
def fno_openmp_target_new_runtime : Flag<["-"], "fno-openmp-target-new-runtime">,
|
||||||
Group<f_Group>, Flags<[CC1Option, HelpHidden]>;
|
Group<f_Group>, Flags<[CC1Option, HelpHidden]>;
|
||||||
|
def fopenmp_target_jit : Flag<["-"], "fopenmp-target-jit">, Group<f_Group>,
|
||||||
|
HelpText<"Enable JIT comilation for OpenMP Offloading">, Flags<[ NoArgumentUnused]>;
|
||||||
|
def fno_openmp_target_jit : Flag<["-"], "fno-openmp-target-jit">, Group<f_Group>,
|
||||||
|
Flags<[NoArgumentUnused, HelpHidden]>;
|
||||||
defm openmp_optimistic_collapse : BoolFOption<"openmp-optimistic-collapse",
|
defm openmp_optimistic_collapse : BoolFOption<"openmp-optimistic-collapse",
|
||||||
LangOpts<"OpenMPOptimisticCollapse">, DefaultFalse,
|
LangOpts<"OpenMPOptimisticCollapse">, DefaultFalse,
|
||||||
PosFlag<SetTrue, [CC1Option]>, NegFlag<SetFalse>, BothFlags<[NoArgumentUnused, HelpHidden]>>;
|
PosFlag<SetTrue, [CC1Option]>, NegFlag<SetFalse>, BothFlags<[NoArgumentUnused, HelpHidden]>>;
|
||||||
|
|||||||
@@ -8285,6 +8285,12 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
|
|||||||
auto OpenMPTCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
|
auto OpenMPTCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
|
||||||
ArgStringList CmdArgs;
|
ArgStringList CmdArgs;
|
||||||
|
|
||||||
|
if (!C.getDriver().isUsingLTO(/* IsOffload */ true) &&
|
||||||
|
Args.hasFlag(options::OPT_fopenmp_target_jit,
|
||||||
|
options::OPT_fno_openmp_target_jit, /*Default*/ false)) {
|
||||||
|
C.getDriver().Diag(clang::diag::err_drv_openmp_jit_without_lto);
|
||||||
|
}
|
||||||
|
|
||||||
// Pass the CUDA path to the linker wrapper tool.
|
// Pass the CUDA path to the linker wrapper tool.
|
||||||
for (Action::OffloadKind Kind : {Action::OFK_Cuda, Action::OFK_OpenMP}) {
|
for (Action::OffloadKind Kind : {Action::OFK_Cuda, Action::OFK_OpenMP}) {
|
||||||
auto TCRange = C.getOffloadToolChains(Kind);
|
auto TCRange = C.getOffloadToolChains(Kind);
|
||||||
@@ -8355,6 +8361,11 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
|
|||||||
if (!OOpt.empty())
|
if (!OOpt.empty())
|
||||||
CmdArgs.push_back(Args.MakeArgString(Twine("-opt-level=O") + OOpt));
|
CmdArgs.push_back(Args.MakeArgString(Twine("-opt-level=O") + OOpt));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (Args.hasFlag(options::OPT_fopenmp_target_jit,
|
||||||
|
options::OPT_fno_openmp_target_jit,
|
||||||
|
/*Default=*/false))
|
||||||
|
CmdArgs.push_back(Args.MakeArgString("-target-embed-bc"));
|
||||||
}
|
}
|
||||||
|
|
||||||
CmdArgs.push_back("-host-triple");
|
CmdArgs.push_back("-host-triple");
|
||||||
|
|||||||
@@ -98,6 +98,7 @@ struct LTOCodeGenerator {
|
|||||||
|
|
||||||
void setCpu(StringRef MCpu) { Config.CPU = std::string(MCpu); }
|
void setCpu(StringRef MCpu) { Config.CPU = std::string(MCpu); }
|
||||||
void setAttrs(std::vector<std::string> MAttrs) { Config.MAttrs = MAttrs; }
|
void setAttrs(std::vector<std::string> MAttrs) { Config.MAttrs = MAttrs; }
|
||||||
|
void setUseDefaultPipeline(bool Value) { Config.UseDefaultPipeline = Value; }
|
||||||
void setOptLevel(unsigned OptLevel);
|
void setOptLevel(unsigned OptLevel);
|
||||||
|
|
||||||
void setShouldInternalize(bool Value) { ShouldInternalize = Value; }
|
void setShouldInternalize(bool Value) { ShouldInternalize = Value; }
|
||||||
@@ -193,6 +194,8 @@ struct LTOCodeGenerator {
|
|||||||
void resetMergedModule() { MergedModule.reset(); }
|
void resetMergedModule() { MergedModule.reset(); }
|
||||||
void DiagnosticHandler(const DiagnosticInfo &DI);
|
void DiagnosticHandler(const DiagnosticInfo &DI);
|
||||||
|
|
||||||
|
Module &getMergedModule() const { return *MergedModule; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// Verify the merged module on first call.
|
/// Verify the merged module on first call.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -111,6 +111,8 @@ public:
|
|||||||
createInLocalContext(std::unique_ptr<LLVMContext> Context, const void *mem,
|
createInLocalContext(std::unique_ptr<LLVMContext> Context, const void *mem,
|
||||||
size_t length, const TargetOptions &options,
|
size_t length, const TargetOptions &options,
|
||||||
StringRef path);
|
StringRef path);
|
||||||
|
static ErrorOr<std::unique_ptr<LTOModule>>
|
||||||
|
clone(const LTOModule &LM, const TargetOptions &options);
|
||||||
|
|
||||||
const Module &getModule() const { return *Mod; }
|
const Module &getModule() const { return *Mod; }
|
||||||
Module &getModule() { return *Mod; }
|
Module &getModule() { return *Mod; }
|
||||||
|
|||||||
@@ -538,7 +538,8 @@ bool LTOCodeGenerator::optimize() {
|
|||||||
this->applyScopeRestrictions();
|
this->applyScopeRestrictions();
|
||||||
|
|
||||||
// Write LTOPostLink flag for passes that require all the modules.
|
// Write LTOPostLink flag for passes that require all the modules.
|
||||||
MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
|
if (!MergedModule->getModuleFlag("LTOPostLink"))
|
||||||
|
MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
|
||||||
|
|
||||||
// Add an appropriate DataLayout instance for this module...
|
// Add an appropriate DataLayout instance for this module...
|
||||||
MergedModule->setDataLayout(TargetMach->createDataLayout());
|
MergedModule->setDataLayout(TargetMach->createDataLayout());
|
||||||
|
|||||||
@@ -38,6 +38,7 @@
|
|||||||
#include "llvm/Support/SourceMgr.h"
|
#include "llvm/Support/SourceMgr.h"
|
||||||
#include "llvm/Support/TargetSelect.h"
|
#include "llvm/Support/TargetSelect.h"
|
||||||
#include "llvm/Target/TargetLoweringObjectFile.h"
|
#include "llvm/Target/TargetLoweringObjectFile.h"
|
||||||
|
#include "llvm/Transforms/Utils/Cloning.h"
|
||||||
#include "llvm/Transforms/Utils/GlobalStatus.h"
|
#include "llvm/Transforms/Utils/GlobalStatus.h"
|
||||||
#include <system_error>
|
#include <system_error>
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
@@ -701,3 +702,47 @@ bool LTOModule::hasCtorDtor() const {
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ErrorOr<std::unique_ptr<LTOModule>>
|
||||||
|
LTOModule::clone(const LTOModule &LM, const TargetOptions &options) {
|
||||||
|
auto NM = CloneModule(LM.getModule());
|
||||||
|
|
||||||
|
std::string TripleStr = NM->getTargetTriple();
|
||||||
|
if (TripleStr.empty())
|
||||||
|
TripleStr = sys::getDefaultTargetTriple();
|
||||||
|
llvm::Triple Triple(TripleStr);
|
||||||
|
|
||||||
|
// find machine architecture for this module
|
||||||
|
std::string errMsg;
|
||||||
|
const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
|
||||||
|
if (!march)
|
||||||
|
return make_error_code(object::object_error::arch_not_found);
|
||||||
|
|
||||||
|
// construct LTOModule, hand over ownership of module and target
|
||||||
|
SubtargetFeatures Features;
|
||||||
|
Features.getDefaultSubtargetFeatures(Triple);
|
||||||
|
std::string FeatureStr = Features.getString();
|
||||||
|
// Set a default CPU for Darwin triples.
|
||||||
|
std::string CPU;
|
||||||
|
if (Triple.isOSDarwin()) {
|
||||||
|
if (Triple.getArch() == llvm::Triple::x86_64)
|
||||||
|
CPU = "core2";
|
||||||
|
else if (Triple.getArch() == llvm::Triple::x86)
|
||||||
|
CPU = "yonah";
|
||||||
|
else if (Triple.isArm64e())
|
||||||
|
CPU = "apple-a12";
|
||||||
|
else if (Triple.getArch() == llvm::Triple::aarch64 ||
|
||||||
|
Triple.getArch() == llvm::Triple::aarch64_32)
|
||||||
|
CPU = "cyclone";
|
||||||
|
}
|
||||||
|
|
||||||
|
TargetMachine *target =
|
||||||
|
march->createTargetMachine(TripleStr, CPU, FeatureStr, options, None);
|
||||||
|
|
||||||
|
std::unique_ptr<LTOModule> Ret(
|
||||||
|
new LTOModule(std::move(NM), LM.MBRef, target));
|
||||||
|
Ret->parseSymbols();
|
||||||
|
Ret->parseMetadata();
|
||||||
|
|
||||||
|
return std::move(Ret);
|
||||||
|
}
|
||||||
|
|||||||
@@ -1497,6 +1497,14 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
|
|||||||
MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
|
MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
FunctionPassManager EarlyFPM;
|
||||||
|
// Break up allocas
|
||||||
|
EarlyFPM.addPass(SROAPass());
|
||||||
|
MPM.addPass(createModuleToFunctionPassAdaptor(
|
||||||
|
std::move(EarlyFPM), PTO.EagerlyInvalidateAnalyses));
|
||||||
|
}
|
||||||
|
|
||||||
// Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
|
// Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
|
||||||
MPM.addPass(OpenMPOptPass());
|
MPM.addPass(OpenMPOptPass());
|
||||||
|
|
||||||
@@ -1632,9 +1640,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
|
|||||||
PGOOpt->ProfileRemappingFile);
|
PGOOpt->ProfileRemappingFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Break up allocas
|
|
||||||
FPM.addPass(SROAPass());
|
|
||||||
|
|
||||||
// LTO provides additional opportunities for tailcall elimination due to
|
// LTO provides additional opportunities for tailcall elimination due to
|
||||||
// link-time inlining, and visibility of nocapture attribute.
|
// link-time inlining, and visibility of nocapture attribute.
|
||||||
FPM.addPass(TailCallElimPass());
|
FPM.addPass(TailCallElimPass());
|
||||||
|
|||||||
@@ -128,6 +128,7 @@ template <typename VTy, typename Ty> struct ValueRAII {
|
|||||||
Val(OldValue), Active(Active) {
|
Val(OldValue), Active(Active) {
|
||||||
if (!Active)
|
if (!Active)
|
||||||
return;
|
return;
|
||||||
|
Ptr = &V.lookup(/* IsReadonly */ false, Ident);
|
||||||
ASSERT(*Ptr == OldValue &&
|
ASSERT(*Ptr == OldValue &&
|
||||||
"ValueRAII initialization with wrong old value!");
|
"ValueRAII initialization with wrong old value!");
|
||||||
*Ptr = NewValue;
|
*Ptr = NewValue;
|
||||||
|
|||||||
@@ -22,6 +22,7 @@
|
|||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "ExclusiveAccess.h"
|
#include "ExclusiveAccess.h"
|
||||||
@@ -302,6 +303,9 @@ typedef std::map<void *, ShadowPtrValTy> ShadowPtrListTy;
|
|||||||
struct PendingCtorDtorListsTy {
|
struct PendingCtorDtorListsTy {
|
||||||
std::list<void *> PendingCtors;
|
std::list<void *> PendingCtors;
|
||||||
std::list<void *> PendingDtors;
|
std::list<void *> PendingDtors;
|
||||||
|
|
||||||
|
std::unordered_map<__tgt_device_image *, std::list<void *>> PendingJITCtors;
|
||||||
|
std::unordered_map<__tgt_device_image *, std::list<void *>> PendingJITDtors;
|
||||||
};
|
};
|
||||||
typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy>
|
typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy>
|
||||||
PendingCtorsDtorsPerLibrary;
|
PendingCtorsDtorsPerLibrary;
|
||||||
@@ -461,6 +465,10 @@ struct DeviceTy {
|
|||||||
int32_t destroyEvent(void *Event);
|
int32_t destroyEvent(void *Event);
|
||||||
/// }
|
/// }
|
||||||
|
|
||||||
|
__tgt_target_table *loadJITImage(__tgt_device_image *Image,
|
||||||
|
const char *EntryName, void **TgtArgs,
|
||||||
|
ptrdiff_t *TgtOffsets, int NumArgs);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Call to RTL
|
// Call to RTL
|
||||||
void init(); // To be called only via DeviceTy::initOnce()
|
void init(); // To be called only via DeviceTy::initOnce()
|
||||||
|
|||||||
@@ -197,6 +197,12 @@ struct __tgt_device_info {
|
|||||||
void *Device = nullptr;
|
void *Device = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct __tgt_kernel_launch_entry {
|
||||||
|
__tgt_offload_entry *HostEntry = nullptr;
|
||||||
|
void *TargetEntry = nullptr;
|
||||||
|
__tgt_device_image *Image = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
#include <map>
|
#include <map>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <unordered_set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// Forward declarations.
|
// Forward declarations.
|
||||||
@@ -184,4 +185,7 @@ struct TableMap {
|
|||||||
};
|
};
|
||||||
typedef std::map<void *, TableMap> HostPtrToTableMapTy;
|
typedef std::map<void *, TableMap> HostPtrToTableMapTy;
|
||||||
|
|
||||||
|
/// A set that stores all registered JIT images.
|
||||||
|
extern std::unordered_set<__tgt_device_image *> RegisteredJITImages;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ else()
|
|||||||
set(LIBOMPTARGET_DEP_LIBRARIES)
|
set(LIBOMPTARGET_DEP_LIBRARIES)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_library(omptarget.rtl.amdgpu SHARED
|
add_llvm_library(omptarget.rtl.amdgpu SHARED
|
||||||
impl/impl.cpp
|
impl/impl.cpp
|
||||||
impl/interop_hsa.cpp
|
impl/interop_hsa.cpp
|
||||||
impl/data.cpp
|
impl/data.cpp
|
||||||
@@ -100,6 +100,7 @@ target_link_libraries(
|
|||||||
${OPENMP_PTHREAD_LIB}
|
${OPENMP_PTHREAD_LIB}
|
||||||
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
|
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
|
||||||
${LDFLAGS_UNDEFINED}
|
${LDFLAGS_UNDEFINED}
|
||||||
|
LLVM-LIBOMPTARGET-JIT
|
||||||
)
|
)
|
||||||
|
|
||||||
# in case of amdgcn, skip running tests if amdgpu-arch was not built or fails
|
# in case of amdgcn, skip running tests if amdgpu-arch was not built or fails
|
||||||
|
|||||||
@@ -35,8 +35,16 @@
|
|||||||
#include "omptargetplugin.h"
|
#include "omptargetplugin.h"
|
||||||
#include "print_tracing.h"
|
#include "print_tracing.h"
|
||||||
|
|
||||||
|
#include "llvm/ADT/StringRef.h"
|
||||||
#include "llvm/Frontend/OpenMP/OMPConstants.h"
|
#include "llvm/Frontend/OpenMP/OMPConstants.h"
|
||||||
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
||||||
|
#include "llvm/Support/MemoryBuffer.h"
|
||||||
|
#include "llvm/Support/Path.h"
|
||||||
|
#include "llvm/Support/Program.h"
|
||||||
|
|
||||||
|
#include "JIT.h"
|
||||||
|
|
||||||
|
using namespace llvm;
|
||||||
|
|
||||||
// hostrpc interface, FIXME: consider moving to its own include these are
|
// hostrpc interface, FIXME: consider moving to its own include these are
|
||||||
// statically linked into amdgpu/plugin if present from hostrpc_services.a,
|
// statically linked into amdgpu/plugin if present from hostrpc_services.a,
|
||||||
@@ -850,8 +858,81 @@ pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
|
|||||||
|
|
||||||
static RTLDeviceInfoTy DeviceInfo;
|
static RTLDeviceInfoTy DeviceInfo;
|
||||||
|
|
||||||
|
static __tgt_target_table *
|
||||||
|
__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
std::unique_ptr<jit::JITEngine> JITEngine;
|
||||||
|
|
||||||
|
class AMDDeviceToolChain : public jit::DeviceToolChain {
|
||||||
|
static std::string getMainExecutable(const char *Name) {
|
||||||
|
void *Ptr = (void *)(intptr_t)&getMainExecutable;
|
||||||
|
auto COWPath = sys::fs::getMainExecutable(Name, Ptr);
|
||||||
|
return sys::path::parent_path(COWPath).str();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a temporary filename suitable for output.
|
||||||
|
static Error createOutputFile(const Twine &Prefix, StringRef Extension,
|
||||||
|
SmallString<128> &NewFilename) {
|
||||||
|
if (std::error_code EC =
|
||||||
|
sys::fs::createTemporaryFile(Prefix, Extension, NewFilename))
|
||||||
|
return createFileError(NewFilename, EC);
|
||||||
|
return Error::success();
|
||||||
|
}
|
||||||
|
|
||||||
|
Expected<std::string> link(StringRef Input, StringRef Prefix) {
|
||||||
|
ErrorOr<std::string> LLDPath =
|
||||||
|
sys::findProgramByName("lld", {getMainExecutable("lld")});
|
||||||
|
if (!LLDPath)
|
||||||
|
LLDPath = sys::findProgramByName("lld");
|
||||||
|
if (!LLDPath)
|
||||||
|
return createStringError(LLDPath.getError(),
|
||||||
|
"Unable to find 'lld' in path");
|
||||||
|
|
||||||
|
SmallString<128> TempFile;
|
||||||
|
if (Error Err = createOutputFile(Prefix, "o", TempFile))
|
||||||
|
return std::move(Err);
|
||||||
|
|
||||||
|
SmallVector<StringRef, 16> CmdArgs;
|
||||||
|
CmdArgs.push_back(*LLDPath);
|
||||||
|
CmdArgs.push_back("-flavor");
|
||||||
|
CmdArgs.push_back("gnu");
|
||||||
|
CmdArgs.push_back("--no-undefined");
|
||||||
|
CmdArgs.push_back("-shared");
|
||||||
|
CmdArgs.push_back("-o");
|
||||||
|
CmdArgs.push_back(TempFile);
|
||||||
|
CmdArgs.push_back(Input);
|
||||||
|
|
||||||
|
if (sys::ExecuteAndWait(*LLDPath, CmdArgs))
|
||||||
|
return createStringError(inconvertibleErrorCode(), "'lld' failed");
|
||||||
|
|
||||||
|
return static_cast<std::string>(TempFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
std::unique_ptr<MemoryBuffer> run(const std::string &FileName,
|
||||||
|
const jit::DeviceInfo &DI) override {
|
||||||
|
std::string Prefix = "libomptarget-amdgcn-" + DI.MCpu + "-jit";
|
||||||
|
auto FileNameOrErr = link(FileName, Prefix);
|
||||||
|
if (!FileNameOrErr) {
|
||||||
|
Error E = FileNameOrErr.takeError();
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string TempFile = *FileNameOrErr;
|
||||||
|
auto MBOrError = MemoryBuffer::getFile(TempFile, /*IsText=*/false,
|
||||||
|
/*RequiresNullTerminator=*/false);
|
||||||
|
if (std::error_code EC = MBOrError.getError()) {
|
||||||
|
sys::fs::remove(TempFile);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
sys::fs::remove(TempFile);
|
||||||
|
return std::move(*MBOrError);
|
||||||
|
}
|
||||||
|
} AMDDTC;
|
||||||
|
|
||||||
int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
|
int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
|
||||||
__tgt_async_info *AsyncInfo) {
|
__tgt_async_info *AsyncInfo) {
|
||||||
assert(AsyncInfo && "AsyncInfo is nullptr");
|
assert(AsyncInfo && "AsyncInfo is nullptr");
|
||||||
@@ -1090,6 +1171,30 @@ static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
|
|||||||
return packet_id;
|
return packet_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__tgt_target_table *loadJITImage(int DeviceId, __tgt_device_image *Image,
|
||||||
|
__tgt_offload_entry *Entry, void **Args,
|
||||||
|
int NumArgs, int TeamNum, int ThreadLimit,
|
||||||
|
int LoopTripCount) {
|
||||||
|
auto Kernel =
|
||||||
|
jit::Kernel::create(Image, Entry->name, DeviceInfo.GPUName[DeviceId],
|
||||||
|
Args, NumArgs, TeamNum, ThreadLimit, LoopTripCount);
|
||||||
|
if (auto *TT = JITEngine->getTargetTable(DeviceId, Kernel))
|
||||||
|
return TT;
|
||||||
|
|
||||||
|
auto *NewImage = JITEngine->getImage(DeviceId, Kernel, Image);
|
||||||
|
if (!NewImage)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
auto *TT = __tgt_rtl_load_binary_locked(DeviceId, NewImage);
|
||||||
|
if (!TT)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
if (!JITEngine->insertTargetTable(DeviceId, Kernel, TT))
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
return TT;
|
||||||
|
}
|
||||||
|
|
||||||
int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
|
int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
|
||||||
ptrdiff_t *tgt_offsets, int32_t arg_num,
|
ptrdiff_t *tgt_offsets, int32_t arg_num,
|
||||||
int32_t num_teams, int32_t thread_limit,
|
int32_t num_teams, int32_t thread_limit,
|
||||||
@@ -1111,7 +1216,24 @@ int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
|
|||||||
DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
|
DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
|
auto LaunchEntry = reinterpret_cast<__tgt_kernel_launch_entry *>(tgt_entry_ptr);
|
||||||
|
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(LaunchEntry->TargetEntry);
|
||||||
|
// If kernel info is nullptr, it means we are dealing with JIT image.
|
||||||
|
if (KernelInfo == nullptr) {
|
||||||
|
assert(LaunchEntry->Image && LaunchEntry->HostEntry);
|
||||||
|
__tgt_device_image NewImage = *(LaunchEntry->Image);
|
||||||
|
NewImage.EntriesBegin = LaunchEntry->HostEntry;
|
||||||
|
NewImage.EntriesEnd = NewImage.EntriesBegin + 1;
|
||||||
|
auto TargetTable =
|
||||||
|
loadJITImage(device_id, &NewImage, LaunchEntry->HostEntry, ptrs.data(),
|
||||||
|
arg_num, num_teams, thread_limit, loop_tripcount);
|
||||||
|
if (!TargetTable)
|
||||||
|
return OFFLOAD_FAIL;
|
||||||
|
|
||||||
|
KernelInfo = reinterpret_cast<KernelTy *>(TargetTable->EntriesBegin->addr);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(KernelInfo && "KernelInfo should not be nullptr");
|
||||||
|
|
||||||
std::string kernel_name = std::string(KernelInfo->Name);
|
std::string kernel_name = std::string(KernelInfo->Name);
|
||||||
auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
|
auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
|
||||||
@@ -1640,7 +1762,22 @@ hsa_status_t allow_access_to_all_gpu_agents(void *ptr) {
|
|||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
|
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
|
||||||
return elf_machine_id_is_amdgcn(image);
|
if(elf_machine_id_is_amdgcn(image))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
jit::JITEngine::init();
|
||||||
|
|
||||||
|
if (!JITEngine)
|
||||||
|
JITEngine = std::make_unique<jit::JITEngine>("amdgcn", AMDDTC,
|
||||||
|
DeviceInfo.NumberOfDevices);
|
||||||
|
|
||||||
|
if (!jit::JITEngine::isValidModule("amdgcn", image))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (jit::JITEngine::isSpecializationSupported(image))
|
||||||
|
return 2;
|
||||||
|
|
||||||
|
return 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
int __tgt_rtl_number_of_devices() {
|
int __tgt_rtl_number_of_devices() {
|
||||||
@@ -1811,12 +1948,23 @@ int32_t __tgt_rtl_init_device(int device_id) {
|
|||||||
DeviceInfo.GroupsPerDevice[device_id] *
|
DeviceInfo.GroupsPerDevice[device_id] *
|
||||||
DeviceInfo.ThreadsPerGroup[device_id]);
|
DeviceInfo.ThreadsPerGroup[device_id]);
|
||||||
|
|
||||||
|
if (JITEngine) {
|
||||||
|
jit::DeviceInfo DI;
|
||||||
|
DI.Arch = "amdgcn";
|
||||||
|
DI.MCpu = DeviceInfo.GPUName[device_id];
|
||||||
|
DI.ThreadsPerBlock = DeviceInfo.ThreadsPerGroup[device_id];
|
||||||
|
DI.BlocksPerGrid = DeviceInfo.GroupsPerDevice[device_id];
|
||||||
|
DI.WarpSize = 32;
|
||||||
|
DI.NumThreads = DeviceInfo.NumThreads[device_id];
|
||||||
|
DI.NumTeams = DeviceInfo.NumTeams[device_id];
|
||||||
|
DI.EnvNumThreads = DeviceInfo.Env.TeamThreadLimit;
|
||||||
|
DI.EnvNumTeams = DeviceInfo.Env.NumTeams;
|
||||||
|
JITEngine->init(device_id, DI);
|
||||||
|
}
|
||||||
|
|
||||||
return OFFLOAD_SUCCESS;
|
return OFFLOAD_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __tgt_target_table *
|
|
||||||
__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
|
|
||||||
|
|
||||||
__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
|
__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
|
||||||
__tgt_device_image *image) {
|
__tgt_device_image *image) {
|
||||||
DeviceInfo.load_run_lock.lock();
|
DeviceInfo.load_run_lock.lock();
|
||||||
|
|||||||
@@ -11,4 +11,5 @@
|
|||||||
##===----------------------------------------------------------------------===##
|
##===----------------------------------------------------------------------===##
|
||||||
|
|
||||||
add_subdirectory(elf_common)
|
add_subdirectory(elf_common)
|
||||||
|
add_subdirectory(JIT)
|
||||||
add_subdirectory(MemoryManager)
|
add_subdirectory(MemoryManager)
|
||||||
|
|||||||
36
openmp/libomptarget/plugins/common/JIT/CMakeLists.txt
Normal file
36
openmp/libomptarget/plugins/common/JIT/CMakeLists.txt
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
##===----------------------------------------------------------------------===##
|
||||||
|
#
|
||||||
|
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
# See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
#
|
||||||
|
##===----------------------------------------------------------------------===##
|
||||||
|
#
|
||||||
|
# JIT module
|
||||||
|
#
|
||||||
|
##===----------------------------------------------------------------------===##
|
||||||
|
|
||||||
|
set(LLVM_LINK_COMPONENTS
|
||||||
|
AllTargetsAsmParsers
|
||||||
|
AllTargetsCodeGens
|
||||||
|
AllTargetsDescs
|
||||||
|
AllTargetsInfos
|
||||||
|
LTO
|
||||||
|
)
|
||||||
|
|
||||||
|
add_llvm_library(LLVM-LIBOMPTARGET-JIT STATIC BUILDTREE_ONLY JIT.cpp)
|
||||||
|
|
||||||
|
# Build elf_common with PIC to be able to link it with plugin shared libraries.
|
||||||
|
set_property(TARGET LLVM-LIBOMPTARGET-JIT PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
|
target_link_libraries(LLVM-LIBOMPTARGET-JIT INTERFACE ${OPENMP_PTHREAD_LIB} ncurses dl)
|
||||||
|
|
||||||
|
# Expose JIT.h directory to the users of this library.
|
||||||
|
target_include_directories(LLVM-LIBOMPTARGET-JIT
|
||||||
|
INTERFACE
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
|
${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}
|
||||||
|
${LLVM_INCLUDE_DIRS}
|
||||||
|
PRIVATE
|
||||||
|
${LIBOMPTARGET_INCLUDE_DIR}
|
||||||
|
)
|
||||||
1332
openmp/libomptarget/plugins/common/JIT/JIT.cpp
Normal file
1332
openmp/libomptarget/plugins/common/JIT/JIT.cpp
Normal file
File diff suppressed because it is too large
Load Diff
361
openmp/libomptarget/plugins/common/JIT/JIT.h
Normal file
361
openmp/libomptarget/plugins/common/JIT/JIT.h
Normal file
@@ -0,0 +1,361 @@
|
|||||||
|
//===-- JIT.h --- JIT module ----------------------------------------------===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
// JIT module for target plugins.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <fstream>
|
||||||
|
#include <list>
|
||||||
|
#include <memory>
|
||||||
|
#include <mutex>
|
||||||
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
// Forward declaration.
|
||||||
|
struct __tgt_target_table;
|
||||||
|
struct __tgt_device_image;
|
||||||
|
struct __tgt_offload_entry;
|
||||||
|
struct __tgt_async_info;
|
||||||
|
|
||||||
|
namespace llvm {
|
||||||
|
class MemoryBuffer;
|
||||||
|
} // namespace llvm
|
||||||
|
|
||||||
|
namespace jit {
|
||||||
|
class Kernel;
|
||||||
|
|
||||||
|
namespace impl {
|
||||||
|
|
||||||
|
/// Optimization action applied to a kernel, which is in the form:
|
||||||
|
/// operation:index:value
|
||||||
|
///
|
||||||
|
/// 'operation' can be:
|
||||||
|
/// 's': value specialization;
|
||||||
|
/// 'a': alignment specialization;
|
||||||
|
/// 't': number of threads;
|
||||||
|
/// 'T': number of teams.
|
||||||
|
///
|
||||||
|
/// 'index' can be 'n' for those operations that don't require index, or an
|
||||||
|
/// integer number.
|
||||||
|
///
|
||||||
|
/// 'value' can be an action (recursively defined, but in fact we don't
|
||||||
|
/// support it for now), or an integer value.
|
||||||
|
class Action {
|
||||||
|
public:
|
||||||
|
enum class ActionKind : uint8_t {
|
||||||
|
None = 0,
|
||||||
|
Alignment,
|
||||||
|
Specialization,
|
||||||
|
NumTeams,
|
||||||
|
NumThreads,
|
||||||
|
};
|
||||||
|
|
||||||
|
explicit Action(const std::string &S);
|
||||||
|
|
||||||
|
explicit Action(ActionKind AK, uintptr_t V, int Index);
|
||||||
|
|
||||||
|
explicit Action(ActionKind AK, uintptr_t V);
|
||||||
|
|
||||||
|
std::string toString() const;
|
||||||
|
|
||||||
|
bool match(const Kernel &K) const;
|
||||||
|
|
||||||
|
static std::string ActionsToString(const std::vector<Action> &Actions);
|
||||||
|
|
||||||
|
private:
|
||||||
|
enum ValuePos : uint8_t {
|
||||||
|
POS_OpCode = 0,
|
||||||
|
POS_Index = 1,
|
||||||
|
POS_Value = 2,
|
||||||
|
};
|
||||||
|
|
||||||
|
ActionKind Kind;
|
||||||
|
uintptr_t Value;
|
||||||
|
int Index;
|
||||||
|
};
|
||||||
|
|
||||||
|
class KernelSpecialization {
|
||||||
|
/// Kernel entry name.
|
||||||
|
const std::string Name;
|
||||||
|
/// Target architecture.
|
||||||
|
const std::string MCpu;
|
||||||
|
///
|
||||||
|
std::vector<Action> Actions;
|
||||||
|
|
||||||
|
friend class Image;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit KernelSpecialization(const std::string &Name,
|
||||||
|
const std::string &MCpu)
|
||||||
|
: Name(Name), MCpu(MCpu) {}
|
||||||
|
|
||||||
|
explicit KernelSpecialization(const std::string &Name,
|
||||||
|
const std::string &MCpu,
|
||||||
|
const std::string &ActionString);
|
||||||
|
|
||||||
|
explicit KernelSpecialization(const std::string &Name,
|
||||||
|
const std::string &MCpu,
|
||||||
|
const std::vector<Action> &A);
|
||||||
|
|
||||||
|
bool match(const Kernel &K) const;
|
||||||
|
|
||||||
|
const std::string &getName() const { return Name; }
|
||||||
|
};
|
||||||
|
|
||||||
|
class SpecializationStatistics {
|
||||||
|
/// Kernel name.
|
||||||
|
const std::string KernelName;
|
||||||
|
///
|
||||||
|
uint64_t ThresholdTotalCount = 20;
|
||||||
|
///
|
||||||
|
float ThresholdRatio = 0.5f;
|
||||||
|
/// Total number of specialization variants that have been generated for the
|
||||||
|
/// corresponding kernel.
|
||||||
|
uint64_t TotalCount = 0;
|
||||||
|
/// Count for each argument.
|
||||||
|
std::vector<uint64_t> ArgCount;
|
||||||
|
/// Count for num_thread.
|
||||||
|
uint64_t NumThreadsCount = 0;
|
||||||
|
/// Count for num_team.
|
||||||
|
uint64_t NumTeamsCount = 0;
|
||||||
|
/// Gaurd lock.
|
||||||
|
std::mutex Lock;
|
||||||
|
|
||||||
|
friend class StatisticsUpdater;
|
||||||
|
|
||||||
|
public:
|
||||||
|
SpecializationStatistics(const std::string &Name, int NumArgs)
|
||||||
|
: KernelName(Name), ArgCount(NumArgs, 0) {}
|
||||||
|
|
||||||
|
bool reachThreshold(Action::ActionKind Kind, int Index) const;
|
||||||
|
|
||||||
|
bool reachThreshold(Action::ActionKind Kind) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
class TargetTable {
|
||||||
|
const KernelSpecialization *Specialization;
|
||||||
|
__tgt_target_table *Table;
|
||||||
|
|
||||||
|
public:
|
||||||
|
TargetTable(const KernelSpecialization *KS, __tgt_target_table *Table)
|
||||||
|
: Specialization(KS), Table(Table) {}
|
||||||
|
|
||||||
|
bool match(const Kernel &K) const;
|
||||||
|
|
||||||
|
__tgt_target_table *get() const { return Table; }
|
||||||
|
};
|
||||||
|
|
||||||
|
class TargetTableCache {
|
||||||
|
///
|
||||||
|
std::unordered_map<std::string, std::list<TargetTable>> Map;
|
||||||
|
|
||||||
|
public:
|
||||||
|
__tgt_target_table *insert(const KernelSpecialization *KS,
|
||||||
|
__tgt_target_table *Table) {
|
||||||
|
auto &Tables = Map[KS->getName()];
|
||||||
|
Tables.emplace_back(KS, Table);
|
||||||
|
|
||||||
|
return Tables.back().get();
|
||||||
|
}
|
||||||
|
|
||||||
|
__tgt_target_table *get(const Kernel &K) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
class Image {
|
||||||
|
KernelSpecialization Specialization;
|
||||||
|
///
|
||||||
|
const char *Start = nullptr;
|
||||||
|
///
|
||||||
|
const char *End = nullptr;
|
||||||
|
|
||||||
|
void dump(std::ostream &OS) const;
|
||||||
|
|
||||||
|
friend class ImageCache;
|
||||||
|
|
||||||
|
public:
|
||||||
|
Image(const KernelSpecialization &KS, const char *ImageStart,
|
||||||
|
const char *ImageEnd)
|
||||||
|
: Specialization(KS), Start(ImageStart), End(ImageEnd) {}
|
||||||
|
|
||||||
|
///
|
||||||
|
std::pair<void *, void *> get() const {
|
||||||
|
return std::make_pair((void *)Start, (void *)End);
|
||||||
|
}
|
||||||
|
|
||||||
|
///
|
||||||
|
bool match(const Kernel &K) const { return Specialization.match(K); }
|
||||||
|
|
||||||
|
const KernelSpecialization &getKernelSpecialization() const {
|
||||||
|
return Specialization;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class ImageCache {
|
||||||
|
public:
|
||||||
|
ImageCache(const std::string &Arch);
|
||||||
|
|
||||||
|
~ImageCache();
|
||||||
|
|
||||||
|
///
|
||||||
|
const Image *insert(const std::string &Key, const KernelSpecialization &KS,
|
||||||
|
std::unique_ptr<llvm::MemoryBuffer> MB);
|
||||||
|
|
||||||
|
///
|
||||||
|
const Image *get(const std::string &Key, const Kernel &K) const {
|
||||||
|
auto Itr = Map.find(Key);
|
||||||
|
if (Itr == Map.end())
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
auto &L = Itr->second;
|
||||||
|
for (auto &I : L)
|
||||||
|
if (I.match(K))
|
||||||
|
return &I;
|
||||||
|
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
const std::string Arch;
|
||||||
|
///
|
||||||
|
std::list<std::unique_ptr<llvm::MemoryBuffer>> NewBuffer;
|
||||||
|
///
|
||||||
|
std::unordered_map<std::string, std::list<Image>> Map;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace impl
|
||||||
|
|
||||||
|
struct DeviceInfo {
|
||||||
|
/// Architecture, e.g. nvptx64, amdgcn.
|
||||||
|
std::string Arch;
|
||||||
|
/// GPU code name, e.g. sm_75 for Nvidia GPU.
|
||||||
|
std::string MCpu;
|
||||||
|
/// Maximum number of registers the device can support.
|
||||||
|
uint64_t MaxNumRegs = 0;
|
||||||
|
uint64_t ThreadsPerBlock = 0;
|
||||||
|
uint64_t BlocksPerGrid = 0;
|
||||||
|
uint64_t WarpSize = 32;
|
||||||
|
/// Values set by users.
|
||||||
|
int64_t EnvNumThreads = -1;
|
||||||
|
int64_t EnvNumTeams = -1;
|
||||||
|
/// Default values when users don't set explicitly.
|
||||||
|
uint64_t NumThreads = 0;
|
||||||
|
uint64_t NumTeams = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
class Kernel {
|
||||||
|
/// Kernel entry name.
|
||||||
|
std::string Name;
|
||||||
|
/// Target architecture where the kernel is about to be launched.
|
||||||
|
std::string MCpu;
|
||||||
|
/// Number of threads.
|
||||||
|
int NumThreads = 0;
|
||||||
|
/// Number of teams.
|
||||||
|
int NumTeams = 0;
|
||||||
|
///
|
||||||
|
int LoopTripCount = 0;
|
||||||
|
/// Number of arguments.
|
||||||
|
int NumArgs = 0;
|
||||||
|
/// Pointer to the kernel arguments.
|
||||||
|
uintptr_t *Args = nullptr;
|
||||||
|
/// If the kernel is specialized, an id will be assigned.
|
||||||
|
uintptr_t Id = 0;
|
||||||
|
|
||||||
|
Kernel() = default;
|
||||||
|
|
||||||
|
public:
|
||||||
|
static Kernel create(__tgt_device_image *Image, const char *Name,
|
||||||
|
const std::string &MCpu, void **Args, int NumArgs,
|
||||||
|
int NumTeams, int NumThreads, int LoopTripCount);
|
||||||
|
|
||||||
|
const std::string &getName() const { return Name; }
|
||||||
|
|
||||||
|
const std::string &getMCpu() const { return MCpu; }
|
||||||
|
|
||||||
|
int getNumThreads() const { return NumThreads; }
|
||||||
|
|
||||||
|
int getNumTeams() const { return NumTeams; }
|
||||||
|
|
||||||
|
uintptr_t getArg(int Index) const {
|
||||||
|
assert(Index < NumArgs && "out of range access");
|
||||||
|
return Args[Index];
|
||||||
|
}
|
||||||
|
|
||||||
|
int getNumArgs() const { return NumArgs; }
|
||||||
|
|
||||||
|
friend class JITEngine;
|
||||||
|
};
|
||||||
|
|
||||||
|
class DeviceToolChain {
|
||||||
|
public:
|
||||||
|
virtual std::unique_ptr<llvm::MemoryBuffer> run(const std::string &FileName,
|
||||||
|
const DeviceInfo &DI) = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
class JITEngine {
|
||||||
|
const std::string Arch;
|
||||||
|
int NumDevices = 0;
|
||||||
|
|
||||||
|
DeviceToolChain &DTC;
|
||||||
|
std::vector<DeviceInfo> DI;
|
||||||
|
std::unique_ptr<impl::ImageCache> IC;
|
||||||
|
std::vector<std::unique_ptr<impl::TargetTableCache>> TTC;
|
||||||
|
|
||||||
|
class StatisticMap {
|
||||||
|
std::unordered_map<std::string,
|
||||||
|
std::unique_ptr<impl::SpecializationStatistics>>
|
||||||
|
Map;
|
||||||
|
std::mutex Mtx;
|
||||||
|
|
||||||
|
public:
|
||||||
|
impl::SpecializationStatistics &get(const std::string &K, int NumArgs) {
|
||||||
|
std::lock_guard<std::mutex> LG(Mtx);
|
||||||
|
auto Itr = Map.find(K);
|
||||||
|
if (Itr != Map.end())
|
||||||
|
return *Itr->second;
|
||||||
|
auto R = Map.insert(
|
||||||
|
{K, std::make_unique<impl::SpecializationStatistics>(K, NumArgs)});
|
||||||
|
return *R.first->second;
|
||||||
|
}
|
||||||
|
} Statistics;
|
||||||
|
|
||||||
|
public:
|
||||||
|
JITEngine(const char *A, DeviceToolChain &DTC, int NumDevices);
|
||||||
|
|
||||||
|
///
|
||||||
|
bool init(int DeviceId, const DeviceInfo &D) {
|
||||||
|
if (DeviceId >= NumDevices)
|
||||||
|
return false;
|
||||||
|
DI[DeviceId] = D;
|
||||||
|
TTC[DeviceId] = std::make_unique<impl::TargetTableCache>();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Look up the target table cache. Return nullptr if there is no cache match
|
||||||
|
/// for that specific kernel.
|
||||||
|
__tgt_target_table *getTargetTable(int DeviceId, const Kernel &K);
|
||||||
|
|
||||||
|
/// Get the device image.
|
||||||
|
__tgt_device_image *getImage(int DeviceId, Kernel &K,
|
||||||
|
__tgt_device_image *Image);
|
||||||
|
/// Get the device image without any kernel specialization.
|
||||||
|
__tgt_device_image *getImage(int DeviceId, __tgt_device_image *Image);
|
||||||
|
|
||||||
|
bool insertTargetTable(int DeviceId, const Kernel &K,
|
||||||
|
__tgt_target_table *Table);
|
||||||
|
|
||||||
|
static bool isValidModule(const std::string &Arch, __tgt_device_image *Image);
|
||||||
|
|
||||||
|
static bool isSpecializationSupported(__tgt_device_image *Image);
|
||||||
|
|
||||||
|
static void init();
|
||||||
|
};
|
||||||
|
} // namespace jit
|
||||||
@@ -37,17 +37,38 @@ if (LIBOMPTARGET_DEP_CUDA_FOUND AND LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
|
|||||||
set(LIBOMPTARGET_CAN_LINK_LIBCUDA TRUE)
|
set(LIBOMPTARGET_CAN_LINK_LIBCUDA TRUE)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
set(LLVM_LINK_COMPONENTS
|
||||||
|
AllTargetsAsmParsers
|
||||||
|
AllTargetsCodeGens
|
||||||
|
AllTargetsDescs
|
||||||
|
AllTargetsInfos
|
||||||
|
LTO
|
||||||
|
)
|
||||||
|
|
||||||
|
set(src_files src/rtl.cpp)
|
||||||
|
|
||||||
if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
|
if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
|
||||||
libomptarget_say("Building CUDA plugin linked against libcuda")
|
libomptarget_say("Building CUDA plugin linked against libcuda")
|
||||||
include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
|
include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
|
||||||
add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
|
set(dependences ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES})
|
||||||
set (LIBOMPTARGET_DEP_LIBRARIES ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES})
|
|
||||||
else()
|
else()
|
||||||
libomptarget_say("Building CUDA plugin for dlopened libcuda")
|
libomptarget_say("Building CUDA plugin for dlopened libcuda")
|
||||||
include_directories(dynamic_cuda)
|
include_directories(dynamic_cuda)
|
||||||
add_library(omptarget.rtl.cuda SHARED src/rtl.cpp dynamic_cuda/cuda.cpp)
|
list(APPEND src_files dynamic_cuda/cuda.cpp)
|
||||||
set (LIBOMPTARGET_DEP_LIBRARIES ${CMAKE_DL_LIBS})
|
set(dependences ${CMAKE_DL_LIBS})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
add_llvm_library(omptarget.rtl.cuda SHARED ${src_files}
|
||||||
|
LINK_LIBS elf_common
|
||||||
|
MemoryManager
|
||||||
|
${LIBOMPTARGET_DEP_LIBRARIES}
|
||||||
|
${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
|
||||||
|
${OPENMP_PTHREAD_LIB}
|
||||||
|
${dependences}
|
||||||
|
LLVM-LIBOMPTARGET-JIT
|
||||||
|
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
|
||||||
|
"-Wl,-z,defs")
|
||||||
|
|
||||||
add_dependencies(omptarget.rtl.cuda omptarget.devicertl.nvptx)
|
add_dependencies(omptarget.rtl.cuda omptarget.devicertl.nvptx)
|
||||||
|
|
||||||
# Install plugin under the lib destination folder.
|
# Install plugin under the lib destination folder.
|
||||||
@@ -58,15 +79,6 @@ target_include_directories(omptarget.rtl.cuda PRIVATE
|
|||||||
${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}
|
${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}
|
||||||
)
|
)
|
||||||
|
|
||||||
target_link_libraries(omptarget.rtl.cuda
|
|
||||||
elf_common
|
|
||||||
MemoryManager
|
|
||||||
${LIBOMPTARGET_DEP_LIBRARIES}
|
|
||||||
${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
|
|
||||||
${OPENMP_PTHREAD_LIB}
|
|
||||||
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
|
|
||||||
"-Wl,-z,defs")
|
|
||||||
|
|
||||||
# Report to the parent scope that we are building a plugin for CUDA.
|
# Report to the parent scope that we are building a plugin for CUDA.
|
||||||
# This controls whether tests are run for the nvptx offloading target
|
# This controls whether tests are run for the nvptx offloading target
|
||||||
# Run them if libcuda is available, or if the user explicitly asked for dlopen
|
# Run them if libcuda is available, or if the user explicitly asked for dlopen
|
||||||
|
|||||||
@@ -13,12 +13,17 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
|
#include <cstdlib>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
#include <list>
|
#include <list>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
#include <unordered_set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "Debug.h"
|
#include "Debug.h"
|
||||||
@@ -31,7 +36,12 @@
|
|||||||
|
|
||||||
#include "MemoryManager.h"
|
#include "MemoryManager.h"
|
||||||
|
|
||||||
|
#include "JIT.h"
|
||||||
|
|
||||||
#include "llvm/Frontend/OpenMP/OMPConstants.h"
|
#include "llvm/Frontend/OpenMP/OMPConstants.h"
|
||||||
|
#include "llvm/Support/MemoryBuffer.h"
|
||||||
|
|
||||||
|
using llvm::MemoryBuffer;
|
||||||
|
|
||||||
// Utility for retrieving and printing CUDA error string.
|
// Utility for retrieving and printing CUDA error string.
|
||||||
#ifdef OMPTARGET_DEBUG
|
#ifdef OMPTARGET_DEBUG
|
||||||
@@ -91,6 +101,24 @@ struct KernelTy {
|
|||||||
};
|
};
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
std::unique_ptr<jit::JITEngine> JITEngine;
|
||||||
|
|
||||||
|
class NVDeviceToolChain : public jit::DeviceToolChain {
|
||||||
|
public:
|
||||||
|
std::unique_ptr<llvm::MemoryBuffer> run(const std::string &FileName,
|
||||||
|
const jit::DeviceInfo &DI) override {
|
||||||
|
auto MBOrError = llvm::MemoryBuffer::getFile(
|
||||||
|
FileName, /*IsText=*/true, /*RequiresNullTerminator=*/false);
|
||||||
|
if (!MBOrError)
|
||||||
|
return nullptr;
|
||||||
|
if (const char *Str = getenv("LIBOMPTARGET_JIT_DUMP_ASM"))
|
||||||
|
fprintf(stderr, ">>> ptx:\n%s\n", (*MBOrError)->getBufferStart());
|
||||||
|
return std::move(*MBOrError);
|
||||||
|
}
|
||||||
|
} NVDTC;
|
||||||
|
|
||||||
|
std::unordered_set<void *> NonSpecializedImages;
|
||||||
|
|
||||||
bool checkResult(CUresult Err, const char *ErrMsg) {
|
bool checkResult(CUresult Err, const char *ErrMsg) {
|
||||||
if (Err == CUDA_SUCCESS)
|
if (Err == CUDA_SUCCESS)
|
||||||
return true;
|
return true;
|
||||||
@@ -158,9 +186,20 @@ struct DeviceDataTy {
|
|||||||
int ThreadsPerBlock = 0;
|
int ThreadsPerBlock = 0;
|
||||||
int BlocksPerGrid = 0;
|
int BlocksPerGrid = 0;
|
||||||
int WarpSize = 0;
|
int WarpSize = 0;
|
||||||
|
// Maximum number of registers available per block
|
||||||
|
int MaxRegisters = 0;
|
||||||
// OpenMP properties
|
// OpenMP properties
|
||||||
int NumTeams = 0;
|
int NumTeams = 0;
|
||||||
int NumThreads = 0;
|
int NumThreads = 0;
|
||||||
|
|
||||||
|
struct ComputeCapabilityTy {
|
||||||
|
int Major = 3;
|
||||||
|
int Minor = 5;
|
||||||
|
|
||||||
|
std::string toString() const { return "sm_" + std::to_string(toInt()); }
|
||||||
|
|
||||||
|
int toInt() const { return Major * 10 + Minor; }
|
||||||
|
} ComputeCapability;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Resource allocator where \p T is the resource type.
|
/// Resource allocator where \p T is the resource type.
|
||||||
@@ -471,7 +510,6 @@ class DeviceRTLTy {
|
|||||||
E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
|
E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
|
||||||
CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
|
CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
|
||||||
assert(AsyncInfo && "AsyncInfo is nullptr");
|
assert(AsyncInfo && "AsyncInfo is nullptr");
|
||||||
|
|
||||||
@@ -486,6 +524,40 @@ public:
|
|||||||
return reinterpret_cast<CUstream>(AsyncInfo->Queue);
|
return reinterpret_cast<CUstream>(AsyncInfo->Queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__tgt_device_image *loadJITImage(int DeviceId, __tgt_device_image *Image) {
|
||||||
|
return JITEngine->getImage(DeviceId, Image);
|
||||||
|
}
|
||||||
|
|
||||||
|
__tgt_target_table *loadJITImage(int DeviceId, __tgt_device_image *Image,
|
||||||
|
__tgt_offload_entry *Entry, void **Args,
|
||||||
|
int NumArgs, int TeamNum, int ThreadLimit,
|
||||||
|
int LoopTripCount) {
|
||||||
|
auto Kernel = jit::Kernel::create(
|
||||||
|
Image, Entry->name, DeviceData[DeviceId].ComputeCapability.toString(),
|
||||||
|
Args, NumArgs, TeamNum, ThreadLimit, LoopTripCount);
|
||||||
|
if (auto *TT = JITEngine->getTargetTable(DeviceId, Kernel)) {
|
||||||
|
DP("couldn't find cached target table for kernel entry " DPxMOD ".\n",
|
||||||
|
DPxPTR(Entry));
|
||||||
|
return TT;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto *NewImage = JITEngine->getImage(DeviceId, Kernel, Image);
|
||||||
|
if (!NewImage) {
|
||||||
|
DP("failed to jit image for kernel entry " DPxMOD ".\n", DPxPTR(Entry));
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto *TT = loadBinary(DeviceId, NewImage);
|
||||||
|
if (!TT)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
if (!JITEngine->insertTargetTable(DeviceId, Kernel, TT))
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
return TT;
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
// This class should not be copied
|
// This class should not be copied
|
||||||
DeviceRTLTy(const DeviceRTLTy &) = delete;
|
DeviceRTLTy(const DeviceRTLTy &) = delete;
|
||||||
DeviceRTLTy(DeviceRTLTy &&) = delete;
|
DeviceRTLTy(DeviceRTLTy &&) = delete;
|
||||||
@@ -749,6 +821,50 @@ public:
|
|||||||
DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
|
DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get compute capability
|
||||||
|
int SM;
|
||||||
|
Err = cuDeviceGetAttribute(
|
||||||
|
&SM, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device);
|
||||||
|
if (Err != CUDA_SUCCESS) {
|
||||||
|
DP("Error getting compute capablity major, use default value %d\n",
|
||||||
|
DeviceData[DeviceId].ComputeCapability.Major);
|
||||||
|
} else {
|
||||||
|
DeviceData[DeviceId].ComputeCapability.Major = SM;
|
||||||
|
}
|
||||||
|
Err = cuDeviceGetAttribute(
|
||||||
|
&SM, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device);
|
||||||
|
if (Err != CUDA_SUCCESS) {
|
||||||
|
DP("Error getting compute capablity minor, use default value %d\n",
|
||||||
|
DeviceData[DeviceId].ComputeCapability.Minor);
|
||||||
|
} else {
|
||||||
|
DeviceData[DeviceId].ComputeCapability.Minor = SM;
|
||||||
|
}
|
||||||
|
int MaxRegs;
|
||||||
|
Err = cuDeviceGetAttribute(
|
||||||
|
&MaxRegs, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device);
|
||||||
|
if (Err != CUDA_SUCCESS) {
|
||||||
|
DP("Error getting max registers per block, use default value %d\n",
|
||||||
|
DeviceData[DeviceId].MaxRegisters);
|
||||||
|
} else {
|
||||||
|
DeviceData[DeviceId].MaxRegisters = MaxRegs;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (JITEngine) {
|
||||||
|
jit::DeviceInfo DI;
|
||||||
|
DI.Arch = "nvptx64";
|
||||||
|
DI.MCpu = DeviceData[DeviceId].ComputeCapability.toString();
|
||||||
|
DI.MaxNumRegs = DeviceData[DeviceId].MaxRegisters;
|
||||||
|
DI.ThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
|
||||||
|
DI.BlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
|
||||||
|
DI.WarpSize = 32;
|
||||||
|
DI.NumThreads = DeviceData[DeviceId].NumThreads;
|
||||||
|
DI.NumTeams = DeviceData[DeviceId].NumTeams;
|
||||||
|
DI.EnvNumThreads = EnvTeamThreadLimit;
|
||||||
|
DI.EnvNumTeams = EnvNumTeams;
|
||||||
|
|
||||||
|
JITEngine->init(DeviceId, DI);
|
||||||
|
}
|
||||||
|
|
||||||
return OFFLOAD_SUCCESS;
|
return OFFLOAD_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -790,14 +906,24 @@ public:
|
|||||||
|
|
||||||
__tgt_target_table *loadBinary(const int DeviceId,
|
__tgt_target_table *loadBinary(const int DeviceId,
|
||||||
const __tgt_device_image *Image) {
|
const __tgt_device_image *Image) {
|
||||||
|
void *ImageStart = Image->ImageStart;
|
||||||
|
if (NonSpecializedImages.find(ImageStart) != NonSpecializedImages.end()) {
|
||||||
|
auto *NewImage =
|
||||||
|
loadJITImage(DeviceId, const_cast<__tgt_device_image *>(Image));
|
||||||
|
if (!NewImage)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
ImageStart = NewImage->ImageStart;
|
||||||
|
}
|
||||||
|
|
||||||
// Clear the offload table as we are going to create a new one.
|
// Clear the offload table as we are going to create a new one.
|
||||||
clearOffloadEntriesTable(DeviceId);
|
clearOffloadEntriesTable(DeviceId);
|
||||||
|
|
||||||
// Create the module and extract the function pointers.
|
// Create the module and extract the function pointers.
|
||||||
CUmodule Module;
|
CUmodule Module;
|
||||||
DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
|
DP("Load data from image " DPxMOD "\n", DPxPTR(ImageStart));
|
||||||
CUresult Err =
|
CUresult Err =
|
||||||
cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
|
cuModuleLoadDataEx(&Module, ImageStart, 0, nullptr, nullptr);
|
||||||
if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
|
if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
@@ -1073,7 +1199,7 @@ public:
|
|||||||
ptrdiff_t *TgtOffsets, const int ArgNum,
|
ptrdiff_t *TgtOffsets, const int ArgNum,
|
||||||
const int TeamNum, const int ThreadLimit,
|
const int TeamNum, const int ThreadLimit,
|
||||||
const unsigned int LoopTripCount,
|
const unsigned int LoopTripCount,
|
||||||
__tgt_async_info *AsyncInfo) const {
|
__tgt_async_info *AsyncInfo) {
|
||||||
// All args are references.
|
// All args are references.
|
||||||
std::vector<void *> Args(ArgNum);
|
std::vector<void *> Args(ArgNum);
|
||||||
std::vector<void *> Ptrs(ArgNum);
|
std::vector<void *> Ptrs(ArgNum);
|
||||||
@@ -1083,7 +1209,27 @@ public:
|
|||||||
Args[I] = &Ptrs[I];
|
Args[I] = &Ptrs[I];
|
||||||
}
|
}
|
||||||
|
|
||||||
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
|
auto LaunchEntry =
|
||||||
|
reinterpret_cast<__tgt_kernel_launch_entry *>(TgtEntryPtr);
|
||||||
|
KernelTy *KernelInfo =
|
||||||
|
reinterpret_cast<KernelTy *>(LaunchEntry->TargetEntry);
|
||||||
|
// If kernel info is nullptr, it means we are dealing with JIT image.
|
||||||
|
if (KernelInfo == nullptr) {
|
||||||
|
assert(LaunchEntry->Image && LaunchEntry->HostEntry);
|
||||||
|
__tgt_device_image NewImage = *(LaunchEntry->Image);
|
||||||
|
NewImage.EntriesBegin = LaunchEntry->HostEntry;
|
||||||
|
NewImage.EntriesEnd = NewImage.EntriesBegin + 1;
|
||||||
|
auto TargetTable =
|
||||||
|
loadJITImage(DeviceId, &NewImage, LaunchEntry->HostEntry, Ptrs.data(),
|
||||||
|
ArgNum, TeamNum, ThreadLimit, LoopTripCount);
|
||||||
|
if (!TargetTable)
|
||||||
|
return OFFLOAD_FAIL;
|
||||||
|
|
||||||
|
KernelInfo =
|
||||||
|
reinterpret_cast<KernelTy *>(TargetTable->EntriesBegin->addr);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(KernelInfo && "KernelInfo should not be nullptr");
|
||||||
|
|
||||||
const bool IsSPMDGenericMode =
|
const bool IsSPMDGenericMode =
|
||||||
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
|
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
|
||||||
@@ -1484,7 +1630,24 @@ extern "C" {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
|
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
|
||||||
return elf_check_machine(image, /* EM_CUDA */ 190);
|
if (elf_check_machine(image, /* EM_CUDA */ 190))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
jit::JITEngine::init();
|
||||||
|
|
||||||
|
if (!JITEngine)
|
||||||
|
JITEngine = std::make_unique<jit::JITEngine>("nvptx64", NVDTC,
|
||||||
|
DeviceRTL.getNumOfDevices());
|
||||||
|
|
||||||
|
if (!jit::JITEngine::isValidModule("nvptx64", image))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (jit::JITEngine::isSpecializationSupported(image))
|
||||||
|
return 2;
|
||||||
|
|
||||||
|
NonSpecializedImages.insert(image->ImageStart);
|
||||||
|
|
||||||
|
return 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
|
int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
|
||||||
|
|||||||
@@ -19,7 +19,10 @@
|
|||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <fstream>
|
||||||
|
#include <memory>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
/// adds requires flags
|
/// adds requires flags
|
||||||
@@ -40,6 +43,7 @@ EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// PM->RTLs.RegisterLib(createBinDescFrom(desc));
|
||||||
PM->RTLs.RegisterLib(desc);
|
PM->RTLs.RegisterLib(desc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -107,6 +107,13 @@ static int InitLibrary(DeviceTy &Device) {
|
|||||||
rc = OFFLOAD_FAIL;
|
rc = OFFLOAD_FAIL;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const bool IsJITImage =
|
||||||
|
RegisteredJITImages.find(img) != RegisteredJITImages.end();
|
||||||
|
|
||||||
|
if (IsJITImage)
|
||||||
|
continue;
|
||||||
|
|
||||||
// 2) load image into the target table.
|
// 2) load image into the target table.
|
||||||
__tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] =
|
__tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] =
|
||||||
Device.load_binary(img);
|
Device.load_binary(img);
|
||||||
@@ -1500,16 +1507,6 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
|
|||||||
return OFFLOAD_FAIL;
|
return OFFLOAD_FAIL;
|
||||||
}
|
}
|
||||||
|
|
||||||
// get target table.
|
|
||||||
__tgt_target_table *TargetTable = nullptr;
|
|
||||||
{
|
|
||||||
std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
|
|
||||||
assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
|
|
||||||
"Not expecting a device ID outside the table's bounds!");
|
|
||||||
TargetTable = TM->Table->TargetsTable[DeviceId];
|
|
||||||
}
|
|
||||||
assert(TargetTable && "Global data has not been mapped\n");
|
|
||||||
|
|
||||||
// We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
|
// We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
|
||||||
// need to manifest base pointers prior to launching a kernel. Even if we have
|
// need to manifest base pointers prior to launching a kernel. Even if we have
|
||||||
// mapped an object only partially, e.g. A[N:M], although the kernel is
|
// mapped an object only partially, e.g. A[N:M], although the kernel is
|
||||||
@@ -1536,11 +1533,42 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Launch device execution.
|
__tgt_device_image *Image = TM->Table->TargetsImages[Device.DeviceID];
|
||||||
void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
|
const bool UseJIT =
|
||||||
DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
|
RegisteredJITImages.find(Image) != RegisteredJITImages.end();
|
||||||
TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
|
void *TgtEntryPtr = nullptr;
|
||||||
|
__tgt_kernel_launch_entry LaunchEntry;
|
||||||
|
// get target table if in non-JIT mode.
|
||||||
|
if (UseJIT) {
|
||||||
|
__tgt_offload_entry *Entry = nullptr;
|
||||||
|
__tgt_target_table *HostTable = &TM->Table->HostTable;
|
||||||
|
// Find the entry name from the host entries
|
||||||
|
// TODO: We might want a map for this
|
||||||
|
for (auto Itr = HostTable->EntriesBegin; Itr != HostTable->EntriesEnd;
|
||||||
|
++Itr)
|
||||||
|
if (Itr->addr == HostPtr) {
|
||||||
|
Entry = Itr;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
assert(Entry && "cannot find entry");
|
||||||
|
LaunchEntry.HostEntry = Entry;
|
||||||
|
LaunchEntry.Image = Image;
|
||||||
|
DP("Launching target jit execution %s with pointer " DPxMOD ".\n",
|
||||||
|
Entry->name, DPxPTR(TgtEntryPtr));
|
||||||
|
} else {
|
||||||
|
std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
|
||||||
|
assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
|
||||||
|
"Not expecting a device ID outside the table's bounds!");
|
||||||
|
__tgt_target_table *TargetTable = TM->Table->TargetsTable[DeviceId];
|
||||||
|
assert(TargetTable && "Global data has not been mapped\n");
|
||||||
|
LaunchEntry.TargetEntry = TargetTable->EntriesBegin[TM->Index].addr;
|
||||||
|
DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
|
||||||
|
TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr),
|
||||||
|
TM->Index);
|
||||||
|
}
|
||||||
|
TgtEntryPtr = &LaunchEntry;
|
||||||
|
|
||||||
|
// Launch device execution.
|
||||||
{
|
{
|
||||||
TIMESCOPE_WITH_NAME_AND_IDENT(
|
TIMESCOPE_WITH_NAME_AND_IDENT(
|
||||||
IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc);
|
IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc);
|
||||||
|
|||||||
@@ -38,6 +38,8 @@ PluginManager *PM;
|
|||||||
static char *ProfileTraceFile = nullptr;
|
static char *ProfileTraceFile = nullptr;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
std::unordered_set<__tgt_device_image *> RegisteredJITImages;
|
||||||
|
|
||||||
__attribute__((constructor(101))) void init() {
|
__attribute__((constructor(101))) void init() {
|
||||||
DP("Init target library!\n");
|
DP("Init target library!\n");
|
||||||
|
|
||||||
@@ -250,8 +252,7 @@ static void RegisterImageIntoTranslationTable(TranslationTable &TT,
|
|||||||
|
|
||||||
static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
|
static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
|
||||||
__tgt_device_image *img,
|
__tgt_device_image *img,
|
||||||
RTLInfoTy *RTL) {
|
RTLInfoTy *RTL, bool IsJITImage) {
|
||||||
|
|
||||||
for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) {
|
for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) {
|
||||||
DeviceTy &Device = *PM->Devices[RTL->Idx + i];
|
DeviceTy &Device = *PM->Devices[RTL->Idx + i];
|
||||||
Device.PendingGlobalsMtx.lock();
|
Device.PendingGlobalsMtx.lock();
|
||||||
@@ -261,13 +262,21 @@ static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
|
|||||||
if (entry->flags & OMP_DECLARE_TARGET_CTOR) {
|
if (entry->flags & OMP_DECLARE_TARGET_CTOR) {
|
||||||
DP("Adding ctor " DPxMOD " to the pending list.\n",
|
DP("Adding ctor " DPxMOD " to the pending list.\n",
|
||||||
DPxPTR(entry->addr));
|
DPxPTR(entry->addr));
|
||||||
Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
|
if (IsJITImage)
|
||||||
|
Device.PendingCtorsDtors[desc].PendingJITCtors[img].push_back(
|
||||||
|
entry->addr);
|
||||||
|
else
|
||||||
|
Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
|
||||||
} else if (entry->flags & OMP_DECLARE_TARGET_DTOR) {
|
} else if (entry->flags & OMP_DECLARE_TARGET_DTOR) {
|
||||||
// Dtors are pushed in reverse order so they are executed from end
|
// Dtors are pushed in reverse order so they are executed from end
|
||||||
// to beginning when unregistering the library!
|
// to beginning when unregistering the library!
|
||||||
DP("Adding dtor " DPxMOD " to the pending list.\n",
|
DP("Adding dtor " DPxMOD " to the pending list.\n",
|
||||||
DPxPTR(entry->addr));
|
DPxPTR(entry->addr));
|
||||||
Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
|
if (IsJITImage)
|
||||||
|
Device.PendingCtorsDtors[desc].PendingJITDtors[img].push_front(
|
||||||
|
entry->addr);
|
||||||
|
else
|
||||||
|
Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry->flags & OMP_DECLARE_TARGET_LINK) {
|
if (entry->flags & OMP_DECLARE_TARGET_LINK) {
|
||||||
@@ -363,14 +372,21 @@ void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
|
|||||||
// Scan the RTLs that have associated images until we find one that supports
|
// Scan the RTLs that have associated images until we find one that supports
|
||||||
// the current image.
|
// the current image.
|
||||||
for (auto &R : AllRTLs) {
|
for (auto &R : AllRTLs) {
|
||||||
if (!R.is_valid_binary(img)) {
|
int Ret = R.is_valid_binary(img);
|
||||||
|
if (Ret == 0) {
|
||||||
DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
|
DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
|
||||||
DPxPTR(img->ImageStart), R.RTLName.c_str());
|
DPxPTR(img->ImageStart), R.RTLName.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
DP("Image " DPxMOD " is compatible with RTL %s!\n",
|
// TODO: should use enum here.
|
||||||
DPxPTR(img->ImageStart), R.RTLName.c_str());
|
const bool IsJITImage = Ret == 2;
|
||||||
|
|
||||||
|
DP("%sImage " DPxMOD " is compatible with RTL %s!\n",
|
||||||
|
IsJITImage ? "JIT " : "", DPxPTR(img->ImageStart), R.RTLName.c_str());
|
||||||
|
|
||||||
|
if (IsJITImage)
|
||||||
|
RegisteredJITImages.insert(img);
|
||||||
|
|
||||||
initRTLonce(R);
|
initRTLonce(R);
|
||||||
|
|
||||||
@@ -395,7 +411,7 @@ void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
|
|||||||
FoundRTL = &R;
|
FoundRTL = &R;
|
||||||
|
|
||||||
// Load ctors/dtors for static objects
|
// Load ctors/dtors for static objects
|
||||||
RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL);
|
RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL, IsJITImage);
|
||||||
|
|
||||||
// if an RTL was found we are done - proceed to register the next image
|
// if an RTL was found we are done - proceed to register the next image
|
||||||
break;
|
break;
|
||||||
@@ -427,6 +443,9 @@ void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
|
|||||||
|
|
||||||
assert(R->isUsed && "Expecting used RTLs.");
|
assert(R->isUsed && "Expecting used RTLs.");
|
||||||
|
|
||||||
|
// FIXME: This is WRONG!!!
|
||||||
|
continue;
|
||||||
|
|
||||||
if (!R->is_valid_binary(img)) {
|
if (!R->is_valid_binary(img)) {
|
||||||
DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n",
|
DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n",
|
||||||
DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
|
DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
|
||||||
|
|||||||
Reference in New Issue
Block a user