Compare commits

...

2 Commits

Author SHA1 Message Date
Shilei Tian
02bc7effcc Init JIT module 2022-05-27 17:44:53 -04:00
Joseph Huber
6f3e60f1c0 [OpenMP] Add flag for embedding bitcode in module for JIT
Summary:
This patch adds the '-fopenmp-target-jit' flag to embed bitcode in the
module when using the new driver.
2022-05-24 13:47:21 -04:00
23 changed files with 2248 additions and 51 deletions

View File

@@ -160,6 +160,8 @@ def err_drv_invalid_Xarch_argument_with_args : Error<
"invalid Xarch argument: '%0', options requiring arguments are unsupported">;
def err_drv_Xopenmp_target_missing_triple : Error<
"cannot deduce implicit triple value for -Xopenmp-target, specify triple using -Xopenmp-target=<triple>">;
def err_drv_openmp_jit_without_lto : Error<
"cannot enable OpenMP offloading JIT, specify bitcode compilation with '-foffload-lto'">;
def err_drv_invalid_Xopenmp_target_with_args : Error<
"invalid -Xopenmp-target argument: '%0', options requiring arguments are unsupported">;
def err_drv_argument_only_allowed_with : Error<

View File

@@ -2539,6 +2539,10 @@ def fopenmp_target_new_runtime : Flag<["-"], "fopenmp-target-new-runtime">,
Group<f_Group>, Flags<[CC1Option, HelpHidden]>;
def fno_openmp_target_new_runtime : Flag<["-"], "fno-openmp-target-new-runtime">,
Group<f_Group>, Flags<[CC1Option, HelpHidden]>;
def fopenmp_target_jit : Flag<["-"], "fopenmp-target-jit">, Group<f_Group>,
HelpText<"Enable JIT comilation for OpenMP Offloading">, Flags<[ NoArgumentUnused]>;
def fno_openmp_target_jit : Flag<["-"], "fno-openmp-target-jit">, Group<f_Group>,
Flags<[NoArgumentUnused, HelpHidden]>;
defm openmp_optimistic_collapse : BoolFOption<"openmp-optimistic-collapse",
LangOpts<"OpenMPOptimisticCollapse">, DefaultFalse,
PosFlag<SetTrue, [CC1Option]>, NegFlag<SetFalse>, BothFlags<[NoArgumentUnused, HelpHidden]>>;

View File

@@ -8285,6 +8285,12 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
auto OpenMPTCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
ArgStringList CmdArgs;
if (!C.getDriver().isUsingLTO(/* IsOffload */ true) &&
Args.hasFlag(options::OPT_fopenmp_target_jit,
options::OPT_fno_openmp_target_jit, /*Default*/ false)) {
C.getDriver().Diag(clang::diag::err_drv_openmp_jit_without_lto);
}
// Pass the CUDA path to the linker wrapper tool.
for (Action::OffloadKind Kind : {Action::OFK_Cuda, Action::OFK_OpenMP}) {
auto TCRange = C.getOffloadToolChains(Kind);
@@ -8355,6 +8361,11 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
if (!OOpt.empty())
CmdArgs.push_back(Args.MakeArgString(Twine("-opt-level=O") + OOpt));
}
if (Args.hasFlag(options::OPT_fopenmp_target_jit,
options::OPT_fno_openmp_target_jit,
/*Default=*/false))
CmdArgs.push_back(Args.MakeArgString("-target-embed-bc"));
}
CmdArgs.push_back("-host-triple");

View File

@@ -98,6 +98,7 @@ struct LTOCodeGenerator {
void setCpu(StringRef MCpu) { Config.CPU = std::string(MCpu); }
void setAttrs(std::vector<std::string> MAttrs) { Config.MAttrs = MAttrs; }
void setUseDefaultPipeline(bool Value) { Config.UseDefaultPipeline = Value; }
void setOptLevel(unsigned OptLevel);
void setShouldInternalize(bool Value) { ShouldInternalize = Value; }
@@ -193,6 +194,8 @@ struct LTOCodeGenerator {
void resetMergedModule() { MergedModule.reset(); }
void DiagnosticHandler(const DiagnosticInfo &DI);
Module &getMergedModule() const { return *MergedModule; }
private:
/// Verify the merged module on first call.
///

View File

@@ -111,6 +111,8 @@ public:
createInLocalContext(std::unique_ptr<LLVMContext> Context, const void *mem,
size_t length, const TargetOptions &options,
StringRef path);
static ErrorOr<std::unique_ptr<LTOModule>>
clone(const LTOModule &LM, const TargetOptions &options);
const Module &getModule() const { return *Mod; }
Module &getModule() { return *Mod; }

View File

@@ -538,7 +538,8 @@ bool LTOCodeGenerator::optimize() {
this->applyScopeRestrictions();
// Write LTOPostLink flag for passes that require all the modules.
MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
if (!MergedModule->getModuleFlag("LTOPostLink"))
MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
// Add an appropriate DataLayout instance for this module...
MergedModule->setDataLayout(TargetMach->createDataLayout());

View File

@@ -38,6 +38,7 @@
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/GlobalStatus.h"
#include <system_error>
using namespace llvm;
@@ -701,3 +702,47 @@ bool LTOModule::hasCtorDtor() const {
}
return false;
}
ErrorOr<std::unique_ptr<LTOModule>>
LTOModule::clone(const LTOModule &LM, const TargetOptions &options) {
auto NM = CloneModule(LM.getModule());
std::string TripleStr = NM->getTargetTriple();
if (TripleStr.empty())
TripleStr = sys::getDefaultTargetTriple();
llvm::Triple Triple(TripleStr);
// find machine architecture for this module
std::string errMsg;
const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
if (!march)
return make_error_code(object::object_error::arch_not_found);
// construct LTOModule, hand over ownership of module and target
SubtargetFeatures Features;
Features.getDefaultSubtargetFeatures(Triple);
std::string FeatureStr = Features.getString();
// Set a default CPU for Darwin triples.
std::string CPU;
if (Triple.isOSDarwin()) {
if (Triple.getArch() == llvm::Triple::x86_64)
CPU = "core2";
else if (Triple.getArch() == llvm::Triple::x86)
CPU = "yonah";
else if (Triple.isArm64e())
CPU = "apple-a12";
else if (Triple.getArch() == llvm::Triple::aarch64 ||
Triple.getArch() == llvm::Triple::aarch64_32)
CPU = "cyclone";
}
TargetMachine *target =
march->createTargetMachine(TripleStr, CPU, FeatureStr, options, None);
std::unique_ptr<LTOModule> Ret(
new LTOModule(std::move(NM), LM.MBRef, target));
Ret->parseSymbols();
Ret->parseMetadata();
return std::move(Ret);
}

View File

@@ -1497,6 +1497,14 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
}
{
FunctionPassManager EarlyFPM;
// Break up allocas
EarlyFPM.addPass(SROAPass());
MPM.addPass(createModuleToFunctionPassAdaptor(
std::move(EarlyFPM), PTO.EagerlyInvalidateAnalyses));
}
// Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
MPM.addPass(OpenMPOptPass());
@@ -1632,9 +1640,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
PGOOpt->ProfileRemappingFile);
}
// Break up allocas
FPM.addPass(SROAPass());
// LTO provides additional opportunities for tailcall elimination due to
// link-time inlining, and visibility of nocapture attribute.
FPM.addPass(TailCallElimPass());

View File

@@ -128,6 +128,7 @@ template <typename VTy, typename Ty> struct ValueRAII {
Val(OldValue), Active(Active) {
if (!Active)
return;
Ptr = &V.lookup(/* IsReadonly */ false, Ident);
ASSERT(*Ptr == OldValue &&
"ValueRAII initialization with wrong old value!");
*Ptr = NewValue;

View File

@@ -22,6 +22,7 @@
#include <mutex>
#include <set>
#include <thread>
#include <unordered_map>
#include <vector>
#include "ExclusiveAccess.h"
@@ -302,6 +303,9 @@ typedef std::map<void *, ShadowPtrValTy> ShadowPtrListTy;
struct PendingCtorDtorListsTy {
std::list<void *> PendingCtors;
std::list<void *> PendingDtors;
std::unordered_map<__tgt_device_image *, std::list<void *>> PendingJITCtors;
std::unordered_map<__tgt_device_image *, std::list<void *>> PendingJITDtors;
};
typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy>
PendingCtorsDtorsPerLibrary;
@@ -461,6 +465,10 @@ struct DeviceTy {
int32_t destroyEvent(void *Event);
/// }
__tgt_target_table *loadJITImage(__tgt_device_image *Image,
const char *EntryName, void **TgtArgs,
ptrdiff_t *TgtOffsets, int NumArgs);
private:
// Call to RTL
void init(); // To be called only via DeviceTy::initOnce()

View File

@@ -197,6 +197,12 @@ struct __tgt_device_info {
void *Device = nullptr;
};
struct __tgt_kernel_launch_entry {
__tgt_offload_entry *HostEntry = nullptr;
void *TargetEntry = nullptr;
__tgt_device_image *Image = nullptr;
};
#ifdef __cplusplus
extern "C" {
#endif

View File

@@ -18,6 +18,7 @@
#include <map>
#include <mutex>
#include <string>
#include <unordered_set>
#include <vector>
// Forward declarations.
@@ -184,4 +185,7 @@ struct TableMap {
};
typedef std::map<void *, TableMap> HostPtrToTableMapTy;
/// A set that stores all registered JIT images.
extern std::unordered_set<__tgt_device_image *> RegisteredJITImages;
#endif

View File

@@ -57,7 +57,7 @@ else()
set(LIBOMPTARGET_DEP_LIBRARIES)
endif()
add_library(omptarget.rtl.amdgpu SHARED
add_llvm_library(omptarget.rtl.amdgpu SHARED
impl/impl.cpp
impl/interop_hsa.cpp
impl/data.cpp
@@ -100,6 +100,7 @@ target_link_libraries(
${OPENMP_PTHREAD_LIB}
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
${LDFLAGS_UNDEFINED}
LLVM-LIBOMPTARGET-JIT
)
# in case of amdgcn, skip running tests if amdgpu-arch was not built or fails

View File

@@ -35,8 +35,16 @@
#include "omptargetplugin.h"
#include "print_tracing.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Program.h"
#include "JIT.h"
using namespace llvm;
// hostrpc interface, FIXME: consider moving to its own include these are
// statically linked into amdgpu/plugin if present from hostrpc_services.a,
@@ -850,8 +858,81 @@ pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
static RTLDeviceInfoTy DeviceInfo;
static __tgt_target_table *
__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
namespace {
std::unique_ptr<jit::JITEngine> JITEngine;
class AMDDeviceToolChain : public jit::DeviceToolChain {
static std::string getMainExecutable(const char *Name) {
void *Ptr = (void *)(intptr_t)&getMainExecutable;
auto COWPath = sys::fs::getMainExecutable(Name, Ptr);
return sys::path::parent_path(COWPath).str();
}
/// Get a temporary filename suitable for output.
static Error createOutputFile(const Twine &Prefix, StringRef Extension,
SmallString<128> &NewFilename) {
if (std::error_code EC =
sys::fs::createTemporaryFile(Prefix, Extension, NewFilename))
return createFileError(NewFilename, EC);
return Error::success();
}
Expected<std::string> link(StringRef Input, StringRef Prefix) {
ErrorOr<std::string> LLDPath =
sys::findProgramByName("lld", {getMainExecutable("lld")});
if (!LLDPath)
LLDPath = sys::findProgramByName("lld");
if (!LLDPath)
return createStringError(LLDPath.getError(),
"Unable to find 'lld' in path");
SmallString<128> TempFile;
if (Error Err = createOutputFile(Prefix, "o", TempFile))
return std::move(Err);
SmallVector<StringRef, 16> CmdArgs;
CmdArgs.push_back(*LLDPath);
CmdArgs.push_back("-flavor");
CmdArgs.push_back("gnu");
CmdArgs.push_back("--no-undefined");
CmdArgs.push_back("-shared");
CmdArgs.push_back("-o");
CmdArgs.push_back(TempFile);
CmdArgs.push_back(Input);
if (sys::ExecuteAndWait(*LLDPath, CmdArgs))
return createStringError(inconvertibleErrorCode(), "'lld' failed");
return static_cast<std::string>(TempFile);
}
public:
std::unique_ptr<MemoryBuffer> run(const std::string &FileName,
const jit::DeviceInfo &DI) override {
std::string Prefix = "libomptarget-amdgcn-" + DI.MCpu + "-jit";
auto FileNameOrErr = link(FileName, Prefix);
if (!FileNameOrErr) {
Error E = FileNameOrErr.takeError();
return nullptr;
}
std::string TempFile = *FileNameOrErr;
auto MBOrError = MemoryBuffer::getFile(TempFile, /*IsText=*/false,
/*RequiresNullTerminator=*/false);
if (std::error_code EC = MBOrError.getError()) {
sys::fs::remove(TempFile);
return nullptr;
}
sys::fs::remove(TempFile);
return std::move(*MBOrError);
}
} AMDDTC;
int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
__tgt_async_info *AsyncInfo) {
assert(AsyncInfo && "AsyncInfo is nullptr");
@@ -1090,6 +1171,30 @@ static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
return packet_id;
}
__tgt_target_table *loadJITImage(int DeviceId, __tgt_device_image *Image,
__tgt_offload_entry *Entry, void **Args,
int NumArgs, int TeamNum, int ThreadLimit,
int LoopTripCount) {
auto Kernel =
jit::Kernel::create(Image, Entry->name, DeviceInfo.GPUName[DeviceId],
Args, NumArgs, TeamNum, ThreadLimit, LoopTripCount);
if (auto *TT = JITEngine->getTargetTable(DeviceId, Kernel))
return TT;
auto *NewImage = JITEngine->getImage(DeviceId, Kernel, Image);
if (!NewImage)
return nullptr;
auto *TT = __tgt_rtl_load_binary_locked(DeviceId, NewImage);
if (!TT)
return nullptr;
if (!JITEngine->insertTargetTable(DeviceId, Kernel, TT))
return nullptr;
return TT;
}
int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
ptrdiff_t *tgt_offsets, int32_t arg_num,
int32_t num_teams, int32_t thread_limit,
@@ -1111,7 +1216,24 @@ int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
}
KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
auto LaunchEntry = reinterpret_cast<__tgt_kernel_launch_entry *>(tgt_entry_ptr);
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(LaunchEntry->TargetEntry);
// If kernel info is nullptr, it means we are dealing with JIT image.
if (KernelInfo == nullptr) {
assert(LaunchEntry->Image && LaunchEntry->HostEntry);
__tgt_device_image NewImage = *(LaunchEntry->Image);
NewImage.EntriesBegin = LaunchEntry->HostEntry;
NewImage.EntriesEnd = NewImage.EntriesBegin + 1;
auto TargetTable =
loadJITImage(device_id, &NewImage, LaunchEntry->HostEntry, ptrs.data(),
arg_num, num_teams, thread_limit, loop_tripcount);
if (!TargetTable)
return OFFLOAD_FAIL;
KernelInfo = reinterpret_cast<KernelTy *>(TargetTable->EntriesBegin->addr);
}
assert(KernelInfo && "KernelInfo should not be nullptr");
std::string kernel_name = std::string(KernelInfo->Name);
auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
@@ -1640,7 +1762,22 @@ hsa_status_t allow_access_to_all_gpu_agents(void *ptr) {
extern "C" {
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
return elf_machine_id_is_amdgcn(image);
if(elf_machine_id_is_amdgcn(image))
return 1;
jit::JITEngine::init();
if (!JITEngine)
JITEngine = std::make_unique<jit::JITEngine>("amdgcn", AMDDTC,
DeviceInfo.NumberOfDevices);
if (!jit::JITEngine::isValidModule("amdgcn", image))
return 0;
if (jit::JITEngine::isSpecializationSupported(image))
return 2;
return 3;
}
int __tgt_rtl_number_of_devices() {
@@ -1811,12 +1948,23 @@ int32_t __tgt_rtl_init_device(int device_id) {
DeviceInfo.GroupsPerDevice[device_id] *
DeviceInfo.ThreadsPerGroup[device_id]);
if (JITEngine) {
jit::DeviceInfo DI;
DI.Arch = "amdgcn";
DI.MCpu = DeviceInfo.GPUName[device_id];
DI.ThreadsPerBlock = DeviceInfo.ThreadsPerGroup[device_id];
DI.BlocksPerGrid = DeviceInfo.GroupsPerDevice[device_id];
DI.WarpSize = 32;
DI.NumThreads = DeviceInfo.NumThreads[device_id];
DI.NumTeams = DeviceInfo.NumTeams[device_id];
DI.EnvNumThreads = DeviceInfo.Env.TeamThreadLimit;
DI.EnvNumTeams = DeviceInfo.Env.NumTeams;
JITEngine->init(device_id, DI);
}
return OFFLOAD_SUCCESS;
}
static __tgt_target_table *
__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
__tgt_device_image *image) {
DeviceInfo.load_run_lock.lock();

View File

@@ -11,4 +11,5 @@
##===----------------------------------------------------------------------===##
add_subdirectory(elf_common)
add_subdirectory(JIT)
add_subdirectory(MemoryManager)

View File

@@ -0,0 +1,36 @@
##===----------------------------------------------------------------------===##
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
##===----------------------------------------------------------------------===##
#
# JIT module
#
##===----------------------------------------------------------------------===##
set(LLVM_LINK_COMPONENTS
AllTargetsAsmParsers
AllTargetsCodeGens
AllTargetsDescs
AllTargetsInfos
LTO
)
add_llvm_library(LLVM-LIBOMPTARGET-JIT STATIC BUILDTREE_ONLY JIT.cpp)
# Build elf_common with PIC to be able to link it with plugin shared libraries.
set_property(TARGET LLVM-LIBOMPTARGET-JIT PROPERTY POSITION_INDEPENDENT_CODE ON)
target_link_libraries(LLVM-LIBOMPTARGET-JIT INTERFACE ${OPENMP_PTHREAD_LIB} ncurses dl)
# Expose JIT.h directory to the users of this library.
target_include_directories(LLVM-LIBOMPTARGET-JIT
INTERFACE
${CMAKE_CURRENT_SOURCE_DIR}
${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}
${LLVM_INCLUDE_DIRS}
PRIVATE
${LIBOMPTARGET_INCLUDE_DIR}
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,361 @@
//===-- JIT.h --- JIT module ----------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// JIT module for target plugins.
//
//===----------------------------------------------------------------------===//
#include <cassert>
#include <cstdint>
#include <fstream>
#include <list>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>
// Forward declaration.
struct __tgt_target_table;
struct __tgt_device_image;
struct __tgt_offload_entry;
struct __tgt_async_info;
namespace llvm {
class MemoryBuffer;
} // namespace llvm
namespace jit {
class Kernel;
namespace impl {
/// Optimization action applied to a kernel, which is in the form:
/// operation:index:value
///
/// 'operation' can be:
/// 's': value specialization;
/// 'a': alignment specialization;
/// 't': number of threads;
/// 'T': number of teams.
///
/// 'index' can be 'n' for those operations that don't require index, or an
/// integer number.
///
/// 'value' can be an action (recursively defined, but in fact we don't
/// support it for now), or an integer value.
class Action {
public:
enum class ActionKind : uint8_t {
None = 0,
Alignment,
Specialization,
NumTeams,
NumThreads,
};
explicit Action(const std::string &S);
explicit Action(ActionKind AK, uintptr_t V, int Index);
explicit Action(ActionKind AK, uintptr_t V);
std::string toString() const;
bool match(const Kernel &K) const;
static std::string ActionsToString(const std::vector<Action> &Actions);
private:
enum ValuePos : uint8_t {
POS_OpCode = 0,
POS_Index = 1,
POS_Value = 2,
};
ActionKind Kind;
uintptr_t Value;
int Index;
};
class KernelSpecialization {
/// Kernel entry name.
const std::string Name;
/// Target architecture.
const std::string MCpu;
///
std::vector<Action> Actions;
friend class Image;
public:
explicit KernelSpecialization(const std::string &Name,
const std::string &MCpu)
: Name(Name), MCpu(MCpu) {}
explicit KernelSpecialization(const std::string &Name,
const std::string &MCpu,
const std::string &ActionString);
explicit KernelSpecialization(const std::string &Name,
const std::string &MCpu,
const std::vector<Action> &A);
bool match(const Kernel &K) const;
const std::string &getName() const { return Name; }
};
class SpecializationStatistics {
/// Kernel name.
const std::string KernelName;
///
uint64_t ThresholdTotalCount = 20;
///
float ThresholdRatio = 0.5f;
/// Total number of specialization variants that have been generated for the
/// corresponding kernel.
uint64_t TotalCount = 0;
/// Count for each argument.
std::vector<uint64_t> ArgCount;
/// Count for num_thread.
uint64_t NumThreadsCount = 0;
/// Count for num_team.
uint64_t NumTeamsCount = 0;
/// Gaurd lock.
std::mutex Lock;
friend class StatisticsUpdater;
public:
SpecializationStatistics(const std::string &Name, int NumArgs)
: KernelName(Name), ArgCount(NumArgs, 0) {}
bool reachThreshold(Action::ActionKind Kind, int Index) const;
bool reachThreshold(Action::ActionKind Kind) const;
};
class TargetTable {
const KernelSpecialization *Specialization;
__tgt_target_table *Table;
public:
TargetTable(const KernelSpecialization *KS, __tgt_target_table *Table)
: Specialization(KS), Table(Table) {}
bool match(const Kernel &K) const;
__tgt_target_table *get() const { return Table; }
};
class TargetTableCache {
///
std::unordered_map<std::string, std::list<TargetTable>> Map;
public:
__tgt_target_table *insert(const KernelSpecialization *KS,
__tgt_target_table *Table) {
auto &Tables = Map[KS->getName()];
Tables.emplace_back(KS, Table);
return Tables.back().get();
}
__tgt_target_table *get(const Kernel &K) const;
};
class Image {
KernelSpecialization Specialization;
///
const char *Start = nullptr;
///
const char *End = nullptr;
void dump(std::ostream &OS) const;
friend class ImageCache;
public:
Image(const KernelSpecialization &KS, const char *ImageStart,
const char *ImageEnd)
: Specialization(KS), Start(ImageStart), End(ImageEnd) {}
///
std::pair<void *, void *> get() const {
return std::make_pair((void *)Start, (void *)End);
}
///
bool match(const Kernel &K) const { return Specialization.match(K); }
const KernelSpecialization &getKernelSpecialization() const {
return Specialization;
}
};
class ImageCache {
public:
ImageCache(const std::string &Arch);
~ImageCache();
///
const Image *insert(const std::string &Key, const KernelSpecialization &KS,
std::unique_ptr<llvm::MemoryBuffer> MB);
///
const Image *get(const std::string &Key, const Kernel &K) const {
auto Itr = Map.find(Key);
if (Itr == Map.end())
return nullptr;
auto &L = Itr->second;
for (auto &I : L)
if (I.match(K))
return &I;
return nullptr;
}
private:
const std::string Arch;
///
std::list<std::unique_ptr<llvm::MemoryBuffer>> NewBuffer;
///
std::unordered_map<std::string, std::list<Image>> Map;
};
} // namespace impl
struct DeviceInfo {
/// Architecture, e.g. nvptx64, amdgcn.
std::string Arch;
/// GPU code name, e.g. sm_75 for Nvidia GPU.
std::string MCpu;
/// Maximum number of registers the device can support.
uint64_t MaxNumRegs = 0;
uint64_t ThreadsPerBlock = 0;
uint64_t BlocksPerGrid = 0;
uint64_t WarpSize = 32;
/// Values set by users.
int64_t EnvNumThreads = -1;
int64_t EnvNumTeams = -1;
/// Default values when users don't set explicitly.
uint64_t NumThreads = 0;
uint64_t NumTeams = 0;
};
class Kernel {
/// Kernel entry name.
std::string Name;
/// Target architecture where the kernel is about to be launched.
std::string MCpu;
/// Number of threads.
int NumThreads = 0;
/// Number of teams.
int NumTeams = 0;
///
int LoopTripCount = 0;
/// Number of arguments.
int NumArgs = 0;
/// Pointer to the kernel arguments.
uintptr_t *Args = nullptr;
/// If the kernel is specialized, an id will be assigned.
uintptr_t Id = 0;
Kernel() = default;
public:
static Kernel create(__tgt_device_image *Image, const char *Name,
const std::string &MCpu, void **Args, int NumArgs,
int NumTeams, int NumThreads, int LoopTripCount);
const std::string &getName() const { return Name; }
const std::string &getMCpu() const { return MCpu; }
int getNumThreads() const { return NumThreads; }
int getNumTeams() const { return NumTeams; }
uintptr_t getArg(int Index) const {
assert(Index < NumArgs && "out of range access");
return Args[Index];
}
int getNumArgs() const { return NumArgs; }
friend class JITEngine;
};
class DeviceToolChain {
public:
virtual std::unique_ptr<llvm::MemoryBuffer> run(const std::string &FileName,
const DeviceInfo &DI) = 0;
};
class JITEngine {
const std::string Arch;
int NumDevices = 0;
DeviceToolChain &DTC;
std::vector<DeviceInfo> DI;
std::unique_ptr<impl::ImageCache> IC;
std::vector<std::unique_ptr<impl::TargetTableCache>> TTC;
class StatisticMap {
std::unordered_map<std::string,
std::unique_ptr<impl::SpecializationStatistics>>
Map;
std::mutex Mtx;
public:
impl::SpecializationStatistics &get(const std::string &K, int NumArgs) {
std::lock_guard<std::mutex> LG(Mtx);
auto Itr = Map.find(K);
if (Itr != Map.end())
return *Itr->second;
auto R = Map.insert(
{K, std::make_unique<impl::SpecializationStatistics>(K, NumArgs)});
return *R.first->second;
}
} Statistics;
public:
JITEngine(const char *A, DeviceToolChain &DTC, int NumDevices);
///
bool init(int DeviceId, const DeviceInfo &D) {
if (DeviceId >= NumDevices)
return false;
DI[DeviceId] = D;
TTC[DeviceId] = std::make_unique<impl::TargetTableCache>();
return true;
}
/// Look up the target table cache. Return nullptr if there is no cache match
/// for that specific kernel.
__tgt_target_table *getTargetTable(int DeviceId, const Kernel &K);
/// Get the device image.
__tgt_device_image *getImage(int DeviceId, Kernel &K,
__tgt_device_image *Image);
/// Get the device image without any kernel specialization.
__tgt_device_image *getImage(int DeviceId, __tgt_device_image *Image);
bool insertTargetTable(int DeviceId, const Kernel &K,
__tgt_target_table *Table);
static bool isValidModule(const std::string &Arch, __tgt_device_image *Image);
static bool isSpecializationSupported(__tgt_device_image *Image);
static void init();
};
} // namespace jit

View File

@@ -37,17 +37,38 @@ if (LIBOMPTARGET_DEP_CUDA_FOUND AND LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
set(LIBOMPTARGET_CAN_LINK_LIBCUDA TRUE)
endif()
set(LLVM_LINK_COMPONENTS
AllTargetsAsmParsers
AllTargetsCodeGens
AllTargetsDescs
AllTargetsInfos
LTO
)
set(src_files src/rtl.cpp)
if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
libomptarget_say("Building CUDA plugin linked against libcuda")
include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
set (LIBOMPTARGET_DEP_LIBRARIES ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES})
set(dependences ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES})
else()
libomptarget_say("Building CUDA plugin for dlopened libcuda")
include_directories(dynamic_cuda)
add_library(omptarget.rtl.cuda SHARED src/rtl.cpp dynamic_cuda/cuda.cpp)
set (LIBOMPTARGET_DEP_LIBRARIES ${CMAKE_DL_LIBS})
list(APPEND src_files dynamic_cuda/cuda.cpp)
set(dependences ${CMAKE_DL_LIBS})
endif()
add_llvm_library(omptarget.rtl.cuda SHARED ${src_files}
LINK_LIBS elf_common
MemoryManager
${LIBOMPTARGET_DEP_LIBRARIES}
${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
${OPENMP_PTHREAD_LIB}
${dependences}
LLVM-LIBOMPTARGET-JIT
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
"-Wl,-z,defs")
add_dependencies(omptarget.rtl.cuda omptarget.devicertl.nvptx)
# Install plugin under the lib destination folder.
@@ -58,15 +79,6 @@ target_include_directories(omptarget.rtl.cuda PRIVATE
${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}
)
target_link_libraries(omptarget.rtl.cuda
elf_common
MemoryManager
${LIBOMPTARGET_DEP_LIBRARIES}
${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
${OPENMP_PTHREAD_LIB}
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
"-Wl,-z,defs")
# Report to the parent scope that we are building a plugin for CUDA.
# This controls whether tests are run for the nvptx offloading target
# Run them if libcuda is available, or if the user explicitly asked for dlopen

View File

@@ -13,12 +13,17 @@
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdlib>
#include <cuda.h>
#include <fstream>
#include <iostream>
#include <list>
#include <memory>
#include <mutex>
#include <string>
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "Debug.h"
@@ -31,7 +36,12 @@
#include "MemoryManager.h"
#include "JIT.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Support/MemoryBuffer.h"
using llvm::MemoryBuffer;
// Utility for retrieving and printing CUDA error string.
#ifdef OMPTARGET_DEBUG
@@ -91,6 +101,24 @@ struct KernelTy {
};
namespace {
std::unique_ptr<jit::JITEngine> JITEngine;
class NVDeviceToolChain : public jit::DeviceToolChain {
public:
std::unique_ptr<llvm::MemoryBuffer> run(const std::string &FileName,
const jit::DeviceInfo &DI) override {
auto MBOrError = llvm::MemoryBuffer::getFile(
FileName, /*IsText=*/true, /*RequiresNullTerminator=*/false);
if (!MBOrError)
return nullptr;
if (const char *Str = getenv("LIBOMPTARGET_JIT_DUMP_ASM"))
fprintf(stderr, ">>> ptx:\n%s\n", (*MBOrError)->getBufferStart());
return std::move(*MBOrError);
}
} NVDTC;
std::unordered_set<void *> NonSpecializedImages;
bool checkResult(CUresult Err, const char *ErrMsg) {
if (Err == CUDA_SUCCESS)
return true;
@@ -158,9 +186,20 @@ struct DeviceDataTy {
int ThreadsPerBlock = 0;
int BlocksPerGrid = 0;
int WarpSize = 0;
// Maximum number of registers available per block
int MaxRegisters = 0;
// OpenMP properties
int NumTeams = 0;
int NumThreads = 0;
struct ComputeCapabilityTy {
int Major = 3;
int Minor = 5;
std::string toString() const { return "sm_" + std::to_string(toInt()); }
int toInt() const { return Major * 10 + Minor; }
} ComputeCapability;
};
/// Resource allocator where \p T is the resource type.
@@ -471,7 +510,6 @@ class DeviceRTLTy {
E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
}
public:
CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
assert(AsyncInfo && "AsyncInfo is nullptr");
@@ -486,6 +524,40 @@ public:
return reinterpret_cast<CUstream>(AsyncInfo->Queue);
}
__tgt_device_image *loadJITImage(int DeviceId, __tgt_device_image *Image) {
return JITEngine->getImage(DeviceId, Image);
}
__tgt_target_table *loadJITImage(int DeviceId, __tgt_device_image *Image,
__tgt_offload_entry *Entry, void **Args,
int NumArgs, int TeamNum, int ThreadLimit,
int LoopTripCount) {
auto Kernel = jit::Kernel::create(
Image, Entry->name, DeviceData[DeviceId].ComputeCapability.toString(),
Args, NumArgs, TeamNum, ThreadLimit, LoopTripCount);
if (auto *TT = JITEngine->getTargetTable(DeviceId, Kernel)) {
DP("couldn't find cached target table for kernel entry " DPxMOD ".\n",
DPxPTR(Entry));
return TT;
}
auto *NewImage = JITEngine->getImage(DeviceId, Kernel, Image);
if (!NewImage) {
DP("failed to jit image for kernel entry " DPxMOD ".\n", DPxPTR(Entry));
return nullptr;
}
auto *TT = loadBinary(DeviceId, NewImage);
if (!TT)
return nullptr;
if (!JITEngine->insertTargetTable(DeviceId, Kernel, TT))
return nullptr;
return TT;
}
public:
// This class should not be copied
DeviceRTLTy(const DeviceRTLTy &) = delete;
DeviceRTLTy(DeviceRTLTy &&) = delete;
@@ -749,6 +821,50 @@ public:
DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
}
// Get compute capability
int SM;
Err = cuDeviceGetAttribute(
&SM, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device);
if (Err != CUDA_SUCCESS) {
DP("Error getting compute capablity major, use default value %d\n",
DeviceData[DeviceId].ComputeCapability.Major);
} else {
DeviceData[DeviceId].ComputeCapability.Major = SM;
}
Err = cuDeviceGetAttribute(
&SM, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device);
if (Err != CUDA_SUCCESS) {
DP("Error getting compute capablity minor, use default value %d\n",
DeviceData[DeviceId].ComputeCapability.Minor);
} else {
DeviceData[DeviceId].ComputeCapability.Minor = SM;
}
int MaxRegs;
Err = cuDeviceGetAttribute(
&MaxRegs, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device);
if (Err != CUDA_SUCCESS) {
DP("Error getting max registers per block, use default value %d\n",
DeviceData[DeviceId].MaxRegisters);
} else {
DeviceData[DeviceId].MaxRegisters = MaxRegs;
}
if (JITEngine) {
jit::DeviceInfo DI;
DI.Arch = "nvptx64";
DI.MCpu = DeviceData[DeviceId].ComputeCapability.toString();
DI.MaxNumRegs = DeviceData[DeviceId].MaxRegisters;
DI.ThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
DI.BlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
DI.WarpSize = 32;
DI.NumThreads = DeviceData[DeviceId].NumThreads;
DI.NumTeams = DeviceData[DeviceId].NumTeams;
DI.EnvNumThreads = EnvTeamThreadLimit;
DI.EnvNumTeams = EnvNumTeams;
JITEngine->init(DeviceId, DI);
}
return OFFLOAD_SUCCESS;
}
@@ -790,14 +906,24 @@ public:
__tgt_target_table *loadBinary(const int DeviceId,
const __tgt_device_image *Image) {
void *ImageStart = Image->ImageStart;
if (NonSpecializedImages.find(ImageStart) != NonSpecializedImages.end()) {
auto *NewImage =
loadJITImage(DeviceId, const_cast<__tgt_device_image *>(Image));
if (!NewImage)
return nullptr;
ImageStart = NewImage->ImageStart;
}
// Clear the offload table as we are going to create a new one.
clearOffloadEntriesTable(DeviceId);
// Create the module and extract the function pointers.
CUmodule Module;
DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
DP("Load data from image " DPxMOD "\n", DPxPTR(ImageStart));
CUresult Err =
cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
cuModuleLoadDataEx(&Module, ImageStart, 0, nullptr, nullptr);
if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
return nullptr;
@@ -1073,7 +1199,7 @@ public:
ptrdiff_t *TgtOffsets, const int ArgNum,
const int TeamNum, const int ThreadLimit,
const unsigned int LoopTripCount,
__tgt_async_info *AsyncInfo) const {
__tgt_async_info *AsyncInfo) {
// All args are references.
std::vector<void *> Args(ArgNum);
std::vector<void *> Ptrs(ArgNum);
@@ -1083,7 +1209,27 @@ public:
Args[I] = &Ptrs[I];
}
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
auto LaunchEntry =
reinterpret_cast<__tgt_kernel_launch_entry *>(TgtEntryPtr);
KernelTy *KernelInfo =
reinterpret_cast<KernelTy *>(LaunchEntry->TargetEntry);
// If kernel info is nullptr, it means we are dealing with JIT image.
if (KernelInfo == nullptr) {
assert(LaunchEntry->Image && LaunchEntry->HostEntry);
__tgt_device_image NewImage = *(LaunchEntry->Image);
NewImage.EntriesBegin = LaunchEntry->HostEntry;
NewImage.EntriesEnd = NewImage.EntriesBegin + 1;
auto TargetTable =
loadJITImage(DeviceId, &NewImage, LaunchEntry->HostEntry, Ptrs.data(),
ArgNum, TeamNum, ThreadLimit, LoopTripCount);
if (!TargetTable)
return OFFLOAD_FAIL;
KernelInfo =
reinterpret_cast<KernelTy *>(TargetTable->EntriesBegin->addr);
}
assert(KernelInfo && "KernelInfo should not be nullptr");
const bool IsSPMDGenericMode =
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
@@ -1484,7 +1630,24 @@ extern "C" {
#endif
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
return elf_check_machine(image, /* EM_CUDA */ 190);
if (elf_check_machine(image, /* EM_CUDA */ 190))
return 1;
jit::JITEngine::init();
if (!JITEngine)
JITEngine = std::make_unique<jit::JITEngine>("nvptx64", NVDTC,
DeviceRTL.getNumOfDevices());
if (!jit::JITEngine::isValidModule("nvptx64", image))
return 0;
if (jit::JITEngine::isSpecializationSupported(image))
return 2;
NonSpecializedImages.insert(image->ImageStart);
return 3;
}
int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }

View File

@@ -19,7 +19,10 @@
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <memory>
#include <mutex>
#include <vector>
////////////////////////////////////////////////////////////////////////////////
/// adds requires flags
@@ -40,6 +43,7 @@ EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) {
}
}
}
// PM->RTLs.RegisterLib(createBinDescFrom(desc));
PM->RTLs.RegisterLib(desc);
}

View File

@@ -107,6 +107,13 @@ static int InitLibrary(DeviceTy &Device) {
rc = OFFLOAD_FAIL;
break;
}
const bool IsJITImage =
RegisteredJITImages.find(img) != RegisteredJITImages.end();
if (IsJITImage)
continue;
// 2) load image into the target table.
__tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] =
Device.load_binary(img);
@@ -1500,16 +1507,6 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
return OFFLOAD_FAIL;
}
// get target table.
__tgt_target_table *TargetTable = nullptr;
{
std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
"Not expecting a device ID outside the table's bounds!");
TargetTable = TM->Table->TargetsTable[DeviceId];
}
assert(TargetTable && "Global data has not been mapped\n");
// We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
// need to manifest base pointers prior to launching a kernel. Even if we have
// mapped an object only partially, e.g. A[N:M], although the kernel is
@@ -1536,11 +1533,42 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
}
}
// Launch device execution.
void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
__tgt_device_image *Image = TM->Table->TargetsImages[Device.DeviceID];
const bool UseJIT =
RegisteredJITImages.find(Image) != RegisteredJITImages.end();
void *TgtEntryPtr = nullptr;
__tgt_kernel_launch_entry LaunchEntry;
// get target table if in non-JIT mode.
if (UseJIT) {
__tgt_offload_entry *Entry = nullptr;
__tgt_target_table *HostTable = &TM->Table->HostTable;
// Find the entry name from the host entries
// TODO: We might want a map for this
for (auto Itr = HostTable->EntriesBegin; Itr != HostTable->EntriesEnd;
++Itr)
if (Itr->addr == HostPtr) {
Entry = Itr;
break;
}
assert(Entry && "cannot find entry");
LaunchEntry.HostEntry = Entry;
LaunchEntry.Image = Image;
DP("Launching target jit execution %s with pointer " DPxMOD ".\n",
Entry->name, DPxPTR(TgtEntryPtr));
} else {
std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
"Not expecting a device ID outside the table's bounds!");
__tgt_target_table *TargetTable = TM->Table->TargetsTable[DeviceId];
assert(TargetTable && "Global data has not been mapped\n");
LaunchEntry.TargetEntry = TargetTable->EntriesBegin[TM->Index].addr;
DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr),
TM->Index);
}
TgtEntryPtr = &LaunchEntry;
// Launch device execution.
{
TIMESCOPE_WITH_NAME_AND_IDENT(
IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc);

View File

@@ -38,6 +38,8 @@ PluginManager *PM;
static char *ProfileTraceFile = nullptr;
#endif
std::unordered_set<__tgt_device_image *> RegisteredJITImages;
__attribute__((constructor(101))) void init() {
DP("Init target library!\n");
@@ -250,8 +252,7 @@ static void RegisterImageIntoTranslationTable(TranslationTable &TT,
static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
__tgt_device_image *img,
RTLInfoTy *RTL) {
RTLInfoTy *RTL, bool IsJITImage) {
for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) {
DeviceTy &Device = *PM->Devices[RTL->Idx + i];
Device.PendingGlobalsMtx.lock();
@@ -261,13 +262,21 @@ static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
if (entry->flags & OMP_DECLARE_TARGET_CTOR) {
DP("Adding ctor " DPxMOD " to the pending list.\n",
DPxPTR(entry->addr));
Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
if (IsJITImage)
Device.PendingCtorsDtors[desc].PendingJITCtors[img].push_back(
entry->addr);
else
Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
} else if (entry->flags & OMP_DECLARE_TARGET_DTOR) {
// Dtors are pushed in reverse order so they are executed from end
// to beginning when unregistering the library!
DP("Adding dtor " DPxMOD " to the pending list.\n",
DPxPTR(entry->addr));
Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
if (IsJITImage)
Device.PendingCtorsDtors[desc].PendingJITDtors[img].push_front(
entry->addr);
else
Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
}
if (entry->flags & OMP_DECLARE_TARGET_LINK) {
@@ -363,14 +372,21 @@ void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
// Scan the RTLs that have associated images until we find one that supports
// the current image.
for (auto &R : AllRTLs) {
if (!R.is_valid_binary(img)) {
int Ret = R.is_valid_binary(img);
if (Ret == 0) {
DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
DPxPTR(img->ImageStart), R.RTLName.c_str());
continue;
}
DP("Image " DPxMOD " is compatible with RTL %s!\n",
DPxPTR(img->ImageStart), R.RTLName.c_str());
// TODO: should use enum here.
const bool IsJITImage = Ret == 2;
DP("%sImage " DPxMOD " is compatible with RTL %s!\n",
IsJITImage ? "JIT " : "", DPxPTR(img->ImageStart), R.RTLName.c_str());
if (IsJITImage)
RegisteredJITImages.insert(img);
initRTLonce(R);
@@ -395,7 +411,7 @@ void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
FoundRTL = &R;
// Load ctors/dtors for static objects
RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL);
RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL, IsJITImage);
// if an RTL was found we are done - proceed to register the next image
break;
@@ -427,6 +443,9 @@ void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
assert(R->isUsed && "Expecting used RTLs.");
// FIXME: This is WRONG!!!
continue;
if (!R->is_valid_binary(img)) {
DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n",
DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));