Init JIT module

[OpenMP] Add flag for embedding bitcode in module for JIT
Summary: This patch adds the '-fopenmp-target-jit' flag to embed bitcode in the module when using the new driver.
2022-05-27 17:44:53 -04:00 · 2022-05-24 13:47:21 -04:00
23 changed files with 2248 additions and 51 deletions
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -160,6 +160,8 @@ def err_drv_invalid_Xarch_argument_with_args : Error<
  "invalid Xarch argument: '%0', options requiring arguments are unsupported">;
 def err_drv_Xopenmp_target_missing_triple : Error<
  "cannot deduce implicit triple value for -Xopenmp-target, specify triple using -Xopenmp-target=<triple>">;
+def err_drv_openmp_jit_without_lto : Error<
+  "cannot enable OpenMP offloading JIT, specify bitcode compilation with '-foffload-lto'">;
 def err_drv_invalid_Xopenmp_target_with_args : Error<
  "invalid -Xopenmp-target argument: '%0', options requiring arguments are unsupported">;
 def err_drv_argument_only_allowed_with : Error<
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2539,6 +2539,10 @@ def fopenmp_target_new_runtime : Flag<["-"], "fopenmp-target-new-runtime">,
  Group<f_Group>, Flags<[CC1Option, HelpHidden]>;
 def fno_openmp_target_new_runtime : Flag<["-"], "fno-openmp-target-new-runtime">,
  Group<f_Group>, Flags<[CC1Option, HelpHidden]>;
+def fopenmp_target_jit : Flag<["-"], "fopenmp-target-jit">, Group<f_Group>,
+  HelpText<"Enable JIT comilation for OpenMP Offloading">, Flags<[ NoArgumentUnused]>;
+def fno_openmp_target_jit : Flag<["-"], "fno-openmp-target-jit">, Group<f_Group>,
+  Flags<[NoArgumentUnused, HelpHidden]>;
 defm openmp_optimistic_collapse : BoolFOption<"openmp-optimistic-collapse",
  LangOpts<"OpenMPOptimisticCollapse">, DefaultFalse,
  PosFlag<SetTrue, [CC1Option]>, NegFlag<SetFalse>, BothFlags<[NoArgumentUnused, HelpHidden]>>;
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -8285,6 +8285,12 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
  auto OpenMPTCRange = C.getOffloadToolChains<Action::OFK_OpenMP>();
  ArgStringList CmdArgs;

+  if (!C.getDriver().isUsingLTO(/* IsOffload */ true) &&
+      Args.hasFlag(options::OPT_fopenmp_target_jit,
+                   options::OPT_fno_openmp_target_jit, /*Default*/ false)) {
+    C.getDriver().Diag(clang::diag::err_drv_openmp_jit_without_lto);
+  }
+
  // Pass the CUDA path to the linker wrapper tool.
  for (Action::OffloadKind Kind : {Action::OFK_Cuda, Action::OFK_OpenMP}) {
    auto TCRange = C.getOffloadToolChains(Kind);
@@ -8355,6 +8361,11 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
      if (!OOpt.empty())
        CmdArgs.push_back(Args.MakeArgString(Twine("-opt-level=O") + OOpt));
    }
+
+    if (Args.hasFlag(options::OPT_fopenmp_target_jit,
+                     options::OPT_fno_openmp_target_jit,
+                     /*Default=*/false))
+      CmdArgs.push_back(Args.MakeArgString("-target-embed-bc"));
  }

  CmdArgs.push_back("-host-triple");
--- a/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h
+++ b/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h
@@ -98,6 +98,7 @@ struct LTOCodeGenerator {

  void setCpu(StringRef MCpu) { Config.CPU = std::string(MCpu); }
  void setAttrs(std::vector<std::string> MAttrs) { Config.MAttrs = MAttrs; }
+  void setUseDefaultPipeline(bool Value) { Config.UseDefaultPipeline = Value; }
  void setOptLevel(unsigned OptLevel);

  void setShouldInternalize(bool Value) { ShouldInternalize = Value; }
@@ -193,6 +194,8 @@ struct LTOCodeGenerator {
  void resetMergedModule() { MergedModule.reset(); }
  void DiagnosticHandler(const DiagnosticInfo &DI);

+  Module &getMergedModule() const { return *MergedModule; }
+
 private:
  /// Verify the merged module on first call.
  ///
--- a/llvm/include/llvm/LTO/legacy/LTOModule.h
+++ b/llvm/include/llvm/LTO/legacy/LTOModule.h
@@ -111,6 +111,8 @@ public:
  createInLocalContext(std::unique_ptr<LLVMContext> Context, const void *mem,
                       size_t length, const TargetOptions &options,
                       StringRef path);
+  static ErrorOr<std::unique_ptr<LTOModule>>
+  clone(const LTOModule &LM, const TargetOptions &options);

  const Module &getModule() const { return *Mod; }
  Module &getModule() { return *Mod; }
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -538,7 +538,8 @@ bool LTOCodeGenerator::optimize() {
  this->applyScopeRestrictions();

  // Write LTOPostLink flag for passes that require all the modules.
-  MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);
+  if (!MergedModule->getModuleFlag("LTOPostLink"))
+    MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1);

  // Add an appropriate DataLayout instance for this module...
  MergedModule->setDataLayout(TargetMach->createDataLayout());
--- a/llvm/lib/LTO/LTOModule.cpp
+++ b/llvm/lib/LTO/LTOModule.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 #include <system_error>
 using namespace llvm;
@@ -701,3 +702,47 @@ bool LTOModule::hasCtorDtor() const {
  }
  return false;
 }
+
+ErrorOr<std::unique_ptr<LTOModule>>
+LTOModule::clone(const LTOModule &LM, const TargetOptions &options) {
+  auto NM = CloneModule(LM.getModule());
+
+  std::string TripleStr = NM->getTargetTriple();
+  if (TripleStr.empty())
+    TripleStr = sys::getDefaultTargetTriple();
+  llvm::Triple Triple(TripleStr);
+
+  // find machine architecture for this module
+  std::string errMsg;
+  const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
+  if (!march)
+    return make_error_code(object::object_error::arch_not_found);
+
+  // construct LTOModule, hand over ownership of module and target
+  SubtargetFeatures Features;
+  Features.getDefaultSubtargetFeatures(Triple);
+  std::string FeatureStr = Features.getString();
+  // Set a default CPU for Darwin triples.
+  std::string CPU;
+  if (Triple.isOSDarwin()) {
+    if (Triple.getArch() == llvm::Triple::x86_64)
+      CPU = "core2";
+    else if (Triple.getArch() == llvm::Triple::x86)
+      CPU = "yonah";
+    else if (Triple.isArm64e())
+      CPU = "apple-a12";
+    else if (Triple.getArch() == llvm::Triple::aarch64 ||
+             Triple.getArch() == llvm::Triple::aarch64_32)
+      CPU = "cyclone";
+  }
+
+  TargetMachine *target =
+      march->createTargetMachine(TripleStr, CPU, FeatureStr, options, None);
+
+  std::unique_ptr<LTOModule> Ret(
+      new LTOModule(std::move(NM), LM.MBRef, target));
+  Ret->parseSymbols();
+  Ret->parseMetadata();
+
+  return std::move(Ret);
+}
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1497,6 +1497,14 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
    MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
  }

+  {
+    FunctionPassManager EarlyFPM;
+    // Break up allocas
+    EarlyFPM.addPass(SROAPass());
+    MPM.addPass(createModuleToFunctionPassAdaptor(
+        std::move(EarlyFPM), PTO.EagerlyInvalidateAnalyses));
+  }
+
  // Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
  MPM.addPass(OpenMPOptPass());

@@ -1632,9 +1640,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
                        PGOOpt->ProfileRemappingFile);
  }

-  // Break up allocas
-  FPM.addPass(SROAPass());
-
  // LTO provides additional opportunities for tailcall elimination due to
  // link-time inlining, and visibility of nocapture attribute.
  FPM.addPass(TailCallElimPass());
--- a/openmp/libomptarget/DeviceRTL/include/State.h
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -128,6 +128,7 @@ template <typename VTy, typename Ty> struct ValueRAII {
        Val(OldValue), Active(Active) {
    if (!Active)
      return;
+    Ptr = &V.lookup(/* IsReadonly */ false, Ident);
    ASSERT(*Ptr == OldValue &&
           "ValueRAII initialization with wrong old value!");
    *Ptr = NewValue;
--- a/openmp/libomptarget/include/device.h
+++ b/openmp/libomptarget/include/device.h
@@ -22,6 +22,7 @@
 #include <mutex>
 #include <set>
 #include <thread>
+#include <unordered_map>
 #include <vector>

 #include "ExclusiveAccess.h"
@@ -302,6 +303,9 @@ typedef std::map<void *, ShadowPtrValTy> ShadowPtrListTy;
 struct PendingCtorDtorListsTy {
  std::list<void *> PendingCtors;
  std::list<void *> PendingDtors;
+
+  std::unordered_map<__tgt_device_image *, std::list<void *>> PendingJITCtors;
+  std::unordered_map<__tgt_device_image *, std::list<void *>> PendingJITDtors;
 };
 typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy>
    PendingCtorsDtorsPerLibrary;
@@ -461,6 +465,10 @@ struct DeviceTy {
  int32_t destroyEvent(void *Event);
  /// }

+  __tgt_target_table *loadJITImage(__tgt_device_image *Image,
+                                   const char *EntryName, void **TgtArgs,
+                                   ptrdiff_t *TgtOffsets, int NumArgs);
+
 private:
  // Call to RTL
  void init(); // To be called only via DeviceTy::initOnce()
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -197,6 +197,12 @@ struct __tgt_device_info {
  void *Device = nullptr;
 };

+struct __tgt_kernel_launch_entry {
+  __tgt_offload_entry *HostEntry = nullptr;
+  void *TargetEntry = nullptr;
+  __tgt_device_image *Image = nullptr;
+};
+
 #ifdef __cplusplus
 extern "C" {
 #endif
--- a/openmp/libomptarget/include/rtl.h
+++ b/openmp/libomptarget/include/rtl.h
@@ -18,6 +18,7 @@
 #include <map>
 #include <mutex>
 #include <string>
+#include <unordered_set>
 #include <vector>

 // Forward declarations.
@@ -184,4 +185,7 @@ struct TableMap {
 };
 typedef std::map<void *, TableMap> HostPtrToTableMapTy;

+/// A set that stores all registered JIT images.
+extern std::unordered_set<__tgt_device_image *> RegisteredJITImages;
+
 #endif
--- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
@@ -57,7 +57,7 @@ else()
  set(LIBOMPTARGET_DEP_LIBRARIES)
 endif()

-add_library(omptarget.rtl.amdgpu SHARED
+add_llvm_library(omptarget.rtl.amdgpu SHARED
      impl/impl.cpp
      impl/interop_hsa.cpp
      impl/data.cpp
@@ -100,6 +100,7 @@ target_link_libraries(
  ${OPENMP_PTHREAD_LIB}
  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
  ${LDFLAGS_UNDEFINED}
+  LLVM-LIBOMPTARGET-JIT
  )

 # in case of amdgcn, skip running tests if amdgpu-arch was not built or fails
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -35,8 +35,16 @@
 #include "omptargetplugin.h"
 #include "print_tracing.h"

+#include "llvm/ADT/StringRef.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+
+#include "JIT.h"
+
+using namespace llvm;

 // hostrpc interface, FIXME: consider moving to its own include these are
 // statically linked into amdgpu/plugin if present from hostrpc_services.a,
@@ -850,8 +858,81 @@ pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;

 static RTLDeviceInfoTy DeviceInfo;

+static __tgt_target_table *
+__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
+
 namespace {

+std::unique_ptr<jit::JITEngine> JITEngine;
+
+class AMDDeviceToolChain : public jit::DeviceToolChain {
+  static std::string getMainExecutable(const char *Name) {
+    void *Ptr = (void *)(intptr_t)&getMainExecutable;
+    auto COWPath = sys::fs::getMainExecutable(Name, Ptr);
+    return sys::path::parent_path(COWPath).str();
+  }
+
+  /// Get a temporary filename suitable for output.
+  static Error createOutputFile(const Twine &Prefix, StringRef Extension,
+                                SmallString<128> &NewFilename) {
+    if (std::error_code EC =
+            sys::fs::createTemporaryFile(Prefix, Extension, NewFilename))
+      return createFileError(NewFilename, EC);
+    return Error::success();
+  }
+
+  Expected<std::string> link(StringRef Input, StringRef Prefix) {
+    ErrorOr<std::string> LLDPath =
+        sys::findProgramByName("lld", {getMainExecutable("lld")});
+    if (!LLDPath)
+      LLDPath = sys::findProgramByName("lld");
+    if (!LLDPath)
+      return createStringError(LLDPath.getError(),
+                               "Unable to find 'lld' in path");
+
+    SmallString<128> TempFile;
+    if (Error Err = createOutputFile(Prefix, "o", TempFile))
+      return std::move(Err);
+
+    SmallVector<StringRef, 16> CmdArgs;
+    CmdArgs.push_back(*LLDPath);
+    CmdArgs.push_back("-flavor");
+    CmdArgs.push_back("gnu");
+    CmdArgs.push_back("--no-undefined");
+    CmdArgs.push_back("-shared");
+    CmdArgs.push_back("-o");
+    CmdArgs.push_back(TempFile);
+    CmdArgs.push_back(Input);
+
+    if (sys::ExecuteAndWait(*LLDPath, CmdArgs))
+      return createStringError(inconvertibleErrorCode(), "'lld' failed");
+
+    return static_cast<std::string>(TempFile);
+  }
+
+public:
+  std::unique_ptr<MemoryBuffer> run(const std::string &FileName,
+                                    const jit::DeviceInfo &DI) override {
+    std::string Prefix = "libomptarget-amdgcn-" + DI.MCpu + "-jit";
+    auto FileNameOrErr = link(FileName, Prefix);
+    if (!FileNameOrErr) {
+      Error E = FileNameOrErr.takeError();
+      return nullptr;
+    }
+
+    std::string TempFile = *FileNameOrErr;
+    auto MBOrError = MemoryBuffer::getFile(TempFile, /*IsText=*/false,
+                                           /*RequiresNullTerminator=*/false);
+    if (std::error_code EC = MBOrError.getError()) {
+      sys::fs::remove(TempFile);
+      return nullptr;
+    }
+
+    sys::fs::remove(TempFile);
+    return std::move(*MBOrError);
+  }
+} AMDDTC;
+
 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
                     __tgt_async_info *AsyncInfo) {
  assert(AsyncInfo && "AsyncInfo is nullptr");
@@ -1090,6 +1171,30 @@ static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
  return packet_id;
 }

+__tgt_target_table *loadJITImage(int DeviceId, __tgt_device_image *Image,
+                                 __tgt_offload_entry *Entry, void **Args,
+                                 int NumArgs, int TeamNum, int ThreadLimit,
+                                 int LoopTripCount) {
+  auto Kernel =
+      jit::Kernel::create(Image, Entry->name, DeviceInfo.GPUName[DeviceId],
+                          Args, NumArgs, TeamNum, ThreadLimit, LoopTripCount);
+  if (auto *TT = JITEngine->getTargetTable(DeviceId, Kernel))
+    return TT;
+
+  auto *NewImage = JITEngine->getImage(DeviceId, Kernel, Image);
+  if (!NewImage)
+    return nullptr;
+
+  auto *TT = __tgt_rtl_load_binary_locked(DeviceId, NewImage);
+  if (!TT)
+    return nullptr;
+
+  if (!JITEngine->insertTargetTable(DeviceId, Kernel, TT))
+    return nullptr;
+
+  return TT;
+}
+
 int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
                        ptrdiff_t *tgt_offsets, int32_t arg_num,
                        int32_t num_teams, int32_t thread_limit,
@@ -1111,7 +1216,24 @@ int32_t runRegionLocked(int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
    DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
  }

-  KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
+  auto LaunchEntry = reinterpret_cast<__tgt_kernel_launch_entry *>(tgt_entry_ptr);
+  KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(LaunchEntry->TargetEntry);
+  // If kernel info is nullptr, it means we are dealing with JIT image.
+  if (KernelInfo == nullptr) {
+    assert(LaunchEntry->Image && LaunchEntry->HostEntry);
+    __tgt_device_image NewImage = *(LaunchEntry->Image);
+    NewImage.EntriesBegin = LaunchEntry->HostEntry;
+    NewImage.EntriesEnd = NewImage.EntriesBegin + 1;
+    auto TargetTable =
+        loadJITImage(device_id, &NewImage, LaunchEntry->HostEntry, ptrs.data(),
+                     arg_num, num_teams, thread_limit, loop_tripcount);
+    if (!TargetTable)
+      return OFFLOAD_FAIL;
+
+    KernelInfo = reinterpret_cast<KernelTy *>(TargetTable->EntriesBegin->addr);
+  }
+
+  assert(KernelInfo && "KernelInfo should not be nullptr");

  std::string kernel_name = std::string(KernelInfo->Name);
  auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
@@ -1640,7 +1762,22 @@ hsa_status_t allow_access_to_all_gpu_agents(void *ptr) {

 extern "C" {
 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
-  return elf_machine_id_is_amdgcn(image);
+  if(elf_machine_id_is_amdgcn(image))
+    return 1;
+
+  jit::JITEngine::init();
+
+  if (!JITEngine)
+    JITEngine = std::make_unique<jit::JITEngine>("amdgcn", AMDDTC,
+                                                 DeviceInfo.NumberOfDevices);
+
+  if (!jit::JITEngine::isValidModule("amdgcn", image))
+    return 0;
+
+  if (jit::JITEngine::isSpecializationSupported(image))
+    return 2;
+
+  return 3;
 }

 int __tgt_rtl_number_of_devices() {
@@ -1811,12 +1948,23 @@ int32_t __tgt_rtl_init_device(int device_id) {
     DeviceInfo.GroupsPerDevice[device_id] *
         DeviceInfo.ThreadsPerGroup[device_id]);

+  if (JITEngine) {
+    jit::DeviceInfo DI;
+    DI.Arch = "amdgcn";
+    DI.MCpu = DeviceInfo.GPUName[device_id];
+    DI.ThreadsPerBlock = DeviceInfo.ThreadsPerGroup[device_id];
+    DI.BlocksPerGrid = DeviceInfo.GroupsPerDevice[device_id];
+    DI.WarpSize = 32;
+    DI.NumThreads = DeviceInfo.NumThreads[device_id];
+    DI.NumTeams = DeviceInfo.NumTeams[device_id];
+    DI.EnvNumThreads = DeviceInfo.Env.TeamThreadLimit;
+    DI.EnvNumTeams = DeviceInfo.Env.NumTeams;
+    JITEngine->init(device_id, DI);
+  }
+
  return OFFLOAD_SUCCESS;
 }

-static __tgt_target_table *
-__tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
-
 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
                                          __tgt_device_image *image) {
  DeviceInfo.load_run_lock.lock();
--- a/openmp/libomptarget/plugins/common/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/common/CMakeLists.txt
@@ -11,4 +11,5 @@
 ##===----------------------------------------------------------------------===##

 add_subdirectory(elf_common)
+add_subdirectory(JIT)
 add_subdirectory(MemoryManager)
--- a/openmp/libomptarget/plugins/common/JIT/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/common/JIT/CMakeLists.txt
@@ -0,0 +1,36 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# JIT module
+#
+##===----------------------------------------------------------------------===##
+
+set(LLVM_LINK_COMPONENTS
+  AllTargetsAsmParsers
+  AllTargetsCodeGens
+  AllTargetsDescs
+  AllTargetsInfos
+  LTO
+)
+
+add_llvm_library(LLVM-LIBOMPTARGET-JIT STATIC BUILDTREE_ONLY JIT.cpp)
+
+# Build elf_common with PIC to be able to link it with plugin shared libraries.
+set_property(TARGET LLVM-LIBOMPTARGET-JIT PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+target_link_libraries(LLVM-LIBOMPTARGET-JIT INTERFACE ${OPENMP_PTHREAD_LIB} ncurses dl)
+
+# Expose JIT.h directory to the users of this library.
+target_include_directories(LLVM-LIBOMPTARGET-JIT
+  INTERFACE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}
+    ${LLVM_INCLUDE_DIRS}
+  PRIVATE
+    ${LIBOMPTARGET_INCLUDE_DIR}
+)
--- a/openmp/libomptarget/plugins/common/JIT/JIT.cpp
+++ b/openmp/libomptarget/plugins/common/JIT/JIT.cpp
--- a/openmp/libomptarget/plugins/common/JIT/JIT.h
+++ b/openmp/libomptarget/plugins/common/JIT/JIT.h
@@ -0,0 +1,361 @@
+//===-- JIT.h --- JIT module ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// JIT module for target plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstdint>
+#include <fstream>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+// Forward declaration.
+struct __tgt_target_table;
+struct __tgt_device_image;
+struct __tgt_offload_entry;
+struct __tgt_async_info;
+
+namespace llvm {
+class MemoryBuffer;
+} // namespace llvm
+
+namespace jit {
+class Kernel;
+
+namespace impl {
+
+/// Optimization action applied to a kernel, which is in the form:
+/// operation:index:value
+///
+/// 'operation' can be:
+/// 's': value specialization;
+/// 'a': alignment specialization;
+/// 't': number of threads;
+/// 'T': number of teams.
+///
+/// 'index' can be 'n' for those operations that don't require index, or an
+/// integer number.
+///
+/// 'value' can be an action (recursively defined, but in fact we don't
+/// support it for now), or an integer value.
+class Action {
+public:
+  enum class ActionKind : uint8_t {
+    None = 0,
+    Alignment,
+    Specialization,
+    NumTeams,
+    NumThreads,
+  };
+
+  explicit Action(const std::string &S);
+
+  explicit Action(ActionKind AK, uintptr_t V, int Index);
+
+  explicit Action(ActionKind AK, uintptr_t V);
+
+  std::string toString() const;
+
+  bool match(const Kernel &K) const;
+
+  static std::string ActionsToString(const std::vector<Action> &Actions);
+
+private:
+  enum ValuePos : uint8_t {
+    POS_OpCode = 0,
+    POS_Index = 1,
+    POS_Value = 2,
+  };
+
+  ActionKind Kind;
+  uintptr_t Value;
+  int Index;
+};
+
+class KernelSpecialization {
+  /// Kernel entry name.
+  const std::string Name;
+  /// Target architecture.
+  const std::string MCpu;
+  ///
+  std::vector<Action> Actions;
+
+  friend class Image;
+
+public:
+  explicit KernelSpecialization(const std::string &Name,
+                                const std::string &MCpu)
+      : Name(Name), MCpu(MCpu) {}
+
+  explicit KernelSpecialization(const std::string &Name,
+                                const std::string &MCpu,
+                                const std::string &ActionString);
+
+  explicit KernelSpecialization(const std::string &Name,
+                                const std::string &MCpu,
+                                const std::vector<Action> &A);
+
+  bool match(const Kernel &K) const;
+
+  const std::string &getName() const { return Name; }
+};
+
+class SpecializationStatistics {
+  /// Kernel name.
+  const std::string KernelName;
+  ///
+  uint64_t ThresholdTotalCount = 20;
+  ///
+  float ThresholdRatio = 0.5f;
+  /// Total number of specialization variants that have been generated for the
+  /// corresponding kernel.
+  uint64_t TotalCount = 0;
+  /// Count for each argument.
+  std::vector<uint64_t> ArgCount;
+  /// Count for num_thread.
+  uint64_t NumThreadsCount = 0;
+  /// Count for num_team.
+  uint64_t NumTeamsCount = 0;
+  /// Gaurd lock.
+  std::mutex Lock;
+
+  friend class StatisticsUpdater;
+
+public:
+  SpecializationStatistics(const std::string &Name, int NumArgs)
+      : KernelName(Name), ArgCount(NumArgs, 0) {}
+
+  bool reachThreshold(Action::ActionKind Kind, int Index) const;
+
+  bool reachThreshold(Action::ActionKind Kind) const;
+};
+
+class TargetTable {
+  const KernelSpecialization *Specialization;
+  __tgt_target_table *Table;
+
+public:
+  TargetTable(const KernelSpecialization *KS, __tgt_target_table *Table)
+      : Specialization(KS), Table(Table) {}
+
+  bool match(const Kernel &K) const;
+
+  __tgt_target_table *get() const { return Table; }
+};
+
+class TargetTableCache {
+  ///
+  std::unordered_map<std::string, std::list<TargetTable>> Map;
+
+public:
+  __tgt_target_table *insert(const KernelSpecialization *KS,
+                             __tgt_target_table *Table) {
+    auto &Tables = Map[KS->getName()];
+    Tables.emplace_back(KS, Table);
+
+    return Tables.back().get();
+  }
+
+  __tgt_target_table *get(const Kernel &K) const;
+};
+
+class Image {
+  KernelSpecialization Specialization;
+  ///
+  const char *Start = nullptr;
+  ///
+  const char *End = nullptr;
+
+  void dump(std::ostream &OS) const;
+
+  friend class ImageCache;
+
+public:
+  Image(const KernelSpecialization &KS, const char *ImageStart,
+        const char *ImageEnd)
+      : Specialization(KS), Start(ImageStart), End(ImageEnd) {}
+
+  ///
+  std::pair<void *, void *> get() const {
+    return std::make_pair((void *)Start, (void *)End);
+  }
+
+  ///
+  bool match(const Kernel &K) const { return Specialization.match(K); }
+
+  const KernelSpecialization &getKernelSpecialization() const {
+    return Specialization;
+  }
+};
+
+class ImageCache {
+public:
+  ImageCache(const std::string &Arch);
+
+  ~ImageCache();
+
+  ///
+  const Image *insert(const std::string &Key, const KernelSpecialization &KS,
+                      std::unique_ptr<llvm::MemoryBuffer> MB);
+
+  ///
+  const Image *get(const std::string &Key, const Kernel &K) const {
+    auto Itr = Map.find(Key);
+    if (Itr == Map.end())
+      return nullptr;
+
+    auto &L = Itr->second;
+    for (auto &I : L)
+      if (I.match(K))
+        return &I;
+
+    return nullptr;
+  }
+
+private:
+  const std::string Arch;
+  ///
+  std::list<std::unique_ptr<llvm::MemoryBuffer>> NewBuffer;
+  ///
+  std::unordered_map<std::string, std::list<Image>> Map;
+};
+
+} // namespace impl
+
+struct DeviceInfo {
+  /// Architecture, e.g. nvptx64, amdgcn.
+  std::string Arch;
+  /// GPU code name, e.g. sm_75 for Nvidia GPU.
+  std::string MCpu;
+  /// Maximum number of registers the device can support.
+  uint64_t MaxNumRegs = 0;
+  uint64_t ThreadsPerBlock = 0;
+  uint64_t BlocksPerGrid = 0;
+  uint64_t WarpSize = 32;
+  /// Values set by users.
+  int64_t EnvNumThreads = -1;
+  int64_t EnvNumTeams = -1;
+  /// Default values when users don't set explicitly.
+  uint64_t NumThreads = 0;
+  uint64_t NumTeams = 0;
+};
+
+class Kernel {
+  /// Kernel entry name.
+  std::string Name;
+  /// Target architecture where the kernel is about to be launched.
+  std::string MCpu;
+  /// Number of threads.
+  int NumThreads = 0;
+  /// Number of teams.
+  int NumTeams = 0;
+  ///
+  int LoopTripCount = 0;
+  /// Number of arguments.
+  int NumArgs = 0;
+  /// Pointer to the kernel arguments.
+  uintptr_t *Args = nullptr;
+  /// If the kernel is specialized, an id will be assigned.
+  uintptr_t Id = 0;
+
+  Kernel() = default;
+
+public:
+  static Kernel create(__tgt_device_image *Image, const char *Name,
+                       const std::string &MCpu, void **Args, int NumArgs,
+                       int NumTeams, int NumThreads, int LoopTripCount);
+
+  const std::string &getName() const { return Name; }
+
+  const std::string &getMCpu() const { return MCpu; }
+
+  int getNumThreads() const { return NumThreads; }
+
+  int getNumTeams() const { return NumTeams; }
+
+  uintptr_t getArg(int Index) const {
+    assert(Index < NumArgs && "out of range access");
+    return Args[Index];
+  }
+
+  int getNumArgs() const { return NumArgs; }
+
+  friend class JITEngine;
+};
+
+class DeviceToolChain {
+public:
+  virtual std::unique_ptr<llvm::MemoryBuffer> run(const std::string &FileName,
+                                                  const DeviceInfo &DI) = 0;
+};
+
+class JITEngine {
+  const std::string Arch;
+  int NumDevices = 0;
+
+  DeviceToolChain &DTC;
+  std::vector<DeviceInfo> DI;
+  std::unique_ptr<impl::ImageCache> IC;
+  std::vector<std::unique_ptr<impl::TargetTableCache>> TTC;
+
+  class StatisticMap {
+    std::unordered_map<std::string,
+                       std::unique_ptr<impl::SpecializationStatistics>>
+        Map;
+    std::mutex Mtx;
+
+  public:
+    impl::SpecializationStatistics &get(const std::string &K, int NumArgs) {
+      std::lock_guard<std::mutex> LG(Mtx);
+      auto Itr = Map.find(K);
+      if (Itr != Map.end())
+        return *Itr->second;
+      auto R = Map.insert(
+          {K, std::make_unique<impl::SpecializationStatistics>(K, NumArgs)});
+      return *R.first->second;
+    }
+  } Statistics;
+
+public:
+  JITEngine(const char *A, DeviceToolChain &DTC, int NumDevices);
+
+  ///
+  bool init(int DeviceId, const DeviceInfo &D) {
+    if (DeviceId >= NumDevices)
+      return false;
+    DI[DeviceId] = D;
+    TTC[DeviceId] = std::make_unique<impl::TargetTableCache>();
+    return true;
+  }
+
+  /// Look up the target table cache. Return nullptr if there is no cache match
+  /// for that specific kernel.
+  __tgt_target_table *getTargetTable(int DeviceId, const Kernel &K);
+
+  /// Get the device image.
+  __tgt_device_image *getImage(int DeviceId, Kernel &K,
+                               __tgt_device_image *Image);
+  /// Get the device image without any kernel specialization.
+  __tgt_device_image *getImage(int DeviceId, __tgt_device_image *Image);
+
+  bool insertTargetTable(int DeviceId, const Kernel &K,
+                         __tgt_target_table *Table);
+
+  static bool isValidModule(const std::string &Arch, __tgt_device_image *Image);
+
+  static bool isSpecializationSupported(__tgt_device_image *Image);
+
+  static void init();
+};
+} // namespace jit
--- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
@@ -37,17 +37,38 @@ if (LIBOMPTARGET_DEP_CUDA_FOUND AND LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
  set(LIBOMPTARGET_CAN_LINK_LIBCUDA TRUE)
 endif()

+set(LLVM_LINK_COMPONENTS
+  AllTargetsAsmParsers
+  AllTargetsCodeGens
+  AllTargetsDescs
+  AllTargetsInfos
+  LTO
+)
+
+set(src_files src/rtl.cpp)
+
 if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
  libomptarget_say("Building CUDA plugin linked against libcuda")
  include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
-  add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
-  set (LIBOMPTARGET_DEP_LIBRARIES ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES})
+  set(dependences ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES})
 else()
  libomptarget_say("Building CUDA plugin for dlopened libcuda")
  include_directories(dynamic_cuda)
-  add_library(omptarget.rtl.cuda SHARED src/rtl.cpp dynamic_cuda/cuda.cpp)
-  set (LIBOMPTARGET_DEP_LIBRARIES ${CMAKE_DL_LIBS})
+  list(APPEND src_files dynamic_cuda/cuda.cpp)
+  set(dependences ${CMAKE_DL_LIBS})
 endif()
+
+add_llvm_library(omptarget.rtl.cuda SHARED ${src_files}
+  LINK_LIBS elf_common
+            MemoryManager
+            ${LIBOMPTARGET_DEP_LIBRARIES}
+            ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+            ${OPENMP_PTHREAD_LIB}
+            ${dependences}
+            LLVM-LIBOMPTARGET-JIT
+            "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
+            "-Wl,-z,defs")
+
 add_dependencies(omptarget.rtl.cuda omptarget.devicertl.nvptx)

 # Install plugin under the lib destination folder.
@@ -58,15 +79,6 @@ target_include_directories(omptarget.rtl.cuda PRIVATE
  ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}
 )

-target_link_libraries(omptarget.rtl.cuda
-  elf_common
-  MemoryManager
-  ${LIBOMPTARGET_DEP_LIBRARIES}
-  ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
-  ${OPENMP_PTHREAD_LIB}
-  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
-  "-Wl,-z,defs")
-
 # Report to the parent scope that we are building a plugin for CUDA.
 # This controls whether tests are run for the nvptx offloading target
 # Run them if libcuda is available, or if the user explicitly asked for dlopen
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -13,12 +13,17 @@
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
+#include <cstdlib>
 #include <cuda.h>
+#include <fstream>
+#include <iostream>
 #include <list>
 #include <memory>
 #include <mutex>
 #include <string>
+#include <thread>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>

 #include "Debug.h"
@@ -31,7 +36,12 @@

 #include "MemoryManager.h"

+#include "JIT.h"
+
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using llvm::MemoryBuffer;

 // Utility for retrieving and printing CUDA error string.
 #ifdef OMPTARGET_DEBUG
@@ -91,6 +101,24 @@ struct KernelTy {
 };

 namespace {
+std::unique_ptr<jit::JITEngine> JITEngine;
+
+class NVDeviceToolChain : public jit::DeviceToolChain {
+public:
+  std::unique_ptr<llvm::MemoryBuffer> run(const std::string &FileName,
+                                          const jit::DeviceInfo &DI) override {
+    auto MBOrError = llvm::MemoryBuffer::getFile(
+        FileName, /*IsText=*/true, /*RequiresNullTerminator=*/false);
+    if (!MBOrError)
+      return nullptr;
+    if (const char *Str = getenv("LIBOMPTARGET_JIT_DUMP_ASM"))
+      fprintf(stderr, ">>> ptx:\n%s\n", (*MBOrError)->getBufferStart());
+    return std::move(*MBOrError);
+  }
+} NVDTC;
+
+std::unordered_set<void *> NonSpecializedImages;
+
 bool checkResult(CUresult Err, const char *ErrMsg) {
  if (Err == CUDA_SUCCESS)
    return true;
@@ -158,9 +186,20 @@ struct DeviceDataTy {
  int ThreadsPerBlock = 0;
  int BlocksPerGrid = 0;
  int WarpSize = 0;
+  // Maximum number of registers available per block
+  int MaxRegisters = 0;
  // OpenMP properties
  int NumTeams = 0;
  int NumThreads = 0;
+
+  struct ComputeCapabilityTy {
+    int Major = 3;
+    int Minor = 5;
+
+    std::string toString() const { return "sm_" + std::to_string(toInt()); }
+
+    int toInt() const { return Major * 10 + Minor; }
+  } ComputeCapability;
 };

 /// Resource allocator where \p T is the resource type.
@@ -471,7 +510,6 @@ class DeviceRTLTy {
    E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
  }

-public:
  CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
    assert(AsyncInfo && "AsyncInfo is nullptr");

@@ -486,6 +524,40 @@ public:
    return reinterpret_cast<CUstream>(AsyncInfo->Queue);
  }

+  __tgt_device_image *loadJITImage(int DeviceId, __tgt_device_image *Image) {
+    return JITEngine->getImage(DeviceId, Image);
+  }
+
+  __tgt_target_table *loadJITImage(int DeviceId, __tgt_device_image *Image,
+                                   __tgt_offload_entry *Entry, void **Args,
+                                   int NumArgs, int TeamNum, int ThreadLimit,
+                                   int LoopTripCount) {
+    auto Kernel = jit::Kernel::create(
+        Image, Entry->name, DeviceData[DeviceId].ComputeCapability.toString(),
+        Args, NumArgs, TeamNum, ThreadLimit, LoopTripCount);
+    if (auto *TT = JITEngine->getTargetTable(DeviceId, Kernel)) {
+      DP("couldn't find cached target table for kernel entry " DPxMOD ".\n",
+         DPxPTR(Entry));
+      return TT;
+    }
+
+    auto *NewImage = JITEngine->getImage(DeviceId, Kernel, Image);
+    if (!NewImage) {
+      DP("failed to jit image for kernel entry " DPxMOD ".\n", DPxPTR(Entry));
+      return nullptr;
+    }
+
+    auto *TT = loadBinary(DeviceId, NewImage);
+    if (!TT)
+      return nullptr;
+
+    if (!JITEngine->insertTargetTable(DeviceId, Kernel, TT))
+      return nullptr;
+
+    return TT;
+  }
+
+public:
  // This class should not be copied
  DeviceRTLTy(const DeviceRTLTy &) = delete;
  DeviceRTLTy(DeviceRTLTy &&) = delete;
@@ -749,6 +821,50 @@ public:
      DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
    }

+    // Get compute capability
+    int SM;
+    Err = cuDeviceGetAttribute(
+        &SM, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error getting compute capablity major, use default value %d\n",
+         DeviceData[DeviceId].ComputeCapability.Major);
+    } else {
+      DeviceData[DeviceId].ComputeCapability.Major = SM;
+    }
+    Err = cuDeviceGetAttribute(
+        &SM, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error getting compute capablity minor, use default value %d\n",
+         DeviceData[DeviceId].ComputeCapability.Minor);
+    } else {
+      DeviceData[DeviceId].ComputeCapability.Minor = SM;
+    }
+    int MaxRegs;
+    Err = cuDeviceGetAttribute(
+        &MaxRegs, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device);
+    if (Err != CUDA_SUCCESS) {
+      DP("Error getting max registers per block, use default value %d\n",
+         DeviceData[DeviceId].MaxRegisters);
+    } else {
+      DeviceData[DeviceId].MaxRegisters = MaxRegs;
+    }
+
+    if (JITEngine) {
+      jit::DeviceInfo DI;
+      DI.Arch = "nvptx64";
+      DI.MCpu = DeviceData[DeviceId].ComputeCapability.toString();
+      DI.MaxNumRegs = DeviceData[DeviceId].MaxRegisters;
+      DI.ThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
+      DI.BlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
+      DI.WarpSize = 32;
+      DI.NumThreads = DeviceData[DeviceId].NumThreads;
+      DI.NumTeams = DeviceData[DeviceId].NumTeams;
+      DI.EnvNumThreads = EnvTeamThreadLimit;
+      DI.EnvNumTeams = EnvNumTeams;
+
+      JITEngine->init(DeviceId, DI);
+    }
+
    return OFFLOAD_SUCCESS;
  }

@@ -790,14 +906,24 @@ public:

  __tgt_target_table *loadBinary(const int DeviceId,
                                 const __tgt_device_image *Image) {
+    void *ImageStart = Image->ImageStart;
+    if (NonSpecializedImages.find(ImageStart) != NonSpecializedImages.end()) {
+      auto *NewImage =
+          loadJITImage(DeviceId, const_cast<__tgt_device_image *>(Image));
+      if (!NewImage)
+        return nullptr;
+
+      ImageStart = NewImage->ImageStart;
+    }
+
    // Clear the offload table as we are going to create a new one.
    clearOffloadEntriesTable(DeviceId);

    // Create the module and extract the function pointers.
    CUmodule Module;
-    DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
+    DP("Load data from image " DPxMOD "\n", DPxPTR(ImageStart));
    CUresult Err =
-        cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
+        cuModuleLoadDataEx(&Module, ImageStart, 0, nullptr, nullptr);
    if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
      return nullptr;

@@ -1073,7 +1199,7 @@ public:
                          ptrdiff_t *TgtOffsets, const int ArgNum,
                          const int TeamNum, const int ThreadLimit,
                          const unsigned int LoopTripCount,
-                          __tgt_async_info *AsyncInfo) const {
+                          __tgt_async_info *AsyncInfo) {
    // All args are references.
    std::vector<void *> Args(ArgNum);
    std::vector<void *> Ptrs(ArgNum);
@@ -1083,7 +1209,27 @@ public:
      Args[I] = &Ptrs[I];
    }

-    KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
+    auto LaunchEntry =
+        reinterpret_cast<__tgt_kernel_launch_entry *>(TgtEntryPtr);
+    KernelTy *KernelInfo =
+        reinterpret_cast<KernelTy *>(LaunchEntry->TargetEntry);
+    // If kernel info is nullptr, it means we are dealing with JIT image.
+    if (KernelInfo == nullptr) {
+      assert(LaunchEntry->Image && LaunchEntry->HostEntry);
+      __tgt_device_image NewImage = *(LaunchEntry->Image);
+      NewImage.EntriesBegin = LaunchEntry->HostEntry;
+      NewImage.EntriesEnd = NewImage.EntriesBegin + 1;
+      auto TargetTable =
+          loadJITImage(DeviceId, &NewImage, LaunchEntry->HostEntry, Ptrs.data(),
+                       ArgNum, TeamNum, ThreadLimit, LoopTripCount);
+      if (!TargetTable)
+        return OFFLOAD_FAIL;
+
+      KernelInfo =
+          reinterpret_cast<KernelTy *>(TargetTable->EntriesBegin->addr);
+    }
+
+    assert(KernelInfo && "KernelInfo should not be nullptr");

    const bool IsSPMDGenericMode =
        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
@@ -1484,7 +1630,24 @@ extern "C" {
 #endif

 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
-  return elf_check_machine(image, /* EM_CUDA */ 190);
+  if (elf_check_machine(image, /* EM_CUDA */ 190))
+    return 1;
+
+  jit::JITEngine::init();
+
+  if (!JITEngine)
+    JITEngine = std::make_unique<jit::JITEngine>("nvptx64", NVDTC,
+                                                 DeviceRTL.getNumOfDevices());
+
+  if (!jit::JITEngine::isValidModule("nvptx64", image))
+    return 0;
+
+  if (jit::JITEngine::isSpecializationSupported(image))
+    return 2;
+
+  NonSpecializedImages.insert(image->ImageStart);
+
+  return 3;
 }

 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -19,7 +19,10 @@
 #include <cassert>
 #include <cstdio>
 #include <cstdlib>
+#include <fstream>
+#include <memory>
 #include <mutex>
+#include <vector>

 ////////////////////////////////////////////////////////////////////////////////
 /// adds requires flags
@@ -40,6 +43,7 @@ EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) {
      }
    }
  }
+  // PM->RTLs.RegisterLib(createBinDescFrom(desc));
  PM->RTLs.RegisterLib(desc);
 }

--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -107,6 +107,13 @@ static int InitLibrary(DeviceTy &Device) {
        rc = OFFLOAD_FAIL;
        break;
      }
+
+      const bool IsJITImage =
+          RegisteredJITImages.find(img) != RegisteredJITImages.end();
+
+      if (IsJITImage)
+        continue;
+
      // 2) load image into the target table.
      __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] =
          Device.load_binary(img);
@@ -1500,16 +1507,6 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
    return OFFLOAD_FAIL;
  }

-  // get target table.
-  __tgt_target_table *TargetTable = nullptr;
-  {
-    std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
-    assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
-           "Not expecting a device ID outside the table's bounds!");
-    TargetTable = TM->Table->TargetsTable[DeviceId];
-  }
-  assert(TargetTable && "Global data has not been mapped\n");
-
  // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
  // need to manifest base pointers prior to launching a kernel. Even if we have
  // mapped an object only partially, e.g. A[N:M], although the kernel is
@@ -1536,11 +1533,42 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
    }
  }

-  // Launch device execution.
-  void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
-  DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
-     TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
+  __tgt_device_image *Image = TM->Table->TargetsImages[Device.DeviceID];
+  const bool UseJIT =
+      RegisteredJITImages.find(Image) != RegisteredJITImages.end();
+  void *TgtEntryPtr = nullptr;
+  __tgt_kernel_launch_entry LaunchEntry;
+  // get target table if in non-JIT mode.
+  if (UseJIT) {
+    __tgt_offload_entry *Entry = nullptr;
+    __tgt_target_table *HostTable = &TM->Table->HostTable;
+    // Find the entry name from the host entries
+    // TODO: We might want a map for this
+    for (auto Itr = HostTable->EntriesBegin; Itr != HostTable->EntriesEnd;
+         ++Itr)
+      if (Itr->addr == HostPtr) {
+        Entry = Itr;
+        break;
+      }
+    assert(Entry && "cannot find entry");
+    LaunchEntry.HostEntry = Entry;
+    LaunchEntry.Image = Image;
+    DP("Launching target jit execution %s with pointer " DPxMOD ".\n",
+       Entry->name, DPxPTR(TgtEntryPtr));
+  } else {
+    std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
+    assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
+           "Not expecting a device ID outside the table's bounds!");
+    __tgt_target_table *TargetTable = TM->Table->TargetsTable[DeviceId];
+    assert(TargetTable && "Global data has not been mapped\n");
+    LaunchEntry.TargetEntry = TargetTable->EntriesBegin[TM->Index].addr;
+    DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
+       TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr),
+       TM->Index);
+  }
+  TgtEntryPtr = &LaunchEntry;

+  // Launch device execution.
  {
    TIMESCOPE_WITH_NAME_AND_IDENT(
        IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc);
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -38,6 +38,8 @@ PluginManager *PM;
 static char *ProfileTraceFile = nullptr;
 #endif

+std::unordered_set<__tgt_device_image *> RegisteredJITImages;
+
 __attribute__((constructor(101))) void init() {
  DP("Init target library!\n");

@@ -250,8 +252,7 @@ static void RegisterImageIntoTranslationTable(TranslationTable &TT,

 static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
                                             __tgt_device_image *img,
-                                             RTLInfoTy *RTL) {
-
+                                             RTLInfoTy *RTL, bool IsJITImage) {
  for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) {
    DeviceTy &Device = *PM->Devices[RTL->Idx + i];
    Device.PendingGlobalsMtx.lock();
@@ -261,13 +262,21 @@ static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
      if (entry->flags & OMP_DECLARE_TARGET_CTOR) {
        DP("Adding ctor " DPxMOD " to the pending list.\n",
           DPxPTR(entry->addr));
-        Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
+        if (IsJITImage)
+          Device.PendingCtorsDtors[desc].PendingJITCtors[img].push_back(
+              entry->addr);
+        else
+          Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
      } else if (entry->flags & OMP_DECLARE_TARGET_DTOR) {
        // Dtors are pushed in reverse order so they are executed from end
        // to beginning when unregistering the library!
        DP("Adding dtor " DPxMOD " to the pending list.\n",
           DPxPTR(entry->addr));
-        Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
+        if (IsJITImage)
+          Device.PendingCtorsDtors[desc].PendingJITDtors[img].push_front(
+              entry->addr);
+        else
+          Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
      }

      if (entry->flags & OMP_DECLARE_TARGET_LINK) {
@@ -363,14 +372,21 @@ void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
    // Scan the RTLs that have associated images until we find one that supports
    // the current image.
    for (auto &R : AllRTLs) {
-      if (!R.is_valid_binary(img)) {
+      int Ret = R.is_valid_binary(img);
+      if (Ret == 0) {
        DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
           DPxPTR(img->ImageStart), R.RTLName.c_str());
        continue;
      }

-      DP("Image " DPxMOD " is compatible with RTL %s!\n",
-         DPxPTR(img->ImageStart), R.RTLName.c_str());
+      // TODO: should use enum here.
+      const bool IsJITImage = Ret == 2;
+
+      DP("%sImage " DPxMOD " is compatible with RTL %s!\n",
+         IsJITImage ? "JIT " : "", DPxPTR(img->ImageStart), R.RTLName.c_str());
+
+      if (IsJITImage)
+        RegisteredJITImages.insert(img);

      initRTLonce(R);

@@ -395,7 +411,7 @@ void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
      FoundRTL = &R;

      // Load ctors/dtors for static objects
-      RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL);
+      RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL, IsJITImage);

      // if an RTL was found we are done - proceed to register the next image
      break;
@@ -427,6 +443,9 @@ void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {

      assert(R->isUsed && "Expecting used RTLs.");

+      // FIXME: This is WRONG!!!
+      continue;
+
      if (!R->is_valid_binary(img)) {
        DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n",
           DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
Author	SHA1	Message	Date
Shilei Tian	02bc7effcc	Init JIT module	2022-05-27 17:44:53 -04:00
Joseph Huber	6f3e60f1c0	[OpenMP] Add flag for embedding bitcode in module for JIT Summary: This patch adds the '-fopenmp-target-jit' flag to embed bitcode in the module when using the new driver.	2022-05-24 13:47:21 -04:00