[OpenMP] Add the initial support for direct gpu compilation

2023-06-08 22:48:46 -04:00
59 changed files with 5897 additions and 196 deletions
--- a/auto-host-rpc/device.ll
+++ b/auto-host-rpc/device.ll
@@ -0,0 +1,65 @@
+; ModuleID = 'test-openmp-nvptx64-nvidia-cuda-sm_75.bc'
+source_filename = "/home/shiltian/Documents/vscode/llvm-project/auto-host-rpc/test.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.ddd = type { i32, i32, float }
+
+@__omp_rtl_debug_kind = weak_odr hidden local_unnamed_addr constant i32 0
+@__omp_rtl_assume_teams_oversubscription = weak_odr hidden local_unnamed_addr constant i32 0
+@__omp_rtl_assume_threads_oversubscription = weak_odr hidden local_unnamed_addr constant i32 0
+@__omp_rtl_assume_no_thread_state = weak_odr hidden local_unnamed_addr constant i32 0
+@__omp_rtl_assume_no_nested_parallelism = weak_odr hidden local_unnamed_addr constant i32 0
+@.str = private unnamed_addr constant [9 x i8] c"main.cpp\00", align 1
+@.str1 = private unnamed_addr constant [2 x i8] c"r\00", align 1
+@.str2 = private unnamed_addr constant [3 x i8] c"%d\00", align 1
+@.str3 = private unnamed_addr constant [7 x i8] c"%f%d%s\00", align 1
+@.str4 = private unnamed_addr constant [6 x i8] c"hello\00", align 1
+
+; Function Attrs: convergent nounwind
+define hidden void @foo() local_unnamed_addr #0 {
+entry:
+  %d = tail call align 16 dereferenceable_or_null(12) ptr @__kmpc_alloc_shared(i64 12) #4
+  %call = tail call noalias ptr @fopen(ptr noundef nonnull @.str, ptr noundef nonnull @.str1) #5
+  %call1 = tail call i32 (ptr, ptr, ...) @fprintf(ptr noundef %call, ptr noundef nonnull @.str2, i32 noundef 6) #5
+  %call2 = tail call i32 (ptr, ptr, ...) @fprintf(ptr noundef %call, ptr noundef nonnull @.str3, double noundef 6.000000e+00, i32 noundef 1, ptr noundef nonnull @.str4) #5
+  %a = getelementptr inbounds %struct.ddd, ptr %d, i64 0, i32 1
+  %call3 = tail call i32 (ptr, ptr, ...) @fscanf(ptr noundef %call, ptr noundef nonnull @.str2, ptr noundef nonnull %a) #5
+  tail call void @__kmpc_free_shared(ptr %d, i64 12)
+  ret void
+}
+
+; Function Attrs: nofree nosync nounwind allocsize(0)
+declare ptr @__kmpc_alloc_shared(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare noalias ptr @fopen(ptr noundef, ptr noundef) local_unnamed_addr #2
+
+; Function Attrs: convergent
+declare i32 @fprintf(ptr noundef, ptr noundef, ...) local_unnamed_addr #2
+
+; Function Attrs: convergent
+declare i32 @fscanf(ptr noundef, ptr noundef, ...) local_unnamed_addr #2
+
+; Function Attrs: nosync nounwind
+declare void @__kmpc_free_shared(ptr allocptr nocapture, i64) local_unnamed_addr #3
+
+attributes #0 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx77,+sm_75" }
+attributes #1 = { nofree nosync nounwind allocsize(0) }
+attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx77,+sm_75" }
+attributes #3 = { nosync nounwind }
+attributes #4 = { nounwind }
+attributes #5 = { convergent nounwind }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4, !5}
+!llvm.ident = !{!6, !7}
+!nvvm.annotations = !{}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 7]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 7, !"openmp", i32 50}
+!3 = !{i32 7, !"openmp-device", i32 50}
+!4 = !{i32 8, !"PIC Level", i32 2}
+!5 = !{i32 7, !"frame-pointer", i32 2}
+!6 = !{!"clang version 16.0.0"}
+!7 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
--- a/auto-host-rpc/main.cpp
+++ b/auto-host-rpc/main.cpp
@@ -0,0 +1,560 @@
+#include "llvm/ADT/EnumeratedArray.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetOptions.h"
+
+#include <string>
+
+using namespace llvm;
+
+namespace {
+static LLVMContext Context;
+static codegen::RegisterCodeGenFlags RCGF;
+static constexpr const char *InternalPrefix[] = {"__kmp", "llvm.", "nvm.",
+                                                 "omp_"};
+
+std::string typeToString(Type *T) {
+  if (T->is16bitFPTy())
+    return "f16";
+  if (T->isFloatTy())
+    return "f32";
+  if (T->isDoubleTy())
+    return "f64";
+  if (T->isPointerTy())
+    return "ptr";
+  if (T->isStructTy())
+    return std::string(T->getStructName());
+  if (T->isIntegerTy())
+    return "i" + std::to_string(T->getIntegerBitWidth());
+
+  llvm_unreachable("unknown type");
+}
+} // namespace
+
+namespace llvm {
+
+enum class HostRPCRuntimeFunction {
+#define __OMPRTL_HOST_RPC(_ENUM) OMPRTL_##_ENUM
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_get_ret_val),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val),
+  __OMPRTL_HOST_RPC(__last),
+#undef __OMPRTL_HOST_RPC
+};
+
+#define __OMPRTL_HOST_RPC(_ENUM)                                               \
+  auto OMPRTL_##_ENUM = HostRPCRuntimeFunction::OMPRTL_##_ENUM;
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_ret_val)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val)
+#undef __OMPRTL_HOST_RPC
+
+enum OMPHostRPCArgType {
+  // No need to copy.
+  OMP_HOST_RPC_ARG_SCALAR = 0,
+  OMP_HOST_RPC_ARG_PTR = 1,
+  // Copy to host.
+  OMP_HOST_RPC_ARG_PTR_COPY_TO = 2,
+  // Copy to device
+  OMP_HOST_RPC_ARG_PTR_COPY_FROM = 3,
+  // TODO: Do we have a tofrom pointer?
+  OMP_HOST_RPC_ARG_PTR_COPY_TOFROM = 4,
+};
+
+// struct HostRPCArgInfo {
+//   // OMPHostRPCArgType
+//   int64_t Type;
+//   int64_t Size;
+// };
+
+class AutoHostRPC {
+  LLVMContext &Context;
+  // Device module
+  Module &DM;
+  // Host module
+  Module &HM;
+  // Types
+  Type *Int8PtrTy;
+  Type *VoidTy;
+  Type *Int32Ty;
+  Type *Int64Ty;
+  StructType *ArgInfoTy;
+
+  struct CallSiteInfo {
+    CallInst *CI = nullptr;
+    SmallVector<Type *> Params;
+  };
+
+  struct HostRPCArgInfo {
+    // OMPHostRPCArgType
+    Constant *Type;
+    Value *Size;
+  };
+
+  //
+  SmallVector<Function *> HostEntryTable;
+
+  EnumeratedArray<Function *, HostRPCRuntimeFunction,
+                  HostRPCRuntimeFunction::OMPRTL___last>
+      RFIs;
+
+  static std::string getWrapperFunctionName(Function *F, CallSiteInfo &CSI) {
+    std::string Name = "__kmpc_host_rpc_wrapper_" + std::string(F->getName());
+    if (!F->isVarArg())
+      return Name;
+
+    for (unsigned I = F->getFunctionType()->getNumParams();
+         I < CSI.Params.size(); ++I) {
+      Name.push_back('_');
+      Name.append(typeToString(CSI.Params[I]));
+    }
+
+    return Name;
+  }
+
+  static bool isInternalFunction(Function &F) {
+    auto Name = F.getName();
+
+    for (auto P : InternalPrefix)
+      if (Name.startswith(P))
+        return true;
+
+    return false;
+  }
+
+  Value *convertToInt64Ty(IRBuilder<> &Builder, Value *V);
+
+  Value *convertFromInt64TyTo(IRBuilder<> &Builder, Value *V, Type *TargetTy);
+
+  // int device_wrapper(call_no, arg_info, ...) {
+  //   void *desc = __kmpc_host_rpc_get_desc(call_no, num_args, arg_info);
+  //   __kmpc_host_rpc_add_arg(desc, arg1, sizeof(arg1));
+  //   __kmpc_host_rpc_add_arg(desc, arg2, sizeof(arg2));
+  //   ...
+  //   __kmpc_host_rpc_send_and_wait(desc);
+  //   int r = (int)__kmpc_host_rpc_get_ret_val(desc);
+  //   return r;
+  // }
+  Function *getDeviceWrapperFunction(StringRef WrapperName, Function *F,
+                                     CallSiteInfo &CSI);
+
+  // void host_wrapper(desc) {
+  //   int arg1 = (int)__kmpc_host_rpc_get_arg(desc, 0);
+  //   float arg2 = (float)__kmpc_host_rpc_get_arg(desc, 1);
+  //   char *arg3 = (char *)__kmpc_host_rpc_get_arg(desc, 2);
+  //   ...
+  //   int r = actual_call(arg1, arg2, arg3, ...);
+  //   __kmpc_host_rpc_set_ret_val(ptr(desc, (int64_t)r);
+  // }
+  Function *getHostWrapperFunction(StringRef WrapperName, Function *F,
+                                   CallSiteInfo &CSI);
+
+  bool rewriteWithHostRPC(Function *F);
+
+public:
+  AutoHostRPC(Module &DeviceModule, Module &HostModule)
+      : Context(DeviceModule.getContext()), DM(DeviceModule), HM(HostModule) {
+    assert(&DeviceModule.getContext() == &HostModule.getContext() &&
+           "device and host modules have different context");
+
+#define __OMP_TYPE(TYPE) TYPE = Type::get##TYPE(Context)
+    __OMP_TYPE(Int8PtrTy);
+    __OMP_TYPE(VoidTy);
+    __OMP_TYPE(Int32Ty);
+    __OMP_TYPE(Int64Ty);
+#undef __OMP_TYPE
+
+#define __OMP_RTL(_ENUM, MOD, VARARG, RETTY, ...)                              \
+  {                                                                            \
+    SmallVector<Type *> Params{__VA_ARGS__};                                   \
+    FunctionType *FT = FunctionType::get(RETTY, Params, VARARG);               \
+    RFIs[OMPRTL_##_ENUM] = Function::Create(                                   \
+        FT, GlobalValue::LinkageTypes::InternalLinkage, #_ENUM, MOD);          \
+  }
+    __OMP_RTL(__kmpc_host_rpc_get_desc, DM, false, Int8PtrTy, Int32Ty, Int32Ty,
+              Int8PtrTy)
+    __OMP_RTL(__kmpc_host_rpc_add_arg, DM, false, VoidTy, Int8PtrTy, Int64Ty,
+              Int64Ty)
+    __OMP_RTL(__kmpc_host_rpc_send_and_wait, DM, false, VoidTy, Int8PtrTy)
+    __OMP_RTL(__kmpc_host_rpc_get_ret_val, DM, false, Int64Ty, Int8PtrTy)
+    __OMP_RTL(__kmpc_host_rpc_get_arg, HM, false, Int64Ty, Int8PtrTy, Int32Ty)
+    __OMP_RTL(__kmpc_host_rpc_set_ret_val, HM, false, VoidTy, Int8PtrTy,
+              Int64Ty)
+#undef __OMP_RTL
+
+    ArgInfoTy = StructType::create({Int64Ty, Int64Ty}, "struct.arg_info_t");
+  }
+
+  bool run();
+};
+
+Value *AutoHostRPC::convertToInt64Ty(IRBuilder<> &Builder, Value *V) {
+  Type *T = V->getType();
+
+  if (T == Int64Ty)
+    return V;
+
+  if (T->isPointerTy())
+    return Builder.CreatePtrToInt(V, Int64Ty);
+
+  if (T->isIntegerTy())
+    return Builder.CreateIntCast(V, Int64Ty, false);
+
+  if (T->isFloatingPointTy()) {
+    if (T->isFloatTy())
+      V = Builder.CreateFPToSI(V, Int32Ty);
+    return Builder.CreateFPToSI(V, Int64Ty);
+  }
+
+  llvm_unreachable("unknown cast to int64_t");
+}
+
+Value *AutoHostRPC::convertFromInt64TyTo(IRBuilder<> &Builder, Value *V,
+                                         Type *T) {
+  if (T == Int64Ty)
+    return V;
+
+  if (T->isPointerTy())
+    return Builder.CreateIntToPtr(V, T);
+
+  if (T->isIntegerTy())
+    return Builder.CreateIntCast(V, T, /* isSigned */ true);
+
+  if (T->isFloatingPointTy()) {
+    if (T->isFloatTy())
+      V = Builder.CreateIntCast(V, Int32Ty, /* isSigned */ true);
+    V = Builder.CreateSIToFP(V, T);
+    return V;
+  }
+
+  llvm_unreachable("unknown cast from int64_t");
+}
+
+bool AutoHostRPC::run() {
+  bool Changed = false;
+
+  SmallVector<Function *> WorkList;
+
+  for (Function &F : DM) {
+    // If the function is already defined, it definitely does not require RPC.
+    if (!F.isDeclaration())
+      continue;
+
+    // If it is an internal function, skip it as well.
+    if (isInternalFunction(F))
+      continue;
+
+    // If there is no use of the function, skip it.
+    if (F.use_empty())
+      continue;
+
+    WorkList.push_back(&F);
+  }
+
+  if (WorkList.empty())
+    return Changed;
+
+  for (Function *F : WorkList)
+    Changed |= rewriteWithHostRPC(F);
+
+  return Changed;
+}
+
+bool AutoHostRPC::rewriteWithHostRPC(Function *F) {
+  bool Changed = false;
+
+  SmallVector<CallInst *> WorkList;
+
+  for (User *U : F->users()) {
+    auto *CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+    WorkList.push_back(CI);
+  }
+
+  if (WorkList.empty())
+    return Changed;
+
+  for (CallInst *CI : WorkList) {
+    CallSiteInfo CSI;
+    CSI.CI = CI;
+    unsigned NumArgs = CI->arg_size();
+    for (unsigned I = 0; I < NumArgs; ++I)
+      CSI.Params.push_back(CI->getArgOperand(I)->getType());
+
+    std::string WrapperName = getWrapperFunctionName(F, CSI);
+    Function *DeviceWrapperFn = getDeviceWrapperFunction(WrapperName, F, CSI);
+    Function *HostWrapperFn = getHostWrapperFunction(WrapperName, F, CSI);
+
+    int32_t WrapperNumber = -1;
+    for (unsigned I = 0; I < HostEntryTable.size(); ++I) {
+      if (HostEntryTable[I] == HostWrapperFn) {
+        WrapperNumber = I;
+        break;
+      }
+    }
+    if (WrapperNumber == -1) {
+      WrapperNumber = HostEntryTable.size();
+      HostEntryTable.push_back(HostWrapperFn);
+    }
+
+    DataLayout DL = DM.getDataLayout();
+    IRBuilder<> Builder(CI);
+
+    auto CheckIfIdentifierPtr = [](const Value *V) {
+      auto *CI = dyn_cast<CallInst>(V);
+      if (!CI)
+        return false;
+      Function *Callee = CI->getCalledFunction();
+      return Callee->getName().startswith("__kmpc_host_rpc_wrapper_");
+    };
+
+    auto CheckIfAlloca = [](const Value *V) {
+      auto *CI = dyn_cast<CallInst>(V);
+      if (!CI)
+        return false;
+      Function *Callee = CI->getCalledFunction();
+      return Callee->getName() == "__kmpc_alloc_shared" ||
+             Callee->getName() == "malloc";
+    };
+
+    SmallVector<HostRPCArgInfo> ArgInfos;
+    bool IsConstantArgInfo = true;
+
+    for (Value *Op : CI->args()) {
+      if (!Op->getType()->isPointerTy()) {
+        HostRPCArgInfo AI{
+            ConstantInt::get(Int64Ty,
+                             OMPHostRPCArgType::OMP_HOST_RPC_ARG_SCALAR),
+            ConstantInt::getNullValue(Int64Ty)};
+        ArgInfos.push_back(std::move(AI));
+        continue;
+      }
+
+      Value *SizeVal = nullptr;
+      OMPHostRPCArgType ArgType = OMP_HOST_RPC_ARG_PTR_COPY_TOFROM;
+
+      SmallVector<const Value *> Objects;
+      getUnderlyingObjects(Op, Objects);
+
+      // TODO: Handle phi node
+      if (Objects.size() != 1)
+        llvm_unreachable("we can't handle phi node yet");
+
+      auto *Obj = Objects.front();
+      if (CheckIfIdentifierPtr(Obj)) {
+        ArgType = OMP_HOST_RPC_ARG_SCALAR;
+        SizeVal = ConstantInt::getNullValue(Int64Ty);
+      } else if (CheckIfAlloca(Obj)) {
+        auto *CI = dyn_cast<CallInst>(Obj);
+        SizeVal = CI->getOperand(0);
+        if (!isa<Constant>(SizeVal))
+          IsConstantArgInfo = false;
+      } else {
+        if (auto *GV = dyn_cast<GlobalVariable>(Obj)) {
+          SizeVal = ConstantInt::get(Int64Ty,
+                                     DL.getTypeStoreSize(GV->getValueType()));
+          if (GV->isConstant())
+            ArgType = OMP_HOST_RPC_ARG_PTR_COPY_TO;
+          if (GV->isConstant() && GV->hasInitializer()) {
+            // TODO: If the global variable is contant, we can do some
+            // optimization.
+          }
+        } else {
+          // TODO: fix that when it occurs
+          llvm_unreachable("cannot handle unknown type");
+        }
+      }
+
+      HostRPCArgInfo AI{ConstantInt::get(Int64Ty, ArgType), SizeVal};
+      ArgInfos.push_back(std::move(AI));
+    }
+
+    Value *ArgInfo = nullptr;
+
+    if (!IsConstantArgInfo) {
+      ArgInfo = Builder.CreateAlloca(
+          ArgInfoTy, ConstantInt::get(Int64Ty, NumArgs), "arg_info");
+      for (unsigned I = 0; I < NumArgs; ++I) {
+        Value *AII = GetElementPtrInst::Create(
+            ArrayType::get(ArgInfoTy, NumArgs), ArgInfo,
+            {ConstantInt::getNullValue(Int64Ty), ConstantInt::get(Int64Ty, I)});
+        Value *AIIType = GetElementPtrInst::Create(
+            ArgInfoTy, AII, {ConstantInt::get(Int64Ty, 0)});
+        Value *AIISize = GetElementPtrInst::Create(
+            ArgInfoTy, AII, {ConstantInt::get(Int64Ty, 1)});
+        Builder.Insert(AII);
+        Builder.Insert(AIIType);
+        Builder.Insert(AIISize);
+        Builder.CreateStore(ArgInfos[I].Type, AIIType);
+        Builder.CreateStore(ArgInfos[I].Size, AIISize);
+      }
+    } else {
+      SmallVector<Constant *> ArgInfoInitVar;
+      for (auto &AI : ArgInfos) {
+        auto *CS =
+            ConstantStruct::get(ArgInfoTy, {AI.Type, cast<Constant>(AI.Size)});
+        ArgInfoInitVar.push_back(CS);
+      }
+      Constant *ArgInfoInit = ConstantArray::get(
+          ArrayType::get(ArgInfoTy, NumArgs), ArgInfoInitVar);
+      ArgInfo = new GlobalVariable(
+          DM, ArrayType::get(ArgInfoTy, NumArgs), /* isConstant */ true,
+          GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info");
+    }
+
+    SmallVector<Value *> Args{ConstantInt::get(Int32Ty, WrapperNumber),
+                              ArgInfo};
+    for (Value *Op : CI->args())
+      Args.push_back(Op);
+
+    CallInst *NewCall = Builder.CreateCall(DeviceWrapperFn, Args);
+
+    CI->replaceAllUsesWith(NewCall);
+    CI->eraseFromParent();
+  }
+
+  F->eraseFromParent();
+
+  return true;
+}
+
+Function *AutoHostRPC::getDeviceWrapperFunction(StringRef WrapperName,
+                                                Function *F,
+                                                CallSiteInfo &CSI) {
+  Function *WrapperFn = DM.getFunction(WrapperName);
+  if (WrapperFn)
+    return WrapperFn;
+
+  // return_type device_wrapper(int32_t call_no, void *arg_info, ...)
+  SmallVector<Type *> Params{Int32Ty, Int8PtrTy};
+  Params.append(CSI.Params);
+
+  Type *RetTy = F->getReturnType();
+
+  FunctionType *FT = FunctionType::get(RetTy, Params, /*isVarArg*/ false);
+  WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::WeakODRLinkage,
+                               WrapperName, DM);
+
+  // Emit the body of the device wrapper
+  IRBuilder<> Builder(Context);
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
+  Builder.SetInsertPoint(EntryBB);
+
+  // skip call_no and arg_info.
+  constexpr const unsigned NumArgSkipped = 2;
+
+  Value *Desc = nullptr;
+  {
+    Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_get_desc];
+    Desc = Builder.CreateCall(
+        Fn,
+        {WrapperFn->getArg(0),
+         ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
+         WrapperFn->getArg(1)},
+        "desc");
+  }
+
+  {
+    Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_add_arg];
+    for (unsigned I = NumArgSkipped; I < WrapperFn->arg_size(); ++I) {
+      Value *V = convertToInt64Ty(Builder, WrapperFn->getArg(I));
+      Builder.CreateCall(Fn, {Desc, V, ConstantInt::getNullValue(Int64Ty)});
+    }
+  }
+
+  Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_send_and_wait], {Desc});
+
+  if (RetTy->isVoidTy()) {
+    Builder.CreateRetVoid();
+    return WrapperFn;
+  }
+
+  Value *RetVal =
+      Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_ret_val], {Desc});
+  if (RetTy != RetVal->getType())
+    RetVal = convertFromInt64TyTo(Builder, RetVal, RetTy);
+
+  Builder.CreateRet(RetVal);
+
+  return WrapperFn;
+}
+
+Function *AutoHostRPC::getHostWrapperFunction(StringRef WrapperName,
+                                              Function *F, CallSiteInfo &CSI) {
+  Function *WrapperFn = HM.getFunction(WrapperName);
+  if (WrapperFn)
+    return WrapperFn;
+
+  SmallVector<Type *> Params{Int8PtrTy};
+  FunctionType *FT = FunctionType::get(VoidTy, Params, /* isVarArg */ false);
+  WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,
+                               WrapperName, HM);
+
+  Value *Desc = WrapperFn->getArg(0);
+
+  // Emit the body of the host wrapper
+  IRBuilder<> Builder(Context);
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
+  Builder.SetInsertPoint(EntryBB);
+
+  SmallVector<Value *> Args;
+  for (unsigned I = 0; I < CSI.CI->arg_size(); ++I) {
+    Value *V = Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_arg],
+                                  {Desc, ConstantInt::get(Int32Ty, I)});
+    Args.push_back(convertFromInt64TyTo(Builder, V, CSI.Params[I]));
+  }
+
+  // The host callee that will be called eventually by the host wrapper.
+  Function *HostCallee = HM.getFunction(F->getName());
+  if (!HostCallee)
+    HostCallee = Function::Create(F->getFunctionType(), F->getLinkage(),
+                                  F->getName(), HM);
+
+  Value *RetVal = Builder.CreateCall(HostCallee, Args);
+  RetVal = convertToInt64Ty(Builder, RetVal);
+  Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_set_ret_val], {Desc, RetVal});
+  Builder.CreateRetVoid();
+
+  return WrapperFn;
+}
+
+} // namespace llvm
+
+int main(int argc, char *argv[]) {
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  InitializeAllAsmPrinters();
+  InitializeAllAsmParsers();
+
+  SMDiagnostic Err;
+  std::unique_ptr<Module> DM = parseIRFile("device.ll", Err, Context);
+  if (!DM)
+    return 1;
+  Module HM("host-rpc.bc", Context);
+  // get the right target triple
+  HM.setTargetTriple(Triple::normalize("x86-64"));
+
+  AutoHostRPC RPC(*DM, HM);
+  (void)RPC.run();
+
+  DM->dump();
+  // HM.dump();
+
+  return 0;
+}
--- a/auto-host-rpc/test.c
+++ b/auto-host-rpc/test.c
@@ -0,0 +1,19 @@
+#include <stdio.h>
+
+#pragma omp begin declare target device_type(nohost)
+
+struct ddd {
+  int num;
+  int a;
+  float b;
+};
+
+void foo() {
+  FILE *fp = fopen("main.cpp", "r");
+  struct ddd d;
+  fprintf(fp, "%d", 6);
+  fprintf(fp, "%f%d%s", 6.0f, 1, "hello");
+  fscanf(fp, "%d", &d.a);
+}
+
+#pragma omp end declare target
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -253,6 +253,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
 LANGOPT(OpenMPOptimisticCollapse  , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
 LANGOPT(OpenMPThreadSubscription  , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
 LANGOPT(OpenMPTeamSubscription  , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
+LANGOPT(OpenMPGlobalizeToGlobalSpace  , 1, 0, "Globalize to global space for the globalized variables")
 LANGOPT(OpenMPNoThreadState  , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
 LANGOPT(OpenMPNoNestedParallelism  , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
 LANGOPT(OpenMPOffloadMandatory  , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2744,6 +2744,10 @@ def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested
  HelpText<"Assert no nested parallel regions in the GPU">,
  MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;

+def fopenmp_globalize_to_global_space : Flag<["-"], "fopenmp-globalize-to-global-space">,
+  HelpText<"Globalize to global space for the globalized variables">,
+  MarshallingInfoFlag<LangOpts<"OpenMPGlobalizeToGlobalSpace">>;
+
 } // let Group = f_Group
 } // let Flags = [CC1Option, FC1Option, NoArgumentUnused, HelpHidden]

--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -2476,50 +2476,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
        (IPD->getParameterKind() == ImplicitParamDecl::ThreadPrivateVar);
  }

-  Address DeclPtr = Address::invalid();
-  Address AllocaPtr = Address::invalid();
-  bool DoStore = false;
-  bool IsScalar = hasScalarEvaluationKind(Ty);
-  bool UseIndirectDebugAddress = false;
-
-  // If we already have a pointer to the argument, reuse the input pointer.
-  if (Arg.isIndirect()) {
-    // If we have a prettier pointer type at this point, bitcast to that.
-    DeclPtr = Arg.getIndirectAddress();
-    DeclPtr = Builder.CreateElementBitCast(DeclPtr, ConvertTypeForMem(Ty),
-                                           D.getName());
-    // Indirect argument is in alloca address space, which may be different
-    // from the default address space.
-    auto AllocaAS = CGM.getASTAllocaAddressSpace();
-    auto *V = DeclPtr.getPointer();
-    AllocaPtr = DeclPtr;
-
-    // For truly ABI indirect arguments -- those that are not `byval` -- store
-    // the address of the argument on the stack to preserve debug information.
-    ABIArgInfo ArgInfo = CurFnInfo->arguments()[ArgNo - 1].info;
-    if (ArgInfo.isIndirect())
-      UseIndirectDebugAddress = !ArgInfo.getIndirectByVal();
-    if (UseIndirectDebugAddress) {
-      auto PtrTy = getContext().getPointerType(Ty);
-      AllocaPtr = CreateMemTemp(PtrTy, getContext().getTypeAlignInChars(PtrTy),
-                                D.getName() + ".indirect_addr");
-      EmitStoreOfScalar(V, AllocaPtr, /* Volatile */ false, PtrTy);
-    }
-
-    auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
-    auto DestLangAS =
-        getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
-    if (SrcLangAS != DestLangAS) {
-      assert(getContext().getTargetAddressSpace(SrcLangAS) ==
-             CGM.getDataLayout().getAllocaAddrSpace());
-      auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
-      auto *T = DeclPtr.getElementType()->getPointerTo(DestAS);
-      DeclPtr =
-          DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
-                                  *this, V, SrcLangAS, DestLangAS, T, true),
-                              DeclPtr.isKnownNonNull());
-    }
-
+  auto PushCleanupIfNeeded = [this, Ty, &D](Address DeclPtr) {
    // Push a destructor cleanup for this parameter if the ABI requires it.
    // Don't push a cleanup in a thunk for a method that will also emit a
    // cleanup.
@@ -2535,87 +2492,126 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
            EHStack.stable_begin();
      }
    }
-  } else {
-    // Check if the parameter address is controlled by OpenMP runtime.
-    Address OpenMPLocalAddr =
-        getLangOpts().OpenMP
-            ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
-            : Address::invalid();
-    if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
-      DeclPtr = OpenMPLocalAddr;
-      AllocaPtr = DeclPtr;
+  };
+
+  Address DeclPtr = Address::invalid();
+  Address AllocaPtr = Address::invalid();
+  Address OpenMPLocalAddr =
+      getLangOpts().OpenMP
+          ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
+          : Address::invalid();
+  bool DoStore = false;
+  bool IsScalar = hasScalarEvaluationKind(Ty);
+  bool UseIndirectDebugAddress = false;
+  if (OpenMPLocalAddr.isValid()) {
+    DeclPtr = OpenMPLocalAddr;
+    AllocaPtr = DeclPtr;
+    LValue Dst = MakeAddrLValue(DeclPtr, Ty);
+    if (Arg.isIndirect()) {
+      LValue Src = MakeAddrLValue(Arg.getIndirectAddress(), Ty);
+      callCStructCopyConstructor(Dst, Src);
+      PushCleanupIfNeeded(Arg.getIndirectAddress());
    } else {
-      // Otherwise, create a temporary to hold the value.
+      EmitStoreOfScalar(Arg.getDirectValue(), Dst, /* isInitialization */ true);
+    }
+  } else {
+    // If we already have a pointer to the argument, reuse the input pointer.
+    if (Arg.isIndirect()) {
+      // If we have a prettier pointer type at this point, bitcast to that.
+      DeclPtr = Arg.getIndirectAddress();
+      DeclPtr = Builder.CreateElementBitCast(DeclPtr, ConvertTypeForMem(Ty),
+                                             D.getName());
+      // Indirect argument is in alloca address space, which may be different
+      // from the default address space.
+      auto AllocaAS = CGM.getASTAllocaAddressSpace();
+      auto *V = DeclPtr.getPointer();
+      AllocaPtr = DeclPtr;
+      auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
+      auto DestLangAS =
+          getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
+      if (SrcLangAS != DestLangAS) {
+        assert(getContext().getTargetAddressSpace(SrcLangAS) ==
+               CGM.getDataLayout().getAllocaAddrSpace());
+        auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
+        auto *T = DeclPtr.getElementType()->getPointerTo(DestAS);
+        DeclPtr =
+            DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
+                                    *this, V, SrcLangAS, DestLangAS, T, true),
+                                DeclPtr.isKnownNonNull());
+      }
+      PushCleanupIfNeeded(DeclPtr);
+    } else {
+      // Create a temporary to hold the value.
      DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
                              D.getName() + ".addr", &AllocaPtr);
+      DoStore = true;
    }
-    DoStore = true;
-  }

-  llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
+    llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);

-  LValue lv = MakeAddrLValue(DeclPtr, Ty);
-  if (IsScalar) {
-    Qualifiers qs = Ty.getQualifiers();
-    if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
-      // We honor __attribute__((ns_consumed)) for types with lifetime.
-      // For __strong, it's handled by just skipping the initial retain;
-      // otherwise we have to balance out the initial +1 with an extra
-      // cleanup to do the release at the end of the function.
-      bool isConsumed = D.hasAttr<NSConsumedAttr>();
+    LValue lv = MakeAddrLValue(DeclPtr, Ty);
+    if (IsScalar) {
+      Qualifiers qs = Ty.getQualifiers();
+      if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
+        // We honor __attribute__((ns_consumed)) for types with lifetime.
+        // For __strong, it's handled by just skipping the initial retain;
+        // otherwise we have to balance out the initial +1 with an extra
+        // cleanup to do the release at the end of the function.
+        bool isConsumed = D.hasAttr<NSConsumedAttr>();

-      // If a parameter is pseudo-strong then we can omit the implicit retain.
-      if (D.isARCPseudoStrong()) {
-        assert(lt == Qualifiers::OCL_Strong &&
-               "pseudo-strong variable isn't strong?");
-        assert(qs.hasConst() && "pseudo-strong variable should be const!");
-        lt = Qualifiers::OCL_ExplicitNone;
-      }
+        // If a parameter is pseudo-strong then we can omit the implicit retain.
+        if (D.isARCPseudoStrong()) {
+          assert(lt == Qualifiers::OCL_Strong &&
+                 "pseudo-strong variable isn't strong?");
+          assert(qs.hasConst() && "pseudo-strong variable should be const!");
+          lt = Qualifiers::OCL_ExplicitNone;
+        }

-      // Load objects passed indirectly.
-      if (Arg.isIndirect() && !ArgVal)
-        ArgVal = Builder.CreateLoad(DeclPtr);
+        // Load objects passed indirectly.
+        if (Arg.isIndirect() && !ArgVal)
+          ArgVal = Builder.CreateLoad(DeclPtr);

-      if (lt == Qualifiers::OCL_Strong) {
-        if (!isConsumed) {
-          if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
-            // use objc_storeStrong(&dest, value) for retaining the
-            // object. But first, store a null into 'dest' because
-            // objc_storeStrong attempts to release its old value.
-            llvm::Value *Null = CGM.EmitNullConstant(D.getType());
-            EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
-            EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
-            DoStore = false;
+        if (lt == Qualifiers::OCL_Strong) {
+          if (!isConsumed) {
+            if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
+              // use objc_storeStrong(&dest, value) for retaining the
+              // object. But first, store a null into 'dest' because
+              // objc_storeStrong attempts to release its old value.
+              llvm::Value *Null = CGM.EmitNullConstant(D.getType());
+              EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
+              EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
+              DoStore = false;
+            } else
+              // Don't use objc_retainBlock for block pointers, because we
+              // don't want to Block_copy something just because we got it
+              // as a parameter.
+              ArgVal = EmitARCRetainNonBlock(ArgVal);
+          }
+        } else {
+          // Push the cleanup for a consumed parameter.
+          if (isConsumed) {
+            ARCPreciseLifetime_t precise =
+                (D.hasAttr<ObjCPreciseLifetimeAttr>() ? ARCPreciseLifetime
+                                                      : ARCImpreciseLifetime);
+            EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(),
+                                                     ArgVal, precise);
+          }
+
+          if (lt == Qualifiers::OCL_Weak) {
+            EmitARCInitWeak(DeclPtr, ArgVal);
+            DoStore = false; // The weak init is a store, no need to do two.
          }
-          else
-          // Don't use objc_retainBlock for block pointers, because we
-          // don't want to Block_copy something just because we got it
-          // as a parameter.
-            ArgVal = EmitARCRetainNonBlock(ArgVal);
-        }
-      } else {
-        // Push the cleanup for a consumed parameter.
-        if (isConsumed) {
-          ARCPreciseLifetime_t precise = (D.hasAttr<ObjCPreciseLifetimeAttr>()
-                                ? ARCPreciseLifetime : ARCImpreciseLifetime);
-          EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(), ArgVal,
-                                                   precise);
        }

-        if (lt == Qualifiers::OCL_Weak) {
-          EmitARCInitWeak(DeclPtr, ArgVal);
-          DoStore = false; // The weak init is a store, no need to do two.
-        }
+        // Enter the cleanup scope.
+        EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
      }
-
-      // Enter the cleanup scope.
-      EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
    }
-  }

-  // Store the initial value into the alloca.
-  if (DoStore)
-    EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
+    // Store the initial value into the alloca.
+    if (DoStore)
+      EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
+  }

  setAddrOfLocalVar(&D, DeclPtr);

--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1059,10 +1059,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,

    // Allocate space for the variable to be globalized
    llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
-    llvm::CallBase *VoidPtr =
-        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                                CGM.getModule(), OMPRTL___kmpc_alloc_shared),
-                            AllocArgs, VD->getName());
+    llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(
+            CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
+                                 ? OMPRTL_malloc
+                                 : OMPRTL___kmpc_alloc_shared),
+        AllocArgs, VD->getName());
    // FIXME: We should use the variables actual alignment as an argument.
    VoidPtr->addRetAttr(llvm::Attribute::get(
        CGM.getLLVMContext(), llvm::Attribute::Alignment,
@@ -1099,10 +1101,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,

    // Allocate space for this VLA object to be globalized.
    llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
-    llvm::CallBase *VoidPtr =
-        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                                CGM.getModule(), OMPRTL___kmpc_alloc_shared),
-                            AllocArgs, VD->getName());
+    llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(
+            CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
+                                 ? OMPRTL_malloc
+                                 : OMPRTL___kmpc_alloc_shared),
+        AllocArgs, VD->getName());
    VoidPtr->addRetAttr(
        llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
                             CGM.getContext().getTargetInfo().getNewAlign()));
@@ -1130,20 +1134,29 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
    // Deallocate the memory for each globalized VLA object
    for (const auto &AddrSizePair :
         llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_free_shared),
-                          {AddrSizePair.first, AddrSizePair.second});
+      if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
+            {AddrSizePair.first});
+      else
+        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                CGM.getModule(), OMPRTL___kmpc_free_shared),
+                            {AddrSizePair.first, AddrSizePair.second});
    }
    // Deallocate the memory for each globalized value
    for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
      const auto *VD = cast<VarDecl>(Rec.first);
      I->getSecond().MappedParams->restore(CGF);

-      llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
-                                 CGF.getTypeSize(VD->getType())};
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_free_shared),
-                          FreeArgs);
+      if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
+            {Rec.second.GlobalizedVal});
+      else
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+                                                  OMPRTL___kmpc_free_shared),
+            {Rec.second.GlobalizedVal, CGF.getTypeSize(VD->getType())});
    }
  }
 }
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6209,6 +6209,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
        CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism");
      if (Args.hasArg(options::OPT_fopenmp_offload_mandatory))
        CmdArgs.push_back("-fopenmp-offload-mandatory");
+      if (Args.hasArg(options::OPT_fopenmp_globalize_to_global_space))
+        CmdArgs.push_back("-fopenmp-globalize-to-global-space");
      break;
    default:
      // By default, if Clang doesn't know how to generate useful OpenMP code
--- a/clang/test/OpenMP/globalization_byval_struct.c
+++ b/clang/test/OpenMP/globalization_byval_struct.c
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+
+extern int printf(const char *, ...);
+
+struct S {
+  int a;
+  float b;
+};
+
+// CHECK: define{{.*}}void @test(%struct.S* noundef byval(%struct.S) align {{[0-9]+}} [[arg:%[0-9a-zA-Z]+]])
+// CHECK: [[g:%[0-9a-zA-Z]+]] = call align {{[0-9]+}} i8* @__kmpc_alloc_shared
+// CHECK: bitcast i8* [[g]] to %struct.S*
+// CHECK: bitcast %struct.S* [[arg]] to i8**
+// CHECK: call void [[cc:@__copy_constructor[_0-9a-zA-Z]+]]
+// CHECK: void [[cc]]
+void test(struct S s) {
+#pragma omp parallel for
+  for (int i = 0; i < s.a; ++i) {
+    printf("%i : %i : %f\n", i, s.a, s.b);
+  }
+}
+
+void foo() {
+  #pragma omp target teams num_teams(1)
+  {
+    struct S s;
+    s.a = 7;
+    s.b = 11;
+    test(s);
+  }
+}
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -141,6 +141,13 @@ static constexpr OptTable::Info InfoTable[] = {
 #undef OPTION
 };

+/// Host RPC module that will be shared to the corresponding pass.
+Module *HostModule = nullptr;
+/// We only need to generate the host RPC module once.
+bool IsHostModuleGenerated = false;
+/// Host RPC object file.
+StringRef HostRPCObjFile;
+
 class WrapperOptTable : public opt::GenericOptTable {
 public:
  WrapperOptTable() : opt::GenericOptTable(InfoTable) {}
@@ -498,10 +505,12 @@ std::vector<std::string> getTargetFeatures(ArrayRef<OffloadFile> InputFiles) {
  return UnifiedFeatures;
 }

-template <typename ModuleHook = function_ref<bool(size_t, const Module &)>>
+template <typename PreHookTy = function_ref<bool(size_t, const Module &)>,
+          typename PostHookTy = function_ref<bool(size_t, const Module &)>>
 std::unique_ptr<lto::LTO> createLTO(
    const ArgList &Args, const std::vector<std::string> &Features,
-    ModuleHook Hook = [](size_t, const Module &) { return true; }) {
+    PreHookTy PreHook = [](size_t, const Module &) { return true; },
+    PostHookTy PostHook = [](size_t, const Module &) { return true; }) {
  const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
  StringRef Arch = Args.getLastArgValue(OPT_arch_EQ);
  lto::Config Conf;
@@ -555,9 +564,10 @@ std::unique_ptr<lto::LTO> createLTO(
      return true;
    };
  }
-  Conf.PostOptModuleHook = Hook;
-  Conf.CGFileType =
-      (Triple.isNVPTX() || SaveTemps) ? CGFT_AssemblyFile : CGFT_ObjectFile;
+
+  Conf.PreOptModuleHook = PreHook;
+  Conf.PostOptModuleHook = PostHook;
+  Conf.CGFileType = Triple.isNVPTX() ? CGFT_AssemblyFile : CGFT_ObjectFile;

  // TODO: Handle remark files
  Conf.HasWholeProgramVisibility = Args.hasArg(OPT_whole_program);
@@ -573,6 +583,58 @@ bool isValidCIdentifier(StringRef S) {
                      [](char C) { return C == '_' || isAlnum(C); });
 }

+bool writeHostModule(std::string &FileName) {
+  if (!HostModule)
+    return false;
+  if (HostModule->getFunctionList().empty())
+    return false;
+
+  auto HostTriple = HostModule->getTargetTriple();
+  FileName =
+      sys::path::filename(ExecutableName).str() + "-host-rpc-" + HostTriple;
+  auto TempFileOrErr = createOutputFile(FileName, "bc");
+  if (!TempFileOrErr)
+    reportError(TempFileOrErr.takeError());
+  int FD = -1;
+  if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
+    reportError(errorCodeToError(EC));
+
+  auto Out = std::make_unique<llvm::raw_fd_ostream>(FD, true);
+  WriteBitcodeToFile(*HostModule, *Out);
+
+  return true;
+}
+
+std::unique_ptr<lto::LTO> createHostRPCLTO(StringRef HostTriple) {
+  const llvm::Triple Triple(HostTriple);
+  lto::Config Conf;
+  lto::ThinBackend Backend;
+  Backend =
+      lto::createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
+
+  // TODO: host arch?
+  // Conf.CPU = Arch.str();
+  Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple);
+
+  // TODO: host features?
+  // Conf.MAttrs = Features;
+  Conf.CGOptLevel = *CodeGenOpt::getLevel(3);
+  Conf.OptLevel = 3;
+  Conf.UseDefaultPipeline = true;
+  Conf.DefaultTriple = Triple.getTriple();
+
+  LTOError = false;
+  Conf.DiagHandler = diagnosticHandler;
+
+  Conf.PTO.LoopVectorization = Conf.OptLevel > 1;
+  Conf.PTO.SLPVectorization = Conf.OptLevel > 1;
+  Conf.CGFileType = CGFT_ObjectFile;
+
+  Conf.HasWholeProgramVisibility = false;
+
+  return std::make_unique<lto::LTO>(std::move(Conf), Backend);
+}
+
 Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
                       SmallVectorImpl<StringRef> &OutputFiles,
                       const ArgList &Args) {
@@ -645,12 +707,51 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
    BitcodeOutput.push_back(*TempFileOrErr);
    return false;
  };
+  auto AddHostModuleAddr = [&](size_t, const Module &M) {
+    if (!HostModule)
+      return true;
+
+    Module &CM = const_cast<Module &>(M);
+    auto *MD = CM.getOrInsertNamedMetadata("llvm.hostrpc.hostmodule");
+    MD->clearOperands();
+    MD->addOperand(MDTuple::get(
+        CM.getContext(), {ConstantAsMetadata::get(ConstantInt::get(
+                             Type::getInt64Ty(CM.getContext()),
+                             reinterpret_cast<uintptr_t>(HostModule)))}));
+    return true;
+  };

  // We assume visibility of the whole program if every input file was bitcode.
  auto Features = getTargetFeatures(BitcodeInputFiles);
-  auto LTOBackend = Args.hasArg(OPT_embed_bitcode)
-                        ? createLTO(Args, Features, OutputBitcode)
-                        : createLTO(Args, Features);
+  auto LTOBackend =
+      Args.hasArg(OPT_embed_bitcode)
+          ? createLTO(Args, Features, AddHostModuleAddr, OutputBitcode)
+          : createLTO(Args, Features, AddHostModuleAddr);
+
+  LLVMContext &Ctx = LTOBackend->getContext();
+  StringRef HostTriple =
+      Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple());
+  std::unique_ptr<Module> HostModulePtr;
+  if (!IsHostModuleGenerated) {
+    HostModulePtr = std::make_unique<Module>(
+        sys::path::filename(ExecutableName).str() + "-host-rpc.bc", Ctx);
+    HostModule = HostModulePtr.get();
+    HostModulePtr->setTargetTriple(HostTriple);
+
+    std::string Msg;
+    const Target *T =
+        TargetRegistry::lookupTarget(HostModule->getTargetTriple(), Msg);
+    if (!T)
+      return createStringError(inconvertibleErrorCode(), Msg);
+    auto Options =
+        codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple(HostTriple));
+    StringRef CPU = "";
+    StringRef Features = "";
+    std::unique_ptr<TargetMachine> TM(
+        T->createTargetMachine(HostTriple, CPU, Features, Options, Reloc::PIC_,
+                               HostModule->getCodeModel()));
+    HostModule->setDataLayout(TM->createDataLayout());
+  }

  // We need to resolve the symbols so the LTO backend knows which symbols need
  // to be kept or can be internalized. This is a simplified symbol resolution
@@ -743,6 +844,57 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
  if (Error Err = LTOBackend->run(AddStream))
    return Err;

+  std::string HostModuleTempFile;
+  bool ValidHostModule = writeHostModule(HostModuleTempFile);
+  // Reset the HostModule pointer.
+  HostModulePtr.reset();
+  HostModule = nullptr;
+  // TODO: this is really redundant code.
+  if (ValidHostModule) {
+    auto HostLTO = createHostRPCLTO(HostTriple);
+
+    std::string HostBitCodeFile = HostModuleTempFile + ".bc";
+    auto BufferOrError = MemoryBuffer::getFile(HostBitCodeFile);
+    if (!BufferOrError)
+      reportError(createFileError(HostBitCodeFile, BufferOrError.getError()));
+    Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr =
+        llvm::lto::InputFile::create(*BufferOrError.get());
+    if (!BitcodeFileOrErr)
+      return BitcodeFileOrErr.takeError();
+
+    const auto Symbols = (*BitcodeFileOrErr)->symbols();
+    SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
+    size_t Idx = 0;
+    for (auto &Sym : Symbols) {
+      (void)Sym;
+      lto::SymbolResolution &Res = Resolutions[Idx++];
+      Res.ExportDynamic = true;
+      Res.VisibleToRegularObj = true;
+      Res.LinkerRedefined = false;
+      Res.Prevailing = true;
+    }
+    if (Error Err = HostLTO->add(std::move(*BitcodeFileOrErr), Resolutions))
+      return Err;
+
+    auto RPCAddStream =
+        [&](size_t Task,
+            const Twine &ModuleName) -> std::unique_ptr<CachedFileStream> {
+      int FD = -1;
+      auto TempFileOrErr = createOutputFile(
+          sys::path::filename(ExecutableName) + "-host-rpc-" + HostTriple, "o");
+      if (!TempFileOrErr)
+        reportError(TempFileOrErr.takeError());
+      HostRPCObjFile = *TempFileOrErr;
+      if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
+        reportError(errorCodeToError(EC));
+      return std::make_unique<CachedFileStream>(
+          std::make_unique<llvm::raw_fd_ostream>(FD, true));
+    };
+
+    if (Error Err = HostLTO->run(RPCAddStream))
+      return Err;
+  }
+
  if (LTOError)
    return createStringError(inconvertibleErrorCode(),
                             "Errors encountered inside the LTO pipeline.");
@@ -1105,6 +1257,9 @@ linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles,
    WrappedOutput.push_back(*OutputOrErr);
  }

+  if (!HostRPCObjFile.empty())
+    WrappedOutput.push_back(HostRPCObjFile);
+
  return WrappedOutput;
 }

--- a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
@@ -25,6 +25,19 @@ enum OMPTgtExecModeFlags : unsigned char {
      OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
 };

+enum OMPTgtHostRPCArgType {
+  // No need to copy.
+  OMP_HOST_RPC_ARG_SCALAR = 0,
+  OMP_HOST_RPC_ARG_PTR = 1,
+  // Copy to device.
+  OMP_HOST_RPC_ARG_COPY_TO = OMP_HOST_RPC_ARG_PTR | (1 << 1),
+  // Copy to device.
+  OMP_HOST_RPC_ARG_COPY_FROM = OMP_HOST_RPC_ARG_PTR | (1 << 2),
+  // Copy to and from device.
+  OMP_HOST_RPC_ARG_COPY_TOFROM =
+      OMP_HOST_RPC_ARG_COPY_TO | OMP_HOST_RPC_ARG_COPY_FROM,
+};
+
 } // end namespace omp
 } // end namespace llvm

--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -222,7 +222,9 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
 __OMP_RTL(__kmpc_get_warp_size, false, Int32, )

 __OMP_RTL(omp_get_thread_num, false, Int32, )
+__OMP_RTL(omp_get_bulk_thread_num, false, Int32, )
 __OMP_RTL(omp_get_num_threads, false, Int32, )
+__OMP_RTL(omp_get_bulk_num_threads, false, Int32, )
 __OMP_RTL(omp_get_max_threads, false, Int32, )
 __OMP_RTL(omp_in_parallel, false, Int32, )
 __OMP_RTL(omp_get_dynamic, false, Int32, )
@@ -471,6 +473,8 @@ __OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr, Int32,

 __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)

+__OMP_RTL(malloc, false, VoidPtr, SizeTy)
+__OMP_RTL(free, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
 __OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)
 __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
@@ -484,6 +488,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
 __OMP_RTL(__kmpc_syncwarp, false, Void, Int64)

+__OMP_RTL(__kmpc_launch_parallel_51_kernel, false, Void, Int8Ptr, Int32, Int32,
+          Int32, VoidPtrPtr, Int64)
+
 __OMP_RTL(__last, false, Void, )

 #undef __OMP_RTL
@@ -668,6 +675,8 @@ __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs())

 __OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_bulk_thread_num, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_bulk_num_threads, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, SExt, ParamAttrs())
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -288,6 +288,9 @@ public:
  /// by LTO but might not be visible from bitcode symbol table.
  static ArrayRef<const char*> getRuntimeLibcallSymbols();

+  /// Returns the context.
+  LLVMContext &getContext() { return RegularLTO.Ctx; }
+
 private:
  Config Conf;

--- a/llvm/include/llvm/Transforms/IPO/HostRPC.h
+++ b/llvm/include/llvm/Transforms/IPO/HostRPC.h
@@ -0,0 +1,31 @@
+//===- Transform/IPO/HostRPC.h - Code of automatic host rpc -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_HOSTRPC_H
+#define LLVM_TRANSFORMS_IPO_HOSTRPC_H
+
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class HostRPCPass : public PassInfoMixin<HostRPCPass> {
+public:
+  HostRPCPass() : LTOPhase(ThinOrFullLTOPhase::None) {}
+  HostRPCPass(ThinOrFullLTOPhase LTOPhase) : LTOPhase(LTOPhase) {}
+
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+  const ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None;
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_IPO_HOSTRPC_H
--- a/llvm/include/llvm/Transforms/Utils/CanonicalizeMainFunction.h
+++ b/llvm/include/llvm/Transforms/Utils/CanonicalizeMainFunction.h
@@ -0,0 +1,30 @@
+//===--------------- CanonicalizeMainFunction.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility function to canonicalize main function.
+// The canonical main function is defined as: int main(int argc, char *argv[]);
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
+#define LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// A pass that canonicalizes main function in a module.
+class CanonicalizeMainFunctionPass
+    : public PassInfoMixin<CanonicalizeMainFunctionPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -108,6 +108,7 @@
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/IPO/GlobalOpt.h"
 #include "llvm/Transforms/IPO/GlobalSplit.h"
+#include "llvm/Transforms/IPO/HostRPC.h"
 #include "llvm/Transforms/IPO/HotColdSplitting.h"
 #include "llvm/Transforms/IPO/IROutliner.h"
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
@@ -228,6 +229,7 @@
 #include "llvm/Transforms/Utils/BreakCriticalEdges.h"
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
 #include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
+#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
 #include "llvm/Transforms/Utils/CountVisits.h"
 #include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -51,6 +51,7 @@
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/IPO/GlobalOpt.h"
 #include "llvm/Transforms/IPO/GlobalSplit.h"
+#include "llvm/Transforms/IPO/HostRPC.h"
 #include "llvm/Transforms/IPO/HotColdSplitting.h"
 #include "llvm/Transforms/IPO/IROutliner.h"
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
@@ -120,6 +121,7 @@
 #include "llvm/Transforms/Utils/AddDiscriminators.h"
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
+#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
 #include "llvm/Transforms/Utils/CountVisits.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
 #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
@@ -176,9 +178,9 @@ static cl::opt<bool> EnablePostPGOLoopRotation(
    "enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden,
    cl::desc("Run the loop rotation transformation after PGO instrumentation"));

-static cl::opt<bool> EnableGlobalAnalyses(
-    "enable-global-analyses", cl::init(true), cl::Hidden,
-    cl::desc("Enable inter-procedural analyses"));
+static cl::opt<bool>
+    EnableGlobalAnalyses("enable-global-analyses", cl::init(true), cl::Hidden,
+                         cl::desc("Enable inter-procedural analyses"));

 static cl::opt<bool>
    RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden,
@@ -273,6 +275,13 @@ cl::opt<bool> EnableMemProfContextDisambiguation(
    "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
    cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));

+static cl::opt<bool> EnableHostRPC("enable-host-rpc", cl::init(false),
+                                   cl::Hidden, cl::desc("Enable HostRPC pass"));
+
+static cl::opt<bool> EnableCanonicalizeMainFunction(
+    "enable-canonicalize-main-function", cl::init(false), cl::Hidden,
+    cl::desc("Enable CanonicalizeMainFunction pass"));
+
 PipelineTuningOptions::PipelineTuningOptions() {
  LoopInterleaving = true;
  LoopVectorization = true;
@@ -1042,6 +1051,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
          PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
  }

+  if (EnableCanonicalizeMainFunction)
+    MPM.addPass(CanonicalizeMainFunctionPass());
+
  // Try to perform OpenMP specific optimizations on the module. This is a
  // (quick!) no-op if there are no OpenMP runtime calls present in the module.
  MPM.addPass(OpenMPOptPass());
@@ -1061,11 +1073,10 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
  // and prior to optimizing globals.
  // FIXME: This position in the pipeline hasn't been carefully considered in
  // years, it should be re-analyzed.
-  MPM.addPass(IPSCCPPass(
-              IPSCCPOptions(/*AllowFuncSpec=*/
-                            Level != OptimizationLevel::Os &&
-                            Level != OptimizationLevel::Oz &&
-                            !isLTOPreLink(Phase))));
+  MPM.addPass(IPSCCPPass(IPSCCPOptions(/*AllowFuncSpec=*/
+                                       Level != OptimizationLevel::Os &&
+                                       Level != OptimizationLevel::Oz &&
+                                       !isLTOPreLink(Phase))));

  // Attach metadata to indirect call sites indicating the set of functions
  // they may target at run-time. This should follow IPSCCP.
@@ -1471,7 +1482,7 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
 ModulePassManager
 PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
  if (Level == OptimizationLevel::O0)
-    return buildO0DefaultPipeline(Level, /*LTOPreLink*/true);
+    return buildO0DefaultPipeline(Level, /*LTOPreLink*/ true);

  ModulePassManager MPM;

@@ -1594,6 +1605,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
  // in the current module.
  MPM.addPass(CrossDSOCFIPass());

+  if (EnableCanonicalizeMainFunction)
+    MPM.addPass(CanonicalizeMainFunctionPass());
+
  if (Level == OptimizationLevel::O0) {
    // The WPD and LowerTypeTest passes need to run at -O0 to lower type
    // metadata and intrinsics.
@@ -1624,6 +1638,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
  // Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
  MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink));

+  if (EnableHostRPC)
+    MPM.addPass(HostRPCPass(ThinOrFullLTOPhase::FullLTOPostLink));
+
  // Remove unused virtual tables to improve the quality of code generated by
  // whole-program devirtualization and bitset lowering.
  MPM.addPass(GlobalDCEPass());
@@ -1996,6 +2013,9 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,

  MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));

+  if (EnableCanonicalizeMainFunction)
+    MPM.addPass(CanonicalizeMainFunctionPass());
+
  return MPM;
 }

--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -47,6 +47,7 @@ MODULE_PASS("openmp-opt", OpenMPOptPass())
 MODULE_PASS("openmp-opt-postlink", OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink))
 MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
 MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass())
+MODULE_PASS("canonicalize-main-function", CanonicalizeMainFunctionPass())
 MODULE_PASS("cg-profile", CGProfilePass())
 MODULE_PASS("check-debugify", NewPMCheckDebugifyPass())
 MODULE_PASS("constmerge", ConstantMergePass())
@@ -128,6 +129,7 @@ MODULE_PASS("sanmd-module", SanitizerBinaryMetadataPass())
 MODULE_PASS("memprof-module", ModuleMemProfilerPass())
 MODULE_PASS("poison-checking", PoisonCheckingPass())
 MODULE_PASS("pseudo-probe-update", PseudoProbeUpdatePass())
+MODULE_PASS("host-rpc", HostRPCPass())
 #undef MODULE_PASS

 #ifndef MODULE_PASS_WITH_PARAMS
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -383,12 +383,25 @@ static bool getPotentialCopiesOfMemoryValue(
          dbgs() << "Underlying object is a valid nullptr, giving up.\n";);
      return false;
    }
-    // TODO: Use assumed noalias return.
-    if (!isa<AllocaInst>(&Obj) && !isa<GlobalVariable>(&Obj) &&
-        !(IsLoad ? isAllocationFn(&Obj, TLI) : isNoAliasCall(&Obj))) {
-      LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << Obj
-                        << "\n";);
-      return false;
+    if (!isa<AllocaInst>(&Obj) && !isa<GlobalVariable>(&Obj)) {
+      auto *CB = dyn_cast<CallBase>(&Obj);
+      bool R = false;
+      if (CB) {
+        auto &AAN =
+            A.getOrCreateAAFor<AANoAlias>(IRPosition::callsite_returned(*CB),
+                                          &QueryingAA, DepClassTy::OPTIONAL);
+        if (AAN.isValidState())
+          R = AAN.isAssumedNoAlias();
+        else
+          R = isNoAliasCall(&Obj);
+        if (!R)
+          R = isAllocationFn(&Obj, TLI);
+      }
+      if (!R) {
+        LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << Obj
+                          << "\n";);
+        return false;
+      }
    }
    if (auto *GV = dyn_cast<GlobalVariable>(&Obj))
      if (!GV->hasLocalLinkage() &&
@@ -2081,7 +2094,8 @@ void Attributor::runTillFixpoint() {

  LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
                    << IterationCounter << "/" << MaxIterations
-                    << " iterations\n");
+                    << " iterations. #Changed AAs: " << ChangedAAs.size()
+                    << "\n");

  // Reset abstract arguments not settled in a sound fixpoint by now. This
  // happens when we stopped the fixpoint iteration early. Note that only the
@@ -2096,13 +2110,21 @@ void Attributor::runTillFixpoint() {

    AbstractState &State = ChangedAA->getState();
    if (!State.isAtFixpoint()) {
+      LLVM_DEBUG(
+          dbgs() << "\n[Attributor] Invalidate AA " << *ChangedAA
+                 << " as it is not at fix point after max iterations.\n");
+
      State.indicatePessimisticFixpoint();

      NumAttributesTimedOut++;
    }

-    for (auto &DepIt : ChangedAA->Deps)
+    for (auto &DepIt : ChangedAA->Deps) {
+      LLVM_DEBUG(dbgs() << "\n[Attributor] Invalidate dependent AA "
+                        << *(DepIt.getPointer()) << " of AA " << *ChangedAA
+                        << "\n");
      ChangedAAs.push_back(cast<AbstractAttribute>(DepIt.getPointer()));
+    }
    ChangedAA->Deps.clear();
  }

--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -1592,6 +1592,8 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
    // might change while we iterate through a loop. For now, we give up if
    // the PHI is not invariant.
    if (isa<PHINode>(Usr)) {
+      if (!Usr->getType()->isPointerTy())
+        return false;
      // Note the order here, the Usr access might change the map, CurPtr is
      // already in it though.
      bool IsFirstPHIUser = !OffsetInfoMap.count(Usr);
@@ -1814,6 +1816,31 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
      return false;
    }

+    if (auto *II = dyn_cast<ICmpInst>(Usr)) {
+      auto CheckIfUsedAsPred = [&](const Use &U, bool &Follow) {
+        const auto *UU = U.getUser();
+        if (isa<SelectInst>(UU))
+          return true;
+        if (isa<BranchInst>(UU))
+          return true;
+
+        LLVM_DEBUG(dbgs() << "[AAPointerInfo] ICmpInst user not handled " << *UU
+                          << "\n");
+        return false;
+      };
+      if (!A.checkForAllUses(CheckIfUsedAsPred, *this, *II,
+                             /* CheckBBLivenessOnly */ true,
+                             DepClassTy::OPTIONAL,
+                             /* IgnoreDroppableUses */ true)) {
+        LLVM_DEBUG(
+            dbgs() << "[AAPointerInfo] Check for all uses failed for ICmpInst "
+                   << *II << "\n");
+        return false;
+      }
+
+      return true;
+    }
+
    LLVM_DEBUG(dbgs() << "[AAPointerInfo] User not handled " << *Usr << "\n");
    return false;
  };
@@ -1932,7 +1959,6 @@ struct AAPointerInfoCallSiteArgument final : AAPointerInfoFloating {

    const auto &NoCaptureAA =
        A.getAAFor<AANoCapture>(*this, getIRPosition(), DepClassTy::OPTIONAL);
-
    if (!NoCaptureAA.isAssumedNoCapture())
      return indicatePessimisticFixpoint();

@@ -11827,10 +11853,35 @@ struct AAUnderlyingObjectsImpl
  /// See AbstractAttribute::trackStatistics()
  void trackStatistics() const override {}

+  // TODO: This is a temporary solution for terminals.
+  bool checkIfTerminals(Attributor &A, Value *V) {
+    auto *CI = dyn_cast<CallInst>(V);
+    if (!CI)
+      return false;
+
+    Function *Scope = CI->getFunction();
+    const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*Scope);
+    LibFunc TLIFn;
+    if (TLI && TLI->getLibFunc(*CI, TLIFn)) {
+      if (TLIFn == LibFunc::LibFunc_malloc ||
+          TLIFn == LibFunc::LibFunc___kmpc_alloc_shared)
+        return true;
+    }
+
+    return false;
+  }
+
  /// See AbstractAttribute::updateImpl(...).
  ChangeStatus updateImpl(Attributor &A) override {
    auto &Ptr = getAssociatedValue();

+    if (checkIfTerminals(A, &Ptr)) {
+      bool Changed = false;
+      Changed |= InterAssumedUnderlyingObjects.insert(&Ptr);
+      Changed |= IntraAssumedUnderlyingObjects.insert(&Ptr);
+      return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+    }
+
    auto DoUpdate = [&](SmallSetVector<Value *, 8> &UnderlyingObjects,
                        AA::ValueScope Scope) {
      bool UsedAssumedInformation = false;
@@ -11846,6 +11897,10 @@ struct AAUnderlyingObjectsImpl
      for (unsigned I = 0; I < Values.size(); ++I) {
        auto &VAC = Values[I];
        auto *Obj = VAC.getValue();
+        if (checkIfTerminals(A, Obj)) {
+          Changed |= UnderlyingObjects.insert(Obj);
+          continue;
+        }
        Value *UO = getUnderlyingObject(Obj);
        if (UO && UO != VAC.getValue() && SeenObjects.insert(UO).second) {
          const auto &OtherAA = A.getAAFor<AAUnderlyingObjects>(
@@ -11876,6 +11931,16 @@ struct AAUnderlyingObjectsImpl
          continue;
        }

+        if (auto *LI = dyn_cast<LoadInst>(Obj)) {
+          LLVM_DEBUG({
+            dbgs() << "[AAUnderlyingObjects] for CtxI ";
+            getCtxI()->print(dbgs());
+            dbgs() << " at position " << getIRPosition() << " has LoadInst ";
+            LI->print(dbgs());
+            dbgs() << '\n';
+          });
+        }
+
        Changed |= UnderlyingObjects.insert(Obj);
      }

@@ -11886,6 +11951,8 @@ struct AAUnderlyingObjectsImpl
    Changed |= DoUpdate(IntraAssumedUnderlyingObjects, AA::Intraprocedural);
    Changed |= DoUpdate(InterAssumedUnderlyingObjects, AA::Interprocedural);

+    LLVM_DEBUG(dumpState(dbgs()));
+
    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
  }

@@ -11924,6 +11991,19 @@ private:
    return Changed;
  }

+  void dumpState(raw_ostream &O) {
+    O << "Underlying objects:\nintra procedureal:\n";
+    for (auto *Obj : IntraAssumedUnderlyingObjects) {
+      Obj->print(O);
+      O << '\n';
+    }
+    O << "inter procedureal:\n";
+    for (auto *Obj : InterAssumedUnderlyingObjects) {
+      Obj->print(O);
+      O << '\n';
+    }
+  }
+
  /// All the underlying objects collected so far via intra procedural scope.
  SmallSetVector<Value *, 8> IntraAssumedUnderlyingObjects;
  /// All the underlying objects collected so far via inter procedural scope.
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -19,6 +19,7 @@ add_llvm_component_library(LLVMipo
  GlobalDCE.cpp
  GlobalOpt.cpp
  GlobalSplit.cpp
+  HostRPC.cpp
  HotColdSplitting.cpp
  IPO.cpp
  IROutliner.cpp
--- a/llvm/lib/Transforms/IPO/HostRPC.cpp
+++ b/llvm/lib/Transforms/IPO/HostRPC.cpp
@@ -0,0 +1,887 @@
+//===- Transform/IPO/HostRPC.h - Code of automatic host rpc -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/HostRPC.h"
+
+#include "llvm/ADT/EnumeratedArray.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/IPO/Attributor.h"
+
+#include <cstdint>
+
+#define DEBUG_TYPE "host-rpc"
+
+using namespace llvm;
+
+using ArgType = llvm::omp::OMPTgtHostRPCArgType;
+
+static cl::opt<bool>
+    UseDummyHostModule("host-rpc-use-dummy-host-module", cl::init(false),
+                       cl::Hidden,
+                       cl::desc("Use dummy host module if there no host module "
+                                "attached to the device module"));
+
+namespace {
+
+enum class HostRPCRuntimeFunction {
+#define __OMPRTL_HOST_RPC(_ENUM) OMPRTL_##_ENUM
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper),
+  __OMPRTL_HOST_RPC(__last),
+#undef __OMPRTL_HOST_RPC
+};
+
+#define __OMPRTL_HOST_RPC(_ENUM)                                               \
+  auto OMPRTL_##_ENUM = HostRPCRuntimeFunction::OMPRTL_##_ENUM;
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper)
+#undef __OMPRTL_HOST_RPC
+
+// TODO: Remove those functions implemented in device runtime.
+static constexpr const char *InternalPrefix[] = {
+    "__kmp", "llvm.",        "nvm.",
+    "omp_",  "vprintf",      "malloc",
+    "free",  "__keep_alive", "__llvm_omp_vprintf"};
+
+bool isInternalFunction(Function &F) {
+  auto Name = F.getName();
+
+  for (auto *P : InternalPrefix)
+    if (Name.startswith(P))
+      return true;
+
+  return false;
+}
+
+std::string typeToString(Type *T) {
+  if (T->is16bitFPTy())
+    return "f16";
+  if (T->isFloatTy())
+    return "f32";
+  if (T->isDoubleTy())
+    return "f64";
+  if (T->isPointerTy())
+    return "ptr";
+  if (T->isStructTy())
+    return std::string(T->getStructName());
+  if (T->isIntegerTy())
+    return "i" + std::to_string(T->getIntegerBitWidth());
+
+  LLVM_DEBUG(dbgs() << "[HostRPC] unknown type " << *T
+                    << "  for typeToString.\n";);
+
+  llvm_unreachable("unknown type");
+}
+
+class HostRPC {
+  /// LLVM context instance
+  LLVMContext &Context;
+
+  /// Device module.
+  Module &M;
+  /// Host module
+  Module &HM;
+
+  /// Data layout of the device module.
+  DataLayout DL;
+
+  IRBuilder<> Builder;
+
+  /// External functions we are operating on.
+  SmallSetVector<Function *, 8> FunctionWorkList;
+
+  /// Attributor instance.
+  Attributor &A;
+
+  // Types
+  Type *Int8PtrTy;
+  Type *VoidTy;
+  Type *Int32Ty;
+  Type *Int64Ty;
+  StructType *ArgInfoTy;
+  // Values
+  Constant *NullPtr;
+  Constant *NullInt64;
+
+  struct CallSiteInfo {
+    CallInst *CI = nullptr;
+    SmallVector<Type *> Params;
+  };
+
+  struct HostRPCArgInfo {
+    Value *BasePtr = nullptr;
+    Constant *Type = nullptr;
+    Value *Size = nullptr;
+  };
+
+  ///
+  SmallVector<Function *> HostEntryTable;
+
+  EnumeratedArray<Function *, HostRPCRuntimeFunction,
+                  HostRPCRuntimeFunction::OMPRTL___last>
+      RFIs;
+
+  SmallVector<std::pair<CallInst *, CallInst *>> CallInstMap;
+
+  Constant *getConstantInt64(uint64_t Val) {
+    return ConstantInt::get(Int64Ty, Val);
+  }
+
+  static std::string getWrapperFunctionName(Function *F, CallSiteInfo &CSI) {
+    std::string Name = "__kmpc_host_rpc_wrapper_" + std::string(F->getName());
+    if (!F->isVarArg())
+      return Name;
+
+    for (unsigned I = F->getFunctionType()->getNumParams();
+         I < CSI.Params.size(); ++I) {
+      Name.push_back('_');
+      Name.append(typeToString(CSI.Params[I]));
+    }
+
+    return Name;
+  }
+
+  void registerAAs();
+
+  Value *convertToInt64Ty(Value *V);
+
+  Value *convertFromInt64TyTo(Value *V, Type *TargetTy);
+
+  Constant *convertToInt64Ty(Constant *C);
+
+  Constant *convertFromInt64TyTo(Constant *C, Type *T);
+
+  // int device_wrapper(call_no, arg_info, ...) {
+  //   void *desc = __kmpc_host_rpc_get_desc(call_no, num_args, arg_info);
+  //   __kmpc_host_rpc_add_arg(desc, arg1, sizeof(arg1));
+  //   __kmpc_host_rpc_add_arg(desc, arg2, sizeof(arg2));
+  //   ...
+  //   int r = (int)__kmpc_host_rpc_send_and_wait(desc);
+  //   return r;
+  // }
+  Function *getDeviceWrapperFunction(StringRef WrapperName, Function *F,
+                                     CallSiteInfo &CSI);
+
+  // void host_wrapper(desc) {
+  //   int arg1 = (int)__kmpc_host_rpc_get_arg(desc, 0);
+  //   float arg2 = (float)__kmpc_host_rpc_get_arg(desc, 1);
+  //   char *arg3 = (char *)__kmpc_host_rpc_get_arg(desc, 2);
+  //   ...
+  //   int r = actual_call(arg1, arg2, arg3, ...);
+  //   __kmpc_host_rpc_set_ret_val(ptr(desc, (int64_t)r);
+  // }
+  Function *getHostWrapperFunction(StringRef WrapperName, Function *F,
+                                   CallSiteInfo &CSI);
+
+  bool rewriteWithHostRPC(Function *F);
+
+  void emitHostWrapperInvoker();
+
+  bool recollectInformation();
+
+public:
+  HostRPC(Module &DeviceModule, Module &HostModule, Attributor &A)
+      : Context(DeviceModule.getContext()), M(DeviceModule), HM(HostModule),
+        DL(M.getDataLayout()), Builder(Context), A(A) {
+    assert(&M.getContext() == &HM.getContext() &&
+           "device and host modules have different context");
+
+#define __OMP_TYPE(TYPE) TYPE = Type::get##TYPE(Context)
+    __OMP_TYPE(Int8PtrTy);
+    __OMP_TYPE(VoidTy);
+    __OMP_TYPE(Int32Ty);
+    __OMP_TYPE(Int64Ty);
+#undef __OMP_TYPE
+
+    NullPtr = ConstantInt::getNullValue(Int8PtrTy);
+    NullInt64 = ConstantInt::getNullValue(Int64Ty);
+
+#define __OMP_RTL(_ENUM, MOD, VARARG, RETTY, ...)                              \
+  {                                                                            \
+    SmallVector<Type *> Params{__VA_ARGS__};                                   \
+    FunctionType *FT = FunctionType::get(RETTY, Params, VARARG);               \
+    Function *F = (MOD).getFunction(#_ENUM);                                   \
+    if (!F)                                                                    \
+      F = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,     \
+                           #_ENUM, (MOD));                                     \
+    RFIs[OMPRTL_##_ENUM] = F;                                                  \
+  }
+    __OMP_RTL(__kmpc_host_rpc_get_desc, M, false, Int8PtrTy, Int32Ty, Int32Ty,
+              Int8PtrTy)
+    __OMP_RTL(__kmpc_host_rpc_add_arg, M, false, VoidTy, Int8PtrTy, Int64Ty,
+              Int32Ty)
+    __OMP_RTL(__kmpc_host_rpc_send_and_wait, M, false, Int64Ty, Int8PtrTy)
+    __OMP_RTL(__kmpc_host_rpc_get_arg, HM, false, Int64Ty, Int8PtrTy, Int32Ty)
+    __OMP_RTL(__kmpc_host_rpc_set_ret_val, HM, false, VoidTy, Int8PtrTy,
+              Int64Ty)
+    __OMP_RTL(__kmpc_host_rpc_invoke_host_wrapper, HM, false, VoidTy, Int32Ty,
+              Int8PtrTy)
+#undef __OMP_RTL
+
+    ArgInfoTy = StructType::create({Int64Ty, Int64Ty, Int64Ty, Int8PtrTy},
+                                   "struct.arg_info_t");
+  }
+
+  bool run();
+};
+
+Value *HostRPC::convertToInt64Ty(Value *V) {
+  if (auto *C = dyn_cast<Constant>(V))
+    return convertToInt64Ty(C);
+
+  Type *T = V->getType();
+
+  if (T == Int64Ty)
+    return V;
+
+  if (T->isPointerTy())
+    return Builder.CreatePtrToInt(V, Int64Ty);
+
+  if (T->isIntegerTy())
+    return Builder.CreateIntCast(V, Int64Ty, /* isSigned */ true);
+
+  if (T->isFloatingPointTy()) {
+    V = Builder.CreateBitCast(
+        V, Type::getIntNTy(V->getContext(), T->getScalarSizeInBits()));
+    return Builder.CreateIntCast(V, Int64Ty, /* isSigned */ true);
+  }
+
+  llvm_unreachable("unknown cast to int64_t");
+}
+
+Value *HostRPC::convertFromInt64TyTo(Value *V, Type *T) {
+  if (auto *C = dyn_cast<Constant>(V))
+    return convertFromInt64TyTo(C, T);
+
+  if (T == Int64Ty)
+    return V;
+
+  if (T->isPointerTy())
+    return Builder.CreateIntToPtr(V, T);
+
+  if (T->isIntegerTy())
+    return Builder.CreateIntCast(V, T, /* isSigned */ true);
+
+  if (T->isFloatingPointTy()) {
+    V = Builder.CreateIntCast(
+        V, Type::getIntNTy(V->getContext(), T->getScalarSizeInBits()),
+        /* isSigned */ true);
+    return Builder.CreateBitCast(V, T);
+  }
+
+  llvm_unreachable("unknown cast from int64_t");
+}
+
+Constant *HostRPC::convertToInt64Ty(Constant *C) {
+  Type *T = C->getType();
+
+  if (T == Int64Ty)
+    return C;
+
+  if (T->isPointerTy())
+    return ConstantExpr::getPtrToInt(C, Int64Ty);
+
+  if (T->isIntegerTy())
+    return ConstantExpr::getIntegerCast(C, Int64Ty, /* isSigned */ true);
+
+  if (T->isFloatingPointTy()) {
+    C = ConstantExpr::getBitCast(
+        C, Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()));
+    return ConstantExpr::getIntegerCast(C, Int64Ty, /* isSigned */ true);
+  }
+
+  llvm_unreachable("unknown cast to int64_t");
+}
+
+Constant *HostRPC::convertFromInt64TyTo(Constant *C, Type *T) {
+  if (T == Int64Ty)
+    return C;
+
+  if (T->isPointerTy())
+    return ConstantExpr::getIntToPtr(C, T);
+
+  if (T->isIntegerTy())
+    return ConstantExpr::getIntegerCast(C, T, /* isSigned */ true);
+
+  if (T->isFloatingPointTy()) {
+    C = ConstantExpr::getIntegerCast(
+        C, Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()),
+        /* isSigned */ true);
+    return ConstantExpr::getBitCast(C, T);
+  }
+
+  llvm_unreachable("unknown cast from int64_t");
+}
+
+void HostRPC::registerAAs() {
+  for (auto *F : FunctionWorkList)
+    for (User *U : F->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+
+      for (unsigned I = 0; I < CI->arg_size(); ++I) {
+        Value *Operand = CI->getArgOperand(I);
+        if (!Operand->getType()->isPointerTy())
+          continue;
+        A.getOrCreateAAFor<AAUnderlyingObjects>(
+            IRPosition::callsite_argument(*CI, I),
+            /* QueryingAA */ nullptr, DepClassTy::NONE);
+      }
+    }
+}
+
+bool HostRPC::recollectInformation() {
+  FunctionWorkList.clear();
+
+  for (Function &F : M) {
+    // If the function is already defined, it definitely does not require RPC.
+    if (!F.isDeclaration())
+      continue;
+
+    // If it is an internal function, skip it as well.
+    if (isInternalFunction(F))
+      continue;
+
+    // If there is no use of the function, skip it.
+    if (F.use_empty())
+      continue;
+
+    FunctionWorkList.insert(&F);
+  }
+
+  return !FunctionWorkList.empty();
+}
+
+bool HostRPC::run() {
+  bool Changed = false;
+
+  if (!recollectInformation())
+    return Changed;
+
+  Changed = true;
+
+  // We add a couple of assumptions to those RPC functions such that AAs will
+  // not error out because of unknown implementation of those functions.
+  for (Function &F : M) {
+    if (!F.isDeclaration())
+      continue;
+
+    F.addFnAttr(Attribute::NoRecurse);
+
+    for (auto &Arg : F.args())
+      if (Arg.getType()->isPointerTy())
+        Arg.addAttr(Attribute::NoCapture);
+
+    if (!F.isVarArg())
+      continue;
+
+    for (User *U : F.users()) {
+      auto *CB = dyn_cast<CallBase>(U);
+      if (!CB)
+        continue;
+      for (unsigned I = F.getFunctionType()->getNumParams(); I < CB->arg_size();
+           ++I) {
+        Value *Arg = CB->getArgOperand(I);
+        if (Arg->getType()->isPointerTy())
+          CB->addParamAttr(I, Attribute::NoCapture);
+      }
+    }
+  }
+
+  LLVM_DEBUG(M.dump());
+
+  registerAAs();
+
+  ChangeStatus Status = A.run();
+  if (!recollectInformation())
+    return Status == ChangeStatus::CHANGED;
+
+  for (Function *F : FunctionWorkList)
+    Changed |= rewriteWithHostRPC(F);
+
+  if (!Changed)
+    return Changed;
+
+  for (auto Itr = CallInstMap.rbegin(); Itr != CallInstMap.rend(); ++Itr) {
+    auto *CI = Itr->first;
+    auto *NewCI = Itr->second;
+    CI->replaceAllUsesWith(NewCI);
+    CI->eraseFromParent();
+  }
+
+  for (Function *F : FunctionWorkList)
+    if (F->user_empty())
+      F->eraseFromParent();
+
+  emitHostWrapperInvoker();
+
+  return Changed;
+}
+
+bool HostRPC::rewriteWithHostRPC(Function *F) {
+  SmallVector<CallInst *> WorkList;
+
+  for (User *U : F->users()) {
+    auto *CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+    WorkList.push_back(CI);
+  }
+
+  if (WorkList.empty())
+    return false;
+
+  for (CallInst *CI : WorkList) {
+    CallSiteInfo CSI;
+    CSI.CI = CI;
+    unsigned NumArgs = CI->arg_size();
+    for (unsigned I = 0; I < NumArgs; ++I)
+      CSI.Params.push_back(CI->getArgOperand(I)->getType());
+
+    std::string WrapperName = getWrapperFunctionName(F, CSI);
+    Function *DeviceWrapperFn = getDeviceWrapperFunction(WrapperName, F, CSI);
+    Function *HostWrapperFn = getHostWrapperFunction(WrapperName, F, CSI);
+
+    int32_t WrapperNumber = -1;
+    for (unsigned I = 0; I < HostEntryTable.size(); ++I) {
+      if (HostEntryTable[I] == HostWrapperFn) {
+        WrapperNumber = I;
+        break;
+      }
+    }
+    if (WrapperNumber == -1) {
+      WrapperNumber = HostEntryTable.size();
+      HostEntryTable.push_back(HostWrapperFn);
+    }
+
+    auto CheckIfIdentifierPtr = [this](const Value *V) {
+      auto *CI = dyn_cast<CallInst>(V);
+      if (!CI)
+        return false;
+      Function *Callee = CI->getCalledFunction();
+      if (this->FunctionWorkList.count(Callee))
+        return true;
+      return Callee->getName().startswith("__kmpc_host_rpc_wrapper_");
+    };
+
+    auto CheckIfDynAlloc = [](Value *V) -> CallInst * {
+      auto *CI = dyn_cast<CallInst>(V);
+      if (!CI)
+        return nullptr;
+      Function *Callee = CI->getCalledFunction();
+      auto Name = Callee->getName();
+      if (Name == "malloc" || Name == "__kmpc_alloc_shared")
+        return CI;
+      return nullptr;
+    };
+
+    auto CheckIfStdIO = [](Value *V) -> GlobalVariable * {
+      auto *LI = dyn_cast<LoadInst>(V);
+      if (!LI)
+        return nullptr;
+      auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
+      if (!GV)
+        return nullptr;
+      auto Name = GV->getName();
+      if (Name == "stdout" || Name == "stderr" || Name == "stdin")
+        return GV;
+      return nullptr;
+    };
+
+    auto CheckIfGlobalVariable = [](Value *V) {
+      if (auto *GV = dyn_cast<GlobalVariable>(V))
+        return GV;
+      if (auto *LI = dyn_cast<LoadInst>(V))
+        if (auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()))
+          return GV;
+      return static_cast<GlobalVariable *>(nullptr);
+    };
+
+    auto CheckIfNullPtr = [](Value *V) {
+      if (!V->getType()->isPointerTy())
+        return false;
+      return V == ConstantInt::getNullValue(V->getType());
+    };
+
+    auto HandleDirectUse = [&](Value *Ptr, HostRPCArgInfo &AI,
+                               bool IsPointer = false) {
+      AI.BasePtr = Ptr;
+      AI.Type = getConstantInt64(IsPointer ? ArgType::OMP_HOST_RPC_ARG_PTR
+                                           : ArgType::OMP_HOST_RPC_ARG_SCALAR);
+      AI.Size = NullInt64;
+    };
+
+    SmallVector<SmallVector<HostRPCArgInfo>> ArgInfo;
+    bool IsConstantArgInfo = true;
+
+    for (unsigned I = 0; I < CI->arg_size(); ++I) {
+      ArgInfo.emplace_back();
+      auto &AII = ArgInfo.back();
+
+      Value *Operand = CI->getArgOperand(I);
+
+      // Check if scalar type.
+      if (!Operand->getType()->isPointerTy()) {
+        AII.emplace_back();
+        HandleDirectUse(Operand, AII.back());
+        IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
+        continue;
+      }
+
+      if (CheckIfNullPtr(Operand))
+        continue;
+
+      auto Pred = [&](Value &Obj) {
+        if (CheckIfNullPtr(&Obj))
+          return true;
+
+        bool IsConstantArgument = false;
+        if (!F->isVarArg() &&
+            F->hasParamAttribute(I, Attribute::AttrKind::ReadOnly))
+          IsConstantArgument = true;
+
+        HostRPCArgInfo AI;
+
+        if (auto *IO = CheckIfStdIO(&Obj)) {
+          HandleDirectUse(IO, AI, /* IsPointer */ true);
+        } else if (CheckIfIdentifierPtr(&Obj)) {
+          IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
+          HandleDirectUse(Operand, AI, /* IsPointer */ true);
+        } else if (auto *GV = CheckIfGlobalVariable(&Obj)) {
+          AI.BasePtr = GV;
+          AI.Size = getConstantInt64(DL.getTypeStoreSize(GV->getValueType()));
+          AI.Type =
+              getConstantInt64(GV->isConstant() || IsConstantArgument
+                                   ? ArgType::OMP_HOST_RPC_ARG_COPY_TO
+                                   : ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM);
+        } else if (CheckIfDynAlloc(&Obj)) {
+          // We will handle this case at runtime so here we don't do anything.
+          return true;
+        } else if (isa<AllocaInst>(&Obj)) {
+          llvm_unreachable("alloca instruction needs to be handled!");
+        } else {
+          LLVM_DEBUG({
+            dbgs() << "[HostRPC] warning: call site " << *CI << ", operand "
+                   << *Operand << ", underlying object " << Obj
+                   << " cannot be handled.\n";
+          });
+          return true;
+        }
+        AII.push_back(std::move(AI));
+        return true;
+      };
+
+      auto &AAUO = A.getOrCreateAAFor<AAUnderlyingObjects>(
+          IRPosition::callsite_argument(*CI, I), nullptr, DepClassTy::NONE);
+      if (!AAUO.forallUnderlyingObjects(Pred))
+        llvm_unreachable("internal error");
+    }
+
+    // Reset the insert point to the call site.
+    Builder.SetInsertPoint(CI);
+
+    Value *ArgInfoVal = nullptr;
+    if (!IsConstantArgInfo) {
+      ArgInfoVal = Builder.CreateAlloca(Int8PtrTy, getConstantInt64(NumArgs),
+                                        "arg_info");
+      for (unsigned I = 0; I < NumArgs; ++I) {
+        auto &AII = ArgInfo[I];
+        Value *Next = NullPtr;
+        for (auto &AI : AII) {
+          Value *AIV = Builder.CreateAlloca(ArgInfoTy);
+          Value *AIIArg =
+              GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(0)});
+          Builder.Insert(AIIArg);
+          Builder.CreateStore(convertToInt64Ty(AI.BasePtr), AIIArg);
+          Value *AIIType =
+              GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(1)});
+          Builder.Insert(AIIType);
+          Builder.CreateStore(AI.Type, AIIType);
+          Value *AIISize =
+              GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(2)});
+          Builder.Insert(AIISize);
+          Builder.CreateStore(AI.Size, AIISize);
+          Value *AIINext =
+              GetElementPtrInst::Create(Int8PtrTy, AIV, {getConstantInt64(3)});
+          Builder.Insert(AIINext);
+          Builder.CreateStore(Next, AIINext);
+          Next = AIV;
+        }
+        Value *AIIV = GetElementPtrInst::Create(Int8PtrTy, ArgInfoVal,
+                                                {getConstantInt64(I)});
+        Builder.Insert(AIIV);
+        Builder.CreateStore(Next, AIIV);
+      }
+    } else {
+      SmallVector<Constant *> ArgInfoInitVar;
+      for (auto &AII : ArgInfo) {
+        Constant *Last = NullPtr;
+        for (auto &AI : AII) {
+          auto *Arg = cast<Constant>(AI.BasePtr);
+          auto *CS =
+              ConstantStruct::get(ArgInfoTy, {convertToInt64Ty(Arg), AI.Type,
+                                              cast<Constant>(AI.Size), Last});
+          auto *GV = new GlobalVariable(
+              M, ArgInfoTy, /* isConstant */ true,
+              GlobalValue::LinkageTypes::InternalLinkage, CS);
+          Last = GV;
+        }
+        ArgInfoInitVar.push_back(Last);
+      }
+      Constant *ArgInfoInit = ConstantArray::get(
+          ArrayType::get(Int8PtrTy, NumArgs), ArgInfoInitVar);
+      ArgInfoVal = new GlobalVariable(
+          M, ArrayType::get(Int8PtrTy, NumArgs), /* isConstant */ true,
+          GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info");
+    }
+
+    SmallVector<Value *> Args{ConstantInt::get(Int32Ty, WrapperNumber),
+                              ArgInfoVal};
+    for (Value *Operand : CI->args())
+      Args.push_back(Operand);
+
+    CallInst *NewCall = Builder.CreateCall(DeviceWrapperFn, Args);
+
+    CallInstMap.emplace_back(CI, NewCall);
+  }
+
+  return true;
+}
+
+Function *HostRPC::getDeviceWrapperFunction(StringRef WrapperName, Function *F,
+                                            CallSiteInfo &CSI) {
+  Function *WrapperFn = M.getFunction(WrapperName);
+  if (WrapperFn)
+    return WrapperFn;
+
+  // return_type device_wrapper(int32_t call_no, void *arg_info, ...)
+  SmallVector<Type *> Params{Int32Ty, Int8PtrTy};
+  Params.append(CSI.Params);
+
+  Type *RetTy = F->getReturnType();
+
+  FunctionType *FT = FunctionType::get(RetTy, Params, /*isVarArg*/ false);
+  WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::InternalLinkage,
+                               WrapperName, M);
+
+  // Emit the body of the device wrapper
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
+  Builder.SetInsertPoint(EntryBB);
+
+  // skip call_no and arg_info.
+  constexpr const unsigned NumArgSkipped = 2;
+
+  Value *Desc = nullptr;
+  {
+    Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_get_desc];
+    Desc = Builder.CreateCall(
+        Fn,
+        {WrapperFn->getArg(0),
+         ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
+         WrapperFn->getArg(1)},
+        "desc");
+  }
+
+  {
+    Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_add_arg];
+    for (unsigned I = NumArgSkipped; I < WrapperFn->arg_size(); ++I) {
+      Value *V = convertToInt64Ty(WrapperFn->getArg(I));
+      Builder.CreateCall(
+          Fn, {Desc, V, ConstantInt::get(Int32Ty, I - NumArgSkipped)});
+    }
+  }
+
+  Value *RetVal =
+      Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_send_and_wait], {Desc});
+
+  if (RetTy->isVoidTy()) {
+    Builder.CreateRetVoid();
+    return WrapperFn;
+  }
+
+  if (RetTy != RetVal->getType())
+    RetVal = convertFromInt64TyTo(RetVal, RetTy);
+
+  Builder.CreateRet(RetVal);
+
+  return WrapperFn;
+}
+
+Function *HostRPC::getHostWrapperFunction(StringRef WrapperName, Function *F,
+                                          CallSiteInfo &CSI) {
+  Function *WrapperFn = HM.getFunction(WrapperName);
+  if (WrapperFn)
+    return WrapperFn;
+
+  SmallVector<Type *> Params{Int8PtrTy};
+  FunctionType *FT = FunctionType::get(VoidTy, Params, /* isVarArg */ false);
+  WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,
+                               WrapperName, HM);
+
+  Value *Desc = WrapperFn->getArg(0);
+
+  // Emit the body of the host wrapper
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
+  Builder.SetInsertPoint(EntryBB);
+
+  SmallVector<Value *> Args;
+  for (unsigned I = 0; I < CSI.CI->arg_size(); ++I) {
+    Value *V = Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_arg],
+                                  {Desc, ConstantInt::get(Int32Ty, I)});
+    Args.push_back(convertFromInt64TyTo(V, CSI.Params[I]));
+  }
+
+  // The host callee that will be called eventually by the host wrapper.
+  Function *HostCallee = HM.getFunction(F->getName());
+  if (!HostCallee)
+    HostCallee = Function::Create(F->getFunctionType(), F->getLinkage(),
+                                  F->getName(), HM);
+
+  Value *RetVal = Builder.CreateCall(HostCallee, Args);
+  if (!RetVal->getType()->isVoidTy()) {
+    RetVal = convertToInt64Ty(RetVal);
+    Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_set_ret_val],
+                       {Desc, RetVal});
+  }
+  Builder.CreateRetVoid();
+
+  return WrapperFn;
+}
+
+void HostRPC::emitHostWrapperInvoker() {
+  IRBuilder<> Builder(Context);
+  unsigned NumEntries = HostEntryTable.size();
+  Function *F = RFIs[OMPRTL___kmpc_host_rpc_invoke_host_wrapper];
+  F->setDLLStorageClass(
+      GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
+  Value *CallNo = F->getArg(0);
+  Value *Desc = F->getArg(1);
+
+  SmallVector<BasicBlock *> SwitchBBs;
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", F);
+  BasicBlock *ReturnBB = BasicBlock::Create(Context, "return", F);
+
+  // Emit code for the return bb.
+  Builder.SetInsertPoint(ReturnBB);
+  Builder.CreateRetVoid();
+
+  // Create BB for each host entry and emit function call.
+  for (unsigned I = 0; I < NumEntries; ++I) {
+    BasicBlock *BB = BasicBlock::Create(Context, "invoke.bb", F, ReturnBB);
+    SwitchBBs.push_back(BB);
+    Builder.SetInsertPoint(BB);
+    Builder.CreateCall(HostEntryTable[I], {Desc});
+    Builder.CreateBr(ReturnBB);
+  }
+
+  // Emit code for the entry BB.
+  Builder.SetInsertPoint(EntryBB);
+  SwitchInst *Switch = Builder.CreateSwitch(CallNo, ReturnBB, NumEntries);
+  for (unsigned I = 0; I < NumEntries; ++I)
+    Switch->addCase(ConstantInt::get(cast<IntegerType>(Int32Ty), I),
+                    SwitchBBs[I]);
+}
+
+Module *getHostModule(Module &M) {
+  auto *MD = M.getNamedMetadata("llvm.hostrpc.hostmodule");
+  if (!MD || MD->getNumOperands() == 0)
+    return nullptr;
+  auto *Node = MD->getOperand(0);
+  assert(Node->getNumOperands() == 1 && "invliad named metadata");
+  auto *CAM = dyn_cast<ConstantAsMetadata>(Node->getOperand(0));
+  if (!CAM)
+    return nullptr;
+  auto *CI = cast<ConstantInt>(CAM->getValue());
+  Module *Mod = reinterpret_cast<Module *>(CI->getZExtValue());
+  M.eraseNamedMetadata(MD);
+  return Mod;
+}
+} // namespace
+
+PreservedAnalyses HostRPCPass::run(Module &M, ModuleAnalysisManager &AM) {
+  std::unique_ptr<Module> DummyHostModule;
+
+  Module *HostModule = nullptr;
+
+  if (UseDummyHostModule) {
+    DummyHostModule =
+        std::make_unique<Module>("dummy-host-rpc.bc", M.getContext());
+    HostModule = DummyHostModule.get();
+  } else {
+    HostModule = getHostModule(M);
+  }
+
+  if (!HostModule)
+    return PreservedAnalyses::all();
+
+  bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
+                  LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
+
+  // The pass will not run if it is not invoked directly or not invoked at link
+  // time.
+  if (!UseDummyHostModule && !PostLink)
+    return PreservedAnalyses::all();
+
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  AnalysisGetter AG(FAM);
+
+  CallGraphUpdater CGUpdater;
+  BumpPtrAllocator Allocator;
+
+  AttributorConfig AC(CGUpdater);
+  AC.DefaultInitializeLiveInternals = false;
+  AC.RewriteSignatures = false;
+  AC.PassName = DEBUG_TYPE;
+  AC.MaxFixpointIterations = 1024;
+
+  InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
+
+  SetVector<Function *> Functions;
+  Attributor A(Functions, InfoCache, AC);
+
+  HostRPC RPC(M, *HostModule, A);
+  bool Changed = RPC.run();
+
+  LLVM_DEBUG({
+    if (Changed && UseDummyHostModule) {
+      M.dump();
+      HostModule->dump();
+    }
+  });
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -53,6 +53,7 @@
 #include "llvm/Transforms/IPO/Attributor.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"

 #include <algorithm>
 #include <optional>
@@ -135,6 +136,15 @@ static cl::opt<bool>
                         cl::desc("Enables more verbose remarks."), cl::Hidden,
                         cl::init(false));

+static cl::opt<bool>
+    EnableParallel51Split("openmp-opt-enable-parallel-51-split",
+                          cl::desc("Enable the kernel split at parallel_51."),
+                          cl::Hidden, cl::init(false));
+static cl::opt<bool> EnableParallel51SplitMultiTeams(
+    "openmp-opt-enable-parallel-51-split-multi-teams",
+    cl::desc("Enable multi-teams support for the parallel 51 kernel."),
+    cl::Hidden, cl::init(false));
+
 static cl::opt<unsigned>
    SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
                          cl::desc("Maximal number of attributor iterations."),
@@ -812,6 +822,28 @@ private:
  }
 };

+bool canUseMultiTeam(Function *OutlinedFn, OMPInformationCache &OMPInfoCache) {
+  if (!EnableParallel51SplitMultiTeams)
+    return false;
+
+  return true;
+}
+
+void collectReachingKernels(KernelSet &Kernels, Function *F,
+                            SmallVector<Kernel> &ReachingKernels) {
+  if (Kernels.count(F)) {
+    if (F->hasFnAttribute("omp_parallel_51_kernel"))
+      ReachingKernels.push_back(F);
+    return;
+  }
+  for (User *U : F->users()) {
+    auto *I = dyn_cast<Instruction>(U);
+    if (!I)
+      continue;
+    collectReachingKernels(Kernels, I->getFunction(), ReachingKernels);
+  }
+}
+
 struct OpenMPOpt {

  using OptimizationRemarkGetter =
@@ -841,6 +873,11 @@ struct OpenMPOpt {
                      << OMPInfoCache.ModuleSlice.size() << " functions\n");

    if (IsModulePass) {
+      if (EnableParallel51Split)
+        Changed |= splitKernels();
+
+      OMPInfoCache.recollectUses();
+
      Changed |= runAttributor(IsModulePass);

      // Recollect uses, in case Attributor deleted any.
@@ -940,6 +977,307 @@ struct OpenMPOpt {
  }

 private:
+  /// Create a new kernel for each function call to __kmpc_parallel_51.
+  bool splitKernels() {
+    bool Changed = false;
+
+    OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
+
+    if (!Parallel51RFI.Declaration)
+      return Changed;
+
+    SmallVector<CallInst *> WorkItems;
+    for (User *U : Parallel51RFI.Declaration->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+
+      Function *F = CI->getFunction();
+      // If the parent function is already from kernel split, skip it.
+      Attribute Attr = F->getFnAttribute("omp_parallel_51_kernel");
+      if (Attr.isValid())
+        continue;
+
+      WorkItems.push_back(CI);
+    }
+
+    if (WorkItems.empty())
+      return Changed;
+
+    Changed = true;
+
+    auto CreateNewKernel = [](LLVMContext &Ctx, Module &M,
+                              OpenMPIRBuilder &IRBuilder, Function *Parallel51,
+                              const CallInst *CI) {
+      SmallVector<Type *, 4> ArgTypes;
+      // int32_t gtid
+      ArgTypes.push_back(Type::getInt32Ty(Ctx));
+      // int32_t if_expr
+      ArgTypes.push_back(Type::getInt32Ty(Ctx));
+      // int32_t num_threads
+      ArgTypes.push_back(Type::getInt32Ty(Ctx));
+      // void **args
+      ArgTypes.push_back(Type::getInt8PtrTy(Ctx));
+      // int64_t nargs
+      ArgTypes.push_back(Type::getInt64Ty(Ctx));
+
+      FunctionType *FT =
+          FunctionType::get(Type::getVoidTy(Ctx), ArgTypes, false);
+      std::string KernelName = "__omp_offloading_parallel_51_from_";
+      KernelName += CI->getFunction()->getName();
+      // Sanitize the kernel name
+      {
+        size_t DotPos = KernelName.find('.');
+        while (DotPos != std::string::npos) {
+          KernelName[DotPos] = '_';
+          DotPos = KernelName.find('.');
+        }
+      }
+      Function *K =
+          Function::Create(FT, GlobalValue::WeakODRLinkage, KernelName, &M);
+      K->setVisibility(GlobalValue::ProtectedVisibility);
+      // exec mode global variable
+      GlobalVariable *ModeGV = new GlobalVariable(
+          M, Type::getInt8Ty(Ctx), /*isConstant=*/true,
+          GlobalValue::WeakAnyLinkage,
+          ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD),
+          K->getName() + "_exec_mode");
+      appendToCompilerUsed(M, {ModeGV});
+
+      // Attach "kernel" metadata to the new kernel.
+      NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
+      Metadata *MDVals[] = {ConstantAsMetadata::get(K),
+                            MDString::get(Ctx, "kernel"),
+                            ConstantAsMetadata::get(ConstantInt::get(
+                                llvm::Type::getInt32Ty(Ctx), 1))};
+      MD->addOperand(MDNode::get(Ctx, MDVals));
+      // Set kernel attributes.
+      K->setAttributes(
+          AttributeList::get(Ctx, AttributeList::FunctionIndex,
+                             CI->getFunction()->getAttributes().getFnAttrs()));
+      K->removeFnAttr("omp_target_thread_limit");
+      K->removeFnAttr("omp_target_num_teams");
+      K->addFnAttr("omp_parallel_51_kernel");
+      K->addFnAttr("kernel");
+      K->getArg(3)->addAttr(Attribute::NoAlias);
+
+      auto *EntryBB = BasicBlock::Create(Ctx, "entry", K);
+      OpenMPIRBuilder::LocationDescription TargetInitLoc(
+          {EntryBB, EntryBB->end()});
+      auto IP = IRBuilder.createTargetInit(TargetInitLoc, /* IsSPMD */ true);
+      BasicBlock *UserCodeBB = IP.getBlock();
+
+      auto UseDirectIfPossible = [](Value *LHS, Value *RHS) -> Value * {
+        if (isa<Constant>(LHS))
+          return LHS;
+        return RHS;
+      };
+
+      Value *Ident = CI->getOperand(0);
+      Value *TId = UseDirectIfPossible(CI->getOperand(1), K->getArg(0));
+      Value *IfExpr = UseDirectIfPossible(CI->getOperand(2), K->getArg(1));
+      Value *NumThreads = UseDirectIfPossible(CI->getOperand(3), K->getArg(2));
+      Value *ProcBind = CI->getOperand(4);
+      Value *Fn = CI->getOperand(5);
+      Value *WrapperFn = CI->getOperand(6);
+      Value *Args = K->getArg(3);
+      Value *NArgs = UseDirectIfPossible(CI->getOperand(8), K->getArg(4));
+
+      (void)CallInst::Create(Parallel51,
+                             {Ident, TId, IfExpr, NumThreads, ProcBind, Fn,
+                              WrapperFn, Args, NArgs},
+                             "", UserCodeBB);
+
+      OpenMPIRBuilder::LocationDescription TargetDeInitLoc(
+          {UserCodeBB, UserCodeBB->end()});
+      IRBuilder.createTargetDeinit(TargetDeInitLoc, /* IsSPMD */ true);
+      IRBuilder.Builder.CreateRetVoid();
+
+      return K;
+    };
+
+    auto EnableMultiTeam = [](Function *OutlinedFn) {
+      SmallVector<CallInst *> WorkItems;
+      for (BasicBlock &BB : *OutlinedFn)
+        for (Instruction &I : BB) {
+          CallInst *C = dyn_cast<CallInst>(&I);
+          if (!C)
+            continue;
+          Function *Callee = C->getCalledFunction();
+          auto CalleeName = Callee->getName();
+          if (CalleeName == "__kmpc_for_static_init_4" ||
+              CalleeName == "__kmpc_for_static_init_4u" ||
+              CalleeName == "__kmpc_for_static_init_8" ||
+              CalleeName == "__kmpc_for_static_init_8u")
+            WorkItems.push_back(C);
+        }
+
+      for (CallInst *C : WorkItems) {
+        constexpr const unsigned SchedTypeArgNum = 2;
+        Value *SchedTypeVal = C->getOperand(SchedTypeArgNum);
+        ConstantInt *SchedTypeCI = cast<ConstantInt>(SchedTypeVal);
+        int32_t SchedType = SchedTypeCI->getSExtValue();
+        Value *NewSchedType = SchedTypeVal;
+        if (SchedType == /*kmp_sched_static_chunk*/ 33)
+          NewSchedType = ConstantInt::get(SchedTypeCI->getType(), 100);
+        else if (SchedType == /*kmp_sched_static_chunk*/ 34)
+          NewSchedType = ConstantInt::get(SchedTypeCI->getType(), 101);
+        C->setOperand(SchedTypeArgNum, NewSchedType);
+      }
+    };
+
+    auto &Ctx = M.getContext();
+
+    for (CallInst *CI : WorkItems) {
+      Function *K = CreateNewKernel(Ctx, M, OMPInfoCache.OMPBuilder,
+                                    Parallel51RFI.Declaration, CI);
+      OMPInfoCache.Kernels.insert(K);
+
+      constexpr const unsigned OutlinedFnArgNum = 5;
+      Function *OutlinedFn =
+          dyn_cast<Function>(CI->getOperand(OutlinedFnArgNum));
+      assert(OutlinedFn && "arg fn is not a function");
+
+      if (canUseMultiTeam(OutlinedFn, OMPInfoCache))
+        EnableMultiTeam(OutlinedFn);
+
+      auto &Ctx = M.getContext();
+      IRBuilder<> Builder(CI);
+
+      FunctionCallee Callee =
+          OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+              M, OMPRTL___kmpc_launch_parallel_51_kernel);
+      Constant *KernelName = ConstantDataArray::getString(Ctx, K->getName());
+      GlobalVariable *KernelNameVar =
+          new GlobalVariable(M, KernelName->getType(), /*isConstant=*/true,
+                             GlobalValue::WeakAnyLinkage, KernelName);
+      Value *Args[6] = {KernelNameVar,        CI->getArgOperand(1),
+                        CI->getArgOperand(2), CI->getArgOperand(3),
+                        CI->getArgOperand(7), CI->getArgOperand(8)};
+
+      CallInst *C = Builder.CreateCall(Callee, Args);
+      OMPInfoCache.setCallingConvention(Callee, C);
+      CI->eraseFromParent();
+    }
+
+    // Check all the use of __kmpc_parallel_51 again. If it is just used in
+    // parallel kernel, we can mark the main kernel as SPMD mode as well.
+    bool OnlyUsedByParallelKernel = true;
+    for (User *U : Parallel51RFI.Declaration->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+      Function *F = CI->getFunction();
+      Attribute Attr = F->getFnAttribute("omp_parallel_51_kernel");
+      if (Attr.isValid())
+        continue;
+      OnlyUsedByParallelKernel = false;
+      break;
+    }
+
+    if (OnlyUsedByParallelKernel) {
+      for (auto K : OMPInfoCache.Kernels) {
+        Attribute Attr = K->getFnAttribute("omp_parallel_51_kernel");
+        if (Attr.isValid())
+          continue;
+        GlobalVariable *ExecMode =
+            M.getGlobalVariable((K->getName() + "_exec_mode").str());
+        assert(ExecMode && "kernel without exec mode");
+        assert(ExecMode->getInitializer() &&
+               "ExecMode doesn't have initializer!");
+        assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
+               "ExecMode is not an integer!");
+        const int8_t ExecModeVal =
+            cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
+        // Kernel is already in SPMD mode, skip.
+        if (ExecModeVal & OMP_TGT_EXEC_MODE_SPMD)
+          continue;
+        auto *NewExecModeC =
+            ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD);
+        ExecMode->setInitializer(NewExecModeC);
+        OMPInformationCache::RuntimeFunctionInfo &TargetInitRFI =
+            OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
+        OMPInformationCache::RuntimeFunctionInfo &TargetDeInitRFI =
+            OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
+        CallInst *InitCI = nullptr;
+        CallInst *DeInitCI = nullptr;
+        for (User *U : TargetInitRFI.Declaration->users()) {
+          auto *CI = dyn_cast<CallInst>(U);
+          if (!CI)
+            continue;
+          if (CI->getFunction() == K) {
+            InitCI = CI;
+            break;
+          }
+        }
+        for (User *U : TargetDeInitRFI.Declaration->users()) {
+          auto *CI = dyn_cast<CallInst>(U);
+          if (!CI)
+            continue;
+          if (CI->getFunction() == K) {
+            DeInitCI = CI;
+            break;
+          }
+        }
+        assert(InitCI && DeInitCI && "kernel without init and deinit");
+        InitCI->setArgOperand(1, NewExecModeC);
+        InitCI->setArgOperand(2,
+                              ConstantInt::getNullValue(Type::getInt1Ty(Ctx)));
+        DeInitCI->setArgOperand(1, NewExecModeC);
+      }
+    }
+
+    // Check the use of omp_get_thread_num and omp_get_num_threads.
+    OMPInformationCache::RuntimeFunctionInfo &ThreadNumRFI =
+        OMPInfoCache.RFIs[OMPRTL_omp_get_thread_num];
+    OMPInformationCache::RuntimeFunctionInfo &NumThreadsRFI =
+        OMPInfoCache.RFIs[OMPRTL_omp_get_num_threads];
+
+    auto CheckIfOnlyUsedByParallelKernel = [&](CallInst *CI) {
+      SmallVector<Kernel> ReachingKernels;
+      collectReachingKernels(OMPInfoCache.Kernels, CI->getFunction(),
+                             ReachingKernels);
+      return ReachingKernels.size() < 2;
+    };
+
+    SmallVector<CallInst *> WorkList;
+
+    for (User *U : ThreadNumRFI.Declaration->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+      if (!CheckIfOnlyUsedByParallelKernel(CI))
+        continue;
+      WorkList.push_back(CI);
+    }
+     for (User *U : NumThreadsRFI.Declaration->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+      if (!CheckIfOnlyUsedByParallelKernel(CI))
+        continue;
+      WorkList.push_back(CI);
+    }
+
+    for (CallInst *CI : WorkList) {
+      Function *Callee = nullptr;
+      if (CI->getCalledFunction() == ThreadNumRFI.Declaration)
+        Callee = OMPInfoCache.RFIs[OMPRTL_omp_get_bulk_thread_num].Declaration;
+      if (CI->getCalledFunction() == NumThreadsRFI.Declaration)
+        Callee = OMPInfoCache.RFIs[OMPRTL_omp_get_bulk_num_threads].Declaration;
+      assert(Callee && "unknown callee");
+
+      auto &Builder = OMPInfoCache.OMPBuilder.Builder;
+      Builder.SetInsertPoint(CI);
+      Value *C = Builder.CreateCall(Callee);
+      CI->replaceAllUsesWith(C);
+      CI->eraseFromParent();
+    }
+
+    return Changed;
+  }
+
  /// Merge parallel regions when it is safe.
  bool mergeParallelRegions() {
    const unsigned CallbackCalleeOperand = 2;
@@ -3062,7 +3400,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
          if (EDAA.getState().isValidState()) {
            const auto &CalleeED = EDAA.getFunctionExecutionDomain();
            ED.IsReachedFromAlignedBarrierOnly =
-                    CalleeED.IsReachedFromAlignedBarrierOnly;
+                CalleeED.IsReachedFromAlignedBarrierOnly;
            AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
            if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
              ED.EncounteredNonLocalSideEffect |=
@@ -3553,7 +3891,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
    Attributor::SimplifictionCallbackTy StateMachineSimplifyCB =
        [&](const IRPosition &IRP, const AbstractAttribute *AA,
            bool &UsedAssumedInformation) -> std::optional<Value *> {
-        return nullptr;
+      return nullptr;
    };

    Attributor::SimplifictionCallbackTy ModeSimplifyCB =
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMTransformUtils
  CallGraphUpdater.cpp
  CanonicalizeAliases.cpp
  CanonicalizeFreezeInLoops.cpp
+  CanonicalizeMainFunction.cpp
  CloneFunction.cpp
  CloneModule.cpp
  CodeExtractor.cpp
--- a/llvm/lib/Transforms/Utils/CanonicalizeMainFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeMainFunction.cpp
@@ -0,0 +1,103 @@
+//===-------------- CanonicalizeMainFunction.cpp ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility function to canonicalize main function.
+// The canonical main function is defined as: int main(int argc, char *argv[]);
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "canonicalize-main-function"
+
+static cl::opt<std::string>
+    MainFunctionName("canonical-main-function-name",
+                     cl::desc("New main function name"),
+                     cl::value_desc("main function name"));
+
+bool rewriteMainFunction(Function &F) {
+  if (F.arg_size() == 2 && F.getReturnType()->isIntegerTy(32))
+    return false;
+
+  auto &Ctx = F.getContext();
+  auto &DL = F.getParent()->getDataLayout();
+  auto *Int32Ty = IntegerType::getInt32Ty(Ctx);
+  auto *PtrTy = PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace());
+
+  FunctionType *NewFnTy =
+      FunctionType::get(Int32Ty, {Int32Ty, PtrTy}, /* isVarArg */ false);
+  Function *NewFn =
+      Function::Create(NewFnTy, F.getLinkage(), F.getAddressSpace(), "");
+  F.getParent()->getFunctionList().insert(F.getIterator(), NewFn);
+  NewFn->takeName(&F);
+  NewFn->copyAttributesFrom(&F);
+  NewFn->setSubprogram(F.getSubprogram());
+  F.setSubprogram(nullptr);
+  NewFn->splice(NewFn->begin(), &F);
+
+  if (!F.getReturnType()->isIntegerTy(32)) {
+    SmallVector<ReturnInst *> WorkList;
+    for (BasicBlock &BB : *NewFn)
+      for (Instruction &I : BB) {
+        auto *RI = dyn_cast<ReturnInst>(&I);
+        if (!RI)
+          continue;
+        assert(RI->getReturnValue() == nullptr &&
+               "return value of a void main function is not nullptr");
+        WorkList.push_back(RI);
+      }
+    for (auto *RI : WorkList) {
+      (void)ReturnInst::Create(Ctx, ConstantInt::getNullValue(Int32Ty), RI);
+      RI->eraseFromParent();
+    }
+  }
+
+  if (F.arg_size() == NewFn->arg_size())
+    for (unsigned I = 0; I < NewFn->arg_size(); ++I) {
+      Argument *OldArg = F.getArg(I);
+      Argument *NewArg = NewFn->getArg(I);
+      NewArg->takeName(OldArg);
+      OldArg->replaceAllUsesWith(NewArg);
+    }
+
+  return true;
+}
+
+PreservedAnalyses CanonicalizeMainFunctionPass::run(Module &M,
+                                                    ModuleAnalysisManager &AM) {
+  Function *MainFunc = nullptr;
+
+  for (Function &F : M)
+    if (F.getName() == "main") {
+      assert(MainFunc == nullptr && "more than one main function");
+      MainFunc = &F;
+    }
+
+  if (MainFunc == nullptr)
+    return PreservedAnalyses::all();
+
+  bool Changed = false;
+
+  if (!MainFunctionName.empty() && MainFunc->getName() != MainFunctionName) {
+    MainFunc->setName(MainFunctionName);
+    Changed = true;
+  }
+
+  if (rewriteMainFunction(*MainFunc)) {
+    MainFunc->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -87,6 +87,7 @@ set(include_files
  ${include_directory}/Interface.h
  ${include_directory}/LibC.h
  ${include_directory}/Mapping.h
+  ${include_directory}/Memory.h
  ${include_directory}/State.h
  ${include_directory}/Synchronization.h
  ${include_directory}/Types.h
@@ -96,6 +97,7 @@ set(include_files
 set(src_files
  ${source_directory}/Configuration.cpp
  ${source_directory}/Debug.cpp
+  ${source_directory}/HostRPC.cpp
  ${source_directory}/Kernel.cpp
  ${source_directory}/LibC.cpp
  ${source_directory}/Mapping.cpp
@@ -109,6 +111,14 @@ set(src_files
  ${source_directory}/Workshare.cpp
 )

+if (LIBOMPTARGET_DEVICE_BUILTIN_ALLOCATOR)
+  list(APPEND src_files ${source_directory}/BuiltinAllocator.cpp)
+elseif (LIBOMPTARGET_GENERIC_ALLOCATOR)
+  list(APPEND src_files ${source_directory}/GenericAllocator.cpp)
+else()
+  list(APPEND src_files ${source_directory}/TeamAllocator.cpp)
+endif()
+
 set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512)
 set(link_opt_flags  -O3        -openmp-opt-disable -attributor-enable=module)
 set(link_export_flag -passes=internalize -internalize-public-api-file=${source_directory}/exports)
@@ -123,11 +133,20 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
             -nocudalib -nogpulib -nostdinc
             -fopenmp -fopenmp-cuda-mode
             -Wno-unknown-cuda-version
+             -DOMPTARGET_DEVICE_RUNTIME
             -I${include_directory}
             -I${devicertl_base_directory}/../include
             ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
 )

+if (LIBOMPTARGET_DEVICERTL_HOSTRPC_DEBUG)
+  list(APPEND bc_flags "-DHOSTRPC_DEBUG")
+endif()
+
+if (LIBOMPTARGET_DEVICERTL_HOSTRPC_PROFILING)
+  list(APPEND bc_flags "-DHOSTRPC_PROFILING")
+endif()
+
 # first create an object target
 add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
 function(compileDeviceRTLLibrary target_cpu target_name target_triple)
--- a/openmp/libomptarget/DeviceRTL/include/Debug.h
+++ b/openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -32,6 +32,12 @@ void __assert_fail(const char *assertion, const char *file, unsigned line,
      __assert_assume(expr);                                                   \
  }

+#define assert(expr)                                                           \
+  {                                                                            \
+    if (!(expr))                                                               \
+      __assert_fail(#expr, __FILE__, __LINE__, __PRETTY_FUNCTION__);           \
+  }
+
 ///}

 #define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -355,6 +355,22 @@ int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
 int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
 int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
 ///}
+
+/// Host RPC
+///
+/// {
+void *__kmpc_host_rpc_get_desc(int32_t CallNo, int32_t NumArgs, void *ArgInfo);
+void __kmpc_host_rpc_add_arg(void *Desc, int64_t Arg, int32_t ArgNum);
+int64_t __kmpc_host_rpc_send_and_wait(void *Desc);
+/// }
+
+/// Launch parallel kernel
+///
+///{
+void __kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
+                                      int32_t if_expr, int32_t num_threads,
+                                      void **args, int64_t nargs);
+///}
 }

 #endif
--- a/openmp/libomptarget/DeviceRTL/include/LibC.h
+++ b/openmp/libomptarget/DeviceRTL/include/LibC.h
@@ -14,11 +14,115 @@

 #include "Types.h"

+struct FILE;
+extern FILE *stdin;
+extern FILE *stdout;
+extern FILE *stderr;
+
+#ifndef _ASM_GENERIC_ERRNO_BASE_H
+#define _ASM_GENERIC_ERRNO_BASE_H
+
+#define EPERM 1    /* Operation not permitted */
+#define ENOENT 2   /* No such file or directory */
+#define ESRCH 3    /* No such process */
+#define EINTR 4    /* Interrupted system call */
+#define EIO 5      /* I/O error */
+#define ENXIO 6    /* No such device or address */
+#define E2BIG 7    /* Argument list too long */
+#define ENOEXEC 8  /* Exec format error */
+#define EBADF 9    /* Bad file number */
+#define ECHILD 10  /* No child processes */
+#define EAGAIN 11  /* Try again */
+#define ENOMEM 12  /* Out of memory */
+#define EACCES 13  /* Permission denied */
+#define EFAULT 14  /* Bad address */
+#define ENOTBLK 15 /* Block device required */
+#define EBUSY 16   /* Device or resource busy */
+#define EEXIST 17  /* File exists */
+#define EXDEV 18   /* Cross-device link */
+#define ENODEV 19  /* No such device */
+#define ENOTDIR 20 /* Not a directory */
+#define EISDIR 21  /* Is a directory */
+#define EINVAL 22  /* Invalid argument */
+#define ENFILE 23  /* File table overflow */
+#define EMFILE 24  /* Too many open files */
+#define ENOTTY 25  /* Not a typewriter */
+#define ETXTBSY 26 /* Text file busy */
+#define EFBIG 27   /* File too large */
+#define ENOSPC 28  /* No space left on device */
+#define ESPIPE 29  /* Illegal seek */
+#define EROFS 30   /* Read-only file system */
+#define EMLINK 31  /* Too many links */
+#define EPIPE 32   /* Broken pipe */
+#define EDOM 33    /* Math argument out of domain of func */
+#define ERANGE 34  /* Math result not representable */
+
+#endif
+
+#define errno (*__errno_location())
+
 extern "C" {

 int memcmp(const void *lhs, const void *rhs, size_t count);

 int printf(const char *format, ...);
+
+long strtol(const char *str, char **str_end, int base);
+
+int strcmp(const char *lhs, const char *rhs);
+
+void *calloc(size_t num, size_t size);
+
+int strcasecmp(const char *string1, const char *string2);
+
+void exit(int exit_code);
+
+size_t strlen(const char *str);
+
+int atoi(const char *str);
+
+char *strcpy(char *dest, const char *src);
+
+int stat(const char *path, struct stat *buf);
+
+int *__errno_location();
+
+char *strcat(char *dest, const char *src);
+
+void perror(const char *s);
+
+int strncmp(const char *lhs, const char *rhs, size_t count);
+
+char *strncpy(char *dest, const char *src, size_t count);
+
+char *strchr(const char *str, int ch);
+
+char *strtok(char *str, const char *delim);
+
+const unsigned short **__ctype_b_loc(void);
+
+void *realloc(void *ptr, size_t new_size);
+
+void qsort(void *const pbase, size_t total_elems, size_t size,
+           int (*comp)(const void *, const void *));
+
+int gettimeofday(struct timeval *tv, struct timezone *tz);
+
+char *__xpg_basename(const char *path);
+
+void srand(unsigned seed);
+
+int rand();
+
+int abs(int n);
+
+void *memcpy(void *dest, const void *src, size_t count);
+
+double atof(const char *str);
+
+double strtod(const char *str, char **ptr);
+
+long strtol(const char *nptr, char **endptr, int base);
 }

 #endif
--- a/openmp/libomptarget/DeviceRTL/include/Memory.h
+++ b/openmp/libomptarget/DeviceRTL/include/Memory.h
@@ -0,0 +1,39 @@
+//===--- Memory.h - OpenMP device runtime memory allocator -------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_MEMORY_H
+#define OMPTARGET_MEMORY_H
+
+#include "Types.h"
+
+extern "C" {
+__attribute__((leaf)) void *malloc(size_t Size);
+__attribute__((leaf)) void free(void *Ptr);
+}
+
+namespace ompx {
+namespace memory {
+
+struct MemoryAllocationInfo {
+  void *BasePtr = nullptr;
+  size_t Size = 0;
+
+  bool isValid() const { return BasePtr; }
+};
+
+/// Get the memory allocation information if pointer \p P is in the range of one
+/// of the buffer allocated by \p malloc.
+MemoryAllocationInfo getMemoryAllocationInfo(void *P);
+} // namespace memory
+} // namespace ompx
+
+#endif
--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -29,6 +29,9 @@ enum OrderingTy {
 /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
 uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering);

+///
+int32_t addSys(int32_t *Addr, int32_t V);
+
 /// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
 /// result is stored in \p *Addr;
 /// {
@@ -128,6 +131,60 @@ void system(atomic::OrderingTy Ordering);

 } // namespace fence

+namespace atomic {
+
+/// Atomically load \p Addr with \p Ordering semantics.
+uint32_t load(uint32_t *Addr, int Ordering);
+
+/// Atomically load \p Addr with \p Ordering semantics.
+uint64_t load(uint64_t *Addr, int Ordering);
+
+/// Atomically store \p V to \p Addr with \p Ordering semantics.
+void store(uint32_t *Addr, uint32_t V, int Ordering);
+
+void store(uint64_t *Addr, uint64_t V, int Ordering);
+
+/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
+uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically add \p V to \p *Addr with \p Ordering semantics.
+uint32_t add(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically add \p V to \p *Addr with \p Ordering semantics.
+uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);
+
+} // namespace atomic
+
+namespace mutex {
+
+class TicketLock {
+  uint64_t NowServing = 0;
+  uint64_t NextTicket = 0;
+
+public:
+  TicketLock() = default;
+
+  TicketLock(const TicketLock &) = delete;
+
+  TicketLock(TicketLock &&) = delete;
+
+  void lock();
+
+  void unlock();
+};
+
+template <typename T> class LockGuard {
+  T &Lock;
+
+public:
+  explicit LockGuard(T &L) : Lock(L) { Lock.lock(); }
+
+  ~LockGuard() { Lock.unlock(); }
+};
+
+} // namespace mutex
+
 } // namespace ompx

+
 #endif
--- a/openmp/libomptarget/DeviceRTL/include/Types.h
+++ b/openmp/libomptarget/DeviceRTL/include/Types.h
@@ -36,6 +36,7 @@ using size_t = decltype(sizeof(char));
 // TODO: Properly implement this
 using intptr_t = int64_t;
 using uintptr_t = uint64_t;
+using ptrdiff_t = intptr_t;

 static_assert(sizeof(int8_t) == 1, "type size mismatch");
 static_assert(sizeof(uint8_t) == 1, "type size mismatch");
@@ -83,6 +84,11 @@ enum kmp_sched_t {
  kmp_sched_distr_static_nochunk = 92,
  kmp_sched_distr_static_chunk_sched_static_chunkone = 93,

+  // The following two are for direct GPU compilation where we don't have outer
+  // loop to distribute workload among teams but we still need to.
+  kmp_sched_distr_parallel_static_noloop_chunk = 100,
+  kmp_sched_distr_parallel_static_noloop_nochunk = 101,
+
  kmp_sched_default = kmp_sched_static_nochunk,
  kmp_sched_unordered_first = kmp_sched_static_chunk,
  kmp_sched_unordered_last = kmp_sched_auto,
--- a/openmp/libomptarget/DeviceRTL/include/Utils.h
+++ b/openmp/libomptarget/DeviceRTL/include/Utils.h
@@ -16,6 +16,9 @@

 #pragma omp begin declare target device_type(nohost)

+extern "C" double omp_get_wtime();
+extern "C" int printf(const char *, ...);
+
 namespace ompx {
 namespace utils {

@@ -37,8 +40,8 @@ template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
 }

 /// Advance \p Ptr by \p Bytes bytes.
-template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
-  return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
+template <typename Ty1, typename Ty2> inline Ty1 advance(Ty1 Ptr, Ty2 Bytes) {
+  return reinterpret_cast<Ty1>(reinterpret_cast<char *>(Ptr) + Bytes);
 }

 /// Return the first bit set in \p V.
@@ -82,12 +85,39 @@ template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
  return *((DstTy *)(&V));
 }

+template <typename Ty = char>
+ptrdiff_t getPtrDiff(const void *End, const void *Begin) {
+  return reinterpret_cast<const Ty *>(End) -
+         reinterpret_cast<const Ty *>(Begin);
+}
+
+inline bool isInRange(void *Ptr, void *BasePtr, int64_t Offset) {
+  ptrdiff_t Diff = getPtrDiff(Ptr, BasePtr);
+  return Diff >= 0 && Diff < Offset;
+}
+
+inline intptr_t ptrtoint(void *Ptr) { return reinterpret_cast<intptr_t>(Ptr); }
+
+template <typename T> T min(T a, T b) { return a < b ? a : b; }
+
 /// A  pointer variable that has by design an `undef` value. Use with care.
 __attribute__((loader_uninitialized)) static void *const UndefPtr;

 #define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
 #define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)

+class SimpleProfiler {
+  const char *HeadLine = nullptr;
+  double Start;
+
+public:
+  SimpleProfiler(const char *HL) : HeadLine(HL), Start(omp_get_wtime()) {}
+  ~SimpleProfiler() {
+    double End = omp_get_wtime();
+    printf("%s --> %lf s.\n", HeadLine, End - Start);
+  }
+};
+
 } // namespace utils
 } // namespace ompx

--- a/openmp/libomptarget/DeviceRTL/src/BuiltinAllocator.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/BuiltinAllocator.cpp
@@ -0,0 +1,40 @@
+//===------- BuiltinAllocator.cpp - Generic GPU memory allocator -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "Debug.h"
+#include "Memory.h"
+#include "Utils.h"
+
+using namespace ompx;
+
+namespace ompx {
+namespace memory {
+MemoryAllocationInfo getMemoryAllocationInfo(void *P) { return {}; }
+
+void init() {}
+} // namespace memory
+} // namespace ompx
+
+extern "C" {
+
+void *realloc(void *ptr, size_t new_size) {
+  void *NewPtr = malloc(new_size);
+  if (!NewPtr)
+    return nullptr;
+  __builtin_memcpy(NewPtr, ptr, new_size);
+  free(ptr);
+  return NewPtr;
+}
+}
+
+#pragma omp end declare target
--- a/openmp/libomptarget/DeviceRTL/src/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/src/CMakeLists.txt
@@ -0,0 +1,17 @@
+target_sources(omptarget.devicertl PRIVATE
+  Configuration.cpp
+  Debug.cpp
+  HostRPC.cpp
+  GenericAllocator.cpp
+  Kernel.cpp
+  LibC.cpp
+  Mapping.cpp
+  Misc.cpp
+  Parallelism.cpp
+  Reduction.cpp
+  State.cpp
+  Synchronization.cpp
+  Tasking.cpp
+  Utils.cpp
+  WarpAllocator.cpp
+  Workshare.cpp)
--- a/openmp/libomptarget/DeviceRTL/src/GenericAllocator.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/GenericAllocator.cpp
@@ -0,0 +1,308 @@
+//===------- GenericAllocator.cpp - Generic GPU memory allocator -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "Debug.h"
+#include "Memory.h"
+#include "Synchronization.h"
+#include "Utils.h"
+
+using namespace ompx;
+
+char *CONSTANT(omptarget_device_heap_buffer)
+    __attribute__((used, retain, weak, visibility("protected")));
+
+size_t CONSTANT(omptarget_device_heap_size)
+    __attribute__((used, retain, weak, visibility("protected")));
+
+namespace {
+size_t HeapCurPos = 0;
+mutex::TicketLock HeapLock;
+mutex::TicketLock AllocationListLock;
+mutex::TicketLock FreeListLock;
+constexpr const size_t Alignment = 16;
+
+using intptr_t = int64_t;
+
+struct SimpleLinkListNode {
+  template <typename T> friend struct SimpleLinkList;
+
+protected:
+  SimpleLinkListNode *Prev = nullptr;
+  SimpleLinkListNode *Next = nullptr;
+
+  bool operator>(const SimpleLinkListNode &RHS) const {
+    const auto This = reinterpret_cast<int64_t>(this);
+    const auto RHSThis = reinterpret_cast<int64_t>(&RHS);
+    return This > RHSThis;
+  }
+
+  SimpleLinkListNode() = default;
+  SimpleLinkListNode(const SimpleLinkListNode &) = delete;
+  SimpleLinkListNode(SimpleLinkListNode &&) = delete;
+};
+
+struct AllocationMetadata final : SimpleLinkListNode {
+public:
+  size_t getUserSize() const;
+  void *getUserAddr() const {
+    return const_cast<void *>(reinterpret_cast<const void *>(this + 1));
+  }
+
+  size_t getSize() const { return Size; }
+  void setSize(size_t V) { Size = V; }
+
+  bool isInRange(void *Ptr) {
+    return utils::isInRange(Ptr, this + 1, getUserSize());
+  }
+
+  static AllocationMetadata *getFromUserAddr(void *P) {
+    return getFromAddr(P) - 1;
+  }
+
+  static AllocationMetadata *getFromAddr(void *P) {
+    return reinterpret_cast<AllocationMetadata *>(P);
+  }
+
+private:
+  size_t Size = 0;
+  int64_t Reserved = 0;
+};
+
+constexpr const size_t AllocationMetadataSize = sizeof(AllocationMetadata);
+static_assert(AllocationMetadataSize == 32,
+              "expect the metadata size to be 32");
+
+size_t AllocationMetadata::getUserSize() const {
+  return Size - AllocationMetadataSize;
+}
+
+template <typename T> struct SimpleLinkList {
+  struct iterator {
+    friend SimpleLinkList;
+
+    T *operator->() { return reinterpret_cast<T *>(Node); }
+    T &operator*() { return reinterpret_cast<T &>(*Node); }
+
+    iterator operator++() {
+      Node = Node->Next;
+      return *this;
+    }
+
+    bool operator==(const iterator &RHS) const { return Node == RHS.Node; }
+    bool operator!=(const iterator &RHS) const { return !(*this == RHS); }
+
+    iterator next() const {
+      iterator Itr;
+      Itr.Node = Node->Next;
+      return Itr;
+    }
+
+  private:
+    SimpleLinkListNode *Node = nullptr;
+  };
+
+  iterator begin() {
+    iterator Itr;
+    Itr.Node = Head.Next;
+    return Itr;
+  }
+
+  iterator end() { return {}; }
+
+  void insert(SimpleLinkListNode *Node) { insertImpl(&Head, Node); }
+
+  void remove(iterator Itr) {
+    SimpleLinkListNode *Node = Itr.Node;
+    remove(Node);
+  }
+
+  void remove(SimpleLinkListNode *Node) { removeImpl(Node); }
+
+  bool empty() const { return Head.Next == nullptr; }
+
+private:
+  static void insertImpl(SimpleLinkListNode *Current,
+                         SimpleLinkListNode *Node) {
+    SimpleLinkListNode *OldNext = Current->Next;
+    Node->Prev = Current;
+    Node->Next = OldNext;
+    if (OldNext)
+      OldNext->Prev = Node;
+    Current->Next = Node;
+  }
+
+  static void removeImpl(SimpleLinkListNode *Node) {
+    SimpleLinkListNode *Prev = Node->Prev;
+    SimpleLinkListNode *Next = Node->Next;
+    Prev->Next = Next;
+    if (Next)
+      Next->Prev = Prev;
+    Node->Prev = nullptr;
+    Node->Next = nullptr;
+  }
+
+  /// Check if the node is in the list.
+  bool checkExist(SimpleLinkListNode *Node) const {
+    SimpleLinkListNode *P = Head.Next;
+    while (P) {
+      if (P == Node)
+        return true;
+      P = P->Next;
+    }
+    return false;
+  }
+
+  static bool checkSanity(SimpleLinkListNode *Current,
+                          SimpleLinkListNode *Next) {
+    return Current->Next == Next && (!Next || Next->Prev == Current);
+  }
+
+  static bool checkSanity(SimpleLinkListNode *Prev, SimpleLinkListNode *Current,
+                          SimpleLinkListNode *Next) {
+    return Prev->Next == Current && Current->Next == Next &&
+           Current->Prev == Prev && (!Next || Next->Prev == Current);
+  }
+
+  static bool checkDangling(SimpleLinkListNode *Node) {
+    return Node->Next == nullptr && Node->Prev == nullptr;
+  }
+
+  SimpleLinkListNode Head;
+};
+
+SimpleLinkList<AllocationMetadata> AllocationList;
+SimpleLinkList<AllocationMetadata> FreeList;
+
+constexpr const int SplitRatio = 5;
+
+#ifndef LIBOMPTARGET_MEMORY_PROFILING
+struct MemoryProfiler {
+  MemoryProfiler(const char *S) {}
+  ~MemoryProfiler() {}
+};
+#else
+struct MemoryProfiler : utils::SimpleProfiler {
+  MemoryProfiler(const char *S) : utils::SimpleProfiler(S) {}
+  ~MemoryProfiler() { utils::SimpleProfiler::~SimpleProfiler(); }
+};
+#endif
+} // namespace
+
+namespace ompx {
+namespace memory {
+MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+  mutex::LockGuard ALG(AllocationListLock);
+  for (auto Itr = AllocationList.begin(); Itr != AllocationList.end(); ++Itr) {
+    AllocationMetadata *MD = &(*Itr);
+    if (MD->isInRange(P))
+      return {MD->getUserAddr(), MD->getUserSize()};
+  }
+  return {};
+}
+} // namespace memory
+} // namespace ompx
+
+extern "C" {
+
+void *memset(void *dest, int ch, size_t count);
+
+void *malloc(size_t Size) {
+  MemoryProfiler Profiler(__FUNCTION__);
+
+  Size = utils::align_up(Size + AllocationMetadataSize, Alignment);
+
+  AllocationMetadata *MD = nullptr;
+
+  {
+    mutex::LockGuard FLG(FreeListLock);
+
+    auto Itr = FreeList.begin();
+    for (; Itr != FreeList.end(); ++Itr) {
+      if (Itr->getSize() >= Size)
+        break;
+    }
+
+    bool Found = Itr != FreeList.end() && (Itr->getSize() / Size < SplitRatio);
+    if (Found) {
+      MD = &(*Itr);
+      FreeList.remove(Itr);
+    }
+  }
+
+  if (MD) {
+    mutex::LockGuard ALG(AllocationListLock);
+    AllocationList.insert(MD);
+    return MD->getUserAddr();
+  }
+
+  {
+    mutex::LockGuard LG(HeapLock);
+
+    if (Size + HeapCurPos < omptarget_device_heap_size) {
+      void *R = omptarget_device_heap_buffer + HeapCurPos;
+      (void)atomic::add(&HeapCurPos, Size, atomic::acq_rel);
+      MD = AllocationMetadata::getFromAddr(R);
+    }
+  }
+
+  if (MD) {
+    // We need to reset the head in case of any dirty data.
+    memset(MD, 0, AllocationMetadataSize);
+    MD->setSize(Size);
+    mutex::LockGuard ALG(AllocationListLock);
+    AllocationList.insert(MD);
+    return MD->getUserAddr();
+  }
+
+  printf("out of heap memory! size=%lu, cur=%lu.\n", Size, HeapCurPos);
+
+  printf("%s:%d\n", __FILE__, __LINE__);
+  __builtin_trap();
+}
+
+void free(void *P) {
+  MemoryProfiler Profiler(__FUNCTION__);
+
+  if (!P)
+    return;
+
+  auto *MD = AllocationMetadata::getFromUserAddr(P);
+
+  {
+    mutex::LockGuard ALG(AllocationListLock);
+    AllocationList.remove(MD);
+  }
+
+  {
+    mutex::LockGuard FLG(FreeListLock);
+    FreeList.insert(MD);
+  }
+}
+
+void *realloc(void *ptr, size_t new_size) {
+  MemoryProfiler Profiler(__FUNCTION__);
+
+  void *NewPtr = malloc(new_size);
+  if (!NewPtr)
+    return nullptr;
+  auto *OldMD = AllocationMetadata::getFromUserAddr(ptr);
+  assert(ptr == OldMD->getUserAddr());
+  __builtin_memcpy(NewPtr, ptr, utils::min(OldMD->getUserSize(), new_size));
+  free(ptr);
+  return NewPtr;
+}
+
+void __kmpc_target_init_allocator() {}
+}
+
+#pragma omp end declare target
--- a/openmp/libomptarget/DeviceRTL/src/HostRPC.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/HostRPC.cpp
@@ -0,0 +1,420 @@
+//===------- HostRPC.cpp - Implementation of host RPC ------------- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "HostRPC.h"
+
+#include "Debug.h"
+#include "LibC.h"
+#include "Memory.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
+
+#ifdef HOSTRPC_DEBUG
+#define DEBUG_PREFIX "host-rpc-device"
+#define DP(FMT, ...)                                                           \
+  { printf("%s --> " FMT, DEBUG_PREFIX, __VA_ARGS__); }
+#else
+#define DP(FMT, ...)
+#endif
+
+using namespace ompx;
+using namespace hostrpc;
+
+using ArgType = llvm::omp::OMPTgtHostRPCArgType;
+
+Descriptor *omptarget_hostrpc_descriptor
+    __attribute__((used, retain, weak, visibility("protected")));
+int32_t *omptarget_hostrpc_futex
+    __attribute__((used, retain, weak, visibility("protected")));
+char *omptarget_hostrpc_memory_buffer
+    __attribute__((used, retain, weak, visibility("protected")));
+size_t omptarget_hostrpc_memory_buffer_size
+    __attribute__((used, retain, weak, visibility("protected")));
+
+#ifdef HOSTRPC_PROFILING
+int32_t HostRPCId;
+double GetDescStart;
+double GetDescEnd;
+double AddArgStart;
+double AddArgEnd;
+double IssueAndWaitStart;
+double IssueAndWaitEnd;
+double CopyBackStart;
+double CopyBackEnd;
+#endif
+
+namespace {
+size_t HostRPCMemoryBufferCurrentPosition = 0;
+constexpr const size_t Alignment = 16;
+
+// FIXME: For now we only allow one thread requesting host RPC.
+mutex::TicketLock HostRPCLock;
+
+void *HostRPCMemAlloc(size_t Size) {
+  Size = utils::align_up(Size, Alignment);
+
+  if (Size + HostRPCMemoryBufferCurrentPosition <
+      omptarget_hostrpc_memory_buffer_size) {
+    void *R =
+        omptarget_hostrpc_memory_buffer + HostRPCMemoryBufferCurrentPosition;
+    atomic::add(&HostRPCMemoryBufferCurrentPosition, Size, atomic::acq_rel);
+    return R;
+  }
+
+  printf("%s:%d\n", __FILE__, __LINE__);
+  __builtin_trap();
+
+  return nullptr;
+}
+
+// For now we just reset the buffer.
+void HostRPCMemReset() { HostRPCMemoryBufferCurrentPosition = 0; }
+
+static_assert(sizeof(intptr_t) == sizeof(int64_t), "pointer size not match");
+
+struct HostRPCArgInfo {
+  void *BasePtr;
+  int64_t Type;
+  int64_t Size;
+  HostRPCArgInfo *Next;
+};
+
+struct HostRPCPointerMapEntry {
+  void *BasePtr;
+  void *MappedBasePtr;
+  int64_t Size;
+  int64_t Kind;
+};
+
+void *getMappedPointer(Descriptor *D, void *BasePtr, int64_t Size,
+                       int64_t Offset, int64_t Kind) {
+  assert(D->ArgMap && "ArgMap should not be nullptr");
+
+  HostRPCPointerMapEntry *MapTable =
+      reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
+  int I = 0;
+  for (; I < D->NumArgs && MapTable[I].BasePtr; ++I)
+    if (MapTable[I].BasePtr == BasePtr)
+      return utils::advance(MapTable[I].MappedBasePtr, Offset);
+
+  MapTable[I].BasePtr = BasePtr;
+  MapTable[I].MappedBasePtr = HostRPCMemAlloc(Size);
+  MapTable[I].Size = Size;
+  MapTable[I].Kind = Kind;
+
+  if (Kind & ArgType::OMP_HOST_RPC_ARG_COPY_TO) {
+    __builtin_memcpy(MapTable[I].MappedBasePtr, BasePtr, Size);
+    DP("getMappedPointer: copy %ld bytes memory from %p to %p.\n", Size,
+       BasePtr, MapTable[I].MappedBasePtr);
+  }
+
+  return utils::advance(MapTable[I].MappedBasePtr, Offset);
+}
+
+void copybackIfNeeded(Descriptor *D) {
+  if (!D->ArgMap)
+    return;
+
+  auto *MapTable = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
+  for (int I = 0; I < D->NumArgs && MapTable[I].BasePtr; ++I)
+    if (MapTable[I].Kind & ArgType::OMP_HOST_RPC_ARG_COPY_FROM) {
+      __builtin_memcpy(MapTable[I].BasePtr, MapTable[I].MappedBasePtr,
+                       MapTable[I].Size);
+      DP("copybackIfNeeded: copy %ld bytes memory from %p to %p.\n",
+         MapTable[I].Size, MapTable[I].MappedBasePtr, MapTable[I].BasePtr);
+    }
+}
+
+} // namespace
+
+extern "C" {
+__attribute__((noinline, used)) void *
+__kmpc_host_rpc_get_desc(int32_t CallId, int32_t NumArgs, void *ArgInfo) {
+  assert(omptarget_hostrpc_descriptor && omptarget_hostrpc_futex &&
+         "no host rpc pointer");
+
+  DP("device: stdin=%p, stdout=%p, stderr=%p\n", stdin, stdout, stderr);
+  DP("get desc for request (id=%d), NumArgs=%d, ArgInfo=%p.\n", CallId, NumArgs,
+     ArgInfo);
+#ifdef HOSTRPC_DEBUG
+  {
+    void **AIs = reinterpret_cast<void **>(ArgInfo);
+    for (int I = 0; I < NumArgs; ++I)
+      DP("ArgInfo[%d]=%p.\n", I, AIs[I]);
+  }
+#endif
+
+  HostRPCLock.lock();
+
+#ifdef HOSTRPC_PROFILING
+  HostRPCId = CallId;
+  GetDescStart = omp_get_wtime();
+#endif
+
+  // TODO: change it after we support a queue-like data structure.
+  Descriptor *D = omptarget_hostrpc_descriptor;
+
+  D->Id = CallId;
+  D->ArgInfo = reinterpret_cast<void **>(ArgInfo);
+  D->NumArgs = NumArgs;
+  D->Status = EXEC_STAT_CREATED;
+  D->ReturnValue = 0;
+  D->Args =
+      reinterpret_cast<Argument *>(HostRPCMemAlloc(sizeof(Argument) * NumArgs));
+  D->ArgMap = HostRPCMemAlloc(sizeof(HostRPCPointerMapEntry) * NumArgs);
+
+  assert(!NumArgs || (D->Args && D->ArgMap) && "out of host rpc memory!");
+
+  // Reset the map table.
+  auto *ArgMap = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
+  for (int I = 0; I < NumArgs; ++I)
+    ArgMap[I].BasePtr = nullptr;
+
+#ifdef HOSTRPC_PROFILING
+  GetDescEnd = omp_get_wtime();
+  AddArgStart = omp_get_wtime();
+#endif
+
+  return D;
+}
+
+__attribute__((noinline, used)) void
+__kmpc_host_rpc_add_arg(void *Desc, int64_t ArgVal, int32_t ArgNum) {
+  auto *D = reinterpret_cast<Descriptor *>(Desc);
+  assert(ArgNum < D->NumArgs && "out-of-range arguments");
+  Argument &ArgInDesc = D->Args[ArgNum];
+
+  DP("add arg (no=%d), arg=%lx to request (id=%d).\n", ArgNum, ArgVal, D->Id);
+
+  // This early branch can rule out nullptr and zero scalar value because it
+  // doesn't matter whether it is a pointer or scalar value.
+  if (ArgVal == 0) {
+    ArgInDesc.Value = 0;
+    ArgInDesc.ArgType = Type::ARG_LITERAL;
+
+    DP("arg (no=%d) is null, done.\n", ArgNum);
+
+    return;
+  }
+
+  void *ArgPtr = reinterpret_cast<void *>(ArgVal);
+
+  if (ArgPtr == stdin || ArgPtr == stdout || ArgPtr == stderr) {
+    ArgInDesc.Value = ArgVal;
+    ArgInDesc.ArgType = Type::ARG_POINTER;
+
+    DP("arg (no=%d) is stdin/stdout/stderr, done.\n", ArgNum);
+
+    return;
+  }
+
+  const auto *AI = reinterpret_cast<HostRPCArgInfo *>(D->ArgInfo[ArgNum]);
+
+  DP("try to find arg (no=%d) from args AI=%p\n", ArgNum, AI);
+
+  if (AI) {
+    // Let's first check if Arg is a scalar.
+    if (AI->Type == ArgType::OMP_HOST_RPC_ARG_SCALAR) {
+      assert(AI->BasePtr == ArgPtr && "invalid scalar argument info");
+      assert(AI->Next == nullptr && "invalid scalar argument info");
+
+      ArgInDesc.Value = ArgVal;
+      ArgInDesc.ArgType = Type::ARG_LITERAL;
+
+      DP("arg (no=%d) is scalar, done.\n", ArgNum);
+
+      return;
+    }
+
+    // Then let's see if it is a literal pointer that we don't need copy.
+    auto *P = AI;
+    while (P) {
+      if (P->Type == ArgType::OMP_HOST_RPC_ARG_PTR && P->BasePtr == ArgPtr) {
+        ArgInDesc.Value = ArgVal;
+        ArgInDesc.ArgType = Type::ARG_POINTER;
+
+        DP("arg (no=%d) is literal pointer, done.\n", ArgNum);
+
+        return;
+      }
+      P = P->Next;
+    }
+
+    // Next we check if it is within the range of any buffer described in
+    // argument info.
+    P = AI;
+    while (P) {
+      if ((P->Type & ArgType::OMP_HOST_RPC_ARG_PTR) && P->Size) {
+        if (utils::isInRange(ArgPtr, P->BasePtr, P->Size)) {
+          auto Size = P->Size;
+          auto Offset = utils::getPtrDiff(ArgPtr, P->BasePtr);
+
+          ArgInDesc.Value = utils::ptrtoint(
+              getMappedPointer(D, P->BasePtr, Size, Offset, P->Type));
+          ArgInDesc.ArgType = Type::ARG_POINTER;
+
+          DP("found a match for arg (no=%d). done.\n", ArgNum);
+
+          return;
+        }
+      }
+      P = P->Next;
+    }
+  }
+
+  // Now we can't find a match from argument info, then we assume it is from
+  // dynamic allocation.
+  memory::MemoryAllocationInfo MAI = memory::getMemoryAllocationInfo(ArgPtr);
+  if (MAI.isValid()) {
+    auto Size = MAI.Size;
+    auto Offset = utils::getPtrDiff(ArgPtr, MAI.BasePtr);
+
+    ArgInDesc.Value = utils::ptrtoint(
+        getMappedPointer(D, MAI.BasePtr, Size, Offset,
+                         /* Kind */ ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM));
+    ArgInDesc.ArgType = Type::ARG_POINTER;
+
+    DP("arg (no=%d) is from malloc. done.\n", ArgNum);
+
+    return;
+  }
+
+  printf("request (id=%d) arg (no=%d, val=%p) is unknown. send it to host "
+         "directly.\n",
+         D->Id, ArgNum, ArgPtr);
+
+  ArgInDesc.Value = ArgVal;
+  ArgInDesc.ArgType = Type::ARG_POINTER;
+}
+
+__attribute__((noinline, used)) int64_t
+__kmpc_host_rpc_send_and_wait(void *Desc) {
+  auto *D = reinterpret_cast<Descriptor *>(Desc);
+  int32_t Id = D->Id;
+
+#ifdef HOSTRPC_PROFILING
+  AddArgEnd = omp_get_wtime();
+  IssueAndWaitStart = omp_get_wtime();
+#endif
+
+  atomic::add(omptarget_hostrpc_futex, 1U, atomic::acq_rel);
+
+  // A system fence is required to make sure futex on the host is also
+  // updated if USM is supported.
+  fence::system(atomic::seq_cst);
+
+  DP("sent request (id=%d) to host. waiting for finish.\n", Id);
+
+  unsigned NS = 8;
+
+  while (atomic::addSys(omptarget_hostrpc_futex, 0)) {
+    asm volatile("nanosleep.u32 %0;" : : "r"(NS));
+    // if (NS < 64)
+    //   NS *= 2;
+    // fence::system(atomic::seq_cst);
+  }
+
+#ifdef HOSTRPC_PROFILING
+  IssueAndWaitEnd = omp_get_wtime();
+#endif
+
+  DP("finish waiting for request (id=%d).\n", Id);
+
+  int64_t Ret = D->ReturnValue;
+
+  assert(!D->NumArgs || D->ArgMap && "arg map should not be nullptr");
+
+#ifdef HOSTRPC_PROFILING
+  CopyBackStart = omp_get_wtime();
+#endif
+
+  if (D->ArgMap) {
+    DP("copy memory back for request (id=%d).\n", Id);
+    copybackIfNeeded(D);
+    DP("finish copy memory back for request (id=%d).\n", Id);
+  }
+
+#ifdef HOSTRPC_PROFILING
+  CopyBackEnd = omp_get_wtime();
+#endif
+
+  HostRPCMemReset();
+
+  // We can unlock now as we already get all temporary part.
+  // TODO: If we have a queue, we don't need this step.
+  HostRPCLock.unlock();
+
+  DP("request (id=%d) is done with return code=%lx.\n", Id, Ret);
+
+#ifdef HOSTRPC_PROFILING
+  printf("[host-rpc-profiling-device] id=%d, init=%lf, add_arg=%lf, wait=%lf, "
+         "copy=%lf.\n",
+         HostRPCId, GetDescEnd - GetDescStart, AddArgEnd - AddArgStart,
+         IssueAndWaitEnd - IssueAndWaitStart, CopyBackEnd - CopyBackStart);
+#endif
+
+  return Ret;
+}
+
+__attribute__((noinline, used)) void
+__kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
+                                 int32_t if_expr, int32_t num_threads,
+                                 void **args, int64_t nargs) {
+  constexpr const int64_t NumArgs = 6;
+  HostRPCArgInfo ArgInfoArray[NumArgs];
+  void *ArgInfo[NumArgs];
+  for (unsigned I = 0; I < NumArgs; ++I) {
+    ArgInfoArray[I].BasePtr = 0;
+    ArgInfoArray[I].Type = ArgType::OMP_HOST_RPC_ARG_SCALAR;
+    ArgInfoArray[I].Size = 0;
+    ArgInfoArray[I].Next = nullptr;
+    ArgInfo[I] = &ArgInfoArray[I];
+  }
+  auto *D = (Descriptor *)__kmpc_host_rpc_get_desc(0, NumArgs, (void *)ArgInfo);
+
+  // Set up arg info struct.
+  ArgInfoArray[0].BasePtr = const_cast<char *>(name);
+  ArgInfoArray[0].Type = ArgType::OMP_HOST_RPC_ARG_COPY_TO;
+  ArgInfoArray[0].Size = strlen(name) + 1;
+  ArgInfoArray[1].BasePtr =
+      reinterpret_cast<void *>(static_cast<int64_t>(gtid));
+  ArgInfoArray[2].BasePtr =
+      reinterpret_cast<void *>(static_cast<int64_t>(if_expr));
+  ArgInfoArray[3].BasePtr =
+      reinterpret_cast<void *>(static_cast<int64_t>(num_threads));
+  ArgInfoArray[5].BasePtr = reinterpret_cast<void *>(nargs);
+
+  // We need to treat args in a little bit different way because nargs might be
+  // on the stack.
+  ArgInfoArray[4].Size = sizeof(void *) * nargs;
+  void *Args = nullptr;
+  if (nargs) {
+    Args = HostRPCMemAlloc(ArgInfoArray[4].Size);
+    __builtin_memcpy(Args, args, ArgInfoArray[4].Size);
+  }
+  ArgInfoArray[4].BasePtr = Args;
+
+  D->Id = CALLID___kmpc_launch_parallel_51_kernel;
+
+  __kmpc_host_rpc_add_arg(D, reinterpret_cast<int64_t>(name), 0);
+  __kmpc_host_rpc_add_arg(D, gtid, 1);
+  __kmpc_host_rpc_add_arg(D, if_expr, 2);
+  __kmpc_host_rpc_add_arg(D, num_threads, 3);
+  __kmpc_host_rpc_add_arg(D, reinterpret_cast<int64_t>(Args), 4);
+  __kmpc_host_rpc_add_arg(D, nargs, 5);
+
+  (void)__kmpc_host_rpc_send_and_wait(D);
+}
+}
+
+#pragma omp end declare target
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -13,6 +13,7 @@
 #include "Debug.h"
 #include "Interface.h"
 #include "Mapping.h"
+#include "Memory.h"
 #include "State.h"
 #include "Synchronization.h"
 #include "Types.h"
--- a/openmp/libomptarget/DeviceRTL/src/LibC.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/LibC.cpp
@@ -7,6 +7,12 @@
 //===----------------------------------------------------------------------===//

 #include "LibC.h"
+#include "Debug.h"
+#include "Memory.h"
+#include "Synchronization.h"
+#include "Utils.h"
+
+using namespace ompx;

 #pragma omp begin declare target device_type(nohost)

@@ -32,8 +38,324 @@ int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
  return -1;
 }
 } // namespace impl
+
 #pragma omp end declare variant

+// Dummy variable for stdio.
+int StdInDummyVar;
+int StdOutDummyVar;
+int StdErrDummyVar;
+
+struct FILE;
+__attribute__((used, retain, weak, visibility("protected"))) FILE *stdin =
+    (FILE *)&StdInDummyVar;
+__attribute__((used, retain, weak, visibility("protected"))) FILE *stdout =
+    (FILE *)&StdOutDummyVar;
+__attribute__((used, retain, weak, visibility("protected"))) FILE *stderr =
+    (FILE *)&StdErrDummyVar;
+
+typedef int (*__compar_fn_t)(const void *, const void *);
+typedef int (*__compar_d_fn_t)(const void *, const void *, void *);
+
+namespace {
+const int32_t ToLowerMapTable[] = {
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   2,   3,   4,   5,   6,
+    7,   8,   9,   10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,
+    22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,
+    37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
+    52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  'a', 'b',
+    'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+    'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 91,  92,  93,  94,  95,  96,
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 123, 124, 125, 126,
+    127, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,
+};
+const int32_t *ToLowerPTable = ToLowerMapTable + 128;
+
+#define X(x) (((x) / 256 | (x)*256) % 65536)
+
+static const unsigned short BLocTable[] = {
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
+    X(0x200), X(0x200), X(0x200), X(0x200), X(0x320), X(0x220), X(0x220),
+    X(0x220), X(0x220), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
+    X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
+    X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x160),
+    X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
+    X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
+    X(0x4c0), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8),
+    X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x4c0), X(0x4c0), X(0x4c0),
+    X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x8d5), X(0x8d5), X(0x8d5),
+    X(0x8d5), X(0x8d5), X(0x8d5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
+    X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
+    X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
+    X(0x8c5), X(0x8c5), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
+    X(0x4c0), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6),
+    X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6),
+    X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6),
+    X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x4c0),
+    X(0x4c0), X(0x4c0), X(0x4c0), X(0x200), 0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,
+};
+
+#undef X
+
+static const unsigned short *BLocTablePTable = BLocTable + 128;
+
+int ErrNo __attribute__((used, retain));
+
+static unsigned long int RandomNext = 1;
+
+#define SWAP(a, b, size)                                                       \
+  do {                                                                         \
+    size_t __size = (size);                                                    \
+    char *__a = (a), *__b = (b);                                               \
+    do {                                                                       \
+      char __tmp = *__a;                                                       \
+      *__a++ = *__b;                                                           \
+      *__b++ = __tmp;                                                          \
+    } while (--__size > 0);                                                    \
+  } while (0)
+
+/* Discontinue quicksort algorithm when partition gets below this size.
+   This particular magic number was chosen to work best on a Sun 4/260. */
+#define MAX_THRESH 4
+
+/* Stack node declarations used to store unfulfilled partition obligations. */
+typedef struct {
+  char *lo;
+  char *hi;
+} stack_node;
+
+/* The next 4 #defines implement a very fast in-line stack abstraction. */
+/* The stack needs log (total_elements) entries (we could even subtract
+   log(MAX_THRESH)).  Since total_elements has type size_t, we get as
+   upper bound for log (total_elements):
+   bits per byte (CHAR_BIT) * sizeof(size_t).  */
+#define CHAR_BIT 8
+#define STACK_SIZE (CHAR_BIT * sizeof(size_t))
+#define PUSH(low, high) ((void)((top->lo = (low)), (top->hi = (high)), ++top))
+#define POP(low, high) ((void)(--top, (low = top->lo), (high = top->hi)))
+#define STACK_NOT_EMPTY (stack < top)
+
+/* Order size using quicksort.  This implementation incorporates
+   four optimizations discussed in Sedgewick:
+   1. Non-recursive, using an explicit stack of pointer that store the
+      next array partition to sort.  To save time, this maximum amount
+      of space required to store an array of SIZE_MAX is allocated on the
+      stack.  Assuming a 32-bit (64 bit) integer for size_t, this needs
+      only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes).
+      Pretty cheap, actually.
+   2. Chose the pivot element using a median-of-three decision tree.
+      This reduces the probability of selecting a bad pivot value and
+      eliminates certain extraneous comparisons.
+   3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving
+      insertion sort to order the MAX_THRESH items within each partition.
+      This is a big win, since insertion sort is faster for small, mostly
+      sorted array segments.
+   4. The larger of the two sub-partitions is always pushed onto the
+      stack first, with the algorithm then concentrating on the
+      smaller partition.  This *guarantees* no more than log (total_elems)
+      stack size is needed (actually O(1) in this case)!  */
+
+void _quicksort(void *const pbase, size_t total_elems, size_t size,
+                __compar_d_fn_t cmp, void *arg) {
+  char *base_ptr = (char *)pbase;
+
+  const size_t max_thresh = MAX_THRESH * size;
+
+  if (total_elems == 0)
+    /* Avoid lossage with unsigned arithmetic below.  */
+    return;
+
+  if (total_elems > MAX_THRESH) {
+    char *lo = base_ptr;
+    char *hi = &lo[size * (total_elems - 1)];
+    stack_node stack[STACK_SIZE];
+    stack_node *top = stack;
+
+    PUSH(nullptr, nullptr);
+
+    while (STACK_NOT_EMPTY) {
+      char *left_ptr;
+      char *right_ptr;
+
+      /* Select median value from among LO, MID, and HI. Rearrange
+         LO and HI so the three values are sorted. This lowers the
+         probability of picking a pathological pivot value and
+         skips a comparison for both the LEFT_PTR and RIGHT_PTR in
+         the while loops. */
+
+      char *mid = lo + size * ((hi - lo) / size >> 1);
+
+      if ((*cmp)((void *)mid, (void *)lo, arg) < 0)
+        SWAP(mid, lo, size);
+      if ((*cmp)((void *)hi, (void *)mid, arg) < 0)
+        SWAP(mid, hi, size);
+      else
+        goto jump_over;
+      if ((*cmp)((void *)mid, (void *)lo, arg) < 0)
+        SWAP(mid, lo, size);
+    jump_over:;
+
+      left_ptr = lo + size;
+      right_ptr = hi - size;
+
+      /* Here's the famous ``collapse the walls'' section of quicksort.
+         Gotta like those tight inner loops!  They are the main reason
+         that this algorithm runs much faster than others. */
+      do {
+        while ((*cmp)((void *)left_ptr, (void *)mid, arg) < 0)
+          left_ptr += size;
+
+        while ((*cmp)((void *)mid, (void *)right_ptr, arg) < 0)
+          right_ptr -= size;
+
+        if (left_ptr < right_ptr) {
+          SWAP(left_ptr, right_ptr, size);
+          if (mid == left_ptr)
+            mid = right_ptr;
+          else if (mid == right_ptr)
+            mid = left_ptr;
+          left_ptr += size;
+          right_ptr -= size;
+        } else if (left_ptr == right_ptr) {
+          left_ptr += size;
+          right_ptr -= size;
+          break;
+        }
+      } while (left_ptr <= right_ptr);
+
+      /* Set up pointers for next iteration.  First determine whether
+         left and right partitions are below the threshold size.  If so,
+         ignore one or both.  Otherwise, push the larger partition's
+         bounds on the stack and continue sorting the smaller one. */
+
+      if ((size_t)(right_ptr - lo) <= max_thresh) {
+        if ((size_t)(hi - left_ptr) <= max_thresh)
+          /* Ignore both small partitions. */
+          POP(lo, hi);
+        else
+          /* Ignore small left partition. */
+          lo = left_ptr;
+      } else if ((size_t)(hi - left_ptr) <= max_thresh)
+        /* Ignore small right partition. */
+        hi = right_ptr;
+      else if ((right_ptr - lo) > (hi - left_ptr)) {
+        /* Push larger left partition indices. */
+        PUSH(lo, right_ptr);
+        lo = left_ptr;
+      } else {
+        /* Push larger right partition indices. */
+        PUSH(left_ptr, hi);
+        hi = right_ptr;
+      }
+    }
+  }
+
+  /* Once the BASE_PTR array is partially sorted by quicksort the rest
+     is completely sorted using insertion sort, since this is efficient
+     for partitions below MAX_THRESH size. BASE_PTR points to the beginning
+     of the array to sort, and END_PTR points at the very last element in
+     the array (*not* one beyond it!). */
+
+#define min(x, y) ((x) < (y) ? (x) : (y))
+
+  {
+    char *const end_ptr = &base_ptr[size * (total_elems - 1)];
+    char *tmp_ptr = base_ptr;
+    char *thresh = min(end_ptr, base_ptr + max_thresh);
+    char *run_ptr;
+
+    /* Find smallest element in first threshold and place it at the
+       array's beginning.  This is the smallest array element,
+       and the operation speeds up insertion sort's inner loop. */
+
+    for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
+      if ((*cmp)((void *)run_ptr, (void *)tmp_ptr, arg) < 0)
+        tmp_ptr = run_ptr;
+
+    if (tmp_ptr != base_ptr)
+      SWAP(tmp_ptr, base_ptr, size);
+
+    /* Insertion sort, running from left-hand-side up to right-hand-side.  */
+
+    run_ptr = base_ptr + size;
+    while ((run_ptr += size) <= end_ptr) {
+      tmp_ptr = run_ptr - size;
+      while ((*cmp)((void *)run_ptr, (void *)tmp_ptr, arg) < 0)
+        tmp_ptr -= size;
+
+      tmp_ptr += size;
+      if (tmp_ptr != run_ptr) {
+        char *trav;
+
+        trav = run_ptr + size;
+        while (--trav >= run_ptr) {
+          char c = *trav;
+          char *hi, *lo;
+
+          for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
+            *hi = *lo;
+          *hi = c;
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
 extern "C" {

 int memcmp(const void *lhs, const void *rhs, size_t count) {
@@ -51,6 +373,415 @@ int memcmp(const void *lhs, const void *rhs, size_t count) {
 int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
  return impl::omp_vprintf(Format, Arguments, Size);
 }
+
+// -----------------------------------------------------------------------------
+
+#ifndef ULONG_MAX
+#define ULONG_MAX ((unsigned long)(~0L)) /* 0xFFFFFFFF */
+#endif
+
+#ifndef LONG_MAX
+#define LONG_MAX ((long)(ULONG_MAX >> 1)) /* 0x7FFFFFFF */
+#endif
+
+#ifndef LONG_MIN
+#define LONG_MIN ((long)(~LONG_MAX)) /* 0x80000000 */
+#endif
+
+#ifndef ISDIGIT
+#define ISDIGIT(C) ((C) >= '0' && (C) <= '9')
+#endif
+
+#ifndef ISSPACE
+#define ISSPACE(C) ((C) == ' ')
+#endif
+
+#ifndef ISALPHA
+#define ISALPHA(C) (((C) >= 'a' && (C) <= 'z') || ((C) >= 'A' && (C) <= 'Z'))
+#endif
+
+#ifndef ISUPPER
+#define ISUPPER(C) ((C) >= 'A' && (C) <= 'Z')
+#endif
+
+long strtol(const char *nptr, char **endptr, int base) {
+  const char *s = nptr;
+  unsigned long acc;
+  int c;
+  unsigned long cutoff;
+  int neg = 0, any, cutlim;
+
+  /*
+   * Skip white space and pick up leading +/- sign if any.
+   * If base is 0, allow 0x for hex and 0 for octal, else
+   * assume decimal; if base is already 16, allow 0x.
+   */
+  do {
+    c = *s++;
+  } while (ISSPACE(c));
+  if (c == '-') {
+    neg = 1;
+    c = *s++;
+  } else if (c == '+')
+    c = *s++;
+  if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X')) {
+    c = s[1];
+    s += 2;
+    base = 16;
+  }
+  if (base == 0)
+    base = c == '0' ? 8 : 10;
+
+  /*
+   * Compute the cutoff value between legal numbers and illegal
+   * numbers.  That is the largest legal value, divided by the
+   * base.  An input number that is greater than this value, if
+   * followed by a legal input character, is too big.  One that
+   * is equal to this value may be valid or not; the limit
+   * between valid and invalid numbers is then based on the last
+   * digit.  For instance, if the range for longs is
+   * [-2147483648..2147483647] and the input base is 10,
+   * cutoff will be set to 214748364 and cutlim to either
+   * 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated
+   * a value > 214748364, or equal but the next digit is > 7 (or 8),
+   * the number is too big, and we will return a range error.
+   *
+   * Set any if any `digits' consumed; make it negative to indicate
+   * overflow.
+   */
+  cutoff = neg ? -(unsigned long)LONG_MIN : LONG_MAX;
+  cutlim = cutoff % (unsigned long)base;
+  cutoff /= (unsigned long)base;
+  for (acc = 0, any = 0;; c = *s++) {
+    if (ISDIGIT(c))
+      c -= '0';
+    else if (ISALPHA(c))
+      c -= ISUPPER(c) ? 'A' - 10 : 'a' - 10;
+    else
+      break;
+    if (c >= base)
+      break;
+    if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
+      any = -1;
+    else {
+      any = 1;
+      acc *= base;
+      acc += c;
+    }
+  }
+  if (any < 0) {
+    acc = neg ? LONG_MIN : LONG_MAX;
+    errno = ERANGE;
+  } else if (neg)
+    acc = -acc;
+  if (endptr != 0)
+    *endptr = const_cast<char *>(any ? s - 1 : nptr);
+  return (acc);
+}
+
+int strcmp(const char *lhs, const char *rhs) {
+  while (*lhs != '\0' && *rhs != '\0') {
+    if (*lhs == *rhs) {
+      ++lhs;
+      ++rhs;
+    }
+    return *lhs - *rhs;
+  }
+  if (*lhs != '\0')
+    return 1;
+
+  return -1;
+}
+
+void *calloc(size_t num, size_t size) {
+  size_t bits = num * size;
+  char *p = (char *)malloc(bits);
+  if (!p)
+    return p;
+  char *q = (char *)p;
+  while (q - p < bits) {
+    *(int *)q = 0;
+    q += sizeof(int);
+  }
+  while (q - p < bits) {
+    *q = 0;
+    q++;
+  }
+  return p;
+}
+
+void exit(int exit_code) { asm volatile("exit;"); }
+
+size_t strlen(const char *str) {
+  size_t r = 0;
+  while (*str == ' ')
+    ++str;
+
+  while (*str != '\0') {
+    ++r;
+    ++str;
+  }
+
+  return r;
+}
+
+char *strcpy(char *dest, const char *src) {
+  char *pd = dest;
+  const char *ps = src;
+
+  while (*ps != '\0')
+    *(pd++) = *(ps++);
+
+  *pd = '\0';
+
+  return dest;
+}
+
+int *__errno_location() { return &ErrNo; }
+
+char *strcat(char *dest, const char *src) {
+  char *pd = dest;
+  const char *ps = src;
+
+  while (*pd != '\0')
+    ++pd;
+
+  while (*ps != '\0')
+    *(pd++) = *(ps++);
+
+  *pd = '\0';
+
+  return dest;
+}
+
+void perror(const char *s) { printf("%s", s); }
+
+int strncmp(const char *lhs, const char *rhs, size_t count) {
+  size_t c = 0;
+  while (*lhs != '\0' && *rhs != '\0' && c < count) {
+    if (*lhs == *rhs) {
+      ++lhs;
+      ++rhs;
+      ++c;
+    } else
+      return *lhs - *rhs;
+  }
+
+  return 0;
+}
+
+char *strncpy(char *dest, const char *src, size_t count) {
+  char *pd = dest;
+  const char *ps = src;
+  size_t c = 0;
+
+  while (*ps != '\0' && c < count) {
+    *(pd++) = *(ps++);
+    ++c;
+  }
+
+  if (c < count)
+    *pd = '\0';
+
+  return dest;
+}
+
+char *strchr(const char *s, int c) {
+  do {
+    if (*s == c)
+      return const_cast<char *>(s);
+  } while (*s++);
+  return nullptr;
+}
+
+char *strtok(char *str, const char *delim) {
+  static char *s = nullptr;
+  char *tok;
+  if (str == nullptr) {
+    if (s == nullptr)
+      return nullptr;
+  } else
+    s = str;
+  for (size_t i; (*s != '\0'); s++) {
+    for (i = 0; (delim[i] != '\0') && (*s != delim[i]); i++)
+      ;
+    if (delim[i] == '\0')
+      break;
+  }
+  if (*s == '\0')
+    return s = nullptr;
+  tok = s++;
+  for (size_t i; (*s != '\0'); s++) {
+    for (i = 0; (delim[i] != '\0') && (*s != delim[i]); i++)
+      ;
+    if (delim[i] != '\0')
+      break;
+  }
+  if (*s != '\0') {
+    *s = '\0';
+    s++;
+  }
+  return tok;
+}
+
+void srand(unsigned seed) { RandomNext = seed; }
+
+int rand() {
+  RandomNext = RandomNext * 1103515245 + 12345;
+  return static_cast<unsigned int>((RandomNext / 65536)) % 2147483647;
+}
+
+int abs(int n) { return n > 0 ? n : -n; }
+
+void *memcpy(void *dest, const void *src, size_t count) {
+  __builtin_memcpy(dest, src, count);
+  return dest;
+}
+
+unsigned long strtoul(const char *str, char **str_end, int base) {
+  unsigned long res = 0;
+  while (*str != '\0') {
+    if (*str == ' ') {
+      ++str;
+      continue;
+    }
+    if (*str >= '0' && *str <= '9') {
+      res = res * 10 + *str - '0';
+      ++str;
+      continue;
+    }
+    break;
+  }
+
+  if (*str_end)
+    *str_end = const_cast<char *>(str);
+  return res;
+}
+
+double atof(const char *s) { return strtod(s, nullptr); }
+
+int atoi(const char *str) { return (int)strtol(str, nullptr, 10); }
+
+double strtod(const char *str, char **ptr) {
+  char *p;
+
+  if (ptr == (char **)0)
+    return atof(str);
+
+  p = const_cast<char *>(str);
+
+  while (ISSPACE(*p))
+    ++p;
+
+  if (*p == '+' || *p == '-')
+    ++p;
+
+  /* INF or INFINITY.  */
+  if ((p[0] == 'i' || p[0] == 'I') && (p[1] == 'n' || p[1] == 'N') &&
+      (p[2] == 'f' || p[2] == 'F')) {
+    if ((p[3] == 'i' || p[3] == 'I') && (p[4] == 'n' || p[4] == 'N') &&
+        (p[5] == 'i' || p[5] == 'I') && (p[6] == 't' || p[6] == 'T') &&
+        (p[7] == 'y' || p[7] == 'Y')) {
+      *ptr = p + 8;
+      return atof(str);
+    } else {
+      *ptr = p + 3;
+      return atof(str);
+    }
+  }
+
+  /* NAN or NAN(foo).  */
+  if ((p[0] == 'n' || p[0] == 'N') && (p[1] == 'a' || p[1] == 'A') &&
+      (p[2] == 'n' || p[2] == 'N')) {
+    p += 3;
+    if (*p == '(') {
+      ++p;
+      while (*p != '\0' && *p != ')')
+        ++p;
+      if (*p == ')')
+        ++p;
+    }
+    *ptr = p;
+    return atof(str);
+  }
+
+  /* digits, with 0 or 1 periods in it.  */
+  if (ISDIGIT(*p) || *p == '.') {
+    int got_dot = 0;
+    while (ISDIGIT(*p) || (!got_dot && *p == '.')) {
+      if (*p == '.')
+        got_dot = 1;
+      ++p;
+    }
+
+    /* Exponent.  */
+    if (*p == 'e' || *p == 'E') {
+      int i;
+      i = 1;
+      if (p[i] == '+' || p[i] == '-')
+        ++i;
+      if (ISDIGIT(p[i])) {
+        while (ISDIGIT(p[i]))
+          ++i;
+        *ptr = p + i;
+        return atof(str);
+      }
+    }
+    *ptr = p;
+    return atof(str);
+  }
+  /* Didn't find any digits.  Doesn't look like a number.  */
+  *ptr = const_cast<char *>(str);
+  return 0.0;
+}
+
+void *memset(void *dest, int ch, size_t count) {
+  auto *P = reinterpret_cast<unsigned char *>(dest);
+  for (size_t I = 0; I < count; ++I, ++P)
+    *P = (unsigned char)ch;
+  return dest;
+}
+
+int tolower(int ch) {
+  if (ch >= 'A' && ch <= 'Z')
+    return ch + 32;
+  return ch;
+}
+
+char *strstr(const char *s1, const char *s2) {
+  const size_t len = strlen(s2);
+  while (*s1) {
+    if (!memcmp(s1, s2, len))
+      return const_cast<char *>(s1);
+    ++s1;
+  }
+  return (0);
+}
+
+char *__xpg_basename(const char *path) { return const_cast<char *>(path); }
+
+const unsigned short **__ctype_b_loc(void) { return &BLocTablePTable; }
+
+const int32_t **__ctype_tolower_loc(void) { return &ToLowerPTable; }
+
+void qsort(void *b, size_t n, size_t s, __compar_fn_t cmp) {
+  return _quicksort(b, n, s, (__compar_d_fn_t)cmp, nullptr);
+}
+
+int strcasecmp(const char *s1, const char *s2) {
+  const unsigned char *p1 = (const unsigned char *)s1;
+  const unsigned char *p2 = (const unsigned char *)s2;
+  int result;
+
+  if (p1 == p2)
+    return 0;
+
+  while ((result = tolower(*p1) - tolower(*p2++)) == 0)
+    if (*p1++ == '\0')
+      break;
+
+  return result;
+}
 }

 #pragma omp end declare target
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -212,9 +212,14 @@ uint32_t mapping::getThreadIdInBlock() {
 uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }

 uint32_t mapping::getBlockSize(bool IsSPMD) {
-  uint32_t BlockSize =
-      mapping::getNumberOfProcessorElements() - (!IsSPMD * impl::getWarpSize());
-  return BlockSize;
+  uint32_t BlockSize = mapping::getNumberOfProcessorElements();
+  if (IsSPMD)
+    return BlockSize;
+  uint32_t WarpSize = impl::getWarpSize();
+  bool IsFullLastWarp = !(BlockSize % WarpSize);
+  if (OMP_LIKELY(IsFullLastWarp))
+    return BlockSize - WarpSize;
+  return BlockSize - BlockSize % WarpSize;
 }
 uint32_t mapping::getBlockSize() {
  return mapping::getBlockSize(mapping::isSPMDMode());
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -47,6 +47,9 @@ using namespace ompx;
 namespace {

 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
+  if (NumThreadsClause != -1)
+    return NumThreadsClause;
+
  uint32_t NThreadsICV =
      NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
  uint32_t NumThreads = mapping::getBlockSize();
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -13,6 +13,7 @@
 #include "Debug.h"
 #include "Interface.h"
 #include "Mapping.h"
+#include "Memory.h"
 #include "Synchronization.h"
 #include "Types.h"
 #include "Utils.h"
@@ -36,36 +37,6 @@ extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));

 namespace {

-/// Fallback implementations are missing to trigger a link time error.
-/// Implementations for new devices, including the host, should go into a
-/// dedicated begin/end declare variant.
-///
-///{
-
-extern "C" {
-__attribute__((leaf)) void *malloc(uint64_t Size);
-__attribute__((leaf)) void free(void *Ptr);
-}
-
-///}
-
-/// AMDGCN implementations of the shuffle sync idiom.
-///
-///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
-
-extern "C" {
-void *malloc(uint64_t Size) {
-  // TODO: Use some preallocated space for dynamic malloc.
-  return nullptr;
-}
-
-void free(void *Ptr) {}
-}
-
-#pragma omp end declare variant
-///}
-
 /// A "smart" stack in shared memory.
 ///
 /// The stack exposes a malloc/free interface but works like a stack internally.
@@ -261,7 +232,9 @@ void state::enterDataEnvironment(IdentTy *Ident) {
      static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
  uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
  if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
-    uint32_t Bytes = sizeof(ThreadStates[0]) * mapping::getBlockSize();
+    uint32_t Bytes =
+        sizeof(ThreadStates[0]) *
+        (mapping::getBlockSize() + (mapping::isSPMDMode() ? 0 : 1));
    void *ThreadStatesPtr =
        memory::allocGlobal(Bytes, "Thread state array allocation");
    if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
@@ -349,6 +322,15 @@ int omp_get_thread_num(void) {
  return omp_get_ancestor_thread_num(omp_get_level());
 }

+int omp_get_bulk_thread_num() {
+  ASSERT(mapping::isSPMDMode());
+  int BId = mapping::getBlockId();
+  int BSize = mapping::getBlockSize(/* IsSPMD */ true);
+  int TId = mapping::getThreadIdInBlock();
+  int Id = BId * BSize + TId;
+  return returnValIfLevelIsActive(omp_get_level(), Id, 0);
+}
+
 int omp_get_team_size(int Level) {
  return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1);
 }
@@ -357,6 +339,11 @@ int omp_get_num_threads(void) {
  return omp_get_level() > 1 ? 1 : state::ParallelTeamSize;
 }

+int omp_get_bulk_num_threads(void) {
+  ASSERT(mapping::isSPMDMode());
+  return mapping::getKernelSize();
+}
+
 int omp_get_thread_limit(void) { return mapping::getBlockSize(); }

 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -115,6 +115,8 @@ uint32_t atomicExchange(uint32_t *Address, uint32_t Val,
 }
 ///}

+int32_t atomicAddSys(int32_t *Address, uint32_t Val);
+
 // Forward declarations defined to be defined for AMDGCN and NVPTX.
 uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering);
 void namedBarrierInit();
@@ -313,6 +315,10 @@ uint32_t atomicInc(uint32_t *Address, uint32_t Val,
  return __nvvm_atom_inc_gen_ui(Address, Val);
 }

+int32_t atomicAddSys(int32_t *Address, int32_t Val) {
+  return __nvvm_atom_sys_add_gen_i(Address, Val);
+}
+
 void namedBarrierInit() {}

 void namedBarrier() {
@@ -484,10 +490,26 @@ uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
  return impl::atomicInc(Addr, V, Ordering);
 }

+int32_t atomic::addSys(int32_t *Addr, int32_t Val) {
+  return impl::atomicAddSys(Addr, Val);
+}
+
 void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }

 void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }

+void mutex::TicketLock::lock() {
+  uint64_t MyTicket = atomic::add(&NextTicket, 1, atomic::seq_cst);
+  while (atomic::load(&NowServing, atomic::aquire) != MyTicket)
+    ;
+  fence::kernel(atomic::aquire);
+}
+
+void mutex::TicketLock::unlock() {
+  fence::kernel(atomic::release);
+  atomic::add(&NowServing, 1, atomic::seq_cst);
+}
+
 extern "C" {
 void __kmpc_ordered(IdentTy *Loc, int32_t TId) { FunctionTracingRAII(); }

--- a/openmp/libomptarget/DeviceRTL/src/TeamAllocator.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/TeamAllocator.cpp
@@ -0,0 +1,304 @@
+//===------- WarpAllocator.cpp - Warp memory allocator ------- C++ -*-========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "Debug.h"
+#include "Mapping.h"
+#include "Memory.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace ompx;
+
+char *CONSTANT(omptarget_device_heap_buffer)
+    __attribute__((used, retain, weak, visibility("protected")));
+
+size_t CONSTANT(omptarget_device_heap_size)
+    __attribute__((used, retain, weak, visibility("protected")));
+
+namespace {
+constexpr const size_t Alignment = 16;
+constexpr const size_t FirstThreadRatio = 40;
+constexpr const size_t SplitThreadhold = Alignment * 4;
+
+template <typename T> T abs(T V) { return V > 0 ? V : -V; }
+
+template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator;
+
+class WarpAllocatorEntry {
+  template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> friend struct WarpAllocator;
+
+  /// If Size is less than 0, the entry is allocated (in use).
+  int64_t Size = 0;
+  /// PrevSize is also supposed to be greater than or equal to 0. When it is 0,
+  /// it is the first entry of the buffer.
+  int64_t PrevSize = 0;
+
+public:
+  bool isFirst() const { return !PrevSize; }
+
+  size_t getSize() const { return abs(Size); }
+  void setSize(size_t V) { Size = V; }
+
+  void setPrevSize(WarpAllocatorEntry *Prev) {
+    PrevSize = Prev ? Prev->getSize() : 0;
+  }
+
+  size_t getUserSize() const { return getSize() - sizeof(WarpAllocatorEntry); }
+
+  // Note: isUsed can not be !isUnused or other way around because when Size is
+  // 0, it is uninitialized.
+  bool isUsed() const { return Size < 0; }
+  bool isUnused() const { return Size > 0; }
+
+  void setUsed() {
+    assert(isUnused() && "the entry is in use");
+    Size *= -1;
+  }
+  void setUnused() {
+    assert(isUsed() && "the entry is not in use");
+    Size *= -1;
+  }
+
+  char *getUserPtr() { return reinterpret_cast<char *>(this + 1); }
+  char *getEndPtr() { return reinterpret_cast<char *>(getNext()); }
+
+  WarpAllocatorEntry *getPrev() { return utils::advance(this, -PrevSize); }
+  WarpAllocatorEntry *getNext() { return utils::advance(this, getSize()); }
+
+  static WarpAllocatorEntry *fromUserPtr(void *Ptr) { return fromPtr(Ptr) - 1; }
+
+  static WarpAllocatorEntry *fromPtr(void *Ptr) {
+    return reinterpret_cast<WarpAllocatorEntry *>(Ptr);
+  }
+};
+
+static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");
+
+template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
+  void init() {
+    if (mapping::isSPMDMode() &&
+        (mapping::getThreadIdInBlock() || mapping::getBlockId()))
+      return;
+
+    size_t HeapSize = omptarget_device_heap_size;
+
+    FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
+    FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
+    size_t OtherThreadHeapSize =
+        (HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
+    OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);
+
+    size_t TeamHeapSize = FirstThreadHeapSize / TEAM_SIZE;
+    TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
+    FirstTeamSize = TeamHeapSize;
+
+    char *LastLimit = omptarget_device_heap_buffer;
+    for (int I = 0; I < WARP_SIZE; ++I) {
+      for (int J = 0; J < TEAM_SIZE; ++J) {
+        Entries[I][J] = nullptr;
+        Limits[I][J] = LastLimit + TeamHeapSize * (J + 1);
+      }
+      LastLimit += I ? OtherThreadHeapSize : FirstThreadHeapSize;
+      Limits[I][TEAM_SIZE - 1] = LastLimit;
+      TeamHeapSize = OtherThreadHeapSize / TEAM_SIZE;
+      TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
+    }
+  }
+
+  void *allocate(size_t Size) {
+    int32_t TeamSlot = getTeamSlot();
+    int32_t TIdInWarp = mapping::getThreadIdInWarp();
+
+    Size = utils::align_up(Size + sizeof(WarpAllocatorEntry), Alignment);
+
+    // Error our early if the requested size is larger than the entire block.
+    if (Size > getBlockSize(TIdInWarp, TeamSlot))
+      return nullptr;
+
+    WarpAllocatorEntry *E = nullptr;
+    {
+      mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
+
+      auto *LastEntry = Entries[TIdInWarp][TeamSlot];
+      auto *NewWatermark = (LastEntry ? LastEntry->getEndPtr()
+                                      : getBlockBegin(TIdInWarp, TeamSlot)) +
+                           Size;
+      if (NewWatermark >= Limits[TIdInWarp][TeamSlot]) {
+        E = findMemorySlow(Size, TIdInWarp, TeamSlot);
+      } else {
+        E = LastEntry ? LastEntry->getNext()
+                      : WarpAllocatorEntry::fromPtr(
+                            getBlockBegin(TIdInWarp, TeamSlot));
+        E->setSize(Size);
+        E->setPrevSize(LastEntry);
+        Entries[TIdInWarp][TeamSlot] = E;
+      }
+
+      if (!E)
+        return nullptr;
+
+      E->setUsed();
+    }
+
+    return E->getUserPtr();
+  }
+
+  void deallocate(void *Ptr) {
+    WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(Ptr);
+
+    auto TeamSlot = getTeamSlot();
+    auto TIdInWarp = mapping::getThreadIdInWarp();
+
+    mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
+    E->setUnused();
+    // Is last entry?
+    if (E == Entries[TIdInWarp][TeamSlot]) {
+      do {
+        E = E->getPrev();
+      } while (!E->isFirst() && !E->isUsed());
+      Entries[TIdInWarp][TeamSlot] = E;
+    }
+  }
+
+  memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+    if (!utils::isInRange(P, omptarget_device_heap_buffer,
+                          omptarget_device_heap_size))
+      return {};
+
+    auto TeamSlot = getTeamSlot();
+    auto TIdInWarp = mapping::getThreadIdInWarp();
+    for (int I = TIdInWarp; I < TIdInWarp + WARP_SIZE; ++I) {
+      int TId = I % WARP_SIZE;
+      for (int J = TeamSlot; J < TeamSlot + TEAM_SIZE; ++J) {
+        int SId = J % TEAM_SIZE;
+        if (P < getBlockBegin(TId, SId) || P >= getBlockEnd(TId, SId))
+          continue;
+
+        mutex::LockGuard LG(Locks[I][SId]);
+        WarpAllocatorEntry *E = Entries[I][SId];
+        if (!E)
+          return {};
+        if (E->getEndPtr() <= P)
+          return {};
+        do {
+          if (E->getUserPtr() <= P && P < E->getEndPtr()) {
+            if (!E->isUsed())
+              return {};
+            return {E->getUserPtr(), E->getUserSize()};
+          }
+          E = E->getPrev();
+        } while (!E->isFirst());
+      }
+    }
+    return {};
+  }
+
+private:
+  char *getBlockBegin(int32_t TIdInWarp, int32_t TeamSlot) const {
+    if (TeamSlot)
+      return Limits[TIdInWarp][TeamSlot - 1];
+    if (TIdInWarp)
+      return Limits[TIdInWarp - 1][TEAM_SIZE - 1];
+    return omptarget_device_heap_buffer;
+  }
+  char *getBlockEnd(int32_t TIdInWarp, int32_t TeamSlot) const {
+    return Limits[TIdInWarp][TeamSlot];
+  }
+
+  size_t getBlockSize(int32_t TIdInWarp, int32_t TeamSlot) const {
+    return getBlockEnd(TIdInWarp, TeamSlot) -
+           getBlockBegin(TIdInWarp, TeamSlot);
+  }
+
+  static int32_t getTeamSlot() { return mapping::getBlockId() % TEAM_SIZE; }
+
+  WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp,
+                                     int32_t TeamSlot) {
+    char *Ptr = getBlockBegin(TIdInWarp, TeamSlot);
+    char *Limit = getBlockEnd(TIdInWarp, TeamSlot);
+
+    WarpAllocatorEntry *E = WarpAllocatorEntry::fromPtr(Ptr);
+    do {
+      if (!E->isUsed() && E->getSize() >= Size)
+        break;
+      E = E->getNext();
+      if (reinterpret_cast<char *>(E) + Size > Limit)
+        return nullptr;
+    } while (1);
+
+    size_t OldSize = E->getSize();
+    if (OldSize - Size >= SplitThreadhold) {
+      auto *OldNext = E->getNext();
+      E->setSize(Size);
+      auto *LeftOverE = E->getNext();
+      LeftOverE->setPrevSize(E);
+      LeftOverE->setSize(OldSize - Size);
+      OldNext->setPrevSize(LeftOverE);
+    }
+
+    return E;
+  }
+
+  WarpAllocatorEntry *Entries[WARP_SIZE][TEAM_SIZE];
+  char *Limits[WARP_SIZE][TEAM_SIZE];
+  mutex::TicketLock Locks[WARP_SIZE][TEAM_SIZE];
+  size_t FirstThreadHeapSize;
+  size_t FirstTeamSize;
+};
+
+WarpAllocator<32, 16> Allocator;
+
+} // namespace
+
+namespace ompx {
+namespace memory {
+MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+  return Allocator.getMemoryAllocationInfo(P);
+}
+} // namespace memory
+} // namespace ompx
+
+extern "C" {
+
+void *malloc(size_t Size) {
+  if (!Size)
+    return nullptr;
+  void *P = Allocator.allocate(Size);
+  assert(P && "allocator out of memory");
+  assert(reinterpret_cast<intptr_t>(P) % Alignment == 0 &&
+         "misaligned address");
+  return P;
+}
+
+void free(void *P) {
+  if (!P)
+    return;
+  Allocator.deallocate(P);
+}
+
+void *realloc(void *ptr, size_t new_size) {
+  void *NewPtr = malloc(new_size);
+  if (!NewPtr)
+    return nullptr;
+  WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(ptr);
+  __builtin_memcpy(NewPtr, ptr, utils::min(E->getUserSize(), new_size));
+  free(ptr);
+  return NewPtr;
+}
+
+void __kmpc_target_init_allocator() { Allocator.init(); }
+}
+
+#pragma omp end declare target
--- a/openmp/libomptarget/DeviceRTL/src/WarpAllocator.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/WarpAllocator.cpp
@@ -0,0 +1,275 @@
+//===------- WarpAllocator.cpp - Warp memory allocator ------- C++ -*-========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "Debug.h"
+#include "Mapping.h"
+#include "Memory.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace ompx;
+
+char *CONSTANT(omptarget_device_heap_buffer)
+    __attribute__((used, retain, weak, visibility("protected")));
+
+size_t CONSTANT(omptarget_device_heap_size)
+    __attribute__((used, retain, weak, visibility("protected")));
+
+namespace {
+constexpr const size_t Alignment = 16;
+constexpr const size_t FirstThreadRatio = 40;
+constexpr const size_t SplitThreadhold = Alignment * 16;
+
+template <typename T> T abs(T V) { return V > 0 ? V : -V; }
+
+template <int32_t WARP_SIZE> struct WarpAllocator;
+
+class WarpAllocatorEntry {
+  template <int32_t WARP_SIZE> friend struct WarpAllocator;
+
+  /// If Size is less than 0, the entry is allocated (in use).
+  int64_t Size = 0;
+  /// PrevSize is also supposed to be greater than or equal to 0. When it is 0,
+  /// it is the first entry of the buffer.
+  int64_t PrevSize = 0;
+
+public:
+  bool isFirst() const { return !PrevSize; }
+
+  size_t getSize() const { return abs(Size); }
+  void setSize(size_t V) { Size = V; }
+
+  void setPrevSize(WarpAllocatorEntry *Prev) {
+    PrevSize = Prev ? Prev->getSize() : 0;
+  }
+
+  size_t getUserSize() const { return getSize() - sizeof(WarpAllocatorEntry); }
+
+  // Note: isUsed can not be !isUnused or other way around because when Size is
+  // 0, it is uninitialized.
+  bool isUsed() const { return Size < 0; }
+  bool isUnused() const { return Size > 0; }
+
+  void setUsed() {
+    assert(isUnused() && "the entry is in use");
+    Size *= -1;
+  }
+  void setUnused() {
+    assert(isUsed() && "the entry is not in use");
+    Size *= -1;
+  }
+
+  char *getUserPtr() { return reinterpret_cast<char *>(this + 1); }
+  char *getEndPtr() { return reinterpret_cast<char *>(getNext()); }
+
+  WarpAllocatorEntry *getPrev() { return utils::advance(this, -PrevSize); }
+  WarpAllocatorEntry *getNext() { return utils::advance(this, getSize()); }
+
+  static WarpAllocatorEntry *fromUserPtr(void *Ptr) { return fromPtr(Ptr) - 1; }
+
+  static WarpAllocatorEntry *fromPtr(void *Ptr) {
+    return reinterpret_cast<WarpAllocatorEntry *>(Ptr);
+  }
+};
+
+static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");
+
+template <int32_t WARP_SIZE> struct WarpAllocator {
+  void init() {
+    if (mapping::isSPMDMode() &&
+        (mapping::getThreadIdInBlock() || mapping::getBlockId()))
+      return;
+
+    size_t HeapSize = omptarget_device_heap_size;
+    size_t FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
+    FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
+    size_t OtherThreadHeapSize =
+        (HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
+    OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);
+
+    for (int I = 0; I < WARP_SIZE; ++I) {
+      Entries[I] = nullptr;
+      size_t PrivateOffset = OtherThreadHeapSize * I + FirstThreadHeapSize;
+      Limits[I] = omptarget_device_heap_buffer + PrivateOffset;
+    }
+  }
+
+  void *allocate(size_t Size) {
+    int32_t TIdInWarp = mapping::getThreadIdInWarp();
+    Size = utils::align_up(Size + sizeof(WarpAllocatorEntry), Alignment);
+
+    // Error our early if the requested size is larger than the entire block.
+    if (Size > getBlockSize(TIdInWarp))
+      return nullptr;
+
+    WarpAllocatorEntry *E = nullptr;
+    {
+      mutex::LockGuard LG(Locks[TIdInWarp]);
+
+      auto *LastEntry = Entries[TIdInWarp];
+      auto *NewWatermark =
+          (LastEntry ? LastEntry->getEndPtr() : getBlockBegin(TIdInWarp)) +
+          Size;
+      if (NewWatermark >= Limits[TIdInWarp]) {
+        E = findMemorySlow(Size, TIdInWarp);
+      } else {
+        E = LastEntry ? LastEntry->getNext()
+                      : WarpAllocatorEntry::fromPtr(getBlockBegin(TIdInWarp));
+        E->setSize(Size);
+        E->setPrevSize(LastEntry);
+        Entries[TIdInWarp] = E;
+      }
+    }
+
+    if (!E)
+      return nullptr;
+
+    assert(E->isUnused() && "entry is not set to use properly");
+
+    E->setUsed();
+
+    return E->getUserPtr();
+  }
+
+  void deallocate(void *Ptr) {
+    WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(Ptr);
+
+    auto TIdInWarp = mapping::getThreadIdInWarp();
+
+    mutex::LockGuard LG(Locks[TIdInWarp]);
+    E->setUnused();
+    // Is last entry?
+    if (E == Entries[TIdInWarp]) {
+      do {
+        E = E->getPrev();
+      } while (!E->isFirst() && !E->isUsed());
+      Entries[TIdInWarp] = E;
+    }
+  }
+
+  memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+    if (!utils::isInRange(P, omptarget_device_heap_buffer,
+                          omptarget_device_heap_size))
+      return {};
+
+    auto TIdInWarp = mapping::getThreadIdInWarp();
+    for (int I = TIdInWarp; I < TIdInWarp + mapping::getWarpSize(); ++I) {
+      int TId = I % mapping::getWarpSize();
+      if (P < getBlockBegin(TId) || P >= getBlockEnd(TId))
+        continue;
+
+      mutex::LockGuard LG(Locks[I]);
+      WarpAllocatorEntry *E = Entries[I];
+      if (!E)
+        return {};
+      if (E->getEndPtr() <= P)
+        return {};
+      do {
+        if (E->getUserPtr() <= P && P < E->getEndPtr()) {
+          if (!E->isUsed())
+            return {};
+          return {E->getUserPtr(), E->getUserSize()};
+        }
+        E = E->getPrev();
+      } while (!E->isFirst());
+    }
+    return {};
+  }
+
+private:
+  char *getBlockBegin(int32_t TIdInWarp) const {
+    return TIdInWarp ? Limits[TIdInWarp - 1] : omptarget_device_heap_buffer;
+  }
+  char *getBlockEnd(int32_t TIdInWarp) const { return Limits[TIdInWarp]; }
+
+  size_t getBlockSize(int32_t TIdInWarp) const {
+    return getBlockEnd(TIdInWarp) - getBlockBegin(TIdInWarp);
+  }
+
+  WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp) {
+    char *Ptr = getBlockBegin(TIdInWarp);
+    char *Limit = getBlockEnd(TIdInWarp);
+
+    WarpAllocatorEntry *E = WarpAllocatorEntry::fromPtr(Ptr);
+    do {
+      if (!E->isUsed() && E->getSize() >= Size)
+        break;
+      E = E->getNext();
+      if (reinterpret_cast<char *>(E) + Size > Limit)
+        return nullptr;
+    } while (1);
+
+    size_t OldSize = E->getSize();
+    if (OldSize - Size >= SplitThreadhold) {
+      auto *OldNext = E->getNext();
+      E->setSize(Size);
+      auto *LeftOverE = E->getNext();
+      LeftOverE->setPrevSize(E);
+      LeftOverE->setSize(OldSize - Size);
+      OldNext->setPrevSize(LeftOverE);
+    }
+
+    return E;
+  }
+
+  WarpAllocatorEntry *Entries[WARP_SIZE];
+  char *Limits[WARP_SIZE];
+  mutex::TicketLock Locks[WARP_SIZE];
+};
+
+WarpAllocator<32> Allocator;
+
+} // namespace
+
+namespace ompx {
+namespace memory {
+MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+  return Allocator.getMemoryAllocationInfo(P);
+}
+} // namespace memory
+} // namespace ompx
+
+extern "C" {
+
+void *malloc(size_t Size) {
+  if (!Size)
+    return nullptr;
+  void *P = Allocator.allocate(Size);
+  assert(P && "allocator out of memory");
+  assert(reinterpret_cast<intptr_t>(P) % Alignment == 0 &&
+         "misaligned address");
+  return P;
+}
+
+void free(void *P) {
+  if (!P)
+    return;
+  Allocator.deallocate(P);
+}
+
+void *realloc(void *ptr, size_t new_size) {
+  void *NewPtr = malloc(new_size);
+  if (!NewPtr)
+    return nullptr;
+  WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(ptr);
+  __builtin_memcpy(NewPtr, ptr, utils::min(E->getUserSize(), new_size));
+  free(ptr);
+  return NewPtr;
+}
+
+void __kmpc_target_init_allocator() { Allocator.init(); }
+}
+
+#pragma omp end declare target
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -111,6 +111,38 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
    stride = loopSize; // make sure we only do 1 chunk per warp
  }

+  static void ForStaticNoLoopNoTrunk(int &last, T &lb, T &ub, ST &stride,
+                                     ST &chunk, T entityId,
+                                     T numberOfEntities) {
+    T loopSize = ub - lb + 1;
+    chunk = loopSize / numberOfEntities;
+
+    // We have more entities than iterations.
+    if (chunk == 0) {
+      chunk = 1;
+      lb = lb + entityId * chunk;
+      T inputUb = ub;
+      ub = lb + chunk - 1;
+      last = lb <= inputUb && inputUb <= ub;
+      stride = loopSize; // make sure we only do 1 chunk per warp
+      return;
+    }
+
+    T leftOver = loopSize - chunk * numberOfEntities;
+
+    if (entityId < leftOver) {
+      chunk++;
+      lb = lb + entityId * chunk;
+    } else {
+      lb = lb + entityId * chunk + leftOver;
+    }
+
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    last = lb <= inputUb && inputUb <= ub;
+    stride = loopSize; // make sure we only do 1 chunk per warp
+  }
+
  ////////////////////////////////////////////////////////////////////////////////
  // Support for Static Init

@@ -184,6 +216,19 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
                     omp_get_num_teams() * numberOfActiveOMPThreads);
      break;
    }
+    case kmp_sched_distr_parallel_static_noloop_chunk: {
+      ForStaticChunk(lastiter, lb, ub, stride, chunk,
+                     numberOfActiveOMPThreads * omp_get_team_num() + gtid,
+                     omp_get_num_teams() * numberOfActiveOMPThreads);
+      break;
+    }
+    case kmp_sched_distr_parallel_static_noloop_nochunk: {
+      ForStaticNoLoopNoTrunk(lastiter, lb, ub, stride, chunk,
+                             numberOfActiveOMPThreads * omp_get_team_num() +
+                                 gtid,
+                             omp_get_num_teams() * numberOfActiveOMPThreads);
+      break;
+    }
    default: {
      // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
--- a/openmp/libomptarget/DeviceRTL/src/exports
+++ b/openmp/libomptarget/DeviceRTL/src/exports
@@ -6,6 +6,62 @@ _ZN4ompx*

 IsSPMDMode

-memcmp
-printf
 __assert_fail
+__ctype_b_loc
+__ctype_tolower_loc
+__xpg_basename
+abs
+atof
+atoi
+calloc
+clock
+clock_gettime
+exit
+fclose
+feof
+fflush
+fgets
+fopen
+fprintf
+fputs
+fread
+free
+fscanf
+fseek
+ftell
+fwrite
+getc
+gettimeofday
+gmtime
+malloc
+memcmp
+memcpy
+memset
+pclose
+perror
+popen
+printf
+qsort
+rand
+realloc
+rewind
+sprintf
+srand
+sscanf
+stat
+strcat
+strchr
+strcmp
+strcpy
+strftime
+strlen
+strncmp
+strncpy
+strstr
+strtod
+strtok
+strtol
+strtoul
+time
+tolower
+strcasecmp
--- a/openmp/libomptarget/DirectGPUCompilation/clang-gpu
+++ b/openmp/libomptarget/DirectGPUCompilation/clang-gpu
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+
+
+from driver.wrapper import run
+
+
+def main():
+    run(is_cpp=False)
+
+
+if __name__ == "__main__":
+    main()
--- a/openmp/libomptarget/DirectGPUCompilation/clang-gpu++
+++ b/openmp/libomptarget/DirectGPUCompilation/clang-gpu++
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+
+
+from driver.wrapper import run
+
+
+def main():
+    run(is_cpp=True)
+
+
+if __name__ == "__main__":
+    main()
--- a/openmp/libomptarget/DirectGPUCompilation/driver/Main.c
+++ b/openmp/libomptarget/DirectGPUCompilation/driver/Main.c
@@ -0,0 +1,46 @@
+//===------- Main.c - Direct compilation program start point ------ C -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <string.h>
+extern int __user_main(int, char *[]);
+extern void __kmpc_target_init_allocator(void);
+
+#ifndef LOADER_THREAD_LIMIT
+#ifdef SINGLE_THREAD_EXECUTION
+#define LOADER_THREAD_LIMIT 1
+#else
+#define LOADER_THREAD_LIMIT 1024
+#endif
+#endif
+
+#ifndef LOADER_NUM_TEAMS
+#define LOADER_NUM_TEAMS 1
+#endif
+
+int main(int argc, char *argv[]) {
+#pragma omp target enter data map(to: argv[:argc])
+
+  for (int I = 0; I < argc; ++I) {
+#pragma omp target enter data map(to: argv[I][:strlen(argv[I])])
+  }
+
+  int Ret = 0;
+#pragma omp target enter data map(to: Ret)
+
+#pragma omp target teams num_teams(1) thread_limit(1)
+  { __kmpc_target_init_allocator(); }
+
+#pragma omp target teams num_teams(LOADER_NUM_TEAMS) thread_limit(LOADER_THREAD_LIMIT)
+  {
+    Ret = __user_main(argc, argv);
+  }
+
+#pragma omp target exit data map(from: Ret)
+
+  return Ret;
+}
--- a/openmp/libomptarget/DirectGPUCompilation/driver/UserWrapper.h
+++ b/openmp/libomptarget/DirectGPUCompilation/driver/UserWrapper.h
@@ -0,0 +1,14 @@
+//===------- UserWrapper.h - User code wrapper --------------- C++ --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
+#define OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
+
+#pragma omp begin declare target device_type(nohost)
+
+#endif
--- a/openmp/libomptarget/DirectGPUCompilation/driver/init.py
+++ b/openmp/libomptarget/DirectGPUCompilation/driver/init.py
--- a/openmp/libomptarget/DirectGPUCompilation/driver/wrapper.py
+++ b/openmp/libomptarget/DirectGPUCompilation/driver/wrapper.py
@@ -0,0 +1,146 @@
+import argparse
+import os
+import pathlib
+import subprocess
+import sys
+import tempfile
+
+
+cwd = os.path.dirname(os.path.realpath(__file__))
+
+source_file_suffix = [".c", ".cpp", ".cu", ".hip", ".cc", ".cxx"]
+
+
+def print_version():
+    cp = subprocess.run(["clang", "--version"])
+    if cp.returncode != 0:
+        sys.exit(cp.returncode)
+
+
+def compile_loader(loader_name, targets, macros, verbose, dry_run):
+    cmd = [
+        "clang",
+        "-c",
+        "-fopenmp",
+        "-foffload-lto",
+        "-fopenmp-offload-mandatory",
+        "-o",
+        loader_name,
+    ]
+    cmd += macros
+    if targets:
+        for arch in targets:
+            cmd.append("--offload-arch={}".format(arch))
+    else:
+        cmd.append("--offload-arch=native")
+    cmd.append(os.path.join(cwd, "Main.c"))
+    if verbose:
+        print(" ".join(cmd), file=sys.stderr)
+    if dry_run:
+        print(" ".join(cmd), file=sys.stderr)
+        return
+    cp = subprocess.run(cmd)
+    if cp.returncode != 0:
+        sys.exit(cp.returncode)
+
+
+def invoke_clang(is_cpp, args, targets, verbose, dry_run):
+    cmd = [
+        "clang++" if is_cpp else "clang",
+        "-fopenmp",
+        "-foffload-lto",
+        "-fopenmp-offload-mandatory",
+        "-fopenmp-globalize-to-global-space",
+        "-include",
+        os.path.join(cwd, "UserWrapper.h"),
+        "--save-temps",
+        "-rdynamic",
+        "-mllvm",
+        "-enable-host-rpc",
+        "-mllvm",
+        "-openmp-opt-disable-state-machine-rewrite",
+        "-mllvm",
+        "-enable-canonicalize-main-function",
+        "-mllvm",
+        "-canonical-main-function-name=__user_main",
+    ]
+    if targets:
+        for arch in targets:
+            cmd.append("--offload-arch={}".format(arch))
+    else:
+        cmd.append("--offload-arch=native")
+    cmd += args
+    if verbose:
+        print(" ".join(cmd))
+    if dry_run:
+        print(" ".join(cmd))
+        return
+    cp = subprocess.run(cmd)
+    if cp.returncode != 0:
+        sys.exit(cp.returncode)
+
+
+def run(is_cpp=False):
+    parser = argparse.ArgumentParser(
+        prog="clang-gpu", description="clang LLVM GPU compiler"
+    )
+    parser.add_argument(
+        "-c",
+        action="store_true",
+        help="Only run preprocess, compile, and assemble steps",
+    )
+    parser.add_argument(
+        "-v", action="store_true", help="Show commands to run and use verbose output"
+    )
+    parser.add_argument(
+        "--version", action="store_true", help="Print version information"
+    )
+    parser.add_argument(
+        "-###",
+        action="store_true",
+        help="Print (but do not run) the commands to run for this compilation",
+    )
+    parser.add_argument(
+        "--offload-arch",
+        type=str,
+        help=(
+            "Specify an offloading device architecture for CUDA, HIP, or OpenMP. (e.g."
+            " sm_35). If 'native' is used the compiler will detect locally installed"
+            " architectures. For HIP offloading, the device architecture can be"
+            " followed by target ID features delimited by a colon (e.g."
+            " gfx908:xnack+:sramecc-). May be specified more than once."
+        ),
+        nargs="*",
+    )
+
+    args, fwd_args = parser.parse_known_args()
+
+    if args.version:
+        print_version()
+        return
+
+    if args.c:
+        fwd_args.append("-c")
+    if args.v:
+        fwd_args.append("-v")
+
+    dry_run = vars(args)["###"]
+    loader_name = None
+    temp_files = []
+
+    if not args.c:
+        tf = tempfile.NamedTemporaryFile()
+        loader_name = "{}.o".format(tf.name)
+        macros = []
+        for arg in fwd_args:
+            if arg.startswith("-D"):
+                macros.append(arg)
+        compile_loader(loader_name, args.offload_arch, macros, args.v, dry_run)
+        fwd_args.append(loader_name)
+        temp_files.append(loader_name)
+
+    invoke_clang(is_cpp, fwd_args, args.offload_arch, args.v, dry_run)
+
+    for f in temp_files:
+        if os.path.isfile(f):
+            os.unlink(f)
--- a/openmp/libomptarget/include/HostRPC.h
+++ b/openmp/libomptarget/include/HostRPC.h
@@ -0,0 +1,67 @@
+//===------- HostRPC.h - Host RPC ---------------------------- C++ --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_INCLUDE_HOSTRPC_H
+#define OPENMP_LIBOMPTARGET_INCLUDE_HOSTRPC_H
+
+#ifdef OMPTARGET_DEVICE_RUNTIME
+#include "Types.h"
+#else
+#include <cstdint>
+#endif
+
+namespace hostrpc {
+
+/// RPC call identifier. Note: negative value only. Non-negative values are for
+/// compiler generated functions.
+enum CallId {
+  CALLID___kmpc_launch_parallel_51_kernel = -1,
+  CALLID_invalid = -2147483648,
+};
+
+/// Execution status.
+enum ExecutionStatus {
+  EXEC_STAT_CREATED = 0,
+  EXEC_STAT_DONE = 1,
+};
+
+enum Type {
+  ARG_LITERAL = 0,
+  ARG_POINTER = 1,
+};
+
+struct Argument {
+  intptr_t Value;
+  int64_t ArgType;
+};
+
+struct Descriptor {
+  // The following member will be used by both host and device.
+  int32_t Id;
+  struct Argument *Args;
+  int64_t NumArgs;
+  volatile int64_t Status;
+  volatile int64_t ReturnValue;
+
+  // The following members will only be used by device.
+  void **ArgInfo;
+  void *ArgMap;
+};
+
+/// A wrapper of HostRPCDescriptor that will only be used between plugins and
+/// libomptarget. It contains the three stdio global variables.
+struct DescriptorWrapper {
+  Descriptor D;
+  void *StdIn = nullptr;
+  void *StdOut = nullptr;
+  void *StdErr = nullptr;
+};
+
+} // namespace hostrpc
+
+#endif
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -433,6 +433,10 @@ int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr,
 void __tgt_set_info_flag(uint32_t);

 int __tgt_print_device_info(int64_t DeviceId);
+
+// Host RPC support
+int64_t __kmpc_host_rpc_get_arg(void *Wrapper, int32_t ArgNo);
+void __kmpc_host_rpc_set_ret_val(void *Wrapper, int64_t RetVal);
 #ifdef __cplusplus
 }
 #endif
--- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
@@ -57,7 +57,7 @@ target_include_directories(omptarget.rtl.cuda PRIVATE ${LIBOMPTARGET_INCLUDE_DIR

 # Install plugin under the lib destination folder.
 install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.cuda PROPERTIES 
+set_target_properties(omptarget.rtl.cuda PROPERTIES
  INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
  CXX_VISIBILITY_PRESET protected)

--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -11,19 +11,29 @@
 //===----------------------------------------------------------------------===//

 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DynamicLibrary.h"

 #include <algorithm>
+#include <atomic>
 #include <cassert>
+#include <chrono>
 #include <cstddef>
 #include <cuda.h>
 #include <list>
 #include <memory>
 #include <mutex>
 #include <string>
+#include <thread>
+#include <type_traits>
+#include <unordered_map>
 #include <vector>

+#include <sys/stat.h>
+#include <sys/time.h>
+
 #include "Debug.h"
 #include "DeviceEnvironment.h"
+#include "HostRPC.h"
 #include "omptarget.h"
 #include "omptargetplugin.h"

@@ -39,6 +49,7 @@
 #include "llvm/Frontend/OpenMP/OMPConstants.h"

 using namespace llvm;
+using namespace std::chrono_literals;

 // Utility for retrieving and printing CUDA error string.
 #ifdef OMPTARGET_DEBUG
@@ -98,6 +109,8 @@ struct KernelTy {
 };

 namespace {
+volatile bool IsHostRPCEnabled = false;
+
 bool checkResult(CUresult Err, const char *ErrMsg) {
  if (Err == CUDA_SUCCESS)
    return true;
@@ -107,6 +120,369 @@ bool checkResult(CUresult Err, const char *ErrMsg) {
  return false;
 }

+class ArgumentExtractor {
+  hostrpc::Descriptor &D;
+
+public:
+  ArgumentExtractor(hostrpc::Descriptor &D) : D(D) {}
+
+  template <typename T> T getArg(unsigned Idx) {
+    assert(Idx < D.NumArgs && "unexpected argument index");
+    if constexpr (std::is_pointer<T>::value)
+      return reinterpret_cast<T>(D.Args[Idx].Value);
+    else
+      return static_cast<T>(D.Args[Idx].Value);
+  }
+};
+
+struct Parallel51KernelInfo {
+  std::string Name;
+  CUfunction Kernel;
+  int MaxNumThreads;
+};
+
+std::unordered_map<std::string, Parallel51KernelInfo> RPCKernelInfo;
+
+template <typename T> void *toVoidPtr(T Val) {
+  if constexpr (std::is_pointer<T>::value)
+    return reinterpret_cast<void *>(Val);
+  else
+    return reinterpret_cast<void *>(static_cast<intptr_t>(Val));
+}
+
+bool handle___kmpc_launch_parallel_51_kernel(hostrpc::Descriptor &D,
+                                             CUmodule Module, CUstream Stream) {
+  assert(D.NumArgs == 6);
+
+  ArgumentExtractor AE(D);
+
+  auto KernelName = AE.getArg<const char *>(0);
+  auto GTid = AE.getArg<int32_t>(1);
+  auto IfExpr = AE.getArg<int32_t>(2);
+  auto NumThreads = AE.getArg<int32_t>(3);
+  auto Args = AE.getArg<void **>(4);
+  auto NArgs = AE.getArg<int64_t>(5);
+
+  DP("[host-rpc] get a parallel_51 kernel %s.\n", KernelName);
+
+  CUresult Err;
+  CUfunction Func;
+  int MaxNumThreads;
+  auto Itr = RPCKernelInfo.find(KernelName);
+  if (Itr == RPCKernelInfo.end()) {
+    Err = cuModuleGetFunction(&Func, Module, KernelName);
+    if (Err != CUDA_SUCCESS) {
+      REPORT("Loading '%s' failed\n", KernelName);
+      CUDA_ERR_STRING(Err);
+      return false;
+    }
+    Err = cuFuncGetAttribute(&MaxNumThreads,
+                             CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
+    if (Err != CUDA_SUCCESS) {
+      REPORT("Failed to get max threads per block for kernel '%s'\n",
+             KernelName);
+      CUDA_ERR_STRING(Err);
+      return false;
+    }
+    Parallel51KernelInfo Info{KernelName, Func, MaxNumThreads};
+    RPCKernelInfo[KernelName] = std::move(Info);
+  } else {
+    Func = Itr->second.Kernel;
+    MaxNumThreads = Itr->second.MaxNumThreads;
+  }
+
+  if (NumThreads > 0 && NumThreads < MaxNumThreads)
+    MaxNumThreads = NumThreads;
+
+  void *LaunchArgs[5] = {toVoidPtr(GTid), toVoidPtr(IfExpr),
+                         toVoidPtr(NumThreads), Args, toVoidPtr(NArgs)};
+  void *Params[5];
+  for (int I = 0; I < 5; ++I)
+    Params[I] = &LaunchArgs[I];
+
+  int GridDimX = 1024;
+  if (const char *EnvStr = getenv("LIBOMPTARGET_PARALLEL_KERNEL_GRID_SIZE"))
+    GridDimX = std::stoi(EnvStr);
+
+  Err = cuLaunchKernel(Func, /* gridDimX */ GridDimX, /* gridDimY */ 1,
+                       /* gridDimZ */ 1, MaxNumThreads,
+                       /* blockDimY */ 1, /* blockDimZ */ 1, 0, Stream, Params,
+                       nullptr);
+  if (Err != CUDA_SUCCESS) {
+    REPORT("Failed to launch the new kernel %s\n", KernelName);
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+
+  Err = cuStreamSynchronize(Stream);
+  if (Err != CUDA_SUCCESS) {
+    REPORT("Failed to synchronize the stream " DPxMOD "\n", DPxPTR(Stream));
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+
+  DP("[host-rpc] successfully launch a new kernel %s.\n", KernelName);
+
+  return true;
+}
+
+const char *DescriptorVarName = "omptarget_hostrpc_descriptor";
+const char *FutexVarName = "omptarget_hostrpc_futex";
+const char *MemBufVarName = "omptarget_hostrpc_memory_buffer";
+const char *MemBufSizeVarName = "omptarget_hostrpc_memory_buffer_size";
+
+bool initHostRPCServer(CUmodule Module, CUcontext Context,
+                       CUdeviceptr &Descriptor, CUdeviceptr &Futex) {
+  DP("start to init host RPC server...\n");
+  CUresult Err = cuCtxSetCurrent(Context);
+  if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
+    return false;
+
+  auto CheckGlobal = [Module](CUdeviceptr &Ptr, const char *Name, size_t Size) {
+    size_t CUSize;
+
+    CUresult Err = cuModuleGetGlobal(&Ptr, &CUSize, Module, Name);
+    if (Err != CUDA_SUCCESS)
+      return false;
+
+    if (CUSize != Size)
+      return false;
+
+    return true;
+  };
+
+  CUdeviceptr DescriptorVar;
+  CUdeviceptr FutexVar;
+  CUdeviceptr MemBufVar;
+  CUdeviceptr MemBufSizeVar;
+
+  if (!CheckGlobal(DescriptorVar, DescriptorVarName,
+                   sizeof(hostrpc::Descriptor *))) {
+    REPORT("Loading global '%s' failed\n", DescriptorVarName);
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+  if (!CheckGlobal(FutexVar, FutexVarName, sizeof(int32_t *))) {
+    REPORT("Loading global '%s' failed\n", FutexVarName);
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+  if (!CheckGlobal(MemBufVar, MemBufVarName, sizeof(char *))) {
+    REPORT("Loading global '%s' failed\n", MemBufVarName);
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+  if (!CheckGlobal(MemBufSizeVar, MemBufSizeVarName, sizeof(size_t))) {
+    REPORT("Loading global '%s' failed\n", MemBufSizeVarName);
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+
+  Err = cuMemAllocManaged(&Descriptor, sizeof(hostrpc::Descriptor),
+                          CU_MEM_ATTACH_GLOBAL);
+  if (Err != CUDA_SUCCESS) {
+    DP("Failed to allocate USM for descriptor.\n");
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+  Err = cuMemAdvise(Descriptor, sizeof(hostrpc::Descriptor),
+                    CU_MEM_ADVISE_SET_ACCESSED_BY, 0);
+  if (Err != CUDA_SUCCESS) {
+    DP("Failed to cuMemAdvise USM for descriptor.\n");
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+
+  Err = cuMemAllocManaged(&Futex, sizeof(int32_t), CU_MEM_ATTACH_GLOBAL);
+  if (Err != CUDA_SUCCESS) {
+    DP("Failed to allocate USM for futex.\n");
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+  Err = cuMemAdvise(Futex, sizeof(int32_t), CU_MEM_ADVISE_SET_ACCESSED_BY, 0);
+  if (Err != CUDA_SUCCESS) {
+    DP("Failed to cuMemAdvise USM for futex.\n");
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+
+  *reinterpret_cast<int32_t *>(Futex) = 0;
+
+  CUdeviceptr MemBuf;
+  // 128MB
+  const size_t Size = 134217728;
+  Err = cuMemAllocManaged(&MemBuf, Size, CU_MEM_ATTACH_GLOBAL);
+  if (Err != CUDA_SUCCESS) {
+    REPORT("Failed to allocate USM for host rpc buffer\n");
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+  Err = cuMemAdvise(MemBuf, Size, CU_MEM_ADVISE_SET_ACCESSED_BY, 0);
+  if (Err != CUDA_SUCCESS) {
+    DP("Failed to cuMemAdvise USM for membuf.\n");
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+
+  Err = cuMemcpyHtoD(DescriptorVar, &Descriptor, sizeof(hostrpc::Descriptor *));
+  if (Err != CUDA_SUCCESS) {
+    REPORT("Failed to set %s to " DPxMOD ".\n", DescriptorVarName,
+           DPxPTR(Descriptor));
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+
+  Err = cuMemcpyHtoD(FutexVar, &Futex, sizeof(int32_t *));
+  if (Err != CUDA_SUCCESS) {
+    REPORT("Failed to set %s to " DPxMOD ".\n", FutexVarName, DPxPTR(Futex));
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+
+  Err = cuMemcpyHtoD(MemBufVar, &MemBuf, sizeof(int32_t *));
+  if (Err != CUDA_SUCCESS) {
+    REPORT("Failed to set %s to " DPxMOD ".\n", MemBufVarName, DPxPTR(MemBuf));
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+
+  Err = cuMemcpyHtoD(MemBufSizeVar, &Size, sizeof(size_t));
+  if (Err != CUDA_SUCCESS) {
+    REPORT("Failed to set %s to " DPxMOD ".\n", MemBufSizeVarName,
+           DPxPTR(MemBufSizeVar));
+    CUDA_ERR_STRING(Err);
+    return false;
+  }
+
+  return true;
+}
+
+class HostRPCInvokerWrapper {
+  void (*Invoker)(int32_t, void *) = nullptr;
+  std::unique_ptr<llvm::sys::DynamicLibrary> DL;
+  std::once_flag Flag;
+
+  void initInvoker() {
+    std::string ErrMsg;
+    DL = std::make_unique<sys::DynamicLibrary>(
+        sys::DynamicLibrary::getPermanentLibrary(nullptr, &ErrMsg));
+
+    assert(DL->isValid() && "invalid DL");
+    *((void **)&Invoker) =
+        DL->getAddressOfSymbol("__kmpc_host_rpc_invoke_host_wrapper");
+    assert(Invoker && "Invoker is nullptr");
+  }
+
+public:
+  void invoke(int32_t CallNo, void *Desc) {
+    std::call_once(Flag, &HostRPCInvokerWrapper::initInvoker, this);
+    Invoker(CallNo, Desc);
+  }
+};
+
+void runHostRPCServer(CUmodule Module, CUcontext Context,
+                      CUdeviceptr DescriptorPtr, CUdeviceptr FutexPtr) {
+  CUresult Err = cuCtxSetCurrent(Context);
+  if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
+    return;
+
+  CUstream Stream;
+  Err = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
+  if (Err != CUDA_SUCCESS) {
+    REPORT("Failed to create a stream to launch a new kernel\n");
+    CUDA_ERR_STRING(Err);
+    return;
+  }
+
+  HostRPCInvokerWrapper Invoker;
+  hostrpc::DescriptorWrapper Wrapper;
+
+  auto CheckGlobal = [Module](CUdeviceptr &Ptr, const char *Name, size_t Size) {
+    size_t CUSize;
+
+    CUresult Err = cuModuleGetGlobal(&Ptr, &CUSize, Module, Name);
+    if (Err != CUDA_SUCCESS)
+      return false;
+
+    if (CUSize != Size)
+      return false;
+
+    return true;
+  };
+
+  {
+    CUdeviceptr DevPtr;
+    if (CheckGlobal(DevPtr, "StdInDummyVar", sizeof(int)))
+      Wrapper.StdIn = reinterpret_cast<void *>(DevPtr);
+    if (CheckGlobal(DevPtr, "StdOutDummyVar", sizeof(int)))
+      Wrapper.StdOut = reinterpret_cast<void *>(DevPtr);
+    if (CheckGlobal(DevPtr, "StdErrDummyVar", sizeof(int)))
+      Wrapper.StdErr = reinterpret_cast<void *>(DevPtr);
+
+    DP("[host-rpc] host: stdin=%p, stdout=%p, stderr=%p\n", Wrapper.StdIn,
+       Wrapper.StdOut, Wrapper.StdErr);
+  }
+
+  while (IsHostRPCEnabled) {
+    if (__sync_fetch_and_add(reinterpret_cast<uint32_t *>(FutexPtr), 0,
+                             __ATOMIC_ACQUIRE) == 0) {
+      // std::this_thread::sleep_for(50ns);
+      continue;
+    }
+
+    auto HostRPCD2HStart = std::chrono::high_resolution_clock::now();
+
+    // Get the descriptor.
+    Wrapper.D = *reinterpret_cast<hostrpc::Descriptor *>(DescriptorPtr);
+
+    auto HostRPCD2HEnd = std::chrono::high_resolution_clock::now();
+
+    // If the client still didn't reset the descriptor, we skip it.
+    if (Wrapper.D.Status == hostrpc::EXEC_STAT_DONE) {
+      // std::this_thread::sleep_for(50ns);
+      continue;
+    }
+
+    DP("[host-rpc] get a request (id=%d).\n", Wrapper.D.Id);
+
+    auto HostRPCHandleStart = std::chrono::high_resolution_clock::now();
+
+    bool HandleResult = false;
+    switch (Wrapper.D.Id) {
+    case hostrpc::CALLID___kmpc_launch_parallel_51_kernel:
+      HandleResult =
+          handle___kmpc_launch_parallel_51_kernel(Wrapper.D, Module, Stream);
+      break;
+    default:
+      Invoker.invoke(Wrapper.D.Id, &Wrapper);
+      HandleResult = true;
+    }
+
+    Wrapper.D.Status = hostrpc::EXEC_STAT_DONE;
+    if (!HandleResult)
+      Wrapper.D.ReturnValue = 0;
+
+    auto HostRPCHandleEnd = std::chrono::high_resolution_clock::now();
+
+    DP("[host-rpc] finish request (id=%d) with retval 0x%lx.\n", Wrapper.D.Id,
+       Wrapper.D.ReturnValue);
+
+    auto HostRPCH2DStart = std::chrono::high_resolution_clock::now();
+
+    // Update the descriptor and futex word on the device.
+    *reinterpret_cast<hostrpc::Descriptor *>(DescriptorPtr) = Wrapper.D;
+    __sync_fetch_and_sub(reinterpret_cast<uint32_t *>(FutexPtr), 1,
+                         __ATOMIC_ACQ_REL);
+
+    auto HostRPCH2DEnd = std::chrono::high_resolution_clock::now();
+
+    DP("[host-rpc-profiling-host] id=%d, D2H: %ld, handle=%ld, H2D=%ld.\n",
+       Wrapper.D.Id, (HostRPCD2HEnd - HostRPCD2HStart).count(),
+       (HostRPCHandleEnd - HostRPCHandleStart).count(),
+       (HostRPCH2DEnd - HostRPCH2DStart).count());
+  }
+}
+
 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
               CUstream Stream) {
  CUresult Err =
@@ -366,6 +742,9 @@ class DeviceRTLTy {
  std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
  std::mutex PeerAccessMatrixLock;

+  std::vector<std::vector<std::thread>> HostRPCServers;
+  std::vector<std::vector<int>> HostRPCInitState;
+
  /// A class responsible for interacting with device native runtime library to
  /// allocate and free memory.
  class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
@@ -528,6 +907,8 @@ public:
    Modules.resize(NumberOfDevices);
    StreamPool.resize(NumberOfDevices);
    EventPool.resize(NumberOfDevices);
+    HostRPCServers.resize(NumberOfDevices);
+    HostRPCInitState.resize(NumberOfDevices);
    PeerAccessMatrix.resize(NumberOfDevices);
    for (auto &V : PeerAccessMatrix)
      V.resize(NumberOfDevices, PeerAccessState::Unkown);
@@ -764,6 +1145,11 @@ public:
  }

  int deinitDevice(const int DeviceId) {
+    IsHostRPCEnabled = false;
+
+    for (auto &T : HostRPCServers[DeviceId])
+      T.join();
+
    auto IsInitialized = InitializedFlags[DeviceId];
    if (!IsInitialized)
      return OFFLOAD_SUCCESS;
@@ -968,6 +1354,107 @@ public:
      }
    }

+    // Initialize heap buffer
+    {
+      const char *BufferVarName = "omptarget_device_heap_buffer";
+      const char *SizeVarName = "omptarget_device_heap_size";
+      CUdeviceptr BufferVarPtr;
+      CUdeviceptr SizeVarPtr;
+      size_t BufferVarSize;
+      size_t SizeVarSize;
+
+      Err = cuModuleGetGlobal(&BufferVarPtr, &BufferVarSize, Module,
+                              BufferVarName);
+      if (Err == CUDA_SUCCESS) {
+        if (BufferVarSize != sizeof(uint64_t)) {
+          REPORT("Global global heap buffer pointer '%s' - size mismatch (%zu "
+                 "!= %zu)\n",
+                 BufferVarName, BufferVarSize, sizeof(uint64_t));
+          CUDA_ERR_STRING(Err);
+          return nullptr;
+        }
+
+        Err = cuModuleGetGlobal(&SizeVarPtr, &SizeVarSize, Module, SizeVarName);
+        if (Err == CUDA_SUCCESS) {
+          if (SizeVarSize != sizeof(uint64_t)) {
+            REPORT("Global global heap size variable '%s' - size mismatch (%zu "
+                   "!= %zu)\n",
+                   SizeVarName, SizeVarSize, sizeof(uint64_t));
+            CUDA_ERR_STRING(Err);
+            return nullptr;
+          }
+
+          size_t FreeGPUMemory = 0;
+          size_t TotalGPUMemory = 0;
+          // By default we allocate 12GB memory.
+          size_t HeapSize = 12884901888U;
+
+          Err = cuMemGetInfo(&FreeGPUMemory, &TotalGPUMemory);
+          if (Err == CUDA_SUCCESS)
+            HeapSize = FreeGPUMemory * 50 / 100;
+
+          INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+               "Allocate %zu bytes device memory for heap.\n", HeapSize);
+
+          CUdeviceptr BufferPtr;
+          Err = cuMemAlloc(&BufferPtr, HeapSize);
+          if (Err != CUDA_SUCCESS) {
+            REPORT("Error when allocating heap bufferm size = %zu\n", HeapSize);
+            CUDA_ERR_STRING(Err);
+            return nullptr;
+          }
+
+          Err = cuMemcpyHtoD(BufferVarPtr, &BufferPtr, BufferVarSize);
+          if (Err != CUDA_SUCCESS) {
+            REPORT("Error when copying data from host to device. Pointers: "
+                   "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
+                   DPxPTR(&BufferPtr), DPxPTR(BufferVarPtr), BufferVarSize);
+            CUDA_ERR_STRING(Err);
+            return nullptr;
+          }
+
+          Err = cuMemcpyHtoD(SizeVarPtr, &HeapSize, SizeVarSize);
+          if (Err != CUDA_SUCCESS) {
+            REPORT("Error when copying data from host to device. Pointers: "
+                   "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
+                   DPxPTR(&HeapSize), DPxPTR(SizeVarPtr), SizeVarSize);
+            CUDA_ERR_STRING(Err);
+            return nullptr;
+          }
+
+          DP("Successfully set heap buffer. omptarget_device_heap_buffer "
+             "= " DPxMOD ", omptarget_device_heap_size = %zu\n",
+             DPxPTR(BufferPtr), HeapSize);
+        } else {
+          DP("Finding global heap buffer pointer '%s' - symbol missing.\n",
+             SizeVarName);
+          DP("Continue, considering this is an image does not require heap "
+             "allocation.\n");
+        }
+
+      } else {
+        DP("Finding global heap buffer pointer '%s' - symbol missing.\n",
+           BufferVarName);
+        DP("Continue, considering this is an image does not require heap "
+           "allocation.\n");
+      }
+    }
+
+    // Start the host RPC thread
+    IsHostRPCEnabled = true;
+
+    CUdeviceptr DescriptorPtr;
+    CUdeviceptr FutexPtr;
+    if (!initHostRPCServer(Module, DeviceData[DeviceId].Context, DescriptorPtr,
+                           FutexPtr)) {
+      REPORT("Failed to init host RPC server\n");
+      return nullptr;
+    }
+
+    HostRPCServers[DeviceId].emplace_back(runHostRPCServer, Module,
+                                          DeviceData[DeviceId].Context,
+                                          DescriptorPtr, FutexPtr);
+
    return getOffloadEntriesTable(DeviceId);
  }

@@ -1560,8 +2047,8 @@ int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {

 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *Image,
                                       __tgt_image_info *Info) {
-  if (!__tgt_rtl_is_valid_binary(Image))
-    return false;
+  // if (!__tgt_rtl_is_valid_binary(Image))
+  //   return false;

  // A subarchitecture was not specified. Assume it is compatible.
  if (!Info || !Info->Arch)
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -64,7 +64,8 @@ VERS1.0 {
    __tgt_interop_init;
    __tgt_interop_use;
    __tgt_interop_destroy;
-    ompt_libomptarget_connect;
+    __kmpc_host_rpc_get_arg;
+    __kmpc_host_rpc_set_ret_val;
  local:
    *;
 };
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -16,6 +16,7 @@
 #include "private.h"
 #include "rtl.h"

+#include "HostRPC.h"
 #include "Utilities.h"

 #include <cassert>
@@ -435,3 +436,33 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
  delete AsyncInfo;
  *AsyncHandle = nullptr;
 }
+
+// Host RPC support functions.
+EXTERN int64_t __kmpc_host_rpc_get_arg(void *Wrapper, int32_t ArgNum) {
+  auto *W = reinterpret_cast<hostrpc::DescriptorWrapper *>(Wrapper);
+  auto &SD = W->D;
+
+  assert(ArgNum < SD.NumArgs && "out-of-range argument");
+
+  int64_t ArgVal = SD.Args[ArgNum].Value;
+  void *ArgPtr = reinterpret_cast<void *>(ArgVal);
+
+  DP("[host-rpc] get argno=%d arg=%lx...\n", ArgNum, ArgVal);
+
+  if (W->StdIn && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
+      ArgPtr == W->StdIn)
+    return (int64_t)stdin;
+  if (W->StdOut && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
+      ArgPtr == W->StdOut)
+    return (int64_t)stdout;
+  if (W->StdErr && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
+      ArgPtr == W->StdErr)
+    return (int64_t)stderr;
+
+  return ArgVal;
+}
+
+EXTERN void __kmpc_host_rpc_set_ret_val(void *Wrapper, int64_t RetVal) {
+  auto &SD = reinterpret_cast<hostrpc::DescriptorWrapper *>(Wrapper)->D;
+  SD.ReturnValue = RetVal;
+}