[libc] Export the RPC interface from libc

Summary: This patch adds new extensions that allow us to export the RPC interface from `libc` to other programs. This should allow external users of the GPU `libc` to interface with the RPC client (which more or less behaves like syscalls in this context). This is done by wrapping the interface into a C-style function call. Obviously, this approach is far less safe than the carefully crafted C++ interface. For example, we now expose the internal packet buffer, and it is theoretically possible to open a single port with conflicting opcodes and break the whole interface. So, extra care will be needed when interacting with this. However, the usage is very similar as shown by the new test. This somewhat stretches the concept of `libc` just doing `libc` things, but I think this is important enough to justify it. It is difficult to split this out, as we do not want to have the situation where we have multiple RPC clients running at one time, so for now it makes sense to just leave it in `libc`.
Fixe compilations issues after rebase of llvm-test-suite-gpu
2024-04-24 17:35:53 -07:00 · 2024-03-21 12:07:38 -07:00 · 2024-03-21 11:54:26 -07:00 · 2024-03-21 11:51:44 -07:00
81 changed files with 5795 additions and 184 deletions
--- a/auto-host-rpc/device.ll
+++ b/auto-host-rpc/device.ll
@@ -0,0 +1,65 @@
+; ModuleID = 'test-openmp-nvptx64-nvidia-cuda-sm_75.bc'
+source_filename = "/home/shiltian/Documents/vscode/llvm-project/auto-host-rpc/test.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.ddd = type { i32, i32, float }
+
+@__omp_rtl_debug_kind = weak_odr hidden local_unnamed_addr constant i32 0
+@__omp_rtl_assume_teams_oversubscription = weak_odr hidden local_unnamed_addr constant i32 0
+@__omp_rtl_assume_threads_oversubscription = weak_odr hidden local_unnamed_addr constant i32 0
+@__omp_rtl_assume_no_thread_state = weak_odr hidden local_unnamed_addr constant i32 0
+@__omp_rtl_assume_no_nested_parallelism = weak_odr hidden local_unnamed_addr constant i32 0
+@.str = private unnamed_addr constant [9 x i8] c"main.cpp\00", align 1
+@.str1 = private unnamed_addr constant [2 x i8] c"r\00", align 1
+@.str2 = private unnamed_addr constant [3 x i8] c"%d\00", align 1
+@.str3 = private unnamed_addr constant [7 x i8] c"%f%d%s\00", align 1
+@.str4 = private unnamed_addr constant [6 x i8] c"hello\00", align 1
+
+; Function Attrs: convergent nounwind
+define hidden void @foo() local_unnamed_addr #0 {
+entry:
+  %d = tail call align 16 dereferenceable_or_null(12) ptr @__kmpc_alloc_shared(i64 12) #4
+  %call = tail call noalias ptr @fopen(ptr noundef nonnull @.str, ptr noundef nonnull @.str1) #5
+  %call1 = tail call i32 (ptr, ptr, ...) @fprintf(ptr noundef %call, ptr noundef nonnull @.str2, i32 noundef 6) #5
+  %call2 = tail call i32 (ptr, ptr, ...) @fprintf(ptr noundef %call, ptr noundef nonnull @.str3, double noundef 6.000000e+00, i32 noundef 1, ptr noundef nonnull @.str4) #5
+  %a = getelementptr inbounds %struct.ddd, ptr %d, i64 0, i32 1
+  %call3 = tail call i32 (ptr, ptr, ...) @fscanf(ptr noundef %call, ptr noundef nonnull @.str2, ptr noundef nonnull %a) #5
+  tail call void @__kmpc_free_shared(ptr %d, i64 12)
+  ret void
+}
+
+; Function Attrs: nofree nosync nounwind allocsize(0)
+declare ptr @__kmpc_alloc_shared(i64) local_unnamed_addr #1
+
+; Function Attrs: convergent
+declare noalias ptr @fopen(ptr noundef, ptr noundef) local_unnamed_addr #2
+
+; Function Attrs: convergent
+declare i32 @fprintf(ptr noundef, ptr noundef, ...) local_unnamed_addr #2
+
+; Function Attrs: convergent
+declare i32 @fscanf(ptr noundef, ptr noundef, ...) local_unnamed_addr #2
+
+; Function Attrs: nosync nounwind
+declare void @__kmpc_free_shared(ptr allocptr nocapture, i64) local_unnamed_addr #3
+
+attributes #0 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx77,+sm_75" }
+attributes #1 = { nofree nosync nounwind allocsize(0) }
+attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_75" "target-features"="+ptx77,+sm_75" }
+attributes #3 = { nosync nounwind }
+attributes #4 = { nounwind }
+attributes #5 = { convergent nounwind }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4, !5}
+!llvm.ident = !{!6, !7}
+!nvvm.annotations = !{}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 7]}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 7, !"openmp", i32 50}
+!3 = !{i32 7, !"openmp-device", i32 50}
+!4 = !{i32 8, !"PIC Level", i32 2}
+!5 = !{i32 7, !"frame-pointer", i32 2}
+!6 = !{!"clang version 16.0.0"}
+!7 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
--- a/auto-host-rpc/main.cpp
+++ b/auto-host-rpc/main.cpp
@@ -0,0 +1,560 @@
+#include "llvm/ADT/EnumeratedArray.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetOptions.h"
+
+#include <string>
+
+using namespace llvm;
+
+namespace {
+static LLVMContext Context;
+static codegen::RegisterCodeGenFlags RCGF;
+static constexpr const char *InternalPrefix[] = {"__kmp", "llvm.", "nvm.",
+                                                 "omp_"};
+
+std::string typeToString(Type *T) {
+  if (T->is16bitFPTy())
+    return "f16";
+  if (T->isFloatTy())
+    return "f32";
+  if (T->isDoubleTy())
+    return "f64";
+  if (T->isPointerTy())
+    return "ptr";
+  if (T->isStructTy())
+    return std::string(T->getStructName());
+  if (T->isIntegerTy())
+    return "i" + std::to_string(T->getIntegerBitWidth());
+
+  llvm_unreachable("unknown type");
+}
+} // namespace
+
+namespace llvm {
+
+enum class HostRPCRuntimeFunction {
+#define __OMPRTL_HOST_RPC(_ENUM) OMPRTL_##_ENUM
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_get_ret_val),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val),
+  __OMPRTL_HOST_RPC(__last),
+#undef __OMPRTL_HOST_RPC
+};
+
+#define __OMPRTL_HOST_RPC(_ENUM)                                               \
+  auto OMPRTL_##_ENUM = HostRPCRuntimeFunction::OMPRTL_##_ENUM;
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_ret_val)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val)
+#undef __OMPRTL_HOST_RPC
+
+enum OMPHostRPCArgType {
+  // No need to copy.
+  OMP_HOST_RPC_ARG_SCALAR = 0,
+  OMP_HOST_RPC_ARG_PTR = 1,
+  // Copy to host.
+  OMP_HOST_RPC_ARG_PTR_COPY_TO = 2,
+  // Copy to device
+  OMP_HOST_RPC_ARG_PTR_COPY_FROM = 3,
+  // TODO: Do we have a tofrom pointer?
+  OMP_HOST_RPC_ARG_PTR_COPY_TOFROM = 4,
+};
+
+// struct HostRPCArgInfo {
+//   // OMPHostRPCArgType
+//   int64_t Type;
+//   int64_t Size;
+// };
+
+class AutoHostRPC {
+  LLVMContext &Context;
+  // Device module
+  Module &DM;
+  // Host module
+  Module &HM;
+  // Types
+  Type *Int8PtrTy;
+  Type *VoidTy;
+  Type *Int32Ty;
+  Type *Int64Ty;
+  StructType *ArgInfoTy;
+
+  struct CallSiteInfo {
+    CallInst *CI = nullptr;
+    SmallVector<Type *> Params;
+  };
+
+  struct HostRPCArgInfo {
+    // OMPHostRPCArgType
+    Constant *Type;
+    Value *Size;
+  };
+
+  //
+  SmallVector<Function *> HostEntryTable;
+
+  EnumeratedArray<Function *, HostRPCRuntimeFunction,
+                  HostRPCRuntimeFunction::OMPRTL___last>
+      RFIs;
+
+  static std::string getWrapperFunctionName(Function *F, CallSiteInfo &CSI) {
+    std::string Name = "__kmpc_host_rpc_wrapper_" + std::string(F->getName());
+    if (!F->isVarArg())
+      return Name;
+
+    for (unsigned I = F->getFunctionType()->getNumParams();
+         I < CSI.Params.size(); ++I) {
+      Name.push_back('_');
+      Name.append(typeToString(CSI.Params[I]));
+    }
+
+    return Name;
+  }
+
+  static bool isInternalFunction(Function &F) {
+    auto Name = F.getName();
+
+    for (auto P : InternalPrefix)
+      if (Name.startswith(P))
+        return true;
+
+    return false;
+  }
+
+  Value *convertToInt64Ty(IRBuilder<> &Builder, Value *V);
+
+  Value *convertFromInt64TyTo(IRBuilder<> &Builder, Value *V, Type *TargetTy);
+
+  // int device_wrapper(call_no, arg_info, ...) {
+  //   void *desc = __kmpc_host_rpc_get_desc(call_no, num_args, arg_info);
+  //   __kmpc_host_rpc_add_arg(desc, arg1, sizeof(arg1));
+  //   __kmpc_host_rpc_add_arg(desc, arg2, sizeof(arg2));
+  //   ...
+  //   __kmpc_host_rpc_send_and_wait(desc);
+  //   int r = (int)__kmpc_host_rpc_get_ret_val(desc);
+  //   return r;
+  // }
+  Function *getDeviceWrapperFunction(StringRef WrapperName, Function *F,
+                                     CallSiteInfo &CSI);
+
+  // void host_wrapper(desc) {
+  //   int arg1 = (int)__kmpc_host_rpc_get_arg(desc, 0);
+  //   float arg2 = (float)__kmpc_host_rpc_get_arg(desc, 1);
+  //   char *arg3 = (char *)__kmpc_host_rpc_get_arg(desc, 2);
+  //   ...
+  //   int r = actual_call(arg1, arg2, arg3, ...);
+  //   __kmpc_host_rpc_set_ret_val(ptr(desc, (int64_t)r);
+  // }
+  Function *getHostWrapperFunction(StringRef WrapperName, Function *F,
+                                   CallSiteInfo &CSI);
+
+  bool rewriteWithHostRPC(Function *F);
+
+public:
+  AutoHostRPC(Module &DeviceModule, Module &HostModule)
+      : Context(DeviceModule.getContext()), DM(DeviceModule), HM(HostModule) {
+    assert(&DeviceModule.getContext() == &HostModule.getContext() &&
+           "device and host modules have different context");
+
+#define __OMP_TYPE(TYPE) TYPE = Type::get##TYPE(Context)
+    __OMP_TYPE(Int8PtrTy);
+    __OMP_TYPE(VoidTy);
+    __OMP_TYPE(Int32Ty);
+    __OMP_TYPE(Int64Ty);
+#undef __OMP_TYPE
+
+#define __OMP_RTL(_ENUM, MOD, VARARG, RETTY, ...)                              \
+  {                                                                            \
+    SmallVector<Type *> Params{__VA_ARGS__};                                   \
+    FunctionType *FT = FunctionType::get(RETTY, Params, VARARG);               \
+    RFIs[OMPRTL_##_ENUM] = Function::Create(                                   \
+        FT, GlobalValue::LinkageTypes::InternalLinkage, #_ENUM, MOD);          \
+  }
+    __OMP_RTL(__kmpc_host_rpc_get_desc, DM, false, Int8PtrTy, Int32Ty, Int32Ty,
+              Int8PtrTy)
+    __OMP_RTL(__kmpc_host_rpc_add_arg, DM, false, VoidTy, Int8PtrTy, Int64Ty,
+              Int64Ty)
+    __OMP_RTL(__kmpc_host_rpc_send_and_wait, DM, false, VoidTy, Int8PtrTy)
+    __OMP_RTL(__kmpc_host_rpc_get_ret_val, DM, false, Int64Ty, Int8PtrTy)
+    __OMP_RTL(__kmpc_host_rpc_get_arg, HM, false, Int64Ty, Int8PtrTy, Int32Ty)
+    __OMP_RTL(__kmpc_host_rpc_set_ret_val, HM, false, VoidTy, Int8PtrTy,
+              Int64Ty)
+#undef __OMP_RTL
+
+    ArgInfoTy = StructType::create({Int64Ty, Int64Ty}, "struct.arg_info_t");
+  }
+
+  bool run();
+};
+
+Value *AutoHostRPC::convertToInt64Ty(IRBuilder<> &Builder, Value *V) {
+  Type *T = V->getType();
+
+  if (T == Int64Ty)
+    return V;
+
+  if (T->isPointerTy())
+    return Builder.CreatePtrToInt(V, Int64Ty);
+
+  if (T->isIntegerTy())
+    return Builder.CreateIntCast(V, Int64Ty, false);
+
+  if (T->isFloatingPointTy()) {
+    if (T->isFloatTy())
+      V = Builder.CreateFPToSI(V, Int32Ty);
+    return Builder.CreateFPToSI(V, Int64Ty);
+  }
+
+  llvm_unreachable("unknown cast to int64_t");
+}
+
+Value *AutoHostRPC::convertFromInt64TyTo(IRBuilder<> &Builder, Value *V,
+                                         Type *T) {
+  if (T == Int64Ty)
+    return V;
+
+  if (T->isPointerTy())
+    return Builder.CreateIntToPtr(V, T);
+
+  if (T->isIntegerTy())
+    return Builder.CreateIntCast(V, T, /* isSigned */ true);
+
+  if (T->isFloatingPointTy()) {
+    if (T->isFloatTy())
+      V = Builder.CreateIntCast(V, Int32Ty, /* isSigned */ true);
+    V = Builder.CreateSIToFP(V, T);
+    return V;
+  }
+
+  llvm_unreachable("unknown cast from int64_t");
+}
+
+bool AutoHostRPC::run() {
+  bool Changed = false;
+
+  SmallVector<Function *> WorkList;
+
+  for (Function &F : DM) {
+    // If the function is already defined, it definitely does not require RPC.
+    if (!F.isDeclaration())
+      continue;
+
+    // If it is an internal function, skip it as well.
+    if (isInternalFunction(F))
+      continue;
+
+    // If there is no use of the function, skip it.
+    if (F.use_empty())
+      continue;
+
+    WorkList.push_back(&F);
+  }
+
+  if (WorkList.empty())
+    return Changed;
+
+  for (Function *F : WorkList)
+    Changed |= rewriteWithHostRPC(F);
+
+  return Changed;
+}
+
+bool AutoHostRPC::rewriteWithHostRPC(Function *F) {
+  bool Changed = false;
+
+  SmallVector<CallInst *> WorkList;
+
+  for (User *U : F->users()) {
+    auto *CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+    WorkList.push_back(CI);
+  }
+
+  if (WorkList.empty())
+    return Changed;
+
+  for (CallInst *CI : WorkList) {
+    CallSiteInfo CSI;
+    CSI.CI = CI;
+    unsigned NumArgs = CI->arg_size();
+    for (unsigned I = 0; I < NumArgs; ++I)
+      CSI.Params.push_back(CI->getArgOperand(I)->getType());
+
+    std::string WrapperName = getWrapperFunctionName(F, CSI);
+    Function *DeviceWrapperFn = getDeviceWrapperFunction(WrapperName, F, CSI);
+    Function *HostWrapperFn = getHostWrapperFunction(WrapperName, F, CSI);
+
+    int32_t WrapperNumber = -1;
+    for (unsigned I = 0; I < HostEntryTable.size(); ++I) {
+      if (HostEntryTable[I] == HostWrapperFn) {
+        WrapperNumber = I;
+        break;
+      }
+    }
+    if (WrapperNumber == -1) {
+      WrapperNumber = HostEntryTable.size();
+      HostEntryTable.push_back(HostWrapperFn);
+    }
+
+    DataLayout DL = DM.getDataLayout();
+    IRBuilder<> Builder(CI);
+
+    auto CheckIfIdentifierPtr = [](const Value *V) {
+      auto *CI = dyn_cast<CallInst>(V);
+      if (!CI)
+        return false;
+      Function *Callee = CI->getCalledFunction();
+      return Callee->getName().startswith("__kmpc_host_rpc_wrapper_");
+    };
+
+    auto CheckIfAlloca = [](const Value *V) {
+      auto *CI = dyn_cast<CallInst>(V);
+      if (!CI)
+        return false;
+      Function *Callee = CI->getCalledFunction();
+      return Callee->getName() == "__kmpc_alloc_shared" ||
+             Callee->getName() == "malloc";
+    };
+
+    SmallVector<HostRPCArgInfo> ArgInfos;
+    bool IsConstantArgInfo = true;
+
+    for (Value *Op : CI->args()) {
+      if (!Op->getType()->isPointerTy()) {
+        HostRPCArgInfo AI{
+            ConstantInt::get(Int64Ty,
+                             OMPHostRPCArgType::OMP_HOST_RPC_ARG_SCALAR),
+            ConstantInt::getNullValue(Int64Ty)};
+        ArgInfos.push_back(std::move(AI));
+        continue;
+      }
+
+      Value *SizeVal = nullptr;
+      OMPHostRPCArgType ArgType = OMP_HOST_RPC_ARG_PTR_COPY_TOFROM;
+
+      SmallVector<const Value *> Objects;
+      getUnderlyingObjects(Op, Objects);
+
+      // TODO: Handle phi node
+      if (Objects.size() != 1)
+        llvm_unreachable("we can't handle phi node yet");
+
+      auto *Obj = Objects.front();
+      if (CheckIfIdentifierPtr(Obj)) {
+        ArgType = OMP_HOST_RPC_ARG_SCALAR;
+        SizeVal = ConstantInt::getNullValue(Int64Ty);
+      } else if (CheckIfAlloca(Obj)) {
+        auto *CI = dyn_cast<CallInst>(Obj);
+        SizeVal = CI->getOperand(0);
+        if (!isa<Constant>(SizeVal))
+          IsConstantArgInfo = false;
+      } else {
+        if (auto *GV = dyn_cast<GlobalVariable>(Obj)) {
+          SizeVal = ConstantInt::get(Int64Ty,
+                                     DL.getTypeStoreSize(GV->getValueType()));
+          if (GV->isConstant())
+            ArgType = OMP_HOST_RPC_ARG_PTR_COPY_TO;
+          if (GV->isConstant() && GV->hasInitializer()) {
+            // TODO: If the global variable is contant, we can do some
+            // optimization.
+          }
+        } else {
+          // TODO: fix that when it occurs
+          llvm_unreachable("cannot handle unknown type");
+        }
+      }
+
+      HostRPCArgInfo AI{ConstantInt::get(Int64Ty, ArgType), SizeVal};
+      ArgInfos.push_back(std::move(AI));
+    }
+
+    Value *ArgInfo = nullptr;
+
+    if (!IsConstantArgInfo) {
+      ArgInfo = Builder.CreateAlloca(
+          ArgInfoTy, ConstantInt::get(Int64Ty, NumArgs), "arg_info");
+      for (unsigned I = 0; I < NumArgs; ++I) {
+        Value *AII = GetElementPtrInst::Create(
+            ArrayType::get(ArgInfoTy, NumArgs), ArgInfo,
+            {ConstantInt::getNullValue(Int64Ty), ConstantInt::get(Int64Ty, I)});
+        Value *AIIType = GetElementPtrInst::Create(
+            ArgInfoTy, AII, {ConstantInt::get(Int64Ty, 0)});
+        Value *AIISize = GetElementPtrInst::Create(
+            ArgInfoTy, AII, {ConstantInt::get(Int64Ty, 1)});
+        Builder.Insert(AII);
+        Builder.Insert(AIIType);
+        Builder.Insert(AIISize);
+        Builder.CreateStore(ArgInfos[I].Type, AIIType);
+        Builder.CreateStore(ArgInfos[I].Size, AIISize);
+      }
+    } else {
+      SmallVector<Constant *> ArgInfoInitVar;
+      for (auto &AI : ArgInfos) {
+        auto *CS =
+            ConstantStruct::get(ArgInfoTy, {AI.Type, cast<Constant>(AI.Size)});
+        ArgInfoInitVar.push_back(CS);
+      }
+      Constant *ArgInfoInit = ConstantArray::get(
+          ArrayType::get(ArgInfoTy, NumArgs), ArgInfoInitVar);
+      ArgInfo = new GlobalVariable(
+          DM, ArrayType::get(ArgInfoTy, NumArgs), /* isConstant */ true,
+          GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info");
+    }
+
+    SmallVector<Value *> Args{ConstantInt::get(Int32Ty, WrapperNumber),
+                              ArgInfo};
+    for (Value *Op : CI->args())
+      Args.push_back(Op);
+
+    CallInst *NewCall = Builder.CreateCall(DeviceWrapperFn, Args);
+
+    CI->replaceAllUsesWith(NewCall);
+    CI->eraseFromParent();
+  }
+
+  F->eraseFromParent();
+
+  return true;
+}
+
+Function *AutoHostRPC::getDeviceWrapperFunction(StringRef WrapperName,
+                                                Function *F,
+                                                CallSiteInfo &CSI) {
+  Function *WrapperFn = DM.getFunction(WrapperName);
+  if (WrapperFn)
+    return WrapperFn;
+
+  // return_type device_wrapper(int32_t call_no, void *arg_info, ...)
+  SmallVector<Type *> Params{Int32Ty, Int8PtrTy};
+  Params.append(CSI.Params);
+
+  Type *RetTy = F->getReturnType();
+
+  FunctionType *FT = FunctionType::get(RetTy, Params, /*isVarArg*/ false);
+  WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::WeakODRLinkage,
+                               WrapperName, DM);
+
+  // Emit the body of the device wrapper
+  IRBuilder<> Builder(Context);
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
+  Builder.SetInsertPoint(EntryBB);
+
+  // skip call_no and arg_info.
+  constexpr const unsigned NumArgSkipped = 2;
+
+  Value *Desc = nullptr;
+  {
+    Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_get_desc];
+    Desc = Builder.CreateCall(
+        Fn,
+        {WrapperFn->getArg(0),
+         ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
+         WrapperFn->getArg(1)},
+        "desc");
+  }
+
+  {
+    Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_add_arg];
+    for (unsigned I = NumArgSkipped; I < WrapperFn->arg_size(); ++I) {
+      Value *V = convertToInt64Ty(Builder, WrapperFn->getArg(I));
+      Builder.CreateCall(Fn, {Desc, V, ConstantInt::getNullValue(Int64Ty)});
+    }
+  }
+
+  Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_send_and_wait], {Desc});
+
+  if (RetTy->isVoidTy()) {
+    Builder.CreateRetVoid();
+    return WrapperFn;
+  }
+
+  Value *RetVal =
+      Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_ret_val], {Desc});
+  if (RetTy != RetVal->getType())
+    RetVal = convertFromInt64TyTo(Builder, RetVal, RetTy);
+
+  Builder.CreateRet(RetVal);
+
+  return WrapperFn;
+}
+
+Function *AutoHostRPC::getHostWrapperFunction(StringRef WrapperName,
+                                              Function *F, CallSiteInfo &CSI) {
+  Function *WrapperFn = HM.getFunction(WrapperName);
+  if (WrapperFn)
+    return WrapperFn;
+
+  SmallVector<Type *> Params{Int8PtrTy};
+  FunctionType *FT = FunctionType::get(VoidTy, Params, /* isVarArg */ false);
+  WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,
+                               WrapperName, HM);
+
+  Value *Desc = WrapperFn->getArg(0);
+
+  // Emit the body of the host wrapper
+  IRBuilder<> Builder(Context);
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
+  Builder.SetInsertPoint(EntryBB);
+
+  SmallVector<Value *> Args;
+  for (unsigned I = 0; I < CSI.CI->arg_size(); ++I) {
+    Value *V = Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_arg],
+                                  {Desc, ConstantInt::get(Int32Ty, I)});
+    Args.push_back(convertFromInt64TyTo(Builder, V, CSI.Params[I]));
+  }
+
+  // The host callee that will be called eventually by the host wrapper.
+  Function *HostCallee = HM.getFunction(F->getName());
+  if (!HostCallee)
+    HostCallee = Function::Create(F->getFunctionType(), F->getLinkage(),
+                                  F->getName(), HM);
+
+  Value *RetVal = Builder.CreateCall(HostCallee, Args);
+  RetVal = convertToInt64Ty(Builder, RetVal);
+  Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_set_ret_val], {Desc, RetVal});
+  Builder.CreateRetVoid();
+
+  return WrapperFn;
+}
+
+} // namespace llvm
+
+int main(int argc, char *argv[]) {
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  InitializeAllAsmPrinters();
+  InitializeAllAsmParsers();
+
+  SMDiagnostic Err;
+  std::unique_ptr<Module> DM = parseIRFile("device.ll", Err, Context);
+  if (!DM)
+    return 1;
+  Module HM("host-rpc.bc", Context);
+  // get the right target triple
+  HM.setTargetTriple(Triple::normalize("x86-64"));
+
+  AutoHostRPC RPC(*DM, HM);
+  (void)RPC.run();
+
+  DM->dump();
+  // HM.dump();
+
+  return 0;
+}
--- a/auto-host-rpc/test.c
+++ b/auto-host-rpc/test.c
@@ -0,0 +1,19 @@
+#include <stdio.h>
+
+#pragma omp begin declare target device_type(nohost)
+
+struct ddd {
+  int num;
+  int a;
+  float b;
+};
+
+void foo() {
+  FILE *fp = fopen("main.cpp", "r");
+  struct ddd d;
+  fprintf(fp, "%d", 6);
+  fprintf(fp, "%f%d%s", 6.0f, 1, "hello");
+  fscanf(fp, "%d", &d.a);
+}
+
+#pragma omp end declare target
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -258,6 +258,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
 LANGOPT(OpenMPOptimisticCollapse  , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
 LANGOPT(OpenMPThreadSubscription  , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
 LANGOPT(OpenMPTeamSubscription  , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
+LANGOPT(OpenMPGlobalizeToGlobalSpace  , 1, 0, "Globalize to global space for the globalized variables")
 LANGOPT(OpenMPNoThreadState  , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
 LANGOPT(OpenMPNoNestedParallelism  , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
 LANGOPT(OpenMPOffloadMandatory  , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3497,6 +3497,10 @@ def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested
  HelpText<"Assert no nested parallel regions in the GPU">,
  MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;

+def fopenmp_globalize_to_global_space : Flag<["-"], "fopenmp-globalize-to-global-space">,
+  HelpText<"Globalize to global space for the globalized variables">,
+  MarshallingInfoFlag<LangOpts<"OpenMPGlobalizeToGlobalSpace">>;
+
 } // let Group = f_Group
 } // let Visibility = [ClangOption, CC1Option, FC1Option]
 } // let Flags = [NoArgumentUnused, HelpHidden]
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -2530,48 +2530,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
        (IPD->getParameterKind() == ImplicitParamKind::ThreadPrivateVar);
  }

-  Address DeclPtr = Address::invalid();
-  Address AllocaPtr = Address::invalid();
-  bool DoStore = false;
-  bool IsScalar = hasScalarEvaluationKind(Ty);
-  bool UseIndirectDebugAddress = false;
-
-  // If we already have a pointer to the argument, reuse the input pointer.
-  if (Arg.isIndirect()) {
-    DeclPtr = Arg.getIndirectAddress();
-    DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty));
-    // Indirect argument is in alloca address space, which may be different
-    // from the default address space.
-    auto AllocaAS = CGM.getASTAllocaAddressSpace();
-    auto *V = DeclPtr.getPointer();
-    AllocaPtr = DeclPtr;
-
-    // For truly ABI indirect arguments -- those that are not `byval` -- store
-    // the address of the argument on the stack to preserve debug information.
-    ABIArgInfo ArgInfo = CurFnInfo->arguments()[ArgNo - 1].info;
-    if (ArgInfo.isIndirect())
-      UseIndirectDebugAddress = !ArgInfo.getIndirectByVal();
-    if (UseIndirectDebugAddress) {
-      auto PtrTy = getContext().getPointerType(Ty);
-      AllocaPtr = CreateMemTemp(PtrTy, getContext().getTypeAlignInChars(PtrTy),
-                                D.getName() + ".indirect_addr");
-      EmitStoreOfScalar(V, AllocaPtr, /* Volatile */ false, PtrTy);
-    }
-
-    auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
-    auto DestLangAS =
-        getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
-    if (SrcLangAS != DestLangAS) {
-      assert(getContext().getTargetAddressSpace(SrcLangAS) ==
-             CGM.getDataLayout().getAllocaAddrSpace());
-      auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
-      auto *T = llvm::PointerType::get(getLLVMContext(), DestAS);
-      DeclPtr =
-          DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
-                                  *this, V, SrcLangAS, DestLangAS, T, true),
-                              DeclPtr.isKnownNonNull());
-    }
-
+  auto PushCleanupIfNeeded = [this, Ty, &D](Address DeclPtr) {
    // Push a destructor cleanup for this parameter if the ABI requires it.
    // Don't push a cleanup in a thunk for a method that will also emit a
    // cleanup.
@@ -2587,87 +2546,124 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
            EHStack.stable_begin();
      }
    }
-  } else {
-    // Check if the parameter address is controlled by OpenMP runtime.
-    Address OpenMPLocalAddr =
-        getLangOpts().OpenMP
-            ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
-            : Address::invalid();
-    if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
-      DeclPtr = OpenMPLocalAddr;
-      AllocaPtr = DeclPtr;
+  };
+
+  Address DeclPtr = Address::invalid();
+  Address AllocaPtr = Address::invalid();
+  Address OpenMPLocalAddr =
+      getLangOpts().OpenMP
+          ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
+          : Address::invalid();
+  bool DoStore = false;
+  bool IsScalar = hasScalarEvaluationKind(Ty);
+  bool UseIndirectDebugAddress = false;
+  if (OpenMPLocalAddr.isValid()) {
+    DeclPtr = OpenMPLocalAddr;
+    AllocaPtr = DeclPtr;
+    LValue Dst = MakeAddrLValue(DeclPtr, Ty);
+    if (Arg.isIndirect()) {
+      LValue Src = MakeAddrLValue(Arg.getIndirectAddress(), Ty);
+      callCStructCopyConstructor(Dst, Src);
+      PushCleanupIfNeeded(Arg.getIndirectAddress());
    } else {
-      // Otherwise, create a temporary to hold the value.
+      EmitStoreOfScalar(Arg.getDirectValue(), Dst, /* isInitialization */ true);
+    }
+  } else {
+    // If we already have a pointer to the argument, reuse the input pointer.
+    if (Arg.isIndirect()) {
+      DeclPtr = Arg.getIndirectAddress();
+      DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty));
+      // Indirect argument is in alloca address space, which may be different
+      // from the default address space.
+      auto AllocaAS = CGM.getASTAllocaAddressSpace();
+      auto *V = DeclPtr.getPointer();
+      AllocaPtr = DeclPtr;
+      auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
+      auto DestLangAS =
+          getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
+      if (SrcLangAS != DestLangAS) {
+        assert(getContext().getTargetAddressSpace(SrcLangAS) ==
+               CGM.getDataLayout().getAllocaAddrSpace());
+        auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
+        auto *T = llvm::PointerType::get(getLLVMContext(), DestAS);
+        DeclPtr =
+            DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
+                                    *this, V, SrcLangAS, DestLangAS, T, true),
+                                DeclPtr.isKnownNonNull());
+      }
+      PushCleanupIfNeeded(DeclPtr);
+    } else {
+      // Create a temporary to hold the value.
      DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
                              D.getName() + ".addr", &AllocaPtr);
+      DoStore = true;
    }
-    DoStore = true;
-  }

-  llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
+    llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);

-  LValue lv = MakeAddrLValue(DeclPtr, Ty);
-  if (IsScalar) {
-    Qualifiers qs = Ty.getQualifiers();
-    if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
-      // We honor __attribute__((ns_consumed)) for types with lifetime.
-      // For __strong, it's handled by just skipping the initial retain;
-      // otherwise we have to balance out the initial +1 with an extra
-      // cleanup to do the release at the end of the function.
-      bool isConsumed = D.hasAttr<NSConsumedAttr>();
+    LValue lv = MakeAddrLValue(DeclPtr, Ty);
+    if (IsScalar) {
+      Qualifiers qs = Ty.getQualifiers();
+      if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
+        // We honor __attribute__((ns_consumed)) for types with lifetime.
+        // For __strong, it's handled by just skipping the initial retain;
+        // otherwise we have to balance out the initial +1 with an extra
+        // cleanup to do the release at the end of the function.
+        bool isConsumed = D.hasAttr<NSConsumedAttr>();

-      // If a parameter is pseudo-strong then we can omit the implicit retain.
-      if (D.isARCPseudoStrong()) {
-        assert(lt == Qualifiers::OCL_Strong &&
-               "pseudo-strong variable isn't strong?");
-        assert(qs.hasConst() && "pseudo-strong variable should be const!");
-        lt = Qualifiers::OCL_ExplicitNone;
-      }
+        // If a parameter is pseudo-strong then we can omit the implicit retain.
+        if (D.isARCPseudoStrong()) {
+          assert(lt == Qualifiers::OCL_Strong &&
+                 "pseudo-strong variable isn't strong?");
+          assert(qs.hasConst() && "pseudo-strong variable should be const!");
+          lt = Qualifiers::OCL_ExplicitNone;
+        }

-      // Load objects passed indirectly.
-      if (Arg.isIndirect() && !ArgVal)
-        ArgVal = Builder.CreateLoad(DeclPtr);
+        // Load objects passed indirectly.
+        if (Arg.isIndirect() && !ArgVal)
+          ArgVal = Builder.CreateLoad(DeclPtr);

-      if (lt == Qualifiers::OCL_Strong) {
-        if (!isConsumed) {
-          if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
-            // use objc_storeStrong(&dest, value) for retaining the
-            // object. But first, store a null into 'dest' because
-            // objc_storeStrong attempts to release its old value.
-            llvm::Value *Null = CGM.EmitNullConstant(D.getType());
-            EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
-            EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
-            DoStore = false;
+        if (lt == Qualifiers::OCL_Strong) {
+          if (!isConsumed) {
+            if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
+              // use objc_storeStrong(&dest, value) for retaining the
+              // object. But first, store a null into 'dest' because
+              // objc_storeStrong attempts to release its old value.
+              llvm::Value *Null = CGM.EmitNullConstant(D.getType());
+              EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
+              EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
+              DoStore = false;
+            } else
+              // Don't use objc_retainBlock for block pointers, because we
+              // don't want to Block_copy something just because we got it
+              // as a parameter.
+              ArgVal = EmitARCRetainNonBlock(ArgVal);
+          }
+        } else {
+          // Push the cleanup for a consumed parameter.
+          if (isConsumed) {
+            ARCPreciseLifetime_t precise =
+                (D.hasAttr<ObjCPreciseLifetimeAttr>() ? ARCPreciseLifetime
+                                                      : ARCImpreciseLifetime);
+            EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(),
+                                                     ArgVal, precise);
+          }
+
+          if (lt == Qualifiers::OCL_Weak) {
+            EmitARCInitWeak(DeclPtr, ArgVal);
+            DoStore = false; // The weak init is a store, no need to do two.
          }
-          else
-          // Don't use objc_retainBlock for block pointers, because we
-          // don't want to Block_copy something just because we got it
-          // as a parameter.
-            ArgVal = EmitARCRetainNonBlock(ArgVal);
-        }
-      } else {
-        // Push the cleanup for a consumed parameter.
-        if (isConsumed) {
-          ARCPreciseLifetime_t precise = (D.hasAttr<ObjCPreciseLifetimeAttr>()
-                                ? ARCPreciseLifetime : ARCImpreciseLifetime);
-          EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(), ArgVal,
-                                                   precise);
        }

-        if (lt == Qualifiers::OCL_Weak) {
-          EmitARCInitWeak(DeclPtr, ArgVal);
-          DoStore = false; // The weak init is a store, no need to do two.
-        }
+        // Enter the cleanup scope.
+        EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
      }
-
-      // Enter the cleanup scope.
-      EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
    }
-  }

-  // Store the initial value into the alloca.
-  if (DoStore)
-    EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
+    // Store the initial value into the alloca.
+    if (DoStore)
+      EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
+  }

  setAddrOfLocalVar(&D, DeclPtr);

--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1083,10 +1083,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,

    // Allocate space for the variable to be globalized
    llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
-    llvm::CallBase *VoidPtr =
-        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                                CGM.getModule(), OMPRTL___kmpc_alloc_shared),
-                            AllocArgs, VD->getName());
+    llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(
+            CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
+                                 ? OMPRTL_malloc
+                                 : OMPRTL___kmpc_alloc_shared),
+        AllocArgs, VD->getName());
    // FIXME: We should use the variables actual alignment as an argument.
    VoidPtr->addRetAttr(llvm::Attribute::get(
        CGM.getLLVMContext(), llvm::Attribute::Alignment,
@@ -1116,12 +1118,14 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
    I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
    LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
                                     CGM.getContext().getDeclAlign(VD),
+
                                     AlignmentSource::Decl);
    I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress(CGF));
  }
  I->getSecond().MappedParams->apply(CGF);
 }

+
 bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
                                                     const VarDecl *VD) const {
  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
@@ -1149,10 +1153,12 @@ CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,

  // Allocate space for this VLA object to be globalized.
  llvm::Value *AllocArgs[] = {Size};
-  llvm::CallBase *VoidPtr =
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_alloc_shared),
-                          AllocArgs, VD->getName());
+  llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
+      OMPBuilder.getOrCreateRuntimeFunction(
+          CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
+                               ? OMPRTL_malloc
+                               : OMPRTL___kmpc_alloc_shared),
+      AllocArgs, VD->getName());
  VoidPtr->addRetAttr(llvm::Attribute::get(
      CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));

@@ -1178,20 +1184,29 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
    // globalized in the prolog (i.e. emitGenericVarsProlog).
    for (const auto &AddrSizePair :
         llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_free_shared),
-                          {AddrSizePair.first, AddrSizePair.second});
+      if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
+            {AddrSizePair.first});
+      else
+        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                CGM.getModule(), OMPRTL___kmpc_free_shared),
+                            {AddrSizePair.first, AddrSizePair.second});
    }
    // Deallocate the memory for each globalized value
    for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
      const auto *VD = cast<VarDecl>(Rec.first);
      I->getSecond().MappedParams->restore(CGF);

-      llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
-                                 CGF.getTypeSize(VD->getType())};
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_free_shared),
-                          FreeArgs);
+      if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
+            {Rec.second.GlobalizedVal});
+      else
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+                                                  OMPRTL___kmpc_free_shared),
+            {Rec.second.GlobalizedVal, CGF.getTypeSize(VD->getType())});
    }
  }
 }
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6502,6 +6502,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
        CmdArgs.push_back("-fopenmp-offload-mandatory");
      if (Args.hasArg(options::OPT_fopenmp_force_usm))
        CmdArgs.push_back("-fopenmp-force-usm");
+      if (Args.hasArg(options::OPT_fopenmp_globalize_to_global_space))
+        CmdArgs.push_back("-fopenmp-globalize-to-global-space");
      break;
    default:
      // By default, if Clang doesn't know how to generate useful OpenMP code
--- a/clang/test/OpenMP/globalization_byval_struct.c
+++ b/clang/test/OpenMP/globalization_byval_struct.c
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+
+extern int printf(const char *, ...);
+
+struct S {
+  int a;
+  float b;
+};
+
+// CHECK: define{{.*}}void @test(%struct.S* noundef byval(%struct.S) align {{[0-9]+}} [[arg:%[0-9a-zA-Z]+]])
+// CHECK: [[g:%[0-9a-zA-Z]+]] = call align {{[0-9]+}} i8* @__kmpc_alloc_shared
+// CHECK: bitcast i8* [[g]] to %struct.S*
+// CHECK: bitcast %struct.S* [[arg]] to i8**
+// CHECK: call void [[cc:@__copy_constructor[_0-9a-zA-Z]+]]
+// CHECK: void [[cc]]
+void test(struct S s) {
+#pragma omp parallel for
+  for (int i = 0; i < s.a; ++i) {
+    printf("%i : %i : %f\n", i, s.a, s.b);
+  }
+}
+
+void foo() {
+  #pragma omp target teams num_teams(1)
+  {
+    struct S s;
+    s.a = 7;
+    s.b = 11;
+    test(s);
+  }
+}
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -137,6 +137,13 @@ static constexpr OptTable::Info InfoTable[] = {
 #undef OPTION
 };

+/// Host RPC module that will be shared to the corresponding pass.
+Module *HostModule = nullptr;
+/// We only need to generate the host RPC module once.
+bool IsHostModuleGenerated = false;
+/// Host RPC object file.
+StringRef HostRPCObjFile;
+
 class WrapperOptTable : public opt::GenericOptTable {
 public:
  WrapperOptTable() : opt::GenericOptTable(InfoTable) {}
@@ -613,10 +620,12 @@ std::vector<std::string> getTargetFeatures(ArrayRef<OffloadFile> InputFiles) {
  return UnifiedFeatures;
 }

-template <typename ModuleHook = function_ref<bool(size_t, const Module &)>>
+template <typename PreHookTy = function_ref<bool(size_t, const Module &)>,
+          typename PostHookTy = function_ref<bool(size_t, const Module &)>>
 std::unique_ptr<lto::LTO> createLTO(
    const ArgList &Args, const std::vector<std::string> &Features,
-    ModuleHook Hook = [](size_t, const Module &) { return true; }) {
+    PreHookTy PreHook = [](size_t, const Module &) { return true; },
+    PostHookTy PostHook = [](size_t, const Module &) { return true; }) {
  const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
  // We need to remove AMD's target-id from the processor if present.
  StringRef Arch = Args.getLastArgValue(OPT_arch_EQ).split(":").first;
@@ -671,7 +680,9 @@ std::unique_ptr<lto::LTO> createLTO(
      return true;
    };
  }
-  Conf.PostOptModuleHook = Hook;
+
+  Conf.PreOptModuleHook = PreHook;
+  Conf.PostOptModuleHook = PostHook;
  Conf.CGFileType = (Triple.isNVPTX() || SaveTemps)
                        ? CodeGenFileType::AssemblyFile
                        : CodeGenFileType::ObjectFile;
@@ -690,6 +701,58 @@ bool isValidCIdentifier(StringRef S) {
                      [](char C) { return C == '_' || isAlnum(C); });
 }

+bool writeHostModule(std::string &FileName) {
+  if (!HostModule)
+    return false;
+  if (HostModule->getFunctionList().empty())
+    return false;
+
+  auto HostTriple = HostModule->getTargetTriple();
+  FileName =
+      sys::path::filename(ExecutableName).str() + "-host-rpc-" + HostTriple;
+  auto TempFileOrErr = createOutputFile(FileName, "bc");
+  if (!TempFileOrErr)
+    reportError(TempFileOrErr.takeError());
+  int FD = -1;
+  if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
+    reportError(errorCodeToError(EC));
+
+  auto Out = std::make_unique<llvm::raw_fd_ostream>(FD, true);
+  WriteBitcodeToFile(*HostModule, *Out);
+
+  return true;
+}
+
+std::unique_ptr<lto::LTO> createHostRPCLTO(StringRef HostTriple) {
+  const llvm::Triple Triple(HostTriple);
+  lto::Config Conf;
+  lto::ThinBackend Backend;
+  Backend =
+      lto::createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
+
+  // TODO: host arch?
+  // Conf.CPU = Arch.str();
+  Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple);
+
+  // TODO: host features?
+  // Conf.MAttrs = Features;
+  Conf.CGOptLevel = *CodeGenOpt::getLevel(3);
+  Conf.OptLevel = 3;
+  Conf.UseDefaultPipeline = true;
+  Conf.DefaultTriple = Triple.getTriple();
+
+  LTOError = false;
+  Conf.DiagHandler = diagnosticHandler;
+
+  Conf.PTO.LoopVectorization = Conf.OptLevel > 1;
+  Conf.PTO.SLPVectorization = Conf.OptLevel > 1;
+  Conf.CGFileType = CodeGenFileType::ObjectFile;
+
+  Conf.HasWholeProgramVisibility = false;
+
+  return std::make_unique<lto::LTO>(std::move(Conf), Backend);
+}
+
 Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
                       SmallVectorImpl<StringRef> &OutputFiles,
                       const ArgList &Args) {
@@ -775,14 +838,52 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
    BitcodeOutput.push_back(*TempFileOrErr);
    return false;
  };
+  auto AddHostModuleAddr = [&](size_t, const Module &M) {
+    if (!HostModule)
+      return true;
+
+    Module &CM = const_cast<Module &>(M);
+    auto *MD = CM.getOrInsertNamedMetadata("llvm.hostrpc.hostmodule");
+    MD->clearOperands();
+    MD->addOperand(MDTuple::get(
+        CM.getContext(), {ConstantAsMetadata::get(ConstantInt::get(
+                             Type::getInt64Ty(CM.getContext()),
+                             reinterpret_cast<uintptr_t>(HostModule)))}));
+    return true;
+  };

  // We assume visibility of the whole program if every input file was bitcode.
  auto Features = getTargetFeatures(BitcodeInputFiles);
  auto LTOBackend = Args.hasArg(OPT_embed_bitcode) ||
                            Args.hasArg(OPT_builtin_bitcode_EQ) ||
                            Args.hasArg(OPT_clang_backend)
-                        ? createLTO(Args, Features, OutputBitcode)
-                        : createLTO(Args, Features);
+                        ? createLTO(Args, Features, AddHostModuleAddr, OutputBitcode)
+                        : createLTO(Args, Features, AddHostModuleAddr);
+
+  LLVMContext &Ctx = LTOBackend->getContext();
+  StringRef HostTriple =
+      Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple());
+  std::unique_ptr<Module> HostModulePtr;
+  if (!IsHostModuleGenerated) {
+    HostModulePtr = std::make_unique<Module>(
+        sys::path::filename(ExecutableName).str() + "-host-rpc.bc", Ctx);
+    HostModule = HostModulePtr.get();
+    HostModulePtr->setTargetTriple(HostTriple);
+
+    std::string Msg;
+    const Target *T =
+        TargetRegistry::lookupTarget(HostModule->getTargetTriple(), Msg);
+    if (!T)
+      return createStringError(inconvertibleErrorCode(), Msg);
+    auto Options =
+        codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple(HostTriple));
+    StringRef CPU = "";
+    StringRef Features = "";
+    std::unique_ptr<TargetMachine> TM(
+        T->createTargetMachine(HostTriple, CPU, Features, Options, Reloc::PIC_,
+                               HostModule->getCodeModel()));
+    HostModule->setDataLayout(TM->createDataLayout());
+  }

  // We need to resolve the symbols so the LTO backend knows which symbols need
  // to be kept or can be internalized. This is a simplified symbol resolution
@@ -806,6 +907,7 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
    SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
    size_t Idx = 0;
    for (auto &Sym : Symbols) {
+
      lto::SymbolResolution &Res = Resolutions[Idx++];

      // We will use this as the prevailing symbol definition in LTO unless
@@ -876,6 +978,57 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
  if (Error Err = LTOBackend->run(AddStream))
    return Err;

+  std::string HostModuleTempFile;
+  bool ValidHostModule = writeHostModule(HostModuleTempFile);
+  // Reset the HostModule pointer.
+  HostModulePtr.reset();
+  HostModule = nullptr;
+  // TODO: this is really redundant code.
+  if (ValidHostModule) {
+    auto HostLTO = createHostRPCLTO(HostTriple);
+
+    std::string HostBitCodeFile = HostModuleTempFile + ".bc";
+    auto BufferOrError = MemoryBuffer::getFile(HostBitCodeFile);
+    if (!BufferOrError)
+      reportError(createFileError(HostBitCodeFile, BufferOrError.getError()));
+    Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr =
+        llvm::lto::InputFile::create(*BufferOrError.get());
+    if (!BitcodeFileOrErr)
+      return BitcodeFileOrErr.takeError();
+
+    const auto Symbols = (*BitcodeFileOrErr)->symbols();
+    SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
+    size_t Idx = 0;
+    for (auto &Sym : Symbols) {
+      (void)Sym;
+      lto::SymbolResolution &Res = Resolutions[Idx++];
+      Res.ExportDynamic = true;
+      Res.VisibleToRegularObj = true;
+      Res.LinkerRedefined = false;
+      Res.Prevailing = true;
+    }
+    if (Error Err = HostLTO->add(std::move(*BitcodeFileOrErr), Resolutions))
+      return Err;
+
+    auto RPCAddStream =
+        [&](size_t Task,
+            const Twine &ModuleName) -> std::unique_ptr<CachedFileStream> {
+      int FD = -1;
+      auto TempFileOrErr = createOutputFile(
+          sys::path::filename(ExecutableName) + "-host-rpc-" + HostTriple, "o");
+      if (!TempFileOrErr)
+        reportError(TempFileOrErr.takeError());
+      HostRPCObjFile = *TempFileOrErr;
+      if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
+        reportError(errorCodeToError(EC));
+      return std::make_unique<CachedFileStream>(
+          std::make_unique<llvm::raw_fd_ostream>(FD, true));
+    };
+
+    if (Error Err = HostLTO->run(RPCAddStream))
+      return Err;
+  }
+
  if (LTOError)
    return createStringError(inconvertibleErrorCode(),
                             "Errors encountered inside the LTO pipeline.");
@@ -1244,6 +1397,9 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
    WrappedOutput.push_back(*OutputOrErr);
  }

+  if (!HostRPCObjFile.empty())
+    WrappedOutput.push_back(HostRPCObjFile);
+
  return WrappedOutput;
 }

--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -181,6 +181,10 @@ set(TARGET_LIBC_ENTRYPOINTS

    # gpu/rpc.h entrypoints
    libc.src.gpu.rpc_host_call
+    libc.src.gpu.rpc_open_port
+    libc.src.gpu.rpc_send_n
+    libc.src.gpu.rpc_recv_n
+    libc.src.gpu.rpc_close_port
 )

 set(TARGET_LIBM_ENTRYPOINTS
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -604,6 +604,7 @@ if(LIBC_TARGET_OS_IS_GPU)
    DEPENDS
      .llvm_libc_common_h
      .llvm-libc-types.rpc_opcodes_t
+      .llvm-libc-types.rpc_port_t
  )
 endif()

--- a/libc/include/gpu/rpc.h.def
+++ b/libc/include/gpu/rpc.h.def
@@ -11,7 +11,10 @@

 #include <__llvm-libc-common.h>

+#include <llvm-libc-types/size_t.h>
+
 #include <llvm-libc-types/rpc_opcodes_t.h>
+#include <llvm-libc-types/rpc_port_t.h>

 %%public_api()

--- a/libc/include/llvm-libc-types/CMakeLists.txt
+++ b/libc/include/llvm-libc-types/CMakeLists.txt
@@ -93,6 +93,7 @@ add_header(socklen_t HDR socklen_t.h)
 add_header(struct_sockaddr_un HDR struct_sockaddr_un.h)
 add_header(struct_sockaddr HDR struct_sockaddr.h)
 add_header(rpc_opcodes_t HDR rpc_opcodes_t.h)
+add_header(rpc_port_t HDR rpc_port_t.h)
 add_header(ACTION HDR ACTION.h)
 add_header(ENTRY HDR ENTRY.h)
 add_header(struct_hsearch_data HDR struct_hsearch_data.h)
--- a/libc/include/llvm-libc-types/rpc_port_t.h
+++ b/libc/include/llvm-libc-types/rpc_port_t.h
@@ -0,0 +1,16 @@
+//===-- Definition of type rpc_port_t -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LLVM_LIBC_TYPES_RPC_PORT_T_H__
+#define __LLVM_LIBC_TYPES_RPC_PORT_T_H__
+
+typedef struct {
+  __UINT8_TYPE__ reserved[32];
+} rpc_port_t;
+
+#endif // __LLVM_LIBC_TYPES_RPC_PORT_T_H__
--- a/libc/include/llvm-libc-types/test_rpc_opcodes_t.h
+++ b/libc/include/llvm-libc-types/test_rpc_opcodes_t.h
@@ -15,6 +15,7 @@ typedef enum : unsigned short {
  RPC_TEST_NOOP = 1 << 15,
  RPC_TEST_INCREMENT,
  RPC_TEST_INTERFACE,
+  RPC_TEST_EXTERNAL,
  RPC_TEST_STREAM,
 } rpc_test_opcode_t;

--- a/libc/spec/gpu_ext.td
+++ b/libc/spec/gpu_ext.td
@@ -1,8 +1,11 @@
+def RPCPortT : NamedType<"rpc_port_t">;
+def RPCPortPtrT : PtrType<RPCPortT>;
+
 def GPUExtensions : StandardSpec<"GPUExtensions"> {
  HeaderSpec RPC = HeaderSpec<
    "gpu/rpc.h",
    [], // Macros
-    [], // Types
+    [RPCPortT, SizeTType], // Types
    [], // Enumerations
    [
        FunctionSpec<
@@ -10,6 +13,26 @@ def GPUExtensions : StandardSpec<"GPUExtensions"> {
            RetValSpec<VoidType>,
            [ArgSpec<VoidPtr>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
        >,
+        FunctionSpec<
+            "rpc_open_port",
+            RetValSpec<RPCPortT>,
+            [ArgSpec<UnsignedIntType>]
+        >,
+        FunctionSpec<
+            "rpc_send_n",
+            RetValSpec<VoidType>,
+            [ArgSpec<RPCPortPtrT>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
+        >,
+        FunctionSpec<
+            "rpc_recv_n",
+            RetValSpec<VoidType>,
+            [ArgSpec<RPCPortPtrT>, ArgSpec<VoidPtr>, ArgSpec<SizeTPtr>]
+        >,
+        FunctionSpec<
+            "rpc_close_port",
+            RetValSpec<VoidType>,
+            [ArgSpec<RPCPortPtrT>]
+        >,
    ]
  >;
  let Headers = [
--- a/libc/src/__support/CPP/bit.h
+++ b/libc/src/__support/CPP/bit.h
@@ -25,12 +25,14 @@ namespace LIBC_NAMESPACE::cpp {
 #define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
 #endif

-// This implementation of bit_cast requires trivially-constructible To, to avoid
-// UB in the implementation.
 template <typename To, typename From>
 LIBC_INLINE constexpr cpp::enable_if_t<
    (sizeof(To) == sizeof(From)) &&
+// Implementation of bit_cast that cannot use the compiler builtin must be
+// trivially-constructible To, to avoid UB in the implementation.
+#if !LIBC_HAS_BUILTIN(__builtin_bit_cast)
        cpp::is_trivially_constructible<To>::value &&
+#endif
        cpp::is_trivially_copyable<To>::value &&
        cpp::is_trivially_copyable<From>::value,
    To>
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -323,6 +323,7 @@ public:
  LIBC_INLINE void send_n(const void *src, uint64_t size);
  template <typename A>
  LIBC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc);
+  LIBC_INLINE void recv_n(void *dst, uint64_t *size);

  LIBC_INLINE uint16_t get_opcode() const {
    return process.header[index].opcode;
@@ -359,7 +360,9 @@ struct Client {
      : process(port_count, buffer) {}

  using Port = rpc::Port<false>;
-  template <uint16_t opcode> LIBC_INLINE Port open();
+
+  template <uint16_t opcode> LIBC_INLINE Port open() { return open(opcode); }
+  LIBC_INLINE Port open(uint16_t opcode);

 private:
  Process<false> process;
@@ -484,6 +487,14 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
  }
 }

+/// Helper routine to simplify the interface when recieving from the GPU using
+/// thread private pointers to the underly value and the destination pointer
+/// contains enough data to recieve the values.
+template <bool T> LIBC_INLINE void Port<T>::recv_n(void *dst, uint64_t *size) {
+  void **dst_ptr = &dst;
+  recv_n(dst_ptr, size, [=](uint64_t) { return dst; });
+}
+
 /// Receives an arbitrarily sized data buffer across the shared channel in
 /// multiples of the packet length. The \p alloc function is called with the
 /// size of the data so that we can initialize the size of the \p dst buffer.
@@ -522,9 +533,9 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
 /// is, there are send operations pending that haven't been serviced on this
 /// port. Each port instance uses an associated \p opcode to tell the server
 /// what to do. The Client interface provides the appropriate lane size to the
-/// port using the platform's returned value.
-template <uint16_t opcode>
-[[clang::convergent]] LIBC_INLINE Client::Port Client::open() {
+/// port using the platform's returned value. It is required that \p opcode is
+/// uniform between all the lanes for this to work.
+[[clang::convergent]] LIBC_INLINE Client::Port Client::open(uint16_t opcode) {
  // Repeatedly perform a naive linear scan for a port that can be opened to
  // send data.
  for (uint32_t index = gpu::get_cluster_id();; ++index) {
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -995,24 +995,13 @@ LIBC_INLINE_VAR constexpr bool is_big_int_v = is_big_int<T>::value;

 namespace cpp {

-// Specialization of cpp::bit_cast ('bit.h') from T to BigInt.
-template <typename To, typename From>
-LIBC_INLINE constexpr cpp::enable_if_t<
-    (sizeof(To) == sizeof(From)) && cpp::is_trivially_copyable<To>::value &&
-        cpp::is_trivially_copyable<From>::value && is_big_int<To>::value,
-    To>
-bit_cast(const From &from) {
-  To out;
-  using Storage = decltype(out.val);
-  out.val = cpp::bit_cast<Storage>(from);
-  return out;
-}
-
 // Specialization of cpp::bit_cast ('bit.h') from BigInt to T.
 template <typename To, size_t Bits>
 LIBC_INLINE constexpr cpp::enable_if_t<
    sizeof(To) == sizeof(UInt<Bits>) &&
+#if !LIBC_HAS_BUILTIN(__builtin_bit_cast)
        cpp::is_trivially_constructible<To>::value &&
+#endif
        cpp::is_trivially_copyable<To>::value &&
        cpp::is_trivially_copyable<UInt<Bits>>::value,
    To>
--- a/libc/src/gpu/CMakeLists.txt
+++ b/libc/src/gpu/CMakeLists.txt
@@ -8,3 +8,58 @@ add_entrypoint_object(
    libc.src.__support.RPC.rpc_client
    libc.src.__support.GPU.utils
 )
+
+add_entrypoint_object(
+  rpc_open_port
+  SRCS
+    rpc_open_port.cpp
+  HDRS
+    rpc_open_port.h
+  DEPENDS
+    libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
+)
+
+add_entrypoint_object(
+  rpc_close_port
+  SRCS
+    rpc_close_port.cpp
+  HDRS
+    rpc_close_port.h
+  DEPENDS
+    libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
+)
+
+add_entrypoint_object(
+  rpc_get_buffer
+  SRCS
+    rpc_get_buffer.cpp
+  HDRS
+    rpc_get_buffer.h
+  DEPENDS
+    libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
+)
+
+add_entrypoint_object(
+  rpc_send_n
+  SRCS
+    rpc_send_n.cpp
+  HDRS
+    rpc_send_n.h
+  DEPENDS
+    libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
+)
+
+add_entrypoint_object(
+  rpc_recv_n
+  SRCS
+    rpc_recv_n.cpp
+  HDRS
+    rpc_recv_n.h
+  DEPENDS
+    libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
+)
--- a/libc/src/gpu/rpc_close_port.cpp
+++ b/libc/src/gpu/rpc_close_port.cpp
@@ -0,0 +1,25 @@
+//===---------- GPU implementation of the external RPC port interface -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/gpu/rpc_close_port.h"
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
+
+LLVM_LIBC_FUNCTION(void, rpc_close_port, (rpc_port_t * handle)) {
+  rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
+  port.close();
+  *handle = cpp::bit_cast<rpc_port_t>(port);
+}
+
+} // namespace LIBC_NAMESPACE
--- a/libc/src/gpu/rpc_close_port.h
+++ b/libc/src/gpu/rpc_close_port.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H
+#define LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H
+
+#include <gpu/rpc.h>
+
+namespace LIBC_NAMESPACE {
+
+void rpc_close_port(rpc_port_t *handle);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H
--- a/libc/src/gpu/rpc_open_port.cpp
+++ b/libc/src/gpu/rpc_open_port.cpp
@@ -0,0 +1,26 @@
+//===---------- GPU implementation of the external RPC port interface -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/gpu/rpc_open_port.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
+
+LLVM_LIBC_FUNCTION(rpc_port_t, rpc_open_port, (unsigned opcode)) {
+  uint32_t uniform = gpu::broadcast_value(gpu::get_lane_mask(), opcode);
+  rpc::Client::Port port = rpc::client.open(static_cast<uint16_t>(uniform));
+  return cpp::bit_cast<rpc_port_t>(port);
+}
+
+} // namespace LIBC_NAMESPACE
--- a/libc/src/gpu/rpc_open_port.h
+++ b/libc/src/gpu/rpc_open_port.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H
+#define LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H
+
+#include <gpu/rpc.h>
+
+namespace LIBC_NAMESPACE {
+
+rpc_port_t rpc_open_port(unsigned opcode);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H
--- a/libc/src/gpu/rpc_recv_n.cpp
+++ b/libc/src/gpu/rpc_recv_n.cpp
@@ -0,0 +1,26 @@
+//===---------- GPU implementation of the external RPC port interface -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/gpu/rpc_recv_n.h"
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
+
+LLVM_LIBC_FUNCTION(void, rpc_recv_n,
+                   (rpc_port_t * handle, void *dst, size_t *size)) {
+  rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
+  port.recv_n(dst, reinterpret_cast<uint64_t *>(size));
+  *handle = cpp::bit_cast<rpc_port_t>(port);
+}
+
+} // namespace LIBC_NAMESPACE
--- a/libc/src/gpu/rpc_recv_n.h
+++ b/libc/src/gpu/rpc_recv_n.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_GPU_RPC_RECV_H
+#define LLVM_LIBC_SRC_GPU_RPC_RECV_H
+
+#include <gpu/rpc.h>
+
+namespace LIBC_NAMESPACE {
+
+void rpc_recv_n(rpc_port_t *handle, void *dst, size_t *size);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_GPU_RPC_RECV_H
--- a/libc/src/gpu/rpc_send_n.cpp
+++ b/libc/src/gpu/rpc_send_n.cpp
@@ -0,0 +1,26 @@
+//===---------- GPU implementation of the external RPC port interface -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/gpu/rpc_send_n.h"
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
+
+LLVM_LIBC_FUNCTION(void, rpc_send_n,
+                   (rpc_port_t * handle, const void *src, size_t size)) {
+  rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
+  port.send_n(src, size);
+  *handle = cpp::bit_cast<rpc_port_t>(port);
+}
+
+} // namespace LIBC_NAMESPACE
--- a/libc/src/gpu/rpc_send_n.h
+++ b/libc/src/gpu/rpc_send_n.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_GPU_RPC_SEND_H
+#define LLVM_LIBC_SRC_GPU_RPC_SEND_H
+
+#include <gpu/rpc.h>
+
+namespace LIBC_NAMESPACE {
+
+void rpc_send_n(rpc_port_t *handle, const void *src, size_t size);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_GPU_RPC_SEND_H
--- a/libc/test/integration/startup/gpu/CMakeLists.txt
+++ b/libc/test/integration/startup/gpu/CMakeLists.txt
@@ -44,6 +44,21 @@ add_integration_test(
   rpc_interface_test.cpp
 )

+add_integration_test(
+  startup_rpc_external_interface_test
+  SUITE libc-startup-tests
+  SRCS
+    rpc_external_interface_test.cpp
+  DEPENDS
+    libc.src.gpu.rpc_open_port
+    libc.src.gpu.rpc_send_n
+    libc.src.gpu.rpc_recv_n
+    libc.src.gpu.rpc_close_port
+  LOADER_ARGS
+    --threads 32
+    --blocks 8
+)
+
 add_integration_test(
  startup_rpc_stream_test
  SUITE libc-startup-tests
--- a/libc/test/integration/startup/gpu/rpc_external_interface_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_external_interface_test.cpp
@@ -0,0 +1,43 @@
+//===-- Loader test to check the external RPC interface with the loader ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <gpu/rpc.h>
+
+#include "src/gpu/rpc_close_port.h"
+#include "src/gpu/rpc_open_port.h"
+#include "src/gpu/rpc_recv_n.h"
+#include "src/gpu/rpc_send_n.h"
+
+#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "src/__support/GPU/utils.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+static void test_interface() {
+  uint32_t num_additions =
+      10 + 10 * gpu::get_thread_id() + 10 * gpu::get_block_id();
+  uint64_t cnt = 0;
+  for (uint32_t i = 0; i < num_additions; ++i) {
+    size_t size = sizeof(uint64_t);
+    rpc_port_t port = LIBC_NAMESPACE::rpc_open_port(RPC_TEST_EXTERNAL);
+    LIBC_NAMESPACE::rpc_send_n(&port, &cnt, size);
+    LIBC_NAMESPACE::rpc_recv_n(&port, &cnt, &size);
+    LIBC_NAMESPACE::rpc_close_port(&port);
+    ASSERT_TRUE(size == sizeof(uint64_t));
+  }
+
+  ASSERT_TRUE(cnt == num_additions && "Invalid number of increments");
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+  test_interface();
+
+  return 0;
+}
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -11,8 +11,8 @@

 #include "utils/gpu/server/llvmlibc_rpc_server.h"

-#include "llvm-libc-types/rpc_opcodes_t.h"
 #include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/rpc_opcodes_t.h"

 #include <cstddef>
 #include <cstdint>
@@ -222,6 +222,28 @@ inline void register_rpc_callbacks(uint32_t device_id) {
        }
      },
      nullptr);
+
+  // Register the stream test handler.
+  rpc_register_callback(
+      device_id, static_cast<rpc_opcode_t>(RPC_TEST_EXTERNAL),
+      [](rpc_port_t port, void *data) {
+        uint64_t sizes[lane_size] = {0};
+        void *dst[lane_size] = {nullptr};
+        rpc_recv_n(
+            port, dst, sizes,
+            [](uint64_t size, void *) -> void * { return new char[size]; },
+            nullptr);
+        for (uint64_t i = 0; i < lane_size; ++i) {
+          if (dst[i])
+            *reinterpret_cast<uint64_t *>(dst[i]) += 1;
+        }
+        rpc_send_n(port, dst, sizes);
+        for (uint64_t i = 0; i < lane_size; ++i) {
+          if (dst[i])
+            delete[] reinterpret_cast<uint8_t *>(dst[i]);
+        }
+      },
+      nullptr);
 }

 #endif
--- a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
@@ -25,6 +25,19 @@ enum OMPTgtExecModeFlags : unsigned char {
      OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
 };

+enum OMPTgtHostRPCArgType {
+  // No need to copy.
+  OMP_HOST_RPC_ARG_SCALAR = 0,
+  OMP_HOST_RPC_ARG_PTR = 1,
+  // Copy to device.
+  OMP_HOST_RPC_ARG_COPY_TO = OMP_HOST_RPC_ARG_PTR | (1 << 1),
+  // Copy to device.
+  OMP_HOST_RPC_ARG_COPY_FROM = OMP_HOST_RPC_ARG_PTR | (1 << 2),
+  // Copy to and from device.
+  OMP_HOST_RPC_ARG_COPY_TOFROM =
+      OMP_HOST_RPC_ARG_COPY_TO | OMP_HOST_RPC_ARG_COPY_FROM,
+};
+
 } // end namespace omp
 } // end namespace llvm

--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -227,7 +227,9 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
 __OMP_RTL(__kmpc_get_warp_size, false, Int32, )

 __OMP_RTL(omp_get_thread_num, false, Int32, )
+__OMP_RTL(omp_get_bulk_thread_num, false, Int32, )
 __OMP_RTL(omp_get_num_threads, false, Int32, )
+__OMP_RTL(omp_get_bulk_num_threads, false, Int32, )
 __OMP_RTL(omp_get_max_threads, false, Int32, )
 __OMP_RTL(omp_in_parallel, false, Int32, )
 __OMP_RTL(omp_get_dynamic, false, Int32, )
@@ -490,6 +492,8 @@ __OMP_RTL(__kmpc_reduction_get_fixed_buffer, false, VoidPtr, )

 __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)

+__OMP_RTL(malloc, false, VoidPtr, SizeTy)
+__OMP_RTL(free, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
 __OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)
 __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
@@ -503,6 +507,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
 __OMP_RTL(__kmpc_syncwarp, false, Void, Int64)

+__OMP_RTL(__kmpc_launch_parallel_51_kernel, false, Void, Int8Ptr, Int32, Int32,
+          Int32, VoidPtrPtr, Int64)
+
 __OMP_RTL(__last, false, Void, )

 #undef __OMP_RTL
@@ -710,6 +717,8 @@ __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs())

 __OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_bulk_thread_num, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_bulk_num_threads, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, SExt, ParamAttrs())
--- a/llvm/include/llvm/LTO/Config.h
+++ b/llvm/include/llvm/LTO/Config.h
@@ -60,6 +60,9 @@ struct Config {
  bool VerifyEach = false;
  bool DisableVerify = false;

+  /// Use the standard optimization pipeline.
+  bool UseDefaultPipeline = false;
+
  /// Flag to indicate that the optimizer should not assume builtins are present
  /// on the target.
  bool Freestanding = false;
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -303,6 +303,9 @@ public:
  /// by LTO but might not be visible from bitcode symbol table.
  static ArrayRef<const char*> getRuntimeLibcallSymbols();

+  /// Returns the context.
+  LLVMContext &getContext() { return RegularLTO.Ctx; }
+
 private:
  Config Conf;

--- a/llvm/include/llvm/Transforms/IPO/HostRPC.h
+++ b/llvm/include/llvm/Transforms/IPO/HostRPC.h
@@ -0,0 +1,31 @@
+//===- Transform/IPO/HostRPC.h - Code of automatic host rpc -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_HOSTRPC_H
+#define LLVM_TRANSFORMS_IPO_HOSTRPC_H
+
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class HostRPCPass : public PassInfoMixin<HostRPCPass> {
+public:
+  HostRPCPass() : LTOPhase(ThinOrFullLTOPhase::None) {}
+  HostRPCPass(ThinOrFullLTOPhase LTOPhase) : LTOPhase(LTOPhase) {}
+
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+  const ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None;
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_IPO_HOSTRPC_H
--- a/llvm/include/llvm/Transforms/Utils/CanonicalizeMainFunction.h
+++ b/llvm/include/llvm/Transforms/Utils/CanonicalizeMainFunction.h
@@ -0,0 +1,30 @@
+//===--------------- CanonicalizeMainFunction.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility function to canonicalize main function.
+// The canonical main function is defined as: int main(int argc, char *argv[]);
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
+#define LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// A pass that canonicalizes main function in a module.
+class CanonicalizeMainFunctionPass
+    : public PassInfoMixin<CanonicalizeMainFunctionPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -330,6 +330,8 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
      report_fatal_error(Twine("unable to parse pass pipeline description '") +
                         Conf.OptPipeline + "': " + toString(std::move(Err)));
    }
+  } else if (Conf.UseDefaultPipeline) {
+    MPM.addPass(PB.buildPerModuleDefaultPipeline(OL));
  } else if (IsThinLTO) {
    MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary));
  } else {
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -141,6 +141,7 @@
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/IPO/GlobalOpt.h"
 #include "llvm/Transforms/IPO/GlobalSplit.h"
+#include "llvm/Transforms/IPO/HostRPC.h"
 #include "llvm/Transforms/IPO/HotColdSplitting.h"
 #include "llvm/Transforms/IPO/IROutliner.h"
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
@@ -264,6 +265,7 @@
 #include "llvm/Transforms/Utils/BreakCriticalEdges.h"
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
 #include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
+#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
 #include "llvm/Transforms/Utils/CountVisits.h"
 #include "llvm/Transforms/Utils/DXILUpgrade.h"
 #include "llvm/Transforms/Utils/Debugify.h"
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -53,6 +53,7 @@
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/IPO/GlobalOpt.h"
 #include "llvm/Transforms/IPO/GlobalSplit.h"
+#include "llvm/Transforms/IPO/HostRPC.h"
 #include "llvm/Transforms/IPO/HotColdSplitting.h"
 #include "llvm/Transforms/IPO/IROutliner.h"
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
@@ -126,6 +127,7 @@
 #include "llvm/Transforms/Utils/AddDiscriminators.h"
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
+#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
 #include "llvm/Transforms/Utils/CountVisits.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
 #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
@@ -182,9 +184,9 @@ static cl::opt<bool> EnablePostPGOLoopRotation(
    "enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden,
    cl::desc("Run the loop rotation transformation after PGO instrumentation"));

-static cl::opt<bool> EnableGlobalAnalyses(
-    "enable-global-analyses", cl::init(true), cl::Hidden,
-    cl::desc("Enable inter-procedural analyses"));
+static cl::opt<bool>
+    EnableGlobalAnalyses("enable-global-analyses", cl::init(true), cl::Hidden,
+                         cl::desc("Enable inter-procedural analyses"));

 static cl::opt<bool>
    RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden,
@@ -306,6 +308,13 @@ cl::opt<bool> EnableMemProfContextDisambiguation(
 extern cl::opt<bool> EnableInferAlignmentPass;
 } // namespace llvm

+static cl::opt<bool> EnableHostRPC("enable-host-rpc", cl::init(false),
+                                   cl::Hidden, cl::desc("Enable HostRPC pass"));
+
+static cl::opt<bool> EnableCanonicalizeMainFunction(
+    "enable-canonicalize-main-function", cl::init(false), cl::Hidden,
+    cl::desc("Enable CanonicalizeMainFunction pass"));
+
 PipelineTuningOptions::PipelineTuningOptions() {
  LoopInterleaving = true;
  LoopVectorization = true;
@@ -1091,6 +1100,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
          PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
  }

+  if (EnableCanonicalizeMainFunction)
+    MPM.addPass(CanonicalizeMainFunctionPass());
+
  // Try to perform OpenMP specific optimizations on the module. This is a
  // (quick!) no-op if there are no OpenMP runtime calls present in the module.
  MPM.addPass(OpenMPOptPass());
@@ -1110,11 +1122,10 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
  // and prior to optimizing globals.
  // FIXME: This position in the pipeline hasn't been carefully considered in
  // years, it should be re-analyzed.
-  MPM.addPass(IPSCCPPass(
-              IPSCCPOptions(/*AllowFuncSpec=*/
-                            Level != OptimizationLevel::Os &&
-                            Level != OptimizationLevel::Oz &&
-                            !isLTOPreLink(Phase))));
+  MPM.addPass(IPSCCPPass(IPSCCPOptions(/*AllowFuncSpec=*/
+                                       Level != OptimizationLevel::Os &&
+                                       Level != OptimizationLevel::Oz &&
+                                       !isLTOPreLink(Phase))));

  // Attach metadata to indirect call sites indicating the set of functions
  // they may target at run-time. This should follow IPSCCP.
@@ -1588,7 +1599,7 @@ PassBuilder::buildFatLTODefaultPipeline(OptimizationLevel Level, bool ThinLTO,
 ModulePassManager
 PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
  if (Level == OptimizationLevel::O0)
-    return buildO0DefaultPipeline(Level, /*LTOPreLink*/true);
+    return buildO0DefaultPipeline(Level, /*LTOPreLink*/ true);

  ModulePassManager MPM;

@@ -1711,6 +1722,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
  // in the current module.
  MPM.addPass(CrossDSOCFIPass());

+  if (EnableCanonicalizeMainFunction)
+    MPM.addPass(CanonicalizeMainFunctionPass());
+
  if (Level == OptimizationLevel::O0) {
    // The WPD and LowerTypeTest passes need to run at -O0 to lower type
    // metadata and intrinsics.
@@ -1741,6 +1755,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
  // Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
  MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink));

+  if (EnableHostRPC)
+    MPM.addPass(HostRPCPass(ThinOrFullLTOPhase::FullLTOPostLink));
+
  // Remove unused virtual tables to improve the quality of code generated by
  // whole-program devirtualization and bitset lowering.
  MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true));
@@ -2115,6 +2132,9 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,

  MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));

+  if (EnableCanonicalizeMainFunction)
+    MPM.addPass(CanonicalizeMainFunctionPass());
+
  return MPM;
 }

--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -47,6 +47,7 @@ MODULE_PASS("attributor", AttributorPass())
 MODULE_PASS("attributor-light", AttributorLightPass())
 MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
 MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass())
+MODULE_PASS("canonicalize-main-function", CanonicalizeMainFunctionPass())
 MODULE_PASS("check-debugify", NewPMCheckDebugifyPass())
 MODULE_PASS("constmerge", ConstantMergePass())
 MODULE_PASS("coro-cleanup", CoroCleanupPass())
@@ -141,6 +142,7 @@ MODULE_PASS("tsan-module", ModuleThreadSanitizerPass())
 MODULE_PASS("verify", VerifierPass())
 MODULE_PASS("view-callgraph", CallGraphViewerPass())
 MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
+MODULE_PASS("host-rpc", HostRPCPass())
 #undef MODULE_PASS

 #ifndef MODULE_PASS_WITH_PARAMS
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -397,12 +397,25 @@ static bool getPotentialCopiesOfMemoryValue(
          dbgs() << "Underlying object is a valid nullptr, giving up.\n";);
      return false;
    }
-    // TODO: Use assumed noalias return.
-    if (!isa<AllocaInst>(&Obj) && !isa<GlobalVariable>(&Obj) &&
-        !(IsLoad ? isAllocationFn(&Obj, TLI) : isNoAliasCall(&Obj))) {
-      LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << Obj
-                        << "\n";);
-      return false;
+    if (!isa<AllocaInst>(&Obj) && !isa<GlobalVariable>(&Obj)) {
+      auto *CB = dyn_cast<CallBase>(&Obj);
+      bool R = false;
+      if (CB) {
+        auto &AAN =
+            *A.getOrCreateAAFor<AANoAlias>(IRPosition::callsite_returned(*CB),
+                                          &QueryingAA, DepClassTy::OPTIONAL);
+        if (AAN.isValidState())
+          R = AAN.isAssumedNoAlias();
+        else
+          R = isNoAliasCall(&Obj);
+        if (!R)
+          R = isAllocationFn(&Obj, TLI);
+      }
+      if (!R) {
+        LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << Obj
+                          << "\n";);
+        return false;
+      }
    }
    if (auto *GV = dyn_cast<GlobalVariable>(&Obj))
      if (!GV->hasLocalLinkage() &&
@@ -2226,7 +2239,8 @@ void Attributor::runTillFixpoint() {

  LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
                    << IterationCounter << "/" << MaxIterations
-                    << " iterations\n");
+                    << " iterations. #Changed AAs: " << ChangedAAs.size()
+                    << "\n");

  // Reset abstract arguments not settled in a sound fixpoint by now. This
  // happens when we stopped the fixpoint iteration early. Note that only the
@@ -2241,13 +2255,21 @@ void Attributor::runTillFixpoint() {

    AbstractState &State = ChangedAA->getState();
    if (!State.isAtFixpoint()) {
+      LLVM_DEBUG(
+          dbgs() << "\n[Attributor] Invalidate AA " << *ChangedAA
+                 << " as it is not at fix point after max iterations.\n");
+
      State.indicatePessimisticFixpoint();

      NumAttributesTimedOut++;
    }

-    for (auto &DepIt : ChangedAA->Deps)
+    for (auto &DepIt : ChangedAA->Deps) {
+      LLVM_DEBUG(dbgs() << "\n[Attributor] Invalidate dependent AA "
+                        << *(DepIt.getPointer()) << " of AA " << *ChangedAA
+                        << "\n");
      ChangedAAs.push_back(cast<AbstractAttribute>(DepIt.getPointer()));
+    }
    ChangedAA->Deps.clear();
  }

--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -1669,6 +1669,8 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
    // might change while we iterate through a loop. For now, we give up if
    // the PHI is not invariant.
    if (isa<PHINode>(Usr)) {
+      if (!Usr->getType()->isPointerTy())
+        return false;
      // Note the order here, the Usr access might change the map, CurPtr is
      // already in it though.
      bool IsFirstPHIUser = !OffsetInfoMap.count(Usr);
@@ -1899,6 +1901,31 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
      return false;
    }

+    if (auto *II = dyn_cast<ICmpInst>(Usr)) {
+      auto CheckIfUsedAsPred = [&](const Use &U, bool &Follow) {
+        const auto *UU = U.getUser();
+        if (isa<SelectInst>(UU))
+          return true;
+        if (isa<BranchInst>(UU))
+          return true;
+
+        LLVM_DEBUG(dbgs() << "[AAPointerInfo] ICmpInst user not handled " << *UU
+                          << "\n");
+        return false;
+      };
+      if (!A.checkForAllUses(CheckIfUsedAsPred, *this, *II,
+                             /* CheckBBLivenessOnly */ true,
+                             DepClassTy::OPTIONAL,
+                             /* IgnoreDroppableUses */ true)) {
+        LLVM_DEBUG(
+            dbgs() << "[AAPointerInfo] Check for all uses failed for ICmpInst "
+                   << *II << "\n");
+        return false;
+      }
+
+      return true;
+    }
+
    LLVM_DEBUG(dbgs() << "[AAPointerInfo] User not handled " << *Usr << "\n");
    return false;
  };
@@ -2040,6 +2067,7 @@ struct AAPointerInfoCallSiteReturned final : AAPointerInfoFloating {
 };
 } // namespace

+
 /// -----------------------NoUnwind Function Attribute--------------------------

 namespace {
@@ -11899,10 +11927,35 @@ struct AAUnderlyingObjectsImpl
  /// See AbstractAttribute::trackStatistics()
  void trackStatistics() const override {}

+  // TODO: This is a temporary solution for terminals.
+  bool checkIfTerminals(Attributor &A, Value *V) {
+    auto *CI = dyn_cast<CallInst>(V);
+    if (!CI)
+      return false;
+
+    Function *Scope = CI->getFunction();
+    const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*Scope);
+    LibFunc TLIFn;
+    if (TLI && TLI->getLibFunc(*CI, TLIFn)) {
+      if (TLIFn == LibFunc::LibFunc_malloc ||
+          TLIFn == LibFunc::LibFunc___kmpc_alloc_shared)
+        return true;
+    }
+
+    return false;
+  }
+
  /// See AbstractAttribute::updateImpl(...).
  ChangeStatus updateImpl(Attributor &A) override {
    auto &Ptr = getAssociatedValue();

+    if (checkIfTerminals(A, &Ptr)) {
+      bool Changed = false;
+      Changed |= InterAssumedUnderlyingObjects.insert(&Ptr);
+      Changed |= IntraAssumedUnderlyingObjects.insert(&Ptr);
+      return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+    }
+
    auto DoUpdate = [&](SmallSetVector<Value *, 8> &UnderlyingObjects,
                        AA::ValueScope Scope) {
      bool UsedAssumedInformation = false;
@@ -11918,6 +11971,10 @@ struct AAUnderlyingObjectsImpl
      for (unsigned I = 0; I < Values.size(); ++I) {
        auto &VAC = Values[I];
        auto *Obj = VAC.getValue();
+        if (checkIfTerminals(A, Obj)) {
+          Changed |= UnderlyingObjects.insert(Obj);
+          continue;
+        }
        Value *UO = getUnderlyingObject(Obj);
        if (UO && UO != VAC.getValue() && SeenObjects.insert(UO).second) {
          const auto *OtherAA = A.getAAFor<AAUnderlyingObjects>(
@@ -11948,6 +12005,16 @@ struct AAUnderlyingObjectsImpl
          continue;
        }

+        if (auto *LI = dyn_cast<LoadInst>(Obj)) {
+          LLVM_DEBUG({
+            dbgs() << "[AAUnderlyingObjects] for CtxI ";
+            getCtxI()->print(dbgs());
+            dbgs() << " at position " << getIRPosition() << " has LoadInst ";
+            LI->print(dbgs());
+            dbgs() << '\n';
+          });
+        }
+
        Changed |= UnderlyingObjects.insert(Obj);
      }

@@ -11958,6 +12025,8 @@ struct AAUnderlyingObjectsImpl
    Changed |= DoUpdate(IntraAssumedUnderlyingObjects, AA::Intraprocedural);
    Changed |= DoUpdate(InterAssumedUnderlyingObjects, AA::Interprocedural);

+    LLVM_DEBUG(dumpState(dbgs()));
+
    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
  }

@@ -11996,6 +12065,19 @@ private:
    return Changed;
  }

+  void dumpState(raw_ostream &O) {
+    O << "Underlying objects:\nintra procedureal:\n";
+    for (auto *Obj : IntraAssumedUnderlyingObjects) {
+      Obj->print(O);
+      O << '\n';
+    }
+    O << "inter procedureal:\n";
+    for (auto *Obj : InterAssumedUnderlyingObjects) {
+      Obj->print(O);
+      O << '\n';
+    }
+  }
+
  /// All the underlying objects collected so far via intra procedural scope.
  SmallSetVector<Value *, 8> IntraAssumedUnderlyingObjects;
  /// All the underlying objects collected so far via inter procedural scope.
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_component_library(LLVMipo
  GlobalDCE.cpp
  GlobalOpt.cpp
  GlobalSplit.cpp
+  HostRPC.cpp
  HotColdSplitting.cpp
  IPO.cpp
  IROutliner.cpp
--- a/llvm/lib/Transforms/IPO/HostRPC.cpp
+++ b/llvm/lib/Transforms/IPO/HostRPC.cpp
@@ -0,0 +1,890 @@
+//===- Transform/IPO/HostRPC.h - Code of automatic host rpc -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/HostRPC.h"
+
+#include "llvm/ADT/EnumeratedArray.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/IPO/Attributor.h"
+
+#include <cstdint>
+
+#define DEBUG_TYPE "host-rpc"
+
+using namespace llvm;
+
+using ArgType = llvm::omp::OMPTgtHostRPCArgType;
+
+static cl::opt<bool>
+    UseDummyHostModule("host-rpc-use-dummy-host-module", cl::init(false),
+                       cl::Hidden,
+                       cl::desc("Use dummy host module if there no host module "
+                                "attached to the device module"));
+
+namespace {
+
+enum class HostRPCRuntimeFunction {
+#define __OMPRTL_HOST_RPC(_ENUM) OMPRTL_##_ENUM
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper),
+  __OMPRTL_HOST_RPC(__last),
+#undef __OMPRTL_HOST_RPC
+};
+
+#define __OMPRTL_HOST_RPC(_ENUM)                                               \
+  auto OMPRTL_##_ENUM = HostRPCRuntimeFunction::OMPRTL_##_ENUM;
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper)
+#undef __OMPRTL_HOST_RPC
+
+// TODO: Remove those functions implemented in device runtime.
+static constexpr const char *InternalPrefix[] = {
+    "__kmp", "llvm.",        "nvm.",
+    "omp_",  "vprintf",      "malloc",
+    "free",  "__keep_alive", "__llvm_omp_vprintf"};
+
+bool isInternalFunction(Function &F) {
+  auto Name = F.getName();
+
+  for (auto *P : InternalPrefix)
+    if (Name.starts_with(P))
+      return true;
+
+  return false;
+}
+
+std::string typeToString(Type *T) {
+  if (T->is16bitFPTy())
+    return "f16";
+  if (T->isFloatTy())
+    return "f32";
+  if (T->isDoubleTy())
+    return "f64";
+  if (T->isPointerTy())
+    return "ptr";
+  if (T->isStructTy())
+    return std::string(T->getStructName());
+  if (T->isIntegerTy())
+    return "i" + std::to_string(T->getIntegerBitWidth());
+
+  LLVM_DEBUG(dbgs() << "[HostRPC] unknown type " << *T
+                    << "  for typeToString.\n";);
+
+  llvm_unreachable("unknown type");
+}
+
+class HostRPC {
+  /// LLVM context instance
+  LLVMContext &Context;
+
+  /// Device module.
+  Module &M;
+  /// Host module
+  Module &HM;
+
+  /// Data layout of the device module.
+  DataLayout DL;
+
+  IRBuilder<> Builder;
+
+  /// External functions we are operating on.
+  SmallSetVector<Function *, 8> FunctionWorkList;
+
+  /// Attributor instance.
+  Attributor &A;
+
+  // Types
+  Type *Int8PtrTy;
+  Type *VoidTy;
+  Type *Int32Ty;
+  Type *Int64Ty;
+  StructType *ArgInfoTy;
+  // Values
+  Constant *NullPtr;
+  Constant *NullInt64;
+
+  struct CallSiteInfo {
+    CallInst *CI = nullptr;
+    SmallVector<Type *> Params;
+  };
+
+  struct HostRPCArgInfo {
+    Value *BasePtr = nullptr;
+    Constant *Type = nullptr;
+    Value *Size = nullptr;
+  };
+
+  ///
+  SmallVector<Function *> HostEntryTable;
+
+  EnumeratedArray<Function *, HostRPCRuntimeFunction,
+                  HostRPCRuntimeFunction::OMPRTL___last>
+      RFIs;
+
+  SmallVector<std::pair<CallInst *, CallInst *>> CallInstMap;
+
+  Constant *getConstantInt64(uint64_t Val) {
+    return ConstantInt::get(Int64Ty, Val);
+  }
+
+  static std::string getWrapperFunctionName(Function *F, CallSiteInfo &CSI) {
+    std::string Name = "__kmpc_host_rpc_wrapper_" + std::string(F->getName());
+    if (!F->isVarArg())
+      return Name;
+
+    for (unsigned I = F->getFunctionType()->getNumParams();
+         I < CSI.Params.size(); ++I) {
+      Name.push_back('_');
+      Name.append(typeToString(CSI.Params[I]));
+    }
+
+    return Name;
+  }
+
+  void registerAAs();
+
+  Value *convertToInt64Ty(Value *V);
+
+  Value *convertFromInt64TyTo(Value *V, Type *TargetTy);
+
+  Constant *convertToInt64Ty(Constant *C);
+
+  Constant *convertFromInt64TyTo(Constant *C, Type *T);
+
+  // int device_wrapper(call_no, arg_info, ...) {
+  //   void *desc = __kmpc_host_rpc_get_desc(call_no, num_args, arg_info);
+  //   __kmpc_host_rpc_add_arg(desc, arg1, sizeof(arg1));
+  //   __kmpc_host_rpc_add_arg(desc, arg2, sizeof(arg2));
+  //   ...
+  //   int r = (int)__kmpc_host_rpc_send_and_wait(desc);
+  //   return r;
+  // }
+  Function *getDeviceWrapperFunction(StringRef WrapperName, Function *F,
+                                     CallSiteInfo &CSI);
+
+  // void host_wrapper(desc) {
+  //   int arg1 = (int)__kmpc_host_rpc_get_arg(desc, 0);
+  //   float arg2 = (float)__kmpc_host_rpc_get_arg(desc, 1);
+  //   char *arg3 = (char *)__kmpc_host_rpc_get_arg(desc, 2);
+  //   ...
+  //   int r = actual_call(arg1, arg2, arg3, ...);
+  //   __kmpc_host_rpc_set_ret_val(ptr(desc, (int64_t)r);
+  // }
+  Function *getHostWrapperFunction(StringRef WrapperName, Function *F,
+                                   CallSiteInfo &CSI);
+
+  bool rewriteWithHostRPC(Function *F);
+
+  void emitHostWrapperInvoker();
+
+  bool recollectInformation();
+
+public:
+  HostRPC(Module &DeviceModule, Module &HostModule, Attributor &A)
+      : Context(DeviceModule.getContext()), M(DeviceModule), HM(HostModule),
+        DL(M.getDataLayout()), Builder(Context), A(A) {
+    assert(&M.getContext() == &HM.getContext() &&
+           "device and host modules have different context");
+
+    Int8PtrTy = PointerType::getUnqual(Context);
+    VoidTy = Type::getVoidTy(Context);
+    Int32Ty = Type::getInt32Ty(Context);
+    Int64Ty = Type::getInt64Ty(Context);
+    NullPtr = ConstantInt::getNullValue(Int8PtrTy);
+    NullInt64 = ConstantInt::getNullValue(Int64Ty);
+
+#define __OMP_RTL(_ENUM, MOD, VARARG, RETTY, ...)                              \
+  {                                                                            \
+    SmallVector<Type *> Params{__VA_ARGS__};                                   \
+    FunctionType *FT = FunctionType::get(RETTY, Params, VARARG);               \
+    Function *F = (MOD).getFunction(#_ENUM);                                   \
+    if (!F)                                                                    \
+      F = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,     \
+                           #_ENUM, (MOD));                                     \
+    RFIs[OMPRTL_##_ENUM] = F;                                                  \
+  }
+    __OMP_RTL(__kmpc_host_rpc_get_desc, M, false, Int8PtrTy, Int32Ty, Int32Ty,
+              Int8PtrTy)
+    __OMP_RTL(__kmpc_host_rpc_add_arg, M, false, VoidTy, Int8PtrTy, Int64Ty,
+              Int32Ty)
+    __OMP_RTL(__kmpc_host_rpc_send_and_wait, M, false, Int64Ty, Int8PtrTy)
+    __OMP_RTL(__kmpc_host_rpc_get_arg, HM, false, Int64Ty, Int8PtrTy, Int32Ty)
+    __OMP_RTL(__kmpc_host_rpc_set_ret_val, HM, false, VoidTy, Int8PtrTy,
+              Int64Ty)
+    __OMP_RTL(__kmpc_host_rpc_invoke_host_wrapper, HM, false, VoidTy, Int32Ty,
+              Int8PtrTy)
+#undef __OMP_RTL
+
+    ArgInfoTy = StructType::create({Int64Ty, Int64Ty, Int64Ty, Int8PtrTy},
+                                   "struct.arg_info_t");
+  }
+
+  bool run();
+};
+
+Value *HostRPC::convertToInt64Ty(Value *V) {
+  if (auto *C = dyn_cast<Constant>(V))
+    return convertToInt64Ty(C);
+
+  Type *T = V->getType();
+
+  if (T == Int64Ty)
+    return V;
+
+  if (T->isPointerTy())
+    return Builder.CreatePtrToInt(V, Int64Ty);
+
+  if (T->isIntegerTy())
+    return Builder.CreateIntCast(V, Int64Ty, /* isSigned */ true);
+
+  if (T->isFloatingPointTy()) {
+    V = Builder.CreateBitCast(
+        V, Type::getIntNTy(V->getContext(), T->getScalarSizeInBits()));
+    return Builder.CreateIntCast(V, Int64Ty, /* isSigned */ true);
+  }
+
+  llvm_unreachable("unknown cast to int64_t");
+}
+
+Value *HostRPC::convertFromInt64TyTo(Value *V, Type *T) {
+  if (auto *C = dyn_cast<Constant>(V))
+    return convertFromInt64TyTo(C, T);
+
+  if (T == Int64Ty)
+    return V;
+
+  if (T->isPointerTy())
+    return Builder.CreateIntToPtr(V, T);
+
+  if (T->isIntegerTy())
+    return Builder.CreateIntCast(V, T, /* isSigned */ true);
+
+  if (T->isFloatingPointTy()) {
+    V = Builder.CreateIntCast(
+        V, Type::getIntNTy(V->getContext(), T->getScalarSizeInBits()),
+        /* isSigned */ true);
+    return Builder.CreateBitCast(V, T);
+  }
+
+  llvm_unreachable("unknown cast from int64_t");
+}
+
+Constant *HostRPC::convertToInt64Ty(Constant *C) {
+  Type *T = C->getType();
+
+  if (T == Int64Ty)
+    return C;
+
+  if (T->isPointerTy())
+    return ConstantExpr::getPtrToInt(C, Int64Ty);
+
+  if (T->isIntegerTy())
+    llvm_unreachable("I don't know how to fixe this");
+    //return ConstantExpr::getIntegerCast(C, Int64Ty, /* isSigned */ true);
+
+  if (T->isFloatingPointTy()) {
+    // TODO: FIXEME getIntegerCast is hard to implement with new version of ConstExpr
+    //C = ConstantExpr::getBitCast(
+    //    C, Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()));
+    //return ConstantExpr::getIntegerCast(C, Int64Ty, /* isSigned */ true);
+    llvm_unreachable("unsuported cast from float to int64_t");
+  }
+
+  llvm_unreachable("unknown cast to int64_t");
+}
+
+Constant *HostRPC::convertFromInt64TyTo(Constant *C, Type *T) {
+  if (T == Int64Ty)
+    return C;
+
+  if (T->isPointerTy())
+    return ConstantExpr::getIntToPtr(C, T);
+
+  if (T->isIntegerTy())
+    llvm_unreachable("I don't know how to fixe this");
+    //return ConstantExpr::getIntegerCast(C, T, /* isSigned */ true);
+
+  if (T->isFloatingPointTy()) {
+    // TODO: FIXEME getIntegerCast is hard to implement with new version of ConstExpr
+    //C = ConstantExpr::getIntegerCast(
+    //    C, Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()),
+    //    /* isSigned */ true);
+    //return ConstantExpr::getBitCast(C, T);
+    llvm_unreachable("unsuported cast from int64_t to float");
+  }
+
+  llvm_unreachable("unknown cast from int64_t");
+}
+
+void HostRPC::registerAAs() {
+  for (auto *F : FunctionWorkList)
+    for (User *U : F->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+
+      for (unsigned I = 0; I < CI->arg_size(); ++I) {
+        Value *Operand = CI->getArgOperand(I);
+        if (!Operand->getType()->isPointerTy())
+          continue;
+        A.getOrCreateAAFor<AAUnderlyingObjects>(
+            IRPosition::callsite_argument(*CI, I),
+            /* QueryingAA */ nullptr, DepClassTy::NONE);
+      }
+    }
+}
+
+bool HostRPC::recollectInformation() {
+  FunctionWorkList.clear();
+
+  for (Function &F : M) {
+    // If the function is already defined, it definitely does not require RPC.
+    if (!F.isDeclaration())
+      continue;
+
+    // If it is an internal function, skip it as well.
+    if (isInternalFunction(F))
+      continue;
+
+    // If there is no use of the function, skip it.
+    if (F.use_empty())
+      continue;
+
+    FunctionWorkList.insert(&F);
+  }
+
+  return !FunctionWorkList.empty();
+}
+
+bool HostRPC::run() {
+  bool Changed = false;
+
+  if (!recollectInformation())
+    return Changed;
+
+  Changed = true;
+
+  // We add a couple of assumptions to those RPC functions such that AAs will
+  // not error out because of unknown implementation of those functions.
+  for (Function &F : M) {
+    if (!F.isDeclaration())
+      continue;
+
+    F.addFnAttr(Attribute::NoRecurse);
+
+    for (auto &Arg : F.args())
+      if (Arg.getType()->isPointerTy())
+        Arg.addAttr(Attribute::NoCapture);
+
+    if (!F.isVarArg())
+      continue;
+
+    for (User *U : F.users()) {
+      auto *CB = dyn_cast<CallBase>(U);
+      if (!CB)
+        continue;
+      for (unsigned I = F.getFunctionType()->getNumParams(); I < CB->arg_size();
+           ++I) {
+        Value *Arg = CB->getArgOperand(I);
+        if (Arg->getType()->isPointerTy())
+          CB->addParamAttr(I, Attribute::NoCapture);
+      }
+    }
+  }
+
+  LLVM_DEBUG(M.dump());
+
+  registerAAs();
+
+  ChangeStatus Status = A.run();
+  if (!recollectInformation())
+    return Status == ChangeStatus::CHANGED;
+
+  for (Function *F : FunctionWorkList)
+    Changed |= rewriteWithHostRPC(F);
+
+  if (!Changed)
+    return Changed;
+
+  for (auto Itr = CallInstMap.rbegin(); Itr != CallInstMap.rend(); ++Itr) {
+    auto *CI = Itr->first;
+    auto *NewCI = Itr->second;
+    CI->replaceAllUsesWith(NewCI);
+    CI->eraseFromParent();
+  }
+
+  for (Function *F : FunctionWorkList)
+    if (F->user_empty())
+      F->eraseFromParent();
+
+  emitHostWrapperInvoker();
+
+  return Changed;
+}
+
+bool HostRPC::rewriteWithHostRPC(Function *F) {
+  SmallVector<CallInst *> WorkList;
+
+  for (User *U : F->users()) {
+    auto *CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+    WorkList.push_back(CI);
+  }
+
+  if (WorkList.empty())
+    return false;
+
+  for (CallInst *CI : WorkList) {
+    CallSiteInfo CSI;
+    CSI.CI = CI;
+    unsigned NumArgs = CI->arg_size();
+    for (unsigned I = 0; I < NumArgs; ++I)
+      CSI.Params.push_back(CI->getArgOperand(I)->getType());
+
+    std::string WrapperName = getWrapperFunctionName(F, CSI);
+    Function *DeviceWrapperFn = getDeviceWrapperFunction(WrapperName, F, CSI);
+    Function *HostWrapperFn = getHostWrapperFunction(WrapperName, F, CSI);
+
+    int32_t WrapperNumber = -1;
+    for (unsigned I = 0; I < HostEntryTable.size(); ++I) {
+      if (HostEntryTable[I] == HostWrapperFn) {
+        WrapperNumber = I;
+        break;
+      }
+    }
+    if (WrapperNumber == -1) {
+      WrapperNumber = HostEntryTable.size();
+      HostEntryTable.push_back(HostWrapperFn);
+    }
+
+    auto CheckIfIdentifierPtr = [this](const Value *V) {
+      auto *CI = dyn_cast<CallInst>(V);
+      if (!CI)
+        return false;
+      Function *Callee = CI->getCalledFunction();
+      if (this->FunctionWorkList.count(Callee))
+        return true;
+      return Callee->getName().starts_with("__kmpc_host_rpc_wrapper_");
+    };
+
+    auto CheckIfDynAlloc = [](Value *V) -> CallInst * {
+      auto *CI = dyn_cast<CallInst>(V);
+      if (!CI)
+        return nullptr;
+      Function *Callee = CI->getCalledFunction();
+      auto Name = Callee->getName();
+      if (Name == "malloc" || Name == "__kmpc_alloc_shared")
+        return CI;
+      return nullptr;
+    };
+
+    auto CheckIfStdIO = [](Value *V) -> GlobalVariable * {
+      auto *LI = dyn_cast<LoadInst>(V);
+      if (!LI)
+        return nullptr;
+      auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
+      if (!GV)
+        return nullptr;
+      auto Name = GV->getName();
+      if (Name == "stdout" || Name == "stderr" || Name == "stdin")
+        return GV;
+      return nullptr;
+    };
+
+    auto CheckIfGlobalVariable = [](Value *V) {
+      if (auto *GV = dyn_cast<GlobalVariable>(V))
+        return GV;
+      if (auto *LI = dyn_cast<LoadInst>(V))
+        if (auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()))
+          return GV;
+      return static_cast<GlobalVariable *>(nullptr);
+    };
+
+    auto CheckIfNullPtr = [](Value *V) {
+      if (!V->getType()->isPointerTy())
+        return false;
+      return V == ConstantInt::getNullValue(V->getType());
+    };
+
+    auto HandleDirectUse = [&](Value *Ptr, HostRPCArgInfo &AI,
+                               bool IsPointer = false) {
+      AI.BasePtr = Ptr;
+      AI.Type = getConstantInt64(IsPointer ? ArgType::OMP_HOST_RPC_ARG_PTR
+                                           : ArgType::OMP_HOST_RPC_ARG_SCALAR);
+      AI.Size = NullInt64;
+    };
+
+    SmallVector<SmallVector<HostRPCArgInfo>> ArgInfo;
+    bool IsConstantArgInfo = true;
+
+    for (unsigned I = 0; I < CI->arg_size(); ++I) {
+      ArgInfo.emplace_back();
+      auto &AII = ArgInfo.back();
+
+      Value *Operand = CI->getArgOperand(I);
+
+      // Check if scalar type.
+      if (!Operand->getType()->isPointerTy()) {
+        AII.emplace_back();
+        HandleDirectUse(Operand, AII.back());
+        IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
+        continue;
+      }
+
+      if (CheckIfNullPtr(Operand))
+        continue;
+
+      auto Pred = [&](Value &Obj) {
+        if (CheckIfNullPtr(&Obj))
+          return true;
+
+        bool IsConstantArgument = false;
+        if (!F->isVarArg() &&
+            F->hasParamAttribute(I, Attribute::AttrKind::ReadOnly))
+          IsConstantArgument = true;
+
+        HostRPCArgInfo AI;
+
+        if (auto *IO = CheckIfStdIO(&Obj)) {
+          HandleDirectUse(IO, AI, /* IsPointer */ true);
+        } else if (CheckIfIdentifierPtr(&Obj)) {
+          IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
+          HandleDirectUse(Operand, AI, /* IsPointer */ true);
+        } else if (auto *GV = CheckIfGlobalVariable(&Obj)) {
+          AI.BasePtr = GV;
+          AI.Size = getConstantInt64(DL.getTypeStoreSize(GV->getValueType()));
+          AI.Type =
+              getConstantInt64(GV->isConstant() || IsConstantArgument
+                                   ? ArgType::OMP_HOST_RPC_ARG_COPY_TO
+                                   : ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM);
+        } else if (CheckIfDynAlloc(&Obj)) {
+          // We will handle this case at runtime so here we don't do anything.
+          return true;
+        } else if (isa<AllocaInst>(&Obj)) {
+          llvm_unreachable("alloca instruction needs to be handled!");
+        } else {
+          LLVM_DEBUG({
+            dbgs() << "[HostRPC] warning: call site " << *CI << ", operand "
+                   << *Operand << ", underlying object " << Obj
+                   << " cannot be handled.\n";
+          });
+          return true;
+        }
+        AII.push_back(std::move(AI));
+        return true;
+      };
+
+      auto &AAUO = *A.getOrCreateAAFor<AAUnderlyingObjects>(
+          IRPosition::callsite_argument(*CI, I), nullptr, DepClassTy::NONE);
+      if (!AAUO.forallUnderlyingObjects(Pred))
+        llvm_unreachable("internal error");
+    }
+
+    // Reset the insert point to the call site.
+    Builder.SetInsertPoint(CI);
+
+    Value *ArgInfoVal = nullptr;
+    if (!IsConstantArgInfo) {
+      ArgInfoVal = Builder.CreateAlloca(Int8PtrTy, getConstantInt64(NumArgs),
+                                        "arg_info");
+      for (unsigned I = 0; I < NumArgs; ++I) {
+        auto &AII = ArgInfo[I];
+        Value *Next = NullPtr;
+        for (auto &AI : AII) {
+          Value *AIV = Builder.CreateAlloca(ArgInfoTy);
+          Value *AIIArg =
+              GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(0)});
+          Builder.Insert(AIIArg);
+          Builder.CreateStore(convertToInt64Ty(AI.BasePtr), AIIArg);
+          Value *AIIType =
+              GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(1)});
+          Builder.Insert(AIIType);
+          Builder.CreateStore(AI.Type, AIIType);
+          Value *AIISize =
+              GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(2)});
+          Builder.Insert(AIISize);
+          Builder.CreateStore(AI.Size, AIISize);
+          Value *AIINext =
+              GetElementPtrInst::Create(Int8PtrTy, AIV, {getConstantInt64(3)});
+          Builder.Insert(AIINext);
+          Builder.CreateStore(Next, AIINext);
+          Next = AIV;
+        }
+        Value *AIIV = GetElementPtrInst::Create(Int8PtrTy, ArgInfoVal,
+                                                {getConstantInt64(I)});
+        Builder.Insert(AIIV);
+        Builder.CreateStore(Next, AIIV);
+      }
+    } else {
+      SmallVector<Constant *> ArgInfoInitVar;
+      for (auto &AII : ArgInfo) {
+        Constant *Last = NullPtr;
+        for (auto &AI : AII) {
+          auto *Arg = cast<Constant>(AI.BasePtr);
+          auto *CS =
+              ConstantStruct::get(ArgInfoTy, {convertToInt64Ty(Arg), AI.Type,
+                                              cast<Constant>(AI.Size), Last});
+          auto *GV = new GlobalVariable(
+              M, ArgInfoTy, /* isConstant */ true,
+              GlobalValue::LinkageTypes::InternalLinkage, CS);
+          Last = GV;
+        }
+        ArgInfoInitVar.push_back(Last);
+      }
+      Constant *ArgInfoInit = ConstantArray::get(
+          ArrayType::get(Int8PtrTy, NumArgs), ArgInfoInitVar);
+      ArgInfoVal = new GlobalVariable(
+          M, ArrayType::get(Int8PtrTy, NumArgs), /* isConstant */ true,
+          GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info");
+    }
+
+    SmallVector<Value *> Args{ConstantInt::get(Int32Ty, WrapperNumber),
+                              ArgInfoVal};
+    for (Value *Operand : CI->args())
+      Args.push_back(Operand);
+
+    CallInst *NewCall = Builder.CreateCall(DeviceWrapperFn, Args);
+
+    CallInstMap.emplace_back(CI, NewCall);
+  }
+
+  return true;
+}
+
+Function *HostRPC::getDeviceWrapperFunction(StringRef WrapperName, Function *F,
+                                            CallSiteInfo &CSI) {
+  Function *WrapperFn = M.getFunction(WrapperName);
+  if (WrapperFn)
+    return WrapperFn;
+
+  // return_type device_wrapper(int32_t call_no, void *arg_info, ...)
+  SmallVector<Type *> Params{Int32Ty, Int8PtrTy};
+  Params.append(CSI.Params);
+
+  Type *RetTy = F->getReturnType();
+
+  FunctionType *FT = FunctionType::get(RetTy, Params, /*isVarArg*/ false);
+  WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::InternalLinkage,
+                               WrapperName, M);
+
+  // Emit the body of the device wrapper
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
+  Builder.SetInsertPoint(EntryBB);
+
+  // skip call_no and arg_info.
+  constexpr const unsigned NumArgSkipped = 2;
+
+  Value *Desc = nullptr;
+  {
+    Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_get_desc];
+    Desc = Builder.CreateCall(
+        Fn,
+        {WrapperFn->getArg(0),
+         ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
+         WrapperFn->getArg(1)},
+        "desc");
+  }
+
+  {
+    Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_add_arg];
+    for (unsigned I = NumArgSkipped; I < WrapperFn->arg_size(); ++I) {
+      Value *V = convertToInt64Ty(WrapperFn->getArg(I));
+      Builder.CreateCall(
+          Fn, {Desc, V, ConstantInt::get(Int32Ty, I - NumArgSkipped)});
+    }
+  }
+
+  Value *RetVal =
+      Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_send_and_wait], {Desc});
+
+  if (RetTy->isVoidTy()) {
+    Builder.CreateRetVoid();
+    return WrapperFn;
+  }
+
+  if (RetTy != RetVal->getType())
+    RetVal = convertFromInt64TyTo(RetVal, RetTy);
+
+  Builder.CreateRet(RetVal);
+
+  return WrapperFn;
+}
+
+Function *HostRPC::getHostWrapperFunction(StringRef WrapperName, Function *F,
+                                          CallSiteInfo &CSI) {
+  Function *WrapperFn = HM.getFunction(WrapperName);
+  if (WrapperFn)
+    return WrapperFn;
+
+  SmallVector<Type *> Params{Int8PtrTy};
+  FunctionType *FT = FunctionType::get(VoidTy, Params, /* isVarArg */ false);
+  WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,
+                               WrapperName, HM);
+
+  Value *Desc = WrapperFn->getArg(0);
+
+  // Emit the body of the host wrapper
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
+  Builder.SetInsertPoint(EntryBB);
+
+  SmallVector<Value *> Args;
+  for (unsigned I = 0; I < CSI.CI->arg_size(); ++I) {
+    Value *V = Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_arg],
+                                  {Desc, ConstantInt::get(Int32Ty, I)});
+    Args.push_back(convertFromInt64TyTo(V, CSI.Params[I]));
+  }
+
+  // The host callee that will be called eventually by the host wrapper.
+  Function *HostCallee = HM.getFunction(F->getName());
+  if (!HostCallee)
+    HostCallee = Function::Create(F->getFunctionType(), F->getLinkage(),
+                                  F->getName(), HM);
+
+  Value *RetVal = Builder.CreateCall(HostCallee, Args);
+  if (!RetVal->getType()->isVoidTy()) {
+    RetVal = convertToInt64Ty(RetVal);
+    Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_set_ret_val],
+                       {Desc, RetVal});
+  }
+  Builder.CreateRetVoid();
+
+  return WrapperFn;
+}
+
+void HostRPC::emitHostWrapperInvoker() {
+  IRBuilder<> Builder(Context);
+  unsigned NumEntries = HostEntryTable.size();
+  Function *F = RFIs[OMPRTL___kmpc_host_rpc_invoke_host_wrapper];
+  F->setDLLStorageClass(
+      GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
+  Value *CallNo = F->getArg(0);
+  Value *Desc = F->getArg(1);
+
+  SmallVector<BasicBlock *> SwitchBBs;
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", F);
+  BasicBlock *ReturnBB = BasicBlock::Create(Context, "return", F);
+
+  // Emit code for the return bb.
+  Builder.SetInsertPoint(ReturnBB);
+  Builder.CreateRetVoid();
+
+  // Create BB for each host entry and emit function call.
+  for (unsigned I = 0; I < NumEntries; ++I) {
+    BasicBlock *BB = BasicBlock::Create(Context, "invoke.bb", F, ReturnBB);
+    SwitchBBs.push_back(BB);
+    Builder.SetInsertPoint(BB);
+    Builder.CreateCall(HostEntryTable[I], {Desc});
+    Builder.CreateBr(ReturnBB);
+  }
+
+  // Emit code for the entry BB.
+  Builder.SetInsertPoint(EntryBB);
+  SwitchInst *Switch = Builder.CreateSwitch(CallNo, ReturnBB, NumEntries);
+  for (unsigned I = 0; I < NumEntries; ++I)
+    Switch->addCase(ConstantInt::get(cast<IntegerType>(Int32Ty), I),
+                    SwitchBBs[I]);
+}
+
+Module *getHostModule(Module &M) {
+  auto *MD = M.getNamedMetadata("llvm.hostrpc.hostmodule");
+  if (!MD || MD->getNumOperands() == 0)
+    return nullptr;
+  auto *Node = MD->getOperand(0);
+  assert(Node->getNumOperands() == 1 && "invliad named metadata");
+  auto *CAM = dyn_cast<ConstantAsMetadata>(Node->getOperand(0));
+  if (!CAM)
+    return nullptr;
+  auto *CI = cast<ConstantInt>(CAM->getValue());
+  Module *Mod = reinterpret_cast<Module *>(CI->getZExtValue());
+  M.eraseNamedMetadata(MD);
+  return Mod;
+}
+} // namespace
+
+PreservedAnalyses HostRPCPass::run(Module &M, ModuleAnalysisManager &AM) {
+  std::unique_ptr<Module> DummyHostModule;
+
+  Module *HostModule = nullptr;
+
+  if (UseDummyHostModule) {
+    DummyHostModule =
+        std::make_unique<Module>("dummy-host-rpc.bc", M.getContext());
+    HostModule = DummyHostModule.get();
+  } else {
+    HostModule = getHostModule(M);
+  }
+
+  if (!HostModule)
+    return PreservedAnalyses::all();
+
+  bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
+                  LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
+
+  // The pass will not run if it is not invoked directly or not invoked at link
+  // time.
+  if (!UseDummyHostModule && !PostLink)
+    return PreservedAnalyses::all();
+
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  AnalysisGetter AG(FAM);
+
+  CallGraphUpdater CGUpdater;
+  BumpPtrAllocator Allocator;
+
+  AttributorConfig AC(CGUpdater);
+  AC.DefaultInitializeLiveInternals = false;
+  AC.RewriteSignatures = false;
+  AC.PassName = DEBUG_TYPE;
+  AC.MaxFixpointIterations = 1024;
+
+  InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
+
+  SetVector<Function *> Functions;
+  Attributor A(Functions, InfoCache, AC);
+
+  HostRPC RPC(M, *HostModule, A);
+  bool Changed = RPC.run();
+
+  LLVM_DEBUG({
+    if (Changed && UseDummyHostModule) {
+      M.dump();
+      HostModule->dump();
+    }
+  });
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -56,6 +56,7 @@
 #include "llvm/Transforms/IPO/Attributor.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"

 #include <algorithm>
 #include <optional>
@@ -138,6 +139,15 @@ static cl::opt<bool>
                         cl::desc("Enables more verbose remarks."), cl::Hidden,
                         cl::init(false));

+static cl::opt<bool>
+    EnableParallel51Split("openmp-opt-enable-parallel-51-split",
+                          cl::desc("Enable the kernel split at parallel_51."),
+                          cl::Hidden, cl::init(false));
+static cl::opt<bool> EnableParallel51SplitMultiTeams(
+    "openmp-opt-enable-parallel-51-split-multi-teams",
+    cl::desc("Enable multi-teams support for the parallel 51 kernel."),
+    cl::Hidden, cl::init(false));
+
 static cl::opt<unsigned>
    SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
                          cl::desc("Maximal number of attributor iterations."),
@@ -917,6 +927,28 @@ private:
  }
 };

+bool canUseMultiTeam(Function *OutlinedFn, OMPInformationCache &OMPInfoCache) {
+  if (!EnableParallel51SplitMultiTeams)
+    return false;
+
+  return true;
+}
+
+void collectReachingKernels(Function *F,
+                            SmallVector<Kernel> &ReachingKernels) {
+  if (omp::isOpenMPKernel(*F)) {
+    if (F->hasFnAttribute("omp_parallel_51_kernel"))
+      ReachingKernels.push_back(F);
+    return;
+  }
+  for (User *U : F->users()) {
+    auto *I = dyn_cast<Instruction>(U);
+    if (!I)
+      continue;
+    collectReachingKernels(I->getFunction(), ReachingKernels);
+  }
+}
+
 struct OpenMPOpt {

  using OptimizationRemarkGetter =
@@ -945,6 +977,11 @@ struct OpenMPOpt {
                      << " functions\n");

    if (IsModulePass) {
+      if (EnableParallel51Split)
+        Changed |= splitKernels();
+
+      OMPInfoCache.recollectUses();
+
      Changed |= runAttributor(IsModulePass);

      // Recollect uses, in case Attributor deleted any.
@@ -1047,6 +1084,308 @@ struct OpenMPOpt {
  }

 private:
+  /// Create a new kernel for each function call to __kmpc_parallel_51.
+  bool splitKernels() {
+    bool Changed = false;
+
+    OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
+
+    if (!Parallel51RFI.Declaration)
+      return Changed;
+
+    SmallVector<CallInst *> WorkItems;
+    for (User *U : Parallel51RFI.Declaration->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+
+      Function *F = CI->getFunction();
+      // If the parent function is already from kernel split, skip it.
+      Attribute Attr = F->getFnAttribute("omp_parallel_51_kernel");
+      if (Attr.isValid())
+        continue;
+
+      WorkItems.push_back(CI);
+    }
+
+    if (WorkItems.empty())
+      return Changed;
+
+    Changed = true;
+
+    auto CreateNewKernel = [](LLVMContext &Ctx, Module &M,
+                              OpenMPIRBuilder &IRBuilder, Function *Parallel51,
+                              const CallInst *CI) {
+      SmallVector<Type *, 4> ArgTypes;
+      // int32_t gtid
+      ArgTypes.push_back(Type::getInt32Ty(Ctx));
+      // int32_t if_expr
+      ArgTypes.push_back(Type::getInt32Ty(Ctx));
+      // int32_t num_threads
+      ArgTypes.push_back(Type::getInt32Ty(Ctx));
+      // void **args
+      ArgTypes.push_back(PointerType::getUnqual(Ctx));
+      // int64_t nargs
+      ArgTypes.push_back(Type::getInt64Ty(Ctx));
+
+      FunctionType *FT =
+          FunctionType::get(Type::getVoidTy(Ctx), ArgTypes, false);
+      std::string KernelName = "__omp_offloading_parallel_51_from_";
+      KernelName += CI->getFunction()->getName();
+      // Sanitize the kernel name
+      {
+        size_t DotPos = KernelName.find('.');
+        while (DotPos != std::string::npos) {
+          KernelName[DotPos] = '_';
+          DotPos = KernelName.find('.');
+        }
+      }
+      Function *K =
+          Function::Create(FT, GlobalValue::WeakODRLinkage, KernelName, &M);
+      K->setVisibility(GlobalValue::ProtectedVisibility);
+      // exec mode global variable
+      GlobalVariable *ModeGV = new GlobalVariable(
+          M, Type::getInt8Ty(Ctx), /*isConstant=*/true,
+          GlobalValue::WeakAnyLinkage,
+          ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD),
+          K->getName() + "_exec_mode");
+      appendToCompilerUsed(M, {ModeGV});
+
+      // Attach "kernel" metadata to the new kernel.
+      NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
+      Metadata *MDVals[] = {ConstantAsMetadata::get(K),
+                            MDString::get(Ctx, "kernel"),
+                            ConstantAsMetadata::get(ConstantInt::get(
+                                llvm::Type::getInt32Ty(Ctx), 1))};
+      MD->addOperand(MDNode::get(Ctx, MDVals));
+      // Set kernel attributes.
+      K->setAttributes(
+          AttributeList::get(Ctx, AttributeList::FunctionIndex,
+                             CI->getFunction()->getAttributes().getFnAttrs()));
+      K->removeFnAttr("omp_target_thread_limit");
+      K->removeFnAttr("omp_target_num_teams");
+      K->addFnAttr("omp_parallel_51_kernel");
+      K->addFnAttr("kernel");
+      K->getArg(3)->addAttr(Attribute::NoAlias);
+
+      auto *EntryBB = BasicBlock::Create(Ctx, "entry", K);
+      OpenMPIRBuilder::LocationDescription TargetInitLoc(
+          {EntryBB, EntryBB->end()});
+      auto IP = IRBuilder.createTargetInit(TargetInitLoc, /* IsSPMD */ true);
+      BasicBlock *UserCodeBB = IP.getBlock();
+
+      auto UseDirectIfPossible = [](Value *LHS, Value *RHS) -> Value * {
+        if (isa<Constant>(LHS))
+          return LHS;
+        return RHS;
+      };
+
+      Value *Ident = CI->getOperand(0);
+      Value *TId = UseDirectIfPossible(CI->getOperand(1), K->getArg(0));
+      Value *IfExpr = UseDirectIfPossible(CI->getOperand(2), K->getArg(1));
+      Value *NumThreads = UseDirectIfPossible(CI->getOperand(3), K->getArg(2));
+      Value *ProcBind = CI->getOperand(4);
+      Value *Fn = CI->getOperand(5);
+      Value *WrapperFn = CI->getOperand(6);
+      Value *Args = K->getArg(3);
+      Value *NArgs = UseDirectIfPossible(CI->getOperand(8), K->getArg(4));
+
+      (void)CallInst::Create(Parallel51,
+                             {Ident, TId, IfExpr, NumThreads, ProcBind, Fn,
+                              WrapperFn, Args, NArgs},
+                             "", UserCodeBB);
+
+      OpenMPIRBuilder::LocationDescription TargetDeInitLoc(
+          {UserCodeBB, UserCodeBB->end()});
+      IRBuilder.createTargetDeinit(TargetDeInitLoc, /* IsSPMD */ true);
+      IRBuilder.Builder.CreateRetVoid();
+
+      return K;
+    };
+
+    auto EnableMultiTeam = [](Function *OutlinedFn) {
+      SmallVector<CallInst *> WorkItems;
+      for (BasicBlock &BB : *OutlinedFn)
+        for (Instruction &I : BB) {
+          CallInst *C = dyn_cast<CallInst>(&I);
+          if (!C)
+            continue;
+          Function *Callee = C->getCalledFunction();
+          auto CalleeName = Callee->getName();
+          if (CalleeName == "__kmpc_for_static_init_4" ||
+              CalleeName == "__kmpc_for_static_init_4u" ||
+              CalleeName == "__kmpc_for_static_init_8" ||
+              CalleeName == "__kmpc_for_static_init_8u")
+            WorkItems.push_back(C);
+        }
+
+      for (CallInst *C : WorkItems) {
+        constexpr const unsigned SchedTypeArgNum = 2;
+        Value *SchedTypeVal = C->getOperand(SchedTypeArgNum);
+        ConstantInt *SchedTypeCI = cast<ConstantInt>(SchedTypeVal);
+        int32_t SchedType = SchedTypeCI->getSExtValue();
+        Value *NewSchedType = SchedTypeVal;
+        if (SchedType == /*kmp_sched_static_chunk*/ 33)
+          NewSchedType = ConstantInt::get(SchedTypeCI->getType(), 100);
+        else if (SchedType == /*kmp_sched_static_chunk*/ 34)
+          NewSchedType = ConstantInt::get(SchedTypeCI->getType(), 101);
+        C->setOperand(SchedTypeArgNum, NewSchedType);
+      }
+    };
+
+    auto &Ctx = M.getContext();
+
+    for (CallInst *CI : WorkItems) {
+      Function *K = CreateNewKernel(Ctx, M, OMPInfoCache.OMPBuilder,
+                                    Parallel51RFI.Declaration, CI);
+
+      constexpr const unsigned OutlinedFnArgNum = 5;
+      Function *OutlinedFn =
+          dyn_cast<Function>(CI->getOperand(OutlinedFnArgNum));
+      assert(OutlinedFn && "arg fn is not a function");
+
+      if (canUseMultiTeam(OutlinedFn, OMPInfoCache))
+        EnableMultiTeam(OutlinedFn);
+
+      auto &Ctx = M.getContext();
+      IRBuilder<> Builder(CI);
+
+      FunctionCallee Callee =
+          OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+              M, OMPRTL___kmpc_launch_parallel_51_kernel);
+      Constant *KernelName = ConstantDataArray::getString(Ctx, K->getName());
+      GlobalVariable *KernelNameVar =
+          new GlobalVariable(M, KernelName->getType(), /*isConstant=*/true,
+                             GlobalValue::WeakAnyLinkage, KernelName);
+      Value *Args[6] = {KernelNameVar,        CI->getArgOperand(1),
+                        CI->getArgOperand(2), CI->getArgOperand(3),
+                        CI->getArgOperand(7), CI->getArgOperand(8)};
+
+      CallInst *C = Builder.CreateCall(Callee, Args);
+      OMPInfoCache.setCallingConvention(Callee, C);
+      CI->eraseFromParent();
+    }
+
+    // Check all the use of __kmpc_parallel_51 again. If it is just used in
+    // parallel kernel, we can mark the main kernel as SPMD mode as well.
+    bool OnlyUsedByParallelKernel = true;
+    for (User *U : Parallel51RFI.Declaration->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+      Function *F = CI->getFunction();
+      Attribute Attr = F->getFnAttribute("omp_parallel_51_kernel");
+      if (Attr.isValid())
+        continue;
+      OnlyUsedByParallelKernel = false;
+      break;
+    }
+
+    if (OnlyUsedByParallelKernel) {
+      for (auto K : SCC) {
+        if (!omp::isOpenMPKernel(*K))
+          continue;
+        Attribute Attr = K->getFnAttribute("omp_parallel_51_kernel");
+        if (Attr.isValid())
+          continue;
+        GlobalVariable *ExecMode =
+            M.getGlobalVariable((K->getName() + "_exec_mode").str());
+        assert(ExecMode && "kernel without exec mode");
+        assert(ExecMode->getInitializer() &&
+               "ExecMode doesn't have initializer!");
+        assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
+               "ExecMode is not an integer!");
+        const int8_t ExecModeVal =
+            cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
+        // Kernel is already in SPMD mode, skip.
+        if (ExecModeVal & OMP_TGT_EXEC_MODE_SPMD)
+          continue;
+        auto *NewExecModeC =
+            ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD);
+        ExecMode->setInitializer(NewExecModeC);
+        OMPInformationCache::RuntimeFunctionInfo &TargetInitRFI =
+            OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
+        OMPInformationCache::RuntimeFunctionInfo &TargetDeInitRFI =
+            OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
+        CallInst *InitCI = nullptr;
+        CallInst *DeInitCI = nullptr;
+        for (User *U : TargetInitRFI.Declaration->users()) {
+          auto *CI = dyn_cast<CallInst>(U);
+          if (!CI)
+            continue;
+          if (CI->getFunction() == K) {
+            InitCI = CI;
+            break;
+          }
+        }
+        for (User *U : TargetDeInitRFI.Declaration->users()) {
+          auto *CI = dyn_cast<CallInst>(U);
+          if (!CI)
+            continue;
+          if (CI->getFunction() == K) {
+            DeInitCI = CI;
+            break;
+          }
+        }
+        assert(InitCI && DeInitCI && "kernel without init and deinit");
+        InitCI->setArgOperand(1, NewExecModeC);
+        InitCI->setArgOperand(2,
+                              ConstantInt::getNullValue(Type::getInt1Ty(Ctx)));
+        DeInitCI->setArgOperand(1, NewExecModeC);
+      }
+    }
+
+    // Check the use of omp_get_thread_num and omp_get_num_threads.
+    OMPInformationCache::RuntimeFunctionInfo &ThreadNumRFI =
+        OMPInfoCache.RFIs[OMPRTL_omp_get_thread_num];
+    OMPInformationCache::RuntimeFunctionInfo &NumThreadsRFI =
+        OMPInfoCache.RFIs[OMPRTL_omp_get_num_threads];
+
+    auto CheckIfOnlyUsedByParallelKernel = [&](CallInst *CI) {
+      SmallVector<Kernel> ReachingKernels;
+      collectReachingKernels(CI->getFunction(),
+                             ReachingKernels);
+      return ReachingKernels.size() < 2;
+    };
+
+    SmallVector<CallInst *> WorkList;
+
+    for (User *U : ThreadNumRFI.Declaration->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+      if (!CheckIfOnlyUsedByParallelKernel(CI))
+        continue;
+      WorkList.push_back(CI);
+    }
+     for (User *U : NumThreadsRFI.Declaration->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+      if (!CheckIfOnlyUsedByParallelKernel(CI))
+        continue;
+      WorkList.push_back(CI);
+    }
+
+    for (CallInst *CI : WorkList) {
+      Function *Callee = nullptr;
+      if (CI->getCalledFunction() == ThreadNumRFI.Declaration)
+        Callee = OMPInfoCache.RFIs[OMPRTL_omp_get_bulk_thread_num].Declaration;
+      if (CI->getCalledFunction() == NumThreadsRFI.Declaration)
+        Callee = OMPInfoCache.RFIs[OMPRTL_omp_get_bulk_num_threads].Declaration;
+      assert(Callee && "unknown callee");
+
+      auto &Builder = OMPInfoCache.OMPBuilder.Builder;
+      Builder.SetInsertPoint(CI);
+      Value *C = Builder.CreateCall(Callee);
+      CI->replaceAllUsesWith(C);
+      CI->eraseFromParent();
+    }
+
+    return Changed;
+  }
+
  /// Merge parallel regions when it is safe.
  bool mergeParallelRegions() {
    const unsigned CallbackCalleeOperand = 2;
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMTransformUtils
  CallGraphUpdater.cpp
  CanonicalizeAliases.cpp
  CanonicalizeFreezeInLoops.cpp
+  CanonicalizeMainFunction.cpp
  CloneFunction.cpp
  CloneModule.cpp
  CodeExtractor.cpp
--- a/llvm/lib/Transforms/Utils/CanonicalizeMainFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeMainFunction.cpp
@@ -0,0 +1,103 @@
+//===-------------- CanonicalizeMainFunction.cpp ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility function to canonicalize main function.
+// The canonical main function is defined as: int main(int argc, char *argv[]);
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "canonicalize-main-function"
+
+static cl::opt<std::string>
+    MainFunctionName("canonical-main-function-name",
+                     cl::desc("New main function name"),
+                     cl::value_desc("main function name"));
+
+bool rewriteMainFunction(Function &F) {
+  if (F.arg_size() == 2 && F.getReturnType()->isIntegerTy(32))
+    return false;
+
+  auto &Ctx = F.getContext();
+  auto &DL = F.getParent()->getDataLayout();
+  auto *Int32Ty = IntegerType::getInt32Ty(Ctx);
+  auto *PtrTy = PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace());
+
+  FunctionType *NewFnTy =
+      FunctionType::get(Int32Ty, {Int32Ty, PtrTy}, /* isVarArg */ false);
+  Function *NewFn =
+      Function::Create(NewFnTy, F.getLinkage(), F.getAddressSpace(), "");
+  F.getParent()->getFunctionList().insert(F.getIterator(), NewFn);
+  NewFn->takeName(&F);
+  NewFn->copyAttributesFrom(&F);
+  NewFn->setSubprogram(F.getSubprogram());
+  F.setSubprogram(nullptr);
+  NewFn->splice(NewFn->begin(), &F);
+
+  if (!F.getReturnType()->isIntegerTy(32)) {
+    SmallVector<ReturnInst *> WorkList;
+    for (BasicBlock &BB : *NewFn)
+      for (Instruction &I : BB) {
+        auto *RI = dyn_cast<ReturnInst>(&I);
+        if (!RI)
+          continue;
+        assert(RI->getReturnValue() == nullptr &&
+               "return value of a void main function is not nullptr");
+        WorkList.push_back(RI);
+      }
+    for (auto *RI : WorkList) {
+      (void)ReturnInst::Create(Ctx, ConstantInt::getNullValue(Int32Ty), RI);
+      RI->eraseFromParent();
+    }
+  }
+
+  if (F.arg_size() == NewFn->arg_size())
+    for (unsigned I = 0; I < NewFn->arg_size(); ++I) {
+      Argument *OldArg = F.getArg(I);
+      Argument *NewArg = NewFn->getArg(I);
+      NewArg->takeName(OldArg);
+      OldArg->replaceAllUsesWith(NewArg);
+    }
+
+  return true;
+}
+
+PreservedAnalyses CanonicalizeMainFunctionPass::run(Module &M,
+                                                    ModuleAnalysisManager &AM) {
+  Function *MainFunc = nullptr;
+
+  for (Function &F : M)
+    if (F.getName() == "main") {
+      assert(MainFunc == nullptr && "more than one main function");
+      MainFunc = &F;
+    }
+
+  if (MainFunc == nullptr)
+    return PreservedAnalyses::all();
+
+  bool Changed = false;
+
+  if (!MainFunctionName.empty() && MainFunc->getName() != MainFunctionName) {
+    MainFunc->setName(MainFunctionName);
+    Changed = true;
+  }
+
+  if (rewriteMainFunction(*MainFunc)) {
+    MainFunc->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -89,6 +89,7 @@ set(include_files
  ${include_directory}/Interface.h
  ${include_directory}/LibC.h
  ${include_directory}/Mapping.h
+  ${include_directory}/Memory.h
  ${include_directory}/State.h
  ${include_directory}/Synchronization.h
  ${include_directory}/Types.h
@@ -99,6 +100,7 @@ set(src_files
  ${source_directory}/Allocator.cpp
  ${source_directory}/Configuration.cpp
  ${source_directory}/Debug.cpp
+  ${source_directory}/HostRPC.cpp
  ${source_directory}/Kernel.cpp
  ${source_directory}/LibC.cpp
  ${source_directory}/Mapping.cpp
@@ -112,6 +114,14 @@ set(src_files
  ${source_directory}/Workshare.cpp
 )

+if (LIBOMPTARGET_DEVICE_BUILTIN_ALLOCATOR)
+  list(APPEND src_files ${source_directory}/BuiltinAllocator.cpp)
+elseif (LIBOMPTARGET_GENERIC_ALLOCATOR)
+  list(APPEND src_files ${source_directory}/GenericAllocator.cpp)
+else()
+  list(APPEND src_files ${source_directory}/TeamAllocator.cpp)
+endif()
+
 # We disable the slp vectorizer during the runtime optimization to avoid
 # vectorized accesses to the shared state. Generally, those are "good" but
 # the optimizer pipeline (esp. Attributor) does not fully support vectorized
@@ -138,6 +148,14 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
             ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
 )

+if (LIBOMPTARGET_DEVICERTL_HOSTRPC_DEBUG)
+  list(APPEND bc_flags "-DHOSTRPC_DEBUG")
+endif()
+
+if (LIBOMPTARGET_DEVICERTL_HOSTRPC_PROFILING)
+  list(APPEND bc_flags "-DHOSTRPC_PROFILING")
+endif()
+
 # first create an object target
 add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
 function(compileDeviceRTLLibrary target_cpu target_name target_triple)
--- a/openmp/libomptarget/DeviceRTL/include/Debug.h
+++ b/openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -39,6 +39,12 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
  __builtin_trap();                                                            \
  __builtin_unreachable();

+#define assert(expr)                                                           \
+  {                                                                            \
+    if (!(expr))                                                               \
+      __assert_fail(#expr, __FILE__, __LINE__, __PRETTY_FUNCTION__);           \
+  }
+
 ///}

 #define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -360,6 +360,22 @@ int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
 int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
 int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
 ///}
+
+/// Host RPC
+///
+/// {
+void *__kmpc_host_rpc_get_desc(int32_t CallNo, int32_t NumArgs, void *ArgInfo);
+void __kmpc_host_rpc_add_arg(void *Desc, int64_t Arg, int32_t ArgNum);
+int64_t __kmpc_host_rpc_send_and_wait(void *Desc);
+/// }
+
+/// Launch parallel kernel
+///
+///{
+void __kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
+                                      int32_t if_expr, int32_t num_threads,
+                                      void **args, int64_t nargs);
+///}
 }

 #endif
--- a/openmp/libomptarget/DeviceRTL/include/LibC.h
+++ b/openmp/libomptarget/DeviceRTL/include/LibC.h
@@ -14,12 +14,116 @@

 #include "Types.h"

+struct FILE;
+extern FILE *stdin;
+extern FILE *stdout;
+extern FILE *stderr;
+
+#ifndef _ASM_GENERIC_ERRNO_BASE_H
+#define _ASM_GENERIC_ERRNO_BASE_H
+
+#define EPERM 1    /* Operation not permitted */
+#define ENOENT 2   /* No such file or directory */
+#define ESRCH 3    /* No such process */
+#define EINTR 4    /* Interrupted system call */
+#define EIO 5      /* I/O error */
+#define ENXIO 6    /* No such device or address */
+#define E2BIG 7    /* Argument list too long */
+#define ENOEXEC 8  /* Exec format error */
+#define EBADF 9    /* Bad file number */
+#define ECHILD 10  /* No child processes */
+#define EAGAIN 11  /* Try again */
+#define ENOMEM 12  /* Out of memory */
+#define EACCES 13  /* Permission denied */
+#define EFAULT 14  /* Bad address */
+#define ENOTBLK 15 /* Block device required */
+#define EBUSY 16   /* Device or resource busy */
+#define EEXIST 17  /* File exists */
+#define EXDEV 18   /* Cross-device link */
+#define ENODEV 19  /* No such device */
+#define ENOTDIR 20 /* Not a directory */
+#define EISDIR 21  /* Is a directory */
+#define EINVAL 22  /* Invalid argument */
+#define ENFILE 23  /* File table overflow */
+#define EMFILE 24  /* Too many open files */
+#define ENOTTY 25  /* Not a typewriter */
+#define ETXTBSY 26 /* Text file busy */
+#define EFBIG 27   /* File too large */
+#define ENOSPC 28  /* No space left on device */
+#define ESPIPE 29  /* Illegal seek */
+#define EROFS 30   /* Read-only file system */
+#define EMLINK 31  /* Too many links */
+#define EPIPE 32   /* Broken pipe */
+#define EDOM 33    /* Math argument out of domain of func */
+#define ERANGE 34  /* Math result not representable */
+
+#endif
+
+#define errno (*__errno_location())
+
 extern "C" {

 int memcmp(const void *lhs, const void *rhs, size_t count);
-void memset(void *dst, int C, size_t count);
+void *memset(void *dst, int C, size_t count);

 int printf(const char *format, ...);
+
+long strtol(const char *str, char **str_end, int base);
+
+int strcmp(const char *lhs, const char *rhs);
+
+void *calloc(size_t num, size_t size);
+
+int strcasecmp(const char *string1, const char *string2);
+
+void exit(int exit_code);
+
+size_t strlen(const char *str);
+
+int atoi(const char *str);
+
+char *strcpy(char *dest, const char *src);
+
+int stat(const char *path, struct stat *buf);
+
+int *__errno_location();
+
+char *strcat(char *dest, const char *src);
+
+void perror(const char *s);
+
+int strncmp(const char *lhs, const char *rhs, size_t count);
+
+char *strncpy(char *dest, const char *src, size_t count);
+
+char *strchr(const char *str, int ch);
+
+char *strtok(char *str, const char *delim);
+
+const unsigned short **__ctype_b_loc(void);
+
+void *realloc(void *ptr, size_t new_size);
+
+void qsort(void *const pbase, size_t total_elems, size_t size,
+           int (*comp)(const void *, const void *));
+
+int gettimeofday(struct timeval *tv, struct timezone *tz);
+
+char *__xpg_basename(const char *path);
+
+void srand(unsigned seed);
+
+int rand();
+
+int abs(int n);
+
+void *memcpy(void *dest, const void *src, size_t count);
+
+double atof(const char *str);
+
+double strtod(const char *str, char **ptr);
+
+long strtol(const char *nptr, char **endptr, int base);
 }

 #endif
--- a/openmp/libomptarget/DeviceRTL/include/Memory.h
+++ b/openmp/libomptarget/DeviceRTL/include/Memory.h
@@ -0,0 +1,39 @@
+//===--- Memory.h - OpenMP device runtime memory allocator -------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_MEMORY_H
+#define OMPTARGET_MEMORY_H
+
+#include "Types.h"
+
+extern "C" {
+__attribute__((leaf)) void *malloc(size_t Size);
+__attribute__((leaf)) void free(void *Ptr);
+}
+
+namespace ompx {
+namespace memory {
+
+struct MemoryAllocationInfo {
+  void *BasePtr = nullptr;
+  size_t Size = 0;
+
+  bool isValid() const { return BasePtr; }
+};
+
+/// Get the memory allocation information if pointer \p P is in the range of one
+/// of the buffer allocated by \p malloc.
+MemoryAllocationInfo getMemoryAllocationInfo(void *P);
+} // namespace memory
+} // namespace ompx
+
+#endif
--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -36,6 +36,9 @@ enum MemScopeTy {
 uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering,
             MemScopeTy MemScope = MemScopeTy::all);

+///
+int32_t addSys(int32_t *Addr, int32_t V);
+
 /// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
 /// result is stored in \p *Addr;
 /// {
@@ -135,6 +138,60 @@ void system(atomic::OrderingTy Ordering);

 } // namespace fence

+namespace atomic {
+
+/// Atomically load \p Addr with \p Ordering semantics.
+uint32_t load(uint32_t *Addr, int Ordering);
+
+/// Atomically load \p Addr with \p Ordering semantics.
+uint64_t load(uint64_t *Addr, int Ordering);
+
+/// Atomically store \p V to \p Addr with \p Ordering semantics.
+void store(uint32_t *Addr, uint32_t V, int Ordering);
+
+void store(uint64_t *Addr, uint64_t V, int Ordering);
+
+/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
+uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically add \p V to \p *Addr with \p Ordering semantics.
+uint32_t add(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically add \p V to \p *Addr with \p Ordering semantics.
+uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);
+
+} // namespace atomic
+
+namespace mutex {
+
+class TicketLock {
+  uint64_t NowServing = 0;
+  uint64_t NextTicket = 0;
+
+public:
+  TicketLock() = default;
+
+  TicketLock(const TicketLock &) = delete;
+
+  TicketLock(TicketLock &&) = delete;
+
+  void lock();
+
+  void unlock();
+};
+
+template <typename T> class LockGuard {
+  T &Lock;
+
+public:
+  explicit LockGuard(T &L) : Lock(L) { Lock.lock(); }
+
+  ~LockGuard() { Lock.unlock(); }
+};
+
+} // namespace mutex
+
 } // namespace ompx

+
 #endif
--- a/openmp/libomptarget/DeviceRTL/include/Types.h
+++ b/openmp/libomptarget/DeviceRTL/include/Types.h
@@ -36,6 +36,7 @@ using size_t = decltype(sizeof(char));
 // TODO: Properly implement this
 using intptr_t = int64_t;
 using uintptr_t = uint64_t;
+using ptrdiff_t = intptr_t;

 static_assert(sizeof(int8_t) == 1, "type size mismatch");
 static_assert(sizeof(uint8_t) == 1, "type size mismatch");
@@ -83,6 +84,11 @@ enum kmp_sched_t {
  kmp_sched_distr_static_nochunk = 92,
  kmp_sched_distr_static_chunk_sched_static_chunkone = 93,

+  // The following two are for direct GPU compilation where we don't have outer
+  // loop to distribute workload among teams but we still need to.
+  kmp_sched_distr_parallel_static_noloop_chunk = 100,
+  kmp_sched_distr_parallel_static_noloop_nochunk = 101,
+
  kmp_sched_default = kmp_sched_static_nochunk,
  kmp_sched_unordered_first = kmp_sched_static_chunk,
  kmp_sched_unordered_last = kmp_sched_auto,
--- a/openmp/libomptarget/DeviceRTL/include/Utils.h
+++ b/openmp/libomptarget/DeviceRTL/include/Utils.h
@@ -16,6 +16,9 @@

 #pragma omp begin declare target device_type(nohost)

+extern "C" double omp_get_wtime();
+extern "C" int printf(const char *, ...);
+
 namespace ompx {
 namespace utils {

@@ -37,8 +40,8 @@ template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
 }

 /// Advance \p Ptr by \p Bytes bytes.
-template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
-  return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
+template <typename Ty1, typename Ty2> inline Ty1 advance(Ty1 Ptr, Ty2 Bytes) {
+  return reinterpret_cast<Ty1>(reinterpret_cast<char *>(Ptr) + Bytes);
 }

 /// Return the first bit set in \p V.
@@ -82,12 +85,39 @@ template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
  return *((DstTy *)(&V));
 }

+template <typename Ty = char>
+ptrdiff_t getPtrDiff(const void *End, const void *Begin) {
+  return reinterpret_cast<const Ty *>(End) -
+         reinterpret_cast<const Ty *>(Begin);
+}
+
+inline bool isInRange(void *Ptr, void *BasePtr, int64_t Offset) {
+  ptrdiff_t Diff = getPtrDiff(Ptr, BasePtr);
+  return Diff >= 0 && Diff < Offset;
+}
+
+inline intptr_t ptrtoint(void *Ptr) { return reinterpret_cast<intptr_t>(Ptr); }
+
+template <typename T> T min(T a, T b) { return a < b ? a : b; }
+
 /// A  pointer variable that has by design an `undef` value. Use with care.
 [[clang::loader_uninitialized]] static void *const UndefPtr;

 #define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
 #define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)

+class SimpleProfiler {
+  const char *HeadLine = nullptr;
+  double Start;
+
+public:
+  SimpleProfiler(const char *HL) : HeadLine(HL), Start(omp_get_wtime()) {}
+  ~SimpleProfiler() {
+    double End = omp_get_wtime();
+    printf("%s --> %lf s.\n", HeadLine, End - Start);
+  }
+};
+
 } // namespace utils
 } // namespace ompx

--- a/openmp/libomptarget/DeviceRTL/src/BuiltinAllocator.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/BuiltinAllocator.cpp
@@ -0,0 +1,40 @@
+//===------- BuiltinAllocator.cpp - Generic GPU memory allocator -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "Debug.h"
+#include "Memory.h"
+#include "Utils.h"
+
+using namespace ompx;
+
+namespace ompx {
+namespace memory {
+MemoryAllocationInfo getMemoryAllocationInfo(void *P) { return {}; }
+
+void init() {}
+} // namespace memory
+} // namespace ompx
+
+extern "C" {
+
+void *realloc(void *ptr, size_t new_size) {
+  void *NewPtr = malloc(new_size);
+  if (!NewPtr)
+    return nullptr;
+  __builtin_memcpy(NewPtr, ptr, new_size);
+  free(ptr);
+  return NewPtr;
+}
+}
+
+#pragma omp end declare target
--- a/openmp/libomptarget/DeviceRTL/src/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/src/CMakeLists.txt
@@ -0,0 +1,17 @@
+target_sources(omptarget.devicertl PRIVATE
+  Configuration.cpp
+  Debug.cpp
+  HostRPC.cpp
+  GenericAllocator.cpp
+  Kernel.cpp
+  LibC.cpp
+  Mapping.cpp
+  Misc.cpp
+  Parallelism.cpp
+  Reduction.cpp
+  State.cpp
+  Synchronization.cpp
+  Tasking.cpp
+  Utils.cpp
+  WarpAllocator.cpp
+  Workshare.cpp)
--- a/openmp/libomptarget/DeviceRTL/src/GenericAllocator.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/GenericAllocator.cpp
@@ -0,0 +1,310 @@
+//===------- GenericAllocator.cpp - Generic GPU memory allocator -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "Debug.h"
+#include "Memory.h"
+#include "Synchronization.h"
+#include "Utils.h"
+
+using namespace ompx;
+
+char *CONSTANT(omptarget_device_heap_buffer)
+    __attribute__((used, retain, weak, visibility("protected")));
+
+size_t CONSTANT(omptarget_device_heap_size)
+    __attribute__((used, retain, weak, visibility("protected")));
+
+namespace {
+size_t HeapCurPos = 0;
+mutex::TicketLock HeapLock;
+mutex::TicketLock AllocationListLock;
+mutex::TicketLock FreeListLock;
+constexpr const size_t Alignment = 16;
+
+using intptr_t = int64_t;
+
+struct SimpleLinkListNode {
+  template <typename T> friend struct SimpleLinkList;
+
+protected:
+  SimpleLinkListNode *Prev = nullptr;
+  SimpleLinkListNode *Next = nullptr;
+
+  bool operator>(const SimpleLinkListNode &RHS) const {
+    const auto This = reinterpret_cast<int64_t>(this);
+    const auto RHSThis = reinterpret_cast<int64_t>(&RHS);
+    return This > RHSThis;
+  }
+
+  SimpleLinkListNode() = default;
+  SimpleLinkListNode(const SimpleLinkListNode &) = delete;
+  SimpleLinkListNode(SimpleLinkListNode &&) = delete;
+};
+
+struct AllocationMetadata final : SimpleLinkListNode {
+public:
+  size_t getUserSize() const;
+  void *getUserAddr() const {
+    return const_cast<void *>(reinterpret_cast<const void *>(this + 1));
+  }
+
+  size_t getSize() const { return Size; }
+  void setSize(size_t V) { Size = V; }
+
+  bool isInRange(void *Ptr) {
+    return utils::isInRange(Ptr, this + 1, getUserSize());
+  }
+
+  static AllocationMetadata *getFromUserAddr(void *P) {
+    return getFromAddr(P) - 1;
+  }
+
+  static AllocationMetadata *getFromAddr(void *P) {
+    return reinterpret_cast<AllocationMetadata *>(P);
+  }
+
+private:
+  size_t Size = 0;
+  int64_t Reserved = 0;
+};
+
+constexpr const size_t AllocationMetadataSize = sizeof(AllocationMetadata);
+static_assert(AllocationMetadataSize == 32,
+              "expect the metadata size to be 32");
+
+size_t AllocationMetadata::getUserSize() const {
+  return Size - AllocationMetadataSize;
+}
+
+template <typename T> struct SimpleLinkList {
+  struct iterator {
+    friend SimpleLinkList;
+
+    T *operator->() { return reinterpret_cast<T *>(Node); }
+    T &operator*() { return reinterpret_cast<T &>(*Node); }
+
+    iterator operator++() {
+      Node = Node->Next;
+      return *this;
+    }
+
+    bool operator==(const iterator &RHS) const { return Node == RHS.Node; }
+    bool operator!=(const iterator &RHS) const { return !(*this == RHS); }
+
+    iterator next() const {
+      iterator Itr;
+      Itr.Node = Node->Next;
+      return Itr;
+    }
+
+  private:
+    SimpleLinkListNode *Node = nullptr;
+  };
+
+  iterator begin() {
+    iterator Itr;
+    Itr.Node = Head.Next;
+    return Itr;
+  }
+
+  iterator end() { return {}; }
+
+  void insert(SimpleLinkListNode *Node) { insertImpl(&Head, Node); }
+
+  void remove(iterator Itr) {
+    SimpleLinkListNode *Node = Itr.Node;
+    remove(Node);
+  }
+
+  void remove(SimpleLinkListNode *Node) { removeImpl(Node); }
+
+  bool empty() const { return Head.Next == nullptr; }
+
+private:
+  static void insertImpl(SimpleLinkListNode *Current,
+                         SimpleLinkListNode *Node) {
+    SimpleLinkListNode *OldNext = Current->Next;
+    Node->Prev = Current;
+    Node->Next = OldNext;
+    if (OldNext)
+      OldNext->Prev = Node;
+    Current->Next = Node;
+  }
+
+  static void removeImpl(SimpleLinkListNode *Node) {
+    SimpleLinkListNode *Prev = Node->Prev;
+    SimpleLinkListNode *Next = Node->Next;
+    Prev->Next = Next;
+    if (Next)
+      Next->Prev = Prev;
+    Node->Prev = nullptr;
+    Node->Next = nullptr;
+  }
+
+  /// Check if the node is in the list.
+  bool checkExist(SimpleLinkListNode *Node) const {
+    SimpleLinkListNode *P = Head.Next;
+    while (P) {
+      if (P == Node)
+        return true;
+      P = P->Next;
+    }
+    return false;
+  }
+
+  static bool checkSanity(SimpleLinkListNode *Current,
+                          SimpleLinkListNode *Next) {
+    return Current->Next == Next && (!Next || Next->Prev == Current);
+  }
+
+  static bool checkSanity(SimpleLinkListNode *Prev, SimpleLinkListNode *Current,
+                          SimpleLinkListNode *Next) {
+    return Prev->Next == Current && Current->Next == Next &&
+           Current->Prev == Prev && (!Next || Next->Prev == Current);
+  }
+
+  static bool checkDangling(SimpleLinkListNode *Node) {
+    return Node->Next == nullptr && Node->Prev == nullptr;
+  }
+
+  SimpleLinkListNode Head;
+};
+
+SimpleLinkList<AllocationMetadata> AllocationList;
+SimpleLinkList<AllocationMetadata> FreeList;
+
+constexpr const int SplitRatio = 5;
+
+#ifndef LIBOMPTARGET_MEMORY_PROFILING
+struct MemoryProfiler {
+  MemoryProfiler(const char *S) {}
+  ~MemoryProfiler() {}
+};
+#else
+struct MemoryProfiler : utils::SimpleProfiler {
+  MemoryProfiler(const char *S) : utils::SimpleProfiler(S) {}
+  ~MemoryProfiler() { utils::SimpleProfiler::~SimpleProfiler(); }
+};
+#endif
+} // namespace
+
+namespace ompx {
+namespace memory {
+MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+  mutex::LockGuard ALG(AllocationListLock);
+  for (auto Itr = AllocationList.begin(); Itr != AllocationList.end(); ++Itr) {
+    AllocationMetadata *MD = &(*Itr);
+    if (MD->isInRange(P))
+      return {MD->getUserAddr(), MD->getUserSize()};
+  }
+  return {};
+}
+
+void init() {}
+} // namespace memory
+} // namespace ompx
+
+extern "C" {
+
+void *memset(void *dest, int ch, size_t count);
+
+void *malloc(size_t Size) {
+  MemoryProfiler Profiler(__FUNCTION__);
+
+  Size = utils::align_up(Size + AllocationMetadataSize, Alignment);
+
+  AllocationMetadata *MD = nullptr;
+
+  {
+    mutex::LockGuard FLG(FreeListLock);
+
+    auto Itr = FreeList.begin();
+    for (; Itr != FreeList.end(); ++Itr) {
+      if (Itr->getSize() >= Size)
+        break;
+    }
+
+    bool Found = Itr != FreeList.end() && (Itr->getSize() / Size < SplitRatio);
+    if (Found) {
+      MD = &(*Itr);
+      FreeList.remove(Itr);
+    }
+  }
+
+  if (MD) {
+    mutex::LockGuard ALG(AllocationListLock);
+    AllocationList.insert(MD);
+    return MD->getUserAddr();
+  }
+
+  {
+    mutex::LockGuard LG(HeapLock);
+
+    if (Size + HeapCurPos < omptarget_device_heap_size) {
+      void *R = omptarget_device_heap_buffer + HeapCurPos;
+      (void)atomic::add(&HeapCurPos, Size, atomic::acq_rel);
+      MD = AllocationMetadata::getFromAddr(R);
+    }
+  }
+
+  if (MD) {
+    // We need to reset the head in case of any dirty data.
+    memset(MD, 0, AllocationMetadataSize);
+    MD->setSize(Size);
+    mutex::LockGuard ALG(AllocationListLock);
+    AllocationList.insert(MD);
+    return MD->getUserAddr();
+  }
+
+  printf("out of heap memory! size=%lu, cur=%lu.\n", Size, HeapCurPos);
+
+  printf("%s:%d\n", __FILE__, __LINE__);
+  __builtin_trap();
+}
+
+void free(void *P) {
+  MemoryProfiler Profiler(__FUNCTION__);
+
+  if (!P)
+    return;
+
+  auto *MD = AllocationMetadata::getFromUserAddr(P);
+
+  {
+    mutex::LockGuard ALG(AllocationListLock);
+    AllocationList.remove(MD);
+  }
+
+  {
+    mutex::LockGuard FLG(FreeListLock);
+    FreeList.insert(MD);
+  }
+}
+
+void *realloc(void *ptr, size_t new_size) {
+  MemoryProfiler Profiler(__FUNCTION__);
+
+  void *NewPtr = malloc(new_size);
+  if (!NewPtr)
+    return nullptr;
+  auto *OldMD = AllocationMetadata::getFromUserAddr(ptr);
+  assert(ptr == OldMD->getUserAddr());
+  __builtin_memcpy(NewPtr, ptr, utils::min(OldMD->getUserSize(), new_size));
+  free(ptr);
+  return NewPtr;
+}
+
+void __kmpc_target_init_allocator() {}
+}
+
+#pragma omp end declare target
--- a/openmp/libomptarget/DeviceRTL/src/HostRPC.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/HostRPC.cpp
@@ -0,0 +1,420 @@
+//===------- HostRPC.cpp - Implementation of host RPC ------------- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "HostRPC.h"
+
+#include "Debug.h"
+#include "LibC.h"
+#include "Memory.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
+
+#ifdef HOSTRPC_DEBUG
+#define DEBUG_PREFIX "host-rpc-device"
+#define DP(FMT, ...)                                                           \
+  { printf("%s --> " FMT, DEBUG_PREFIX, __VA_ARGS__); }
+#else
+#define DP(FMT, ...)
+#endif
+
+using namespace ompx;
+using namespace hostrpc;
+
+using ArgType = llvm::omp::OMPTgtHostRPCArgType;
+
+Descriptor *omptarget_hostrpc_descriptor
+    __attribute__((used, retain, weak, visibility("protected")));
+int32_t *omptarget_hostrpc_futex
+    __attribute__((used, retain, weak, visibility("protected")));
+char *omptarget_hostrpc_memory_buffer
+    __attribute__((used, retain, weak, visibility("protected")));
+size_t omptarget_hostrpc_memory_buffer_size
+    __attribute__((used, retain, weak, visibility("protected")));
+
+#ifdef HOSTRPC_PROFILING
+int32_t HostRPCId;
+double GetDescStart;
+double GetDescEnd;
+double AddArgStart;
+double AddArgEnd;
+double IssueAndWaitStart;
+double IssueAndWaitEnd;
+double CopyBackStart;
+double CopyBackEnd;
+#endif
+
+namespace {
+size_t HostRPCMemoryBufferCurrentPosition = 0;
+constexpr const size_t Alignment = 16;
+
+// FIXME: For now we only allow one thread requesting host RPC.
+mutex::TicketLock HostRPCLock;
+
+void *HostRPCMemAlloc(size_t Size) {
+  Size = utils::align_up(Size, Alignment);
+
+  if (Size + HostRPCMemoryBufferCurrentPosition <
+      omptarget_hostrpc_memory_buffer_size) {
+    void *R =
+        omptarget_hostrpc_memory_buffer + HostRPCMemoryBufferCurrentPosition;
+    atomic::add(&HostRPCMemoryBufferCurrentPosition, Size, atomic::acq_rel);
+    return R;
+  }
+
+  printf("%s:%d\n", __FILE__, __LINE__);
+  __builtin_trap();
+
+  return nullptr;
+}
+
+// For now we just reset the buffer.
+void HostRPCMemReset() { HostRPCMemoryBufferCurrentPosition = 0; }
+
+static_assert(sizeof(intptr_t) == sizeof(int64_t), "pointer size not match");
+
+struct HostRPCArgInfo {
+  void *BasePtr;
+  int64_t Type;
+  int64_t Size;
+  HostRPCArgInfo *Next;
+};
+
+struct HostRPCPointerMapEntry {
+  void *BasePtr;
+  void *MappedBasePtr;
+  int64_t Size;
+  int64_t Kind;
+};
+
+void *getMappedPointer(Descriptor *D, void *BasePtr, int64_t Size,
+                       int64_t Offset, int64_t Kind) {
+  assert(D->ArgMap && "ArgMap should not be nullptr");
+
+  HostRPCPointerMapEntry *MapTable =
+      reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
+  int I = 0;
+  for (; I < D->NumArgs && MapTable[I].BasePtr; ++I)
+    if (MapTable[I].BasePtr == BasePtr)
+      return utils::advance(MapTable[I].MappedBasePtr, Offset);
+
+  MapTable[I].BasePtr = BasePtr;
+  MapTable[I].MappedBasePtr = HostRPCMemAlloc(Size);
+  MapTable[I].Size = Size;
+  MapTable[I].Kind = Kind;
+
+  if (Kind & ArgType::OMP_HOST_RPC_ARG_COPY_TO) {
+    __builtin_memcpy(MapTable[I].MappedBasePtr, BasePtr, Size);
+    DP("getMappedPointer: copy %ld bytes memory from %p to %p.\n", Size,
+       BasePtr, MapTable[I].MappedBasePtr);
+  }
+
+  return utils::advance(MapTable[I].MappedBasePtr, Offset);
+}
+
+void copybackIfNeeded(Descriptor *D) {
+  if (!D->ArgMap)
+    return;
+
+  auto *MapTable = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
+  for (int I = 0; I < D->NumArgs && MapTable[I].BasePtr; ++I)
+    if (MapTable[I].Kind & ArgType::OMP_HOST_RPC_ARG_COPY_FROM) {
+      __builtin_memcpy(MapTable[I].BasePtr, MapTable[I].MappedBasePtr,
+                       MapTable[I].Size);
+      DP("copybackIfNeeded: copy %ld bytes memory from %p to %p.\n",
+         MapTable[I].Size, MapTable[I].MappedBasePtr, MapTable[I].BasePtr);
+    }
+}
+
+} // namespace
+
+extern "C" {
+__attribute__((noinline, used)) void *
+__kmpc_host_rpc_get_desc(int32_t CallId, int32_t NumArgs, void *ArgInfo) {
+  assert(omptarget_hostrpc_descriptor && omptarget_hostrpc_futex &&
+         "no host rpc pointer");
+
+  DP("device: stdin=%p, stdout=%p, stderr=%p\n", stdin, stdout, stderr);
+  DP("get desc for request (id=%d), NumArgs=%d, ArgInfo=%p.\n", CallId, NumArgs,
+     ArgInfo);
+#ifdef HOSTRPC_DEBUG
+  {
+    void **AIs = reinterpret_cast<void **>(ArgInfo);
+    for (int I = 0; I < NumArgs; ++I)
+      DP("ArgInfo[%d]=%p.\n", I, AIs[I]);
+  }
+#endif
+
+  HostRPCLock.lock();
+
+#ifdef HOSTRPC_PROFILING
+  HostRPCId = CallId;
+  GetDescStart = omp_get_wtime();
+#endif
+
+  // TODO: change it after we support a queue-like data structure.
+  Descriptor *D = omptarget_hostrpc_descriptor;
+
+  D->Id = CallId;
+  D->ArgInfo = reinterpret_cast<void **>(ArgInfo);
+  D->NumArgs = NumArgs;
+  D->Status = EXEC_STAT_CREATED;
+  D->ReturnValue = 0;
+  D->Args =
+      reinterpret_cast<Argument *>(HostRPCMemAlloc(sizeof(Argument) * NumArgs));
+  D->ArgMap = HostRPCMemAlloc(sizeof(HostRPCPointerMapEntry) * NumArgs);
+
+  assert(!NumArgs || (D->Args && D->ArgMap) && "out of host rpc memory!");
+
+  // Reset the map table.
+  auto *ArgMap = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
+  for (int I = 0; I < NumArgs; ++I)
+    ArgMap[I].BasePtr = nullptr;
+
+#ifdef HOSTRPC_PROFILING
+  GetDescEnd = omp_get_wtime();
+  AddArgStart = omp_get_wtime();
+#endif
+
+  return D;
+}
+
+__attribute__((noinline, used)) void
+__kmpc_host_rpc_add_arg(void *Desc, int64_t ArgVal, int32_t ArgNum) {
+  auto *D = reinterpret_cast<Descriptor *>(Desc);
+  assert(ArgNum < D->NumArgs && "out-of-range arguments");
+  Argument &ArgInDesc = D->Args[ArgNum];
+
+  DP("add arg (no=%d), arg=%lx to request (id=%d).\n", ArgNum, ArgVal, D->Id);
+
+  // This early branch can rule out nullptr and zero scalar value because it
+  // doesn't matter whether it is a pointer or scalar value.
+  if (ArgVal == 0) {
+    ArgInDesc.Value = 0;
+    ArgInDesc.ArgType = Type::ARG_LITERAL;
+
+    DP("arg (no=%d) is null, done.\n", ArgNum);
+
+    return;
+  }
+
+  void *ArgPtr = reinterpret_cast<void *>(ArgVal);
+
+  if (ArgPtr == stdin || ArgPtr == stdout || ArgPtr == stderr) {
+    ArgInDesc.Value = ArgVal;
+    ArgInDesc.ArgType = Type::ARG_POINTER;
+
+    DP("arg (no=%d) is stdin/stdout/stderr, done.\n", ArgNum);
+
+    return;
+  }
+
+  const auto *AI = reinterpret_cast<HostRPCArgInfo *>(D->ArgInfo[ArgNum]);
+
+  DP("try to find arg (no=%d) from args AI=%p\n", ArgNum, AI);
+
+  if (AI) {
+    // Let's first check if Arg is a scalar.
+    if (AI->Type == ArgType::OMP_HOST_RPC_ARG_SCALAR) {
+      assert(AI->BasePtr == ArgPtr && "invalid scalar argument info");
+      assert(AI->Next == nullptr && "invalid scalar argument info");
+
+      ArgInDesc.Value = ArgVal;
+      ArgInDesc.ArgType = Type::ARG_LITERAL;
+
+      DP("arg (no=%d) is scalar, done.\n", ArgNum);
+
+      return;
+    }
+
+    // Then let's see if it is a literal pointer that we don't need copy.
+    auto *P = AI;
+    while (P) {
+      if (P->Type == ArgType::OMP_HOST_RPC_ARG_PTR && P->BasePtr == ArgPtr) {
+        ArgInDesc.Value = ArgVal;
+        ArgInDesc.ArgType = Type::ARG_POINTER;
+
+        DP("arg (no=%d) is literal pointer, done.\n", ArgNum);
+
+        return;
+      }
+      P = P->Next;
+    }
+
+    // Next we check if it is within the range of any buffer described in
+    // argument info.
+    P = AI;
+    while (P) {
+      if ((P->Type & ArgType::OMP_HOST_RPC_ARG_PTR) && P->Size) {
+        if (utils::isInRange(ArgPtr, P->BasePtr, P->Size)) {
+          auto Size = P->Size;
+          auto Offset = utils::getPtrDiff(ArgPtr, P->BasePtr);
+
+          ArgInDesc.Value = utils::ptrtoint(
+              getMappedPointer(D, P->BasePtr, Size, Offset, P->Type));
+          ArgInDesc.ArgType = Type::ARG_POINTER;
+
+          DP("found a match for arg (no=%d). done.\n", ArgNum);
+
+          return;
+        }
+      }
+      P = P->Next;
+    }
+  }
+
+  // Now we can't find a match from argument info, then we assume it is from
+  // dynamic allocation.
+  memory::MemoryAllocationInfo MAI = memory::getMemoryAllocationInfo(ArgPtr);
+  if (MAI.isValid()) {
+    auto Size = MAI.Size;
+    auto Offset = utils::getPtrDiff(ArgPtr, MAI.BasePtr);
+
+    ArgInDesc.Value = utils::ptrtoint(
+        getMappedPointer(D, MAI.BasePtr, Size, Offset,
+                         /* Kind */ ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM));
+    ArgInDesc.ArgType = Type::ARG_POINTER;
+
+    DP("arg (no=%d) is from malloc. done.\n", ArgNum);
+
+    return;
+  }
+
+  printf("request (id=%d) arg (no=%d, val=%p) is unknown. send it to host "
+         "directly.\n",
+         D->Id, ArgNum, ArgPtr);
+
+  ArgInDesc.Value = ArgVal;
+  ArgInDesc.ArgType = Type::ARG_POINTER;
+}
+
+__attribute__((noinline, used)) int64_t
+__kmpc_host_rpc_send_and_wait(void *Desc) {
+  auto *D = reinterpret_cast<Descriptor *>(Desc);
+  int32_t Id = D->Id;
+
+#ifdef HOSTRPC_PROFILING
+  AddArgEnd = omp_get_wtime();
+  IssueAndWaitStart = omp_get_wtime();
+#endif
+
+  atomic::add(omptarget_hostrpc_futex, 1U, atomic::acq_rel);
+
+  // A system fence is required to make sure futex on the host is also
+  // updated if USM is supported.
+  fence::system(atomic::seq_cst);
+
+  DP("sent request (id=%d) to host. waiting for finish.\n", Id);
+
+  unsigned NS = 8;
+
+  while (atomic::addSys(omptarget_hostrpc_futex, 0)) {
+    asm volatile("nanosleep.u32 %0;" : : "r"(NS));
+    // if (NS < 64)
+    //   NS *= 2;
+    // fence::system(atomic::seq_cst);
+  }
+
+#ifdef HOSTRPC_PROFILING
+  IssueAndWaitEnd = omp_get_wtime();
+#endif
+
+  DP("finish waiting for request (id=%d).\n", Id);
+
+  int64_t Ret = D->ReturnValue;
+
+  assert(!D->NumArgs || D->ArgMap && "arg map should not be nullptr");
+
+#ifdef HOSTRPC_PROFILING
+  CopyBackStart = omp_get_wtime();
+#endif
+
+  if (D->ArgMap) {
+    DP("copy memory back for request (id=%d).\n", Id);
+    copybackIfNeeded(D);
+    DP("finish copy memory back for request (id=%d).\n", Id);
+  }
+
+#ifdef HOSTRPC_PROFILING
+  CopyBackEnd = omp_get_wtime();
+#endif
+
+  HostRPCMemReset();
+
+  // We can unlock now as we already get all temporary part.
+  // TODO: If we have a queue, we don't need this step.
+  HostRPCLock.unlock();
+
+  DP("request (id=%d) is done with return code=%lx.\n", Id, Ret);
+
+#ifdef HOSTRPC_PROFILING
+  printf("[host-rpc-profiling-device] id=%d, init=%lf, add_arg=%lf, wait=%lf, "
+         "copy=%lf.\n",
+         HostRPCId, GetDescEnd - GetDescStart, AddArgEnd - AddArgStart,
+         IssueAndWaitEnd - IssueAndWaitStart, CopyBackEnd - CopyBackStart);
+#endif
+
+  return Ret;
+}
+
+__attribute__((noinline, used)) void
+__kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
+                                 int32_t if_expr, int32_t num_threads,
+                                 void **args, int64_t nargs) {
+  constexpr const int64_t NumArgs = 6;
+  HostRPCArgInfo ArgInfoArray[NumArgs];
+  void *ArgInfo[NumArgs];
+  for (unsigned I = 0; I < NumArgs; ++I) {
+    ArgInfoArray[I].BasePtr = 0;
+    ArgInfoArray[I].Type = ArgType::OMP_HOST_RPC_ARG_SCALAR;
+    ArgInfoArray[I].Size = 0;
+    ArgInfoArray[I].Next = nullptr;
+    ArgInfo[I] = &ArgInfoArray[I];
+  }
+  auto *D = (Descriptor *)__kmpc_host_rpc_get_desc(0, NumArgs, (void *)ArgInfo);
+
+  // Set up arg info struct.
+  ArgInfoArray[0].BasePtr = const_cast<char *>(name);
+  ArgInfoArray[0].Type = ArgType::OMP_HOST_RPC_ARG_COPY_TO;
+  ArgInfoArray[0].Size = strlen(name) + 1;
+  ArgInfoArray[1].BasePtr =
+      reinterpret_cast<void *>(static_cast<int64_t>(gtid));
+  ArgInfoArray[2].BasePtr =
+      reinterpret_cast<void *>(static_cast<int64_t>(if_expr));
+  ArgInfoArray[3].BasePtr =
+      reinterpret_cast<void *>(static_cast<int64_t>(num_threads));
+  ArgInfoArray[5].BasePtr = reinterpret_cast<void *>(nargs);
+
+  // We need to treat args in a little bit different way because nargs might be
+  // on the stack.
+  ArgInfoArray[4].Size = sizeof(void *) * nargs;
+  void *Args = nullptr;
+  if (nargs) {
+    Args = HostRPCMemAlloc(ArgInfoArray[4].Size);
+    __builtin_memcpy(Args, args, ArgInfoArray[4].Size);
+  }
+  ArgInfoArray[4].BasePtr = Args;
+
+  D->Id = CALLID___kmpc_launch_parallel_51_kernel;
+
+  __kmpc_host_rpc_add_arg(D, reinterpret_cast<int64_t>(name), 0);
+  __kmpc_host_rpc_add_arg(D, gtid, 1);
+  __kmpc_host_rpc_add_arg(D, if_expr, 2);
+  __kmpc_host_rpc_add_arg(D, num_threads, 3);
+  __kmpc_host_rpc_add_arg(D, reinterpret_cast<int64_t>(Args), 4);
+  __kmpc_host_rpc_add_arg(D, nargs, 5);
+
+  (void)__kmpc_host_rpc_send_and_wait(D);
+}
+}
+
+#pragma omp end declare target
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -16,6 +16,7 @@
 #include "Debug.h"
 #include "Interface.h"
 #include "Mapping.h"
+#include "Memory.h"
 #include "State.h"
 #include "Synchronization.h"
 #include "Types.h"
--- a/openmp/libomptarget/DeviceRTL/src/LibC.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/LibC.cpp
@@ -7,6 +7,12 @@
 //===----------------------------------------------------------------------===//

 #include "LibC.h"
+#include "Debug.h"
+#include "Memory.h"
+#include "Synchronization.h"
+#include "Utils.h"
+
+using namespace ompx;

 #pragma omp begin declare target device_type(nohost)

@@ -32,8 +38,324 @@ int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
  return -1;
 }
 } // namespace impl
+
 #pragma omp end declare variant

+// Dummy variable for stdio.
+int StdInDummyVar;
+int StdOutDummyVar;
+int StdErrDummyVar;
+
+struct FILE;
+__attribute__((used, retain, weak, visibility("protected"))) FILE *stdin =
+    (FILE *)&StdInDummyVar;
+__attribute__((used, retain, weak, visibility("protected"))) FILE *stdout =
+    (FILE *)&StdOutDummyVar;
+__attribute__((used, retain, weak, visibility("protected"))) FILE *stderr =
+    (FILE *)&StdErrDummyVar;
+
+typedef int (*__compar_fn_t)(const void *, const void *);
+typedef int (*__compar_d_fn_t)(const void *, const void *, void *);
+
+namespace {
+const int32_t ToLowerMapTable[] = {
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   2,   3,   4,   5,   6,
+    7,   8,   9,   10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,
+    22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,
+    37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
+    52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  'a', 'b',
+    'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+    'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 91,  92,  93,  94,  95,  96,
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 123, 124, 125, 126,
+    127, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,
+};
+const int32_t *ToLowerPTable = ToLowerMapTable + 128;
+
+#define X(x) (((x) / 256 | (x)*256) % 65536)
+
+static const unsigned short BLocTable[] = {
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
+    X(0x200), X(0x200), X(0x200), X(0x200), X(0x320), X(0x220), X(0x220),
+    X(0x220), X(0x220), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
+    X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
+    X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x160),
+    X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
+    X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
+    X(0x4c0), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8),
+    X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x4c0), X(0x4c0), X(0x4c0),
+    X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x8d5), X(0x8d5), X(0x8d5),
+    X(0x8d5), X(0x8d5), X(0x8d5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
+    X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
+    X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
+    X(0x8c5), X(0x8c5), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
+    X(0x4c0), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6),
+    X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6),
+    X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6),
+    X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x4c0),
+    X(0x4c0), X(0x4c0), X(0x4c0), X(0x200), 0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,
+};
+
+#undef X
+
+static const unsigned short *BLocTablePTable = BLocTable + 128;
+
+int ErrNo __attribute__((used, retain));
+
+static unsigned long int RandomNext = 1;
+
+#define SWAP(a, b, size)                                                       \
+  do {                                                                         \
+    size_t __size = (size);                                                    \
+    char *__a = (a), *__b = (b);                                               \
+    do {                                                                       \
+      char __tmp = *__a;                                                       \
+      *__a++ = *__b;                                                           \
+      *__b++ = __tmp;                                                          \
+    } while (--__size > 0);                                                    \
+  } while (0)
+
+/* Discontinue quicksort algorithm when partition gets below this size.
+   This particular magic number was chosen to work best on a Sun 4/260. */
+#define MAX_THRESH 4
+
+/* Stack node declarations used to store unfulfilled partition obligations. */
+typedef struct {
+  char *lo;
+  char *hi;
+} stack_node;
+
+/* The next 4 #defines implement a very fast in-line stack abstraction. */
+/* The stack needs log (total_elements) entries (we could even subtract
+   log(MAX_THRESH)).  Since total_elements has type size_t, we get as
+   upper bound for log (total_elements):
+   bits per byte (CHAR_BIT) * sizeof(size_t).  */
+#define CHAR_BIT 8
+#define STACK_SIZE (CHAR_BIT * sizeof(size_t))
+#define PUSH(low, high) ((void)((top->lo = (low)), (top->hi = (high)), ++top))
+#define POP(low, high) ((void)(--top, (low = top->lo), (high = top->hi)))
+#define STACK_NOT_EMPTY (stack < top)
+
+/* Order size using quicksort.  This implementation incorporates
+   four optimizations discussed in Sedgewick:
+   1. Non-recursive, using an explicit stack of pointer that store the
+      next array partition to sort.  To save time, this maximum amount
+      of space required to store an array of SIZE_MAX is allocated on the
+      stack.  Assuming a 32-bit (64 bit) integer for size_t, this needs
+      only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes).
+      Pretty cheap, actually.
+   2. Chose the pivot element using a median-of-three decision tree.
+      This reduces the probability of selecting a bad pivot value and
+      eliminates certain extraneous comparisons.
+   3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving
+      insertion sort to order the MAX_THRESH items within each partition.
+      This is a big win, since insertion sort is faster for small, mostly
+      sorted array segments.
+   4. The larger of the two sub-partitions is always pushed onto the
+      stack first, with the algorithm then concentrating on the
+      smaller partition.  This *guarantees* no more than log (total_elems)
+      stack size is needed (actually O(1) in this case)!  */
+
+void _quicksort(void *const pbase, size_t total_elems, size_t size,
+                __compar_d_fn_t cmp, void *arg) {
+  char *base_ptr = (char *)pbase;
+
+  const size_t max_thresh = MAX_THRESH * size;
+
+  if (total_elems == 0)
+    /* Avoid lossage with unsigned arithmetic below.  */
+    return;
+
+  if (total_elems > MAX_THRESH) {
+    char *lo = base_ptr;
+    char *hi = &lo[size * (total_elems - 1)];
+    stack_node stack[STACK_SIZE];
+    stack_node *top = stack;
+
+    PUSH(nullptr, nullptr);
+
+    while (STACK_NOT_EMPTY) {
+      char *left_ptr;
+      char *right_ptr;
+
+      /* Select median value from among LO, MID, and HI. Rearrange
+         LO and HI so the three values are sorted. This lowers the
+         probability of picking a pathological pivot value and
+         skips a comparison for both the LEFT_PTR and RIGHT_PTR in
+         the while loops. */
+
+      char *mid = lo + size * ((hi - lo) / size >> 1);
+
+      if ((*cmp)((void *)mid, (void *)lo, arg) < 0)
+        SWAP(mid, lo, size);
+      if ((*cmp)((void *)hi, (void *)mid, arg) < 0)
+        SWAP(mid, hi, size);
+      else
+        goto jump_over;
+      if ((*cmp)((void *)mid, (void *)lo, arg) < 0)
+        SWAP(mid, lo, size);
+    jump_over:;
+
+      left_ptr = lo + size;
+      right_ptr = hi - size;
+
+      /* Here's the famous ``collapse the walls'' section of quicksort.
+         Gotta like those tight inner loops!  They are the main reason
+         that this algorithm runs much faster than others. */
+      do {
+        while ((*cmp)((void *)left_ptr, (void *)mid, arg) < 0)
+          left_ptr += size;
+
+        while ((*cmp)((void *)mid, (void *)right_ptr, arg) < 0)
+          right_ptr -= size;
+
+        if (left_ptr < right_ptr) {
+          SWAP(left_ptr, right_ptr, size);
+          if (mid == left_ptr)
+            mid = right_ptr;
+          else if (mid == right_ptr)
+            mid = left_ptr;
+          left_ptr += size;
+          right_ptr -= size;
+        } else if (left_ptr == right_ptr) {
+          left_ptr += size;
+          right_ptr -= size;
+          break;
+        }
+      } while (left_ptr <= right_ptr);
+
+      /* Set up pointers for next iteration.  First determine whether
+         left and right partitions are below the threshold size.  If so,
+         ignore one or both.  Otherwise, push the larger partition's
+         bounds on the stack and continue sorting the smaller one. */
+
+      if ((size_t)(right_ptr - lo) <= max_thresh) {
+        if ((size_t)(hi - left_ptr) <= max_thresh)
+          /* Ignore both small partitions. */
+          POP(lo, hi);
+        else
+          /* Ignore small left partition. */
+          lo = left_ptr;
+      } else if ((size_t)(hi - left_ptr) <= max_thresh)
+        /* Ignore small right partition. */
+        hi = right_ptr;
+      else if ((right_ptr - lo) > (hi - left_ptr)) {
+        /* Push larger left partition indices. */
+        PUSH(lo, right_ptr);
+        lo = left_ptr;
+      } else {
+        /* Push larger right partition indices. */
+        PUSH(left_ptr, hi);
+        hi = right_ptr;
+      }
+    }
+  }
+
+  /* Once the BASE_PTR array is partially sorted by quicksort the rest
+     is completely sorted using insertion sort, since this is efficient
+     for partitions below MAX_THRESH size. BASE_PTR points to the beginning
+     of the array to sort, and END_PTR points at the very last element in
+     the array (*not* one beyond it!). */
+
+#define min(x, y) ((x) < (y) ? (x) : (y))
+
+  {
+    char *const end_ptr = &base_ptr[size * (total_elems - 1)];
+    char *tmp_ptr = base_ptr;
+    char *thresh = min(end_ptr, base_ptr + max_thresh);
+    char *run_ptr;
+
+    /* Find smallest element in first threshold and place it at the
+       array's beginning.  This is the smallest array element,
+       and the operation speeds up insertion sort's inner loop. */
+
+    for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
+      if ((*cmp)((void *)run_ptr, (void *)tmp_ptr, arg) < 0)
+        tmp_ptr = run_ptr;
+
+    if (tmp_ptr != base_ptr)
+      SWAP(tmp_ptr, base_ptr, size);
+
+    /* Insertion sort, running from left-hand-side up to right-hand-side.  */
+
+    run_ptr = base_ptr + size;
+    while ((run_ptr += size) <= end_ptr) {
+      tmp_ptr = run_ptr - size;
+      while ((*cmp)((void *)run_ptr, (void *)tmp_ptr, arg) < 0)
+        tmp_ptr -= size;
+
+      tmp_ptr += size;
+      if (tmp_ptr != run_ptr) {
+        char *trav;
+
+        trav = run_ptr + size;
+        while (--trav >= run_ptr) {
+          char c = *trav;
+          char *hi, *lo;
+
+          for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
+            *hi = *lo;
+          *hi = c;
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
 extern "C" {

 int memcmp(const void *lhs, const void *rhs, size_t count) {
@@ -47,16 +369,419 @@ int memcmp(const void *lhs, const void *rhs, size_t count) {
  return 0;
 }

-void memset(void *dst, int C, size_t count) {
-  auto *dstc = reinterpret_cast<char *>(dst);
-  for (size_t I = 0; I < count; ++I)
-    dstc[I] = C;
-}
-
 /// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf
 int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
  return impl::omp_vprintf(Format, Arguments, Size);
 }
+
+// -----------------------------------------------------------------------------
+
+#ifndef ULONG_MAX
+#define ULONG_MAX ((unsigned long)(~0L)) /* 0xFFFFFFFF */
+#endif
+
+#ifndef LONG_MAX
+#define LONG_MAX ((long)(ULONG_MAX >> 1)) /* 0x7FFFFFFF */
+#endif
+
+#ifndef LONG_MIN
+#define LONG_MIN ((long)(~LONG_MAX)) /* 0x80000000 */
+#endif
+
+#ifndef ISDIGIT
+#define ISDIGIT(C) ((C) >= '0' && (C) <= '9')
+#endif
+
+#ifndef ISSPACE
+#define ISSPACE(C) ((C) == ' ')
+#endif
+
+#ifndef ISALPHA
+#define ISALPHA(C) (((C) >= 'a' && (C) <= 'z') || ((C) >= 'A' && (C) <= 'Z'))
+#endif
+
+#ifndef ISUPPER
+#define ISUPPER(C) ((C) >= 'A' && (C) <= 'Z')
+#endif
+
+long strtol(const char *nptr, char **endptr, int base) {
+  const char *s = nptr;
+  unsigned long acc;
+  int c;
+  unsigned long cutoff;
+  int neg = 0, any, cutlim;
+
+  /*
+   * Skip white space and pick up leading +/- sign if any.
+   * If base is 0, allow 0x for hex and 0 for octal, else
+   * assume decimal; if base is already 16, allow 0x.
+   */
+  do {
+    c = *s++;
+  } while (ISSPACE(c));
+  if (c == '-') {
+    neg = 1;
+    c = *s++;
+  } else if (c == '+')
+    c = *s++;
+  if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X')) {
+    c = s[1];
+    s += 2;
+    base = 16;
+  }
+  if (base == 0)
+    base = c == '0' ? 8 : 10;
+
+  /*
+   * Compute the cutoff value between legal numbers and illegal
+   * numbers.  That is the largest legal value, divided by the
+   * base.  An input number that is greater than this value, if
+   * followed by a legal input character, is too big.  One that
+   * is equal to this value may be valid or not; the limit
+   * between valid and invalid numbers is then based on the last
+   * digit.  For instance, if the range for longs is
+   * [-2147483648..2147483647] and the input base is 10,
+   * cutoff will be set to 214748364 and cutlim to either
+   * 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated
+   * a value > 214748364, or equal but the next digit is > 7 (or 8),
+   * the number is too big, and we will return a range error.
+   *
+   * Set any if any `digits' consumed; make it negative to indicate
+   * overflow.
+   */
+  cutoff = neg ? -(unsigned long)LONG_MIN : LONG_MAX;
+  cutlim = cutoff % (unsigned long)base;
+  cutoff /= (unsigned long)base;
+  for (acc = 0, any = 0;; c = *s++) {
+    if (ISDIGIT(c))
+      c -= '0';
+    else if (ISALPHA(c))
+      c -= ISUPPER(c) ? 'A' - 10 : 'a' - 10;
+    else
+      break;
+    if (c >= base)
+      break;
+    if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
+      any = -1;
+    else {
+      any = 1;
+      acc *= base;
+      acc += c;
+    }
+  }
+  if (any < 0) {
+    acc = neg ? LONG_MIN : LONG_MAX;
+    errno = ERANGE;
+  } else if (neg)
+    acc = -acc;
+  if (endptr != 0)
+    *endptr = const_cast<char *>(any ? s - 1 : nptr);
+  return (acc);
+}
+
+int strcmp(const char *lhs, const char *rhs) {
+  while (*lhs != '\0' && *rhs != '\0') {
+    if (*lhs == *rhs) {
+      ++lhs;
+      ++rhs;
+    }
+    return *lhs - *rhs;
+  }
+  if (*lhs != '\0')
+    return 1;
+
+  return -1;
+}
+
+void *calloc(size_t num, size_t size) {
+  size_t bits = num * size;
+  char *p = (char *)malloc(bits);
+  if (!p)
+    return p;
+  char *q = (char *)p;
+  while (q - p < bits) {
+    *(int *)q = 0;
+    q += sizeof(int);
+  }
+  while (q - p < bits) {
+    *q = 0;
+    q++;
+  }
+  return p;
+}
+
+void exit(int exit_code) { asm volatile("exit;"); }
+
+size_t strlen(const char *str) {
+  size_t r = 0;
+  while (*str == ' ')
+    ++str;
+
+  while (*str != '\0') {
+    ++r;
+    ++str;
+  }
+
+  return r;
+}
+
+char *strcpy(char *dest, const char *src) {
+  char *pd = dest;
+  const char *ps = src;
+
+  while (*ps != '\0')
+    *(pd++) = *(ps++);
+
+  *pd = '\0';
+
+  return dest;
+}
+
+int *__errno_location() { return &ErrNo; }
+
+char *strcat(char *dest, const char *src) {
+  char *pd = dest;
+  const char *ps = src;
+
+  while (*pd != '\0')
+    ++pd;
+
+  while (*ps != '\0')
+    *(pd++) = *(ps++);
+
+  *pd = '\0';
+
+  return dest;
+}
+
+void perror(const char *s) { printf("%s", s); }
+
+int strncmp(const char *lhs, const char *rhs, size_t count) {
+  size_t c = 0;
+  while (*lhs != '\0' && *rhs != '\0' && c < count) {
+    if (*lhs == *rhs) {
+      ++lhs;
+      ++rhs;
+      ++c;
+    } else
+      return *lhs - *rhs;
+  }
+
+  return 0;
+}
+
+char *strncpy(char *dest, const char *src, size_t count) {
+  char *pd = dest;
+  const char *ps = src;
+  size_t c = 0;
+
+  while (*ps != '\0' && c < count) {
+    *(pd++) = *(ps++);
+    ++c;
+  }
+
+  if (c < count)
+    *pd = '\0';
+
+  return dest;
+}
+
+char *strchr(const char *s, int c) {
+  do {
+    if (*s == c)
+      return const_cast<char *>(s);
+  } while (*s++);
+  return nullptr;
+}
+
+char *strtok(char *str, const char *delim) {
+  static char *s = nullptr;
+  char *tok;
+  if (str == nullptr) {
+    if (s == nullptr)
+      return nullptr;
+  } else
+    s = str;
+  for (size_t i; (*s != '\0'); s++) {
+    for (i = 0; (delim[i] != '\0') && (*s != delim[i]); i++)
+      ;
+    if (delim[i] == '\0')
+      break;
+  }
+  if (*s == '\0')
+    return s = nullptr;
+  tok = s++;
+  for (size_t i; (*s != '\0'); s++) {
+    for (i = 0; (delim[i] != '\0') && (*s != delim[i]); i++)
+      ;
+    if (delim[i] != '\0')
+      break;
+  }
+  if (*s != '\0') {
+    *s = '\0';
+    s++;
+  }
+  return tok;
+}
+
+void srand(unsigned seed) { RandomNext = seed; }
+
+int rand() {
+  RandomNext = RandomNext * 1103515245 + 12345;
+  return static_cast<unsigned int>((RandomNext / 65536)) % 2147483647;
+}
+
+int abs(int n) { return n > 0 ? n : -n; }
+
+void *memcpy(void *dest, const void *src, size_t count) {
+  __builtin_memcpy(dest, src, count);
+  return dest;
+}
+
+unsigned long strtoul(const char *str, char **str_end, int base) {
+  unsigned long res = 0;
+  while (*str != '\0') {
+    if (*str == ' ') {
+      ++str;
+      continue;
+    }
+    if (*str >= '0' && *str <= '9') {
+      res = res * 10 + *str - '0';
+      ++str;
+      continue;
+    }
+    break;
+  }
+
+  if (*str_end)
+    *str_end = const_cast<char *>(str);
+  return res;
+}
+
+double atof(const char *s) { return strtod(s, nullptr); }
+
+int atoi(const char *str) { return (int)strtol(str, nullptr, 10); }
+
+double strtod(const char *str, char **ptr) {
+  char *p;
+
+  if (ptr == (char **)0)
+    return atof(str);
+
+  p = const_cast<char *>(str);
+
+  while (ISSPACE(*p))
+    ++p;
+
+  if (*p == '+' || *p == '-')
+    ++p;
+
+  /* INF or INFINITY.  */
+  if ((p[0] == 'i' || p[0] == 'I') && (p[1] == 'n' || p[1] == 'N') &&
+      (p[2] == 'f' || p[2] == 'F')) {
+    if ((p[3] == 'i' || p[3] == 'I') && (p[4] == 'n' || p[4] == 'N') &&
+        (p[5] == 'i' || p[5] == 'I') && (p[6] == 't' || p[6] == 'T') &&
+        (p[7] == 'y' || p[7] == 'Y')) {
+      *ptr = p + 8;
+      return atof(str);
+    } else {
+      *ptr = p + 3;
+      return atof(str);
+    }
+  }
+
+  /* NAN or NAN(foo).  */
+  if ((p[0] == 'n' || p[0] == 'N') && (p[1] == 'a' || p[1] == 'A') &&
+      (p[2] == 'n' || p[2] == 'N')) {
+    p += 3;
+    if (*p == '(') {
+      ++p;
+      while (*p != '\0' && *p != ')')
+        ++p;
+      if (*p == ')')
+        ++p;
+    }
+    *ptr = p;
+    return atof(str);
+  }
+
+  /* digits, with 0 or 1 periods in it.  */
+  if (ISDIGIT(*p) || *p == '.') {
+    int got_dot = 0;
+    while (ISDIGIT(*p) || (!got_dot && *p == '.')) {
+      if (*p == '.')
+        got_dot = 1;
+      ++p;
+    }
+
+    /* Exponent.  */
+    if (*p == 'e' || *p == 'E') {
+      int i;
+      i = 1;
+      if (p[i] == '+' || p[i] == '-')
+        ++i;
+      if (ISDIGIT(p[i])) {
+        while (ISDIGIT(p[i]))
+          ++i;
+        *ptr = p + i;
+        return atof(str);
+      }
+    }
+    *ptr = p;
+    return atof(str);
+  }
+  /* Didn't find any digits.  Doesn't look like a number.  */
+  *ptr = const_cast<char *>(str);
+  return 0.0;
+}
+
+void *memset(void *dest, int ch, size_t count) {
+  auto *P = reinterpret_cast<unsigned char *>(dest);
+  for (size_t I = 0; I < count; ++I, ++P)
+    *P = (unsigned char)ch;
+  return dest;
+}
+
+int tolower(int ch) {
+  if (ch >= 'A' && ch <= 'Z')
+    return ch + 32;
+  return ch;
+}
+
+char *strstr(const char *s1, const char *s2) {
+  const size_t len = strlen(s2);
+  while (*s1) {
+    if (!memcmp(s1, s2, len))
+      return const_cast<char *>(s1);
+    ++s1;
+  }
+  return (0);
+}
+
+char *__xpg_basename(const char *path) { return const_cast<char *>(path); }
+
+const unsigned short **__ctype_b_loc(void) { return &BLocTablePTable; }
+
+const int32_t **__ctype_tolower_loc(void) { return &ToLowerPTable; }
+
+void qsort(void *b, size_t n, size_t s, __compar_fn_t cmp) {
+  return _quicksort(b, n, s, (__compar_d_fn_t)cmp, nullptr);
+}
+
+int strcasecmp(const char *s1, const char *s2) {
+  const unsigned char *p1 = (const unsigned char *)s1;
+  const unsigned char *p2 = (const unsigned char *)s2;
+  int result;
+
+  if (p1 == p2)
+    return 0;
+
+  while ((result = tolower(*p1) - tolower(*p2++)) == 0)
+    if (*p1++ == '\0')
+      break;
+
+  return result;
+}
 }

 #pragma omp end declare target
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -282,9 +282,15 @@ uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }

 uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
  uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
-  // If we are in SPMD mode, remove one warp.
-  return BlockSize - (!IsSPMD * impl::getWarpSize());
+  if (IsSPMD)
+     return BlockSize;
+   uint32_t WarpSize = impl::getWarpSize();
+   bool IsFullLastWarp = !(BlockSize % WarpSize);
+   if (OMP_LIKELY(IsFullLastWarp))
+     return BlockSize - WarpSize;
+   return BlockSize - BlockSize % WarpSize;
 }
+
 uint32_t mapping::getMaxTeamThreads() {
  return mapping::getMaxTeamThreads(mapping::isSPMDMode());
 }
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -47,6 +47,9 @@ using namespace ompx;
 namespace {

 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
+  if (NumThreadsClause != -1)
+    return NumThreadsClause;
+
  uint32_t NThreadsICV =
      NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
  uint32_t NumThreads = mapping::getMaxTeamThreads();
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -17,6 +17,7 @@
 #include "LibC.h"
 #include "Mapping.h"
 #include "State.h"
+#include "Memory.h"
 #include "Synchronization.h"
 #include "Types.h"
 #include "Utils.h"
@@ -83,6 +84,7 @@ struct SharedMemorySmartStackTy {
  void *push(uint64_t Bytes);

  /// Deallocate the last allocation made by the encountering thread and pointed
+  //
  /// to by \p Ptr from the stack. Each thread can call this function.
  void pop(void *Ptr, uint32_t Bytes);

@@ -274,7 +276,8 @@ void state::enterDataEnvironment(IdentTy *Ident) {
  uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
  if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
    uint32_t Bytes =
-        sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
+        sizeof(ThreadStates[0]) *
+        (mapping::getNumberOfThreadsInBlock() + (mapping::isSPMDMode() ? 0 : 1));
    void *ThreadStatesPtr =
        memory::allocGlobal(Bytes, "Thread state array allocation");
    memset(ThreadStatesPtr, 0, Bytes);
@@ -371,6 +374,15 @@ int omp_get_thread_num(void) {
  return omp_get_ancestor_thread_num(omp_get_level());
 }

+int omp_get_bulk_thread_num() {
+  ASSERT(mapping::isSPMDMode(), "Not SPMD");
+  int BId = mapping::getBlockIdInKernel();
+  int BSize = mapping::getMaxTeamThreads();
+  int TId = mapping::getThreadIdInBlock();
+  int Id = BId * BSize + TId;
+  return returnValIfLevelIsActive(omp_get_level(), Id, 0);
+}
+
 int omp_get_team_size(int Level) {
  return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
 }
@@ -379,6 +391,11 @@ int omp_get_num_threads(void) {
  return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
 }

+int omp_get_bulk_num_threads(void) {
+  ASSERT(mapping::isSPMDMode(), "Not SPMD");
+  return mapping::getNumberOfThreadsInKernel();
+}
+
 int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }

 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -122,6 +122,8 @@ uint32_t atomicExchange(uint32_t *Address, uint32_t Val,
 }
 ///}

+int32_t atomicAddSys(int32_t *Address, uint32_t Val);
+
 // Forward declarations defined to be defined for AMDGCN and NVPTX.
 uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
                   atomic::MemScopeTy MemScope);
@@ -336,6 +338,10 @@ uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
  return __nvvm_atom_inc_gen_ui(Address, Val);
 }

+int32_t atomicAddSys(int32_t *Address, int32_t Val) {
+  return __nvvm_atom_sys_add_gen_i(Address, Val);
+}
+
 void namedBarrierInit() {}

 void namedBarrier() {
@@ -509,6 +515,22 @@ void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }

 void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }

+int32_t atomic::addSys(int32_t *Addr, int32_t Val) {
+  return impl::atomicAddSys(Addr, Val);
+}
+
+void mutex::TicketLock::lock() {
+  uint64_t MyTicket = atomic::add(&NextTicket, 1, atomic::seq_cst);
+  while (atomic::load(&NowServing, atomic::aquire) != MyTicket)
+    ;
+  fence::kernel(atomic::aquire);
+}
+
+void mutex::TicketLock::unlock() {
+  fence::kernel(atomic::release);
+  atomic::add(&NowServing, 1, atomic::seq_cst);
+}
+
 extern "C" {
 void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}

--- a/openmp/libomptarget/DeviceRTL/src/TeamAllocator.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/TeamAllocator.cpp
@@ -0,0 +1,304 @@
+//===------- WarpAllocator.cpp - Warp memory allocator ------- C++ -*-========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "Debug.h"
+#include "Mapping.h"
+#include "Memory.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace ompx;
+
+char *CONSTANT(omptarget_device_heap_buffer)
+    __attribute__((used, retain, weak, visibility("protected")));
+
+size_t CONSTANT(omptarget_device_heap_size)
+    __attribute__((used, retain, weak, visibility("protected")));
+
+namespace {
+constexpr const size_t Alignment = 16;
+constexpr const size_t FirstThreadRatio = 40;
+constexpr const size_t SplitThreadhold = Alignment * 4;
+
+template <typename T> T abs(T V) { return V > 0 ? V : -V; }
+
+template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator;
+
+class WarpAllocatorEntry {
+  template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> friend struct WarpAllocator;
+
+  /// If Size is less than 0, the entry is allocated (in use).
+  int64_t Size = 0;
+  /// PrevSize is also supposed to be greater than or equal to 0. When it is 0,
+  /// it is the first entry of the buffer.
+  int64_t PrevSize = 0;
+
+public:
+  bool isFirst() const { return !PrevSize; }
+
+  size_t getSize() const { return abs(Size); }
+  void setSize(size_t V) { Size = V; }
+
+  void setPrevSize(WarpAllocatorEntry *Prev) {
+    PrevSize = Prev ? Prev->getSize() : 0;
+  }
+
+  size_t getUserSize() const { return getSize() - sizeof(WarpAllocatorEntry); }
+
+  // Note: isUsed can not be !isUnused or other way around because when Size is
+  // 0, it is uninitialized.
+  bool isUsed() const { return Size < 0; }
+  bool isUnused() const { return Size > 0; }
+
+  void setUsed() {
+    assert(isUnused() && "the entry is in use");
+    Size *= -1;
+  }
+  void setUnused() {
+    assert(isUsed() && "the entry is not in use");
+    Size *= -1;
+  }
+
+  char *getUserPtr() { return reinterpret_cast<char *>(this + 1); }
+  char *getEndPtr() { return reinterpret_cast<char *>(getNext()); }
+
+  WarpAllocatorEntry *getPrev() { return utils::advance(this, -PrevSize); }
+  WarpAllocatorEntry *getNext() { return utils::advance(this, getSize()); }
+
+  static WarpAllocatorEntry *fromUserPtr(void *Ptr) { return fromPtr(Ptr) - 1; }
+
+  static WarpAllocatorEntry *fromPtr(void *Ptr) {
+    return reinterpret_cast<WarpAllocatorEntry *>(Ptr);
+  }
+};
+
+static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");
+
+template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
+  void init() {
+    if (mapping::isSPMDMode() &&
+        (mapping::getThreadIdInBlock() || mapping::getBlockIdInKernel()))
+      return;
+
+    size_t HeapSize = omptarget_device_heap_size;
+
+    FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
+    FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
+    size_t OtherThreadHeapSize =
+        (HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
+    OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);
+
+    size_t TeamHeapSize = FirstThreadHeapSize / TEAM_SIZE;
+    TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
+    FirstTeamSize = TeamHeapSize;
+
+    char *LastLimit = omptarget_device_heap_buffer;
+    for (int I = 0; I < WARP_SIZE; ++I) {
+      for (int J = 0; J < TEAM_SIZE; ++J) {
+        Entries[I][J] = nullptr;
+        Limits[I][J] = LastLimit + TeamHeapSize * (J + 1);
+      }
+      LastLimit += I ? OtherThreadHeapSize : FirstThreadHeapSize;
+      Limits[I][TEAM_SIZE - 1] = LastLimit;
+      TeamHeapSize = OtherThreadHeapSize / TEAM_SIZE;
+      TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
+    }
+  }
+
+  void *allocate(size_t Size) {
+    int32_t TeamSlot = getTeamSlot();
+    int32_t TIdInWarp = mapping::getThreadIdInWarp();
+
+    Size = utils::align_up(Size + sizeof(WarpAllocatorEntry), Alignment);
+
+    // Error our early if the requested size is larger than the entire block.
+    if (Size > getBlockSize(TIdInWarp, TeamSlot))
+      return nullptr;
+
+    WarpAllocatorEntry *E = nullptr;
+    {
+      mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
+
+      auto *LastEntry = Entries[TIdInWarp][TeamSlot];
+      auto *NewWatermark = (LastEntry ? LastEntry->getEndPtr()
+                                      : getBlockBegin(TIdInWarp, TeamSlot)) +
+                           Size;
+      if (NewWatermark >= Limits[TIdInWarp][TeamSlot]) {
+        E = findMemorySlow(Size, TIdInWarp, TeamSlot);
+      } else {
+        E = LastEntry ? LastEntry->getNext()
+                      : WarpAllocatorEntry::fromPtr(
+                            getBlockBegin(TIdInWarp, TeamSlot));
+        E->setSize(Size);
+        E->setPrevSize(LastEntry);
+        Entries[TIdInWarp][TeamSlot] = E;
+      }
+
+      if (!E)
+        return nullptr;
+
+      E->setUsed();
+    }
+
+    return E->getUserPtr();
+  }
+
+  void deallocate(void *Ptr) {
+    WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(Ptr);
+
+    auto TeamSlot = getTeamSlot();
+    auto TIdInWarp = mapping::getThreadIdInWarp();
+
+    mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
+    E->setUnused();
+    // Is last entry?
+    if (E == Entries[TIdInWarp][TeamSlot]) {
+      do {
+        E = E->getPrev();
+      } while (!E->isFirst() && !E->isUsed());
+      Entries[TIdInWarp][TeamSlot] = E;
+    }
+  }
+
+  memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+    if (!utils::isInRange(P, omptarget_device_heap_buffer,
+                          omptarget_device_heap_size))
+      return {};
+
+    auto TeamSlot = getTeamSlot();
+    auto TIdInWarp = mapping::getThreadIdInWarp();
+    for (int I = TIdInWarp; I < TIdInWarp + WARP_SIZE; ++I) {
+      int TId = I % WARP_SIZE;
+      for (int J = TeamSlot; J < TeamSlot + TEAM_SIZE; ++J) {
+        int SId = J % TEAM_SIZE;
+        if (P < getBlockBegin(TId, SId) || P >= getBlockEnd(TId, SId))
+          continue;
+
+        mutex::LockGuard LG(Locks[I][SId]);
+        WarpAllocatorEntry *E = Entries[I][SId];
+        if (!E)
+          return {};
+        if (E->getEndPtr() <= P)
+          return {};
+        do {
+          if (E->getUserPtr() <= P && P < E->getEndPtr()) {
+            if (!E->isUsed())
+              return {};
+            return {E->getUserPtr(), E->getUserSize()};
+          }
+          E = E->getPrev();
+        } while (!E->isFirst());
+      }
+    }
+    return {};
+  }
+
+private:
+  char *getBlockBegin(int32_t TIdInWarp, int32_t TeamSlot) const {
+    if (TeamSlot)
+      return Limits[TIdInWarp][TeamSlot - 1];
+    if (TIdInWarp)
+      return Limits[TIdInWarp - 1][TEAM_SIZE - 1];
+    return omptarget_device_heap_buffer;
+  }
+  char *getBlockEnd(int32_t TIdInWarp, int32_t TeamSlot) const {
+    return Limits[TIdInWarp][TeamSlot];
+  }
+
+  size_t getBlockSize(int32_t TIdInWarp, int32_t TeamSlot) const {
+    return getBlockEnd(TIdInWarp, TeamSlot) -
+           getBlockBegin(TIdInWarp, TeamSlot);
+  }
+
+  static int32_t getTeamSlot() { return mapping::getBlockIdInKernel() % TEAM_SIZE; }
+
+  WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp,
+                                     int32_t TeamSlot) {
+    char *Ptr = getBlockBegin(TIdInWarp, TeamSlot);
+    char *Limit = getBlockEnd(TIdInWarp, TeamSlot);
+
+    WarpAllocatorEntry *E = WarpAllocatorEntry::fromPtr(Ptr);
+    do {
+      if (!E->isUsed() && E->getSize() >= Size)
+        break;
+      E = E->getNext();
+      if (reinterpret_cast<char *>(E) + Size > Limit)
+        return nullptr;
+    } while (1);
+
+    size_t OldSize = E->getSize();
+    if (OldSize - Size >= SplitThreadhold) {
+      auto *OldNext = E->getNext();
+      E->setSize(Size);
+      auto *LeftOverE = E->getNext();
+      LeftOverE->setPrevSize(E);
+      LeftOverE->setSize(OldSize - Size);
+      OldNext->setPrevSize(LeftOverE);
+    }
+
+    return E;
+  }
+
+  WarpAllocatorEntry *Entries[WARP_SIZE][TEAM_SIZE];
+  char *Limits[WARP_SIZE][TEAM_SIZE];
+  mutex::TicketLock Locks[WARP_SIZE][TEAM_SIZE];
+  size_t FirstThreadHeapSize;
+  size_t FirstTeamSize;
+};
+
+WarpAllocator<32, 16> Allocator;
+
+} // namespace
+
+namespace ompx {
+namespace memory {
+MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+  return Allocator.getMemoryAllocationInfo(P);
+}
+} // namespace memory
+} // namespace ompx
+
+extern "C" {
+
+void *malloc(size_t Size) {
+  if (!Size)
+    return nullptr;
+  void *P = Allocator.allocate(Size);
+  assert(P && "allocator out of memory");
+  assert(reinterpret_cast<intptr_t>(P) % Alignment == 0 &&
+         "misaligned address");
+  return P;
+}
+
+void free(void *P) {
+  if (!P)
+    return;
+  Allocator.deallocate(P);
+}
+
+void *realloc(void *ptr, size_t new_size) {
+  void *NewPtr = malloc(new_size);
+  if (!NewPtr)
+    return nullptr;
+  WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(ptr);
+  __builtin_memcpy(NewPtr, ptr, utils::min(E->getUserSize(), new_size));
+  free(ptr);
+  return NewPtr;
+}
+
+void __kmpc_target_init_allocator() { Allocator.init(); }
+}
+
+#pragma omp end declare target
--- a/openmp/libomptarget/DeviceRTL/src/WarpAllocator.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/WarpAllocator.cpp
@@ -0,0 +1,275 @@
+//===------- WarpAllocator.cpp - Warp memory allocator ------- C++ -*-========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "Debug.h"
+#include "Mapping.h"
+#include "Memory.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace ompx;
+
+char *CONSTANT(omptarget_device_heap_buffer)
+    __attribute__((used, retain, weak, visibility("protected")));
+
+size_t CONSTANT(omptarget_device_heap_size)
+    __attribute__((used, retain, weak, visibility("protected")));
+
+namespace {
+constexpr const size_t Alignment = 16;
+constexpr const size_t FirstThreadRatio = 40;
+constexpr const size_t SplitThreadhold = Alignment * 16;
+
+template <typename T> T abs(T V) { return V > 0 ? V : -V; }
+
+template <int32_t WARP_SIZE> struct WarpAllocator;
+
+class WarpAllocatorEntry {
+  template <int32_t WARP_SIZE> friend struct WarpAllocator;
+
+  /// If Size is less than 0, the entry is allocated (in use).
+  int64_t Size = 0;
+  /// PrevSize is also supposed to be greater than or equal to 0. When it is 0,
+  /// it is the first entry of the buffer.
+  int64_t PrevSize = 0;
+
+public:
+  bool isFirst() const { return !PrevSize; }
+
+  size_t getSize() const { return abs(Size); }
+  void setSize(size_t V) { Size = V; }
+
+  void setPrevSize(WarpAllocatorEntry *Prev) {
+    PrevSize = Prev ? Prev->getSize() : 0;
+  }
+
+  size_t getUserSize() const { return getSize() - sizeof(WarpAllocatorEntry); }
+
+  // Note: isUsed can not be !isUnused or other way around because when Size is
+  // 0, it is uninitialized.
+  bool isUsed() const { return Size < 0; }
+  bool isUnused() const { return Size > 0; }
+
+  void setUsed() {
+    assert(isUnused() && "the entry is in use");
+    Size *= -1;
+  }
+  void setUnused() {
+    assert(isUsed() && "the entry is not in use");
+    Size *= -1;
+  }
+
+  char *getUserPtr() { return reinterpret_cast<char *>(this + 1); }
+  char *getEndPtr() { return reinterpret_cast<char *>(getNext()); }
+
+  WarpAllocatorEntry *getPrev() { return utils::advance(this, -PrevSize); }
+  WarpAllocatorEntry *getNext() { return utils::advance(this, getSize()); }
+
+  static WarpAllocatorEntry *fromUserPtr(void *Ptr) { return fromPtr(Ptr) - 1; }
+
+  static WarpAllocatorEntry *fromPtr(void *Ptr) {
+    return reinterpret_cast<WarpAllocatorEntry *>(Ptr);
+  }
+};
+
+static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");
+
+template <int32_t WARP_SIZE> struct WarpAllocator {
+  void init() {
+    if (mapping::isSPMDMode() &&
+        (mapping::getThreadIdInBlock() || mapping::getBlockId()))
+      return;
+
+    size_t HeapSize = omptarget_device_heap_size;
+    size_t FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
+    FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
+    size_t OtherThreadHeapSize =
+        (HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
+    OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);
+
+    for (int I = 0; I < WARP_SIZE; ++I) {
+      Entries[I] = nullptr;
+      size_t PrivateOffset = OtherThreadHeapSize * I + FirstThreadHeapSize;
+      Limits[I] = omptarget_device_heap_buffer + PrivateOffset;
+    }
+  }
+
+  void *allocate(size_t Size) {
+    int32_t TIdInWarp = mapping::getThreadIdInWarp();
+    Size = utils::align_up(Size + sizeof(WarpAllocatorEntry), Alignment);
+
+    // Error our early if the requested size is larger than the entire block.
+    if (Size > getBlockSize(TIdInWarp))
+      return nullptr;
+
+    WarpAllocatorEntry *E = nullptr;
+    {
+      mutex::LockGuard LG(Locks[TIdInWarp]);
+
+      auto *LastEntry = Entries[TIdInWarp];
+      auto *NewWatermark =
+          (LastEntry ? LastEntry->getEndPtr() : getBlockBegin(TIdInWarp)) +
+          Size;
+      if (NewWatermark >= Limits[TIdInWarp]) {
+        E = findMemorySlow(Size, TIdInWarp);
+      } else {
+        E = LastEntry ? LastEntry->getNext()
+                      : WarpAllocatorEntry::fromPtr(getBlockBegin(TIdInWarp));
+        E->setSize(Size);
+        E->setPrevSize(LastEntry);
+        Entries[TIdInWarp] = E;
+      }
+    }
+
+    if (!E)
+      return nullptr;
+
+    assert(E->isUnused() && "entry is not set to use properly");
+
+    E->setUsed();
+
+    return E->getUserPtr();
+  }
+
+  void deallocate(void *Ptr) {
+    WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(Ptr);
+
+    auto TIdInWarp = mapping::getThreadIdInWarp();
+
+    mutex::LockGuard LG(Locks[TIdInWarp]);
+    E->setUnused();
+    // Is last entry?
+    if (E == Entries[TIdInWarp]) {
+      do {
+        E = E->getPrev();
+      } while (!E->isFirst() && !E->isUsed());
+      Entries[TIdInWarp] = E;
+    }
+  }
+
+  memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+    if (!utils::isInRange(P, omptarget_device_heap_buffer,
+                          omptarget_device_heap_size))
+      return {};
+
+    auto TIdInWarp = mapping::getThreadIdInWarp();
+    for (int I = TIdInWarp; I < TIdInWarp + mapping::getWarpSize(); ++I) {
+      int TId = I % mapping::getWarpSize();
+      if (P < getBlockBegin(TId) || P >= getBlockEnd(TId))
+        continue;
+
+      mutex::LockGuard LG(Locks[I]);
+      WarpAllocatorEntry *E = Entries[I];
+      if (!E)
+        return {};
+      if (E->getEndPtr() <= P)
+        return {};
+      do {
+        if (E->getUserPtr() <= P && P < E->getEndPtr()) {
+          if (!E->isUsed())
+            return {};
+          return {E->getUserPtr(), E->getUserSize()};
+        }
+        E = E->getPrev();
+      } while (!E->isFirst());
+    }
+    return {};
+  }
+
+private:
+  char *getBlockBegin(int32_t TIdInWarp) const {
+    return TIdInWarp ? Limits[TIdInWarp - 1] : omptarget_device_heap_buffer;
+  }
+  char *getBlockEnd(int32_t TIdInWarp) const { return Limits[TIdInWarp]; }
+
+  size_t getBlockSize(int32_t TIdInWarp) const {
+    return getBlockEnd(TIdInWarp) - getBlockBegin(TIdInWarp);
+  }
+
+  WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp) {
+    char *Ptr = getBlockBegin(TIdInWarp);
+    char *Limit = getBlockEnd(TIdInWarp);
+
+    WarpAllocatorEntry *E = WarpAllocatorEntry::fromPtr(Ptr);
+    do {
+      if (!E->isUsed() && E->getSize() >= Size)
+        break;
+      E = E->getNext();
+      if (reinterpret_cast<char *>(E) + Size > Limit)
+        return nullptr;
+    } while (1);
+
+    size_t OldSize = E->getSize();
+    if (OldSize - Size >= SplitThreadhold) {
+      auto *OldNext = E->getNext();
+      E->setSize(Size);
+      auto *LeftOverE = E->getNext();
+      LeftOverE->setPrevSize(E);
+      LeftOverE->setSize(OldSize - Size);
+      OldNext->setPrevSize(LeftOverE);
+    }
+
+    return E;
+  }
+
+  WarpAllocatorEntry *Entries[WARP_SIZE];
+  char *Limits[WARP_SIZE];
+  mutex::TicketLock Locks[WARP_SIZE];
+};
+
+WarpAllocator<32> Allocator;
+
+} // namespace
+
+namespace ompx {
+namespace memory {
+MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+  return Allocator.getMemoryAllocationInfo(P);
+}
+} // namespace memory
+} // namespace ompx
+
+extern "C" {
+
+void *malloc(size_t Size) {
+  if (!Size)
+    return nullptr;
+  void *P = Allocator.allocate(Size);
+  assert(P && "allocator out of memory");
+  assert(reinterpret_cast<intptr_t>(P) % Alignment == 0 &&
+         "misaligned address");
+  return P;
+}
+
+void free(void *P) {
+  if (!P)
+    return;
+  Allocator.deallocate(P);
+}
+
+void *realloc(void *ptr, size_t new_size) {
+  void *NewPtr = malloc(new_size);
+  if (!NewPtr)
+    return nullptr;
+  WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(ptr);
+  __builtin_memcpy(NewPtr, ptr, utils::min(E->getUserSize(), new_size));
+  free(ptr);
+  return NewPtr;
+}
+
+void __kmpc_target_init_allocator() { Allocator.init(); }
+}
+
+#pragma omp end declare target
--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -111,6 +111,38 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
    stride = loopSize; // make sure we only do 1 chunk per warp
  }

+  static void ForStaticNoLoopNoTrunk(int &last, T &lb, T &ub, ST &stride,
+                                     ST &chunk, T entityId,
+                                     T numberOfEntities) {
+    T loopSize = ub - lb + 1;
+    chunk = loopSize / numberOfEntities;
+
+    // We have more entities than iterations.
+    if (chunk == 0) {
+      chunk = 1;
+      lb = lb + entityId * chunk;
+      T inputUb = ub;
+      ub = lb + chunk - 1;
+      last = lb <= inputUb && inputUb <= ub;
+      stride = loopSize; // make sure we only do 1 chunk per warp
+      return;
+    }
+
+    T leftOver = loopSize - chunk * numberOfEntities;
+
+    if (entityId < leftOver) {
+      chunk++;
+      lb = lb + entityId * chunk;
+    } else {
+      lb = lb + entityId * chunk + leftOver;
+    }
+
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    last = lb <= inputUb && inputUb <= ub;
+    stride = loopSize; // make sure we only do 1 chunk per warp
+  }
+
  ////////////////////////////////////////////////////////////////////////////////
  // Support for Static Init

@@ -184,6 +216,19 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
                     omp_get_num_teams() * numberOfActiveOMPThreads);
      break;
    }
+    case kmp_sched_distr_parallel_static_noloop_chunk: {
+      ForStaticChunk(lastiter, lb, ub, stride, chunk,
+                     numberOfActiveOMPThreads * omp_get_team_num() + gtid,
+                     omp_get_num_teams() * numberOfActiveOMPThreads);
+      break;
+    }
+    case kmp_sched_distr_parallel_static_noloop_nochunk: {
+      ForStaticNoLoopNoTrunk(lastiter, lb, ub, stride, chunk,
+                             numberOfActiveOMPThreads * omp_get_team_num() +
+                                 gtid,
+                             omp_get_num_teams() * numberOfActiveOMPThreads);
+      break;
+    }
    default: {
      // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
--- a/openmp/libomptarget/DeviceRTL/src/exports
+++ b/openmp/libomptarget/DeviceRTL/src/exports
@@ -11,8 +11,62 @@ _ZN4ompx*

 IsSPMDMode

-malloc
-free
-memcmp
-printf
 __assert_fail
+__ctype_b_loc
+__ctype_tolower_loc
+__xpg_basename
+abs
+atof
+atoi
+calloc
+clock
+clock_gettime
+exit
+fclose
+feof
+fflush
+fgets
+fopen
+fprintf
+fputs
+fread
+free
+fscanf
+fseek
+ftell
+fwrite
+getc
+gettimeofday
+gmtime
+malloc
+memcmp
+memcpy
+memset
+pclose
+perror
+popen
+printf
+qsort
+rand
+realloc
+rewind
+sprintf
+srand
+sscanf
+stat
+strcat
+strchr
+strcmp
+strcpy
+strftime
+strlen
+strncmp
+strncpy
+strstr
+strtod
+strtok
+strtol
+strtoul
+time
+tolower
+strcasecmp
--- a/openmp/libomptarget/DirectGPUCompilation/clang-gpu
+++ b/openmp/libomptarget/DirectGPUCompilation/clang-gpu
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+
+
+from driver.wrapper import run
+
+
+def main():
+    run(is_cpp=False)
+
+
+if __name__ == "__main__":
+    main()
--- a/openmp/libomptarget/DirectGPUCompilation/clang-gpu++
+++ b/openmp/libomptarget/DirectGPUCompilation/clang-gpu++
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+
+
+from driver.wrapper import run
+
+
+def main():
+    run(is_cpp=True)
+
+
+if __name__ == "__main__":
+    main()
--- a/openmp/libomptarget/DirectGPUCompilation/driver/Main.c
+++ b/openmp/libomptarget/DirectGPUCompilation/driver/Main.c
@@ -0,0 +1,40 @@
+//===------- Main.c - Direct compilation program start point ------ C -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <string.h>
+extern int __user_main(int, char *[]);
+extern void __kmpc_target_init_allocator(void);
+
+#ifdef SINGLE_THREAD_EXECUTION
+#define THREAD_LIMIT 1
+#else
+#define THREAD_LIMIT 1024
+#endif
+
+int main(int argc, char *argv[]) {
+#pragma omp target enter data map(to: argv[:argc])
+
+  for (int I = 0; I < argc; ++I) {
+#pragma omp target enter data map(to: argv[I][:strlen(argv[I])])
+  }
+
+  int Ret = 0;
+#pragma omp target enter data map(to: Ret)
+
+#pragma omp target teams num_teams(1) thread_limit(1)
+  { __kmpc_target_init_allocator(); }
+
+#pragma omp target teams num_teams(1) thread_limit(THREAD_LIMIT)
+  {
+    Ret = __user_main(argc, argv);
+  }
+
+#pragma omp target exit data map(from: Ret)
+
+  return Ret;
+}
--- a/openmp/libomptarget/DirectGPUCompilation/driver/UserWrapper.h
+++ b/openmp/libomptarget/DirectGPUCompilation/driver/UserWrapper.h
@@ -0,0 +1,14 @@
+//===------- UserWrapper.h - User code wrapper --------------- C++ --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
+#define OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
+
+#pragma omp begin declare target device_type(nohost)
+
+#endif
--- a/openmp/libomptarget/DirectGPUCompilation/driver/init.py
+++ b/openmp/libomptarget/DirectGPUCompilation/driver/init.py
--- a/openmp/libomptarget/DirectGPUCompilation/driver/wrapper.py
+++ b/openmp/libomptarget/DirectGPUCompilation/driver/wrapper.py
@@ -0,0 +1,141 @@
+import argparse
+import os
+import pathlib
+import subprocess
+import sys
+import tempfile
+
+
+cwd = os.path.dirname(os.path.realpath(__file__))
+
+source_file_suffix = [".c", ".cpp", ".cu", ".hip", ".cc", ".cxx"]
+
+
+def print_version():
+    cp = subprocess.run(["clang", "--version"])
+    if cp.returncode != 0:
+        sys.exit(cp.returncode)
+
+
+def compile_loader(loader_name, targets, verbose, dry_run):
+    cmd = [
+        "clang",
+        "-c",
+        "-fopenmp",
+        "-foffload-lto",
+        "-fopenmp-offload-mandatory",
+        "-o",
+        loader_name,
+    ]
+    if targets:
+        for arch in targets:
+            cmd.append("--offload-arch={}".format(arch))
+    else:
+        cmd.append("--offload-arch=native")
+    cmd.append(os.path.join(cwd, "Main.c"))
+    if verbose:
+        print(" ".join(cmd), file=sys.stderr)
+    if dry_run:
+        print(" ".join(cmd), file=sys.stderr)
+        return
+    cp = subprocess.run(cmd)
+    if cp.returncode != 0:
+        sys.exit(cp.returncode)
+
+
+def invoke_clang(is_cpp, args, targets, verbose, dry_run):
+    cmd = [
+        "clang++" if is_cpp else "clang",
+        "-fopenmp",
+        "-foffload-lto",
+        "-fopenmp-offload-mandatory",
+        "-fopenmp-globalize-to-global-space",
+        "-include",
+        os.path.join(cwd, "UserWrapper.h"),
+        "--save-temps",
+        "-rdynamic",
+        "-mllvm",
+        "-enable-host-rpc",
+        "-mllvm",
+        "-openmp-opt-disable-state-machine-rewrite",
+        "-mllvm",
+        "-enable-canonicalize-main-function",
+        "-mllvm",
+        "-canonical-main-function-name=__user_main",
+    ]
+    if targets:
+        for arch in targets:
+            cmd.append("--offload-arch={}".format(arch))
+    else:
+        cmd.append("--offload-arch=native")
+    cmd += args
+    if verbose:
+        print(" ".join(cmd))
+    if dry_run:
+        print(" ".join(cmd))
+        return
+    cp = subprocess.run(cmd)
+    if cp.returncode != 0:
+        sys.exit(cp.returncode)
+
+
+def run(is_cpp=False):
+    parser = argparse.ArgumentParser(
+        prog="clang-gpu", description="clang LLVM GPU compiler"
+    )
+    parser.add_argument(
+        "-c",
+        action="store_true",
+        help="Only run preprocess, compile, and assemble steps",
+    )
+    parser.add_argument(
+        "-v", action="store_true", help="Show commands to run and use verbose output"
+    )
+    parser.add_argument(
+        "--version", action="store_true", help="Print version information"
+    )
+    parser.add_argument(
+        "-###",
+        action="store_true",
+        help="Print (but do not run) the commands to run for this compilation",
+    )
+    parser.add_argument(
+        "--offload-arch",
+        type=str,
+        help=(
+            "Specify an offloading device architecture for CUDA, HIP, or OpenMP. (e.g."
+            " sm_35). If 'native' is used the compiler will detect locally installed"
+            " architectures. For HIP offloading, the device architecture can be"
+            " followed by target ID features delimited by a colon (e.g."
+            " gfx908:xnack+:sramecc-). May be specified more than once."
+        ),
+        nargs="*",
+    )
+
+    args, fwd_args = parser.parse_known_args()
+
+    if args.version:
+        print_version()
+        return
+
+    if args.c:
+        fwd_args.append("-c")
+    if args.v:
+        fwd_args.append("-v")
+
+    dry_run = vars(args)["###"]
+    loader_name = None
+    temp_files = []
+
+    if not args.c:
+        tf = tempfile.NamedTemporaryFile()
+        loader_name = "{}.o".format(tf.name)
+        compile_loader(loader_name, args.offload_arch, args.v, dry_run)
+        fwd_args.append(loader_name)
+        temp_files.append(loader_name)
+
+    invoke_clang(is_cpp, fwd_args, args.offload_arch, args.v, dry_run)
+
+    for f in temp_files:
+        if os.path.isfile(f):
+            os.unlink(f)
--- a/openmp/libomptarget/include/HostRPC.h
+++ b/openmp/libomptarget/include/HostRPC.h
@@ -0,0 +1,67 @@
+//===------- HostRPC.h - Host RPC ---------------------------- C++ --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_INCLUDE_HOSTRPC_H
+#define OPENMP_LIBOMPTARGET_INCLUDE_HOSTRPC_H
+
+#ifdef OMPTARGET_DEVICE_RUNTIME
+#include "Types.h"
+#else
+#include <cstdint>
+#endif
+
+namespace hostrpc {
+
+/// RPC call identifier. Note: negative value only. Non-negative values are for
+/// compiler generated functions.
+enum CallId {
+  CALLID___kmpc_launch_parallel_51_kernel = -1,
+  CALLID_invalid = -2147483648,
+};
+
+/// Execution status.
+enum ExecutionStatus {
+  EXEC_STAT_CREATED = 0,
+  EXEC_STAT_DONE = 1,
+};
+
+enum Type {
+  ARG_LITERAL = 0,
+  ARG_POINTER = 1,
+};
+
+struct Argument {
+  intptr_t Value;
+  int64_t ArgType;
+};
+
+struct Descriptor {
+  // The following member will be used by both host and device.
+  int32_t Id;
+  struct Argument *Args;
+  int64_t NumArgs;
+  volatile int64_t Status;
+  volatile int64_t ReturnValue;
+
+  // The following members will only be used by device.
+  void **ArgInfo;
+  void *ArgMap;
+};
+
+/// A wrapper of HostRPCDescriptor that will only be used between plugins and
+/// libomptarget. It contains the three stdio global variables.
+struct DescriptorWrapper {
+  Descriptor D;
+  void *StdIn = nullptr;
+  void *StdOut = nullptr;
+  void *StdErr = nullptr;
+};
+
+} // namespace hostrpc
+
+#endif
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -424,6 +424,10 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
                                 void *VAddr, bool IsRecord, bool SaveOutput,
                                 uint64_t &ReqPtrArgOffset);

+// Host RPC support
+int64_t __kmpc_host_rpc_get_arg(void *Wrapper, int32_t ArgNo);
+void __kmpc_host_rpc_set_ret_val(void *Wrapper, int64_t RetVal);
+
 #ifdef __cplusplus
 }
 #endif
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -69,7 +69,8 @@ VERS1.0 {
    __tgt_interop_init;
    __tgt_interop_use;
    __tgt_interop_destroy;
-    ompt_libomptarget_connect;
+    __kmpc_host_rpc_get_arg;
+    __kmpc_host_rpc_set_ret_val;
  local:
    *;
 };
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -16,6 +16,8 @@
 #include "PluginManager.h"
 #include "private.h"

+#include "HostRPC.h"
+
 #include "Shared/EnvironmentVar.h"
 #include "Shared/Profile.h"

@@ -504,3 +506,33 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
  delete AsyncInfo;
  *AsyncHandle = nullptr;
 }
+
+// Host RPC support functions.
+EXTERN int64_t __kmpc_host_rpc_get_arg(void *Wrapper, int32_t ArgNum) {
+  auto *W = reinterpret_cast<hostrpc::DescriptorWrapper *>(Wrapper);
+  auto &SD = W->D;
+
+  assert(ArgNum < SD.NumArgs && "out-of-range argument");
+
+  int64_t ArgVal = SD.Args[ArgNum].Value;
+  void *ArgPtr = reinterpret_cast<void *>(ArgVal);
+
+  DP("[host-rpc] get argno=%d arg=%lx...\n", ArgNum, ArgVal);
+
+  if (W->StdIn && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
+      ArgPtr == W->StdIn)
+    return (int64_t)stdin;
+  if (W->StdOut && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
+      ArgPtr == W->StdOut)
+    return (int64_t)stdout;
+  if (W->StdErr && SD.Args[ArgNum].ArgType == hostrpc::ARG_POINTER &&
+      ArgPtr == W->StdErr)
+    return (int64_t)stderr;
+
+  return ArgVal;
+}
+
+EXTERN void __kmpc_host_rpc_set_ret_val(void *Wrapper, int64_t RetVal) {
+  auto &SD = reinterpret_cast<hostrpc::DescriptorWrapper *>(Wrapper)->D;
+  SD.ReturnValue = RetVal;
+}
Author	SHA1	Message	Date
Joseph Huber	b19051093e	[libc] Export the RPC interface from `libc` Summary: This patch adds new extensions that allow us to export the RPC interface from `libc` to other programs. This should allow external users of the GPU `libc` to interface with the RPC client (which more or less behaves like syscalls in this context). This is done by wrapping the interface into a C-style function call. Obviously, this approach is far less safe than the carefully crafted C++ interface. For example, we now expose the internal packet buffer, and it is theoretically possible to open a single port with conflicting opcodes and break the whole interface. So, extra care will be needed when interacting with this. However, the usage is very similar as shown by the new test. This somewhat stretches the concept of `libc` just doing `libc` things, but I think this is important enough to justify it. It is difficult to split this out, as we do not want to have the situation where we have multiple RPC clients running at one time, so for now it makes sense to just leave it in `libc`.	2024-04-24 17:35:53 -07:00
Nicolas Marie	de7beadb39	Fixe compilations issues after rebase of llvm-test-suite-gpu	2024-03-21 12:07:38 -07:00
Nicolas Marie	25d5970544	Revert "[LTO] Remove Config.UseDefaultPipeline (#82587 )" This reverts commit `ec24094b56`. We do need Config.UseDefaultPipeline.	2024-03-21 11:54:26 -07:00
Shilei Tian	20f98716b0	[OpenMP] Add the initial support for direct gpu compilation Rebase from llvm-test-suite-gpu, fixe rebase conflict in: clang/include/clang/Driver/Options.td clang/lib/CodeGen/CGDecl.cpp clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp clang/lib/Driver/ToolChains/Clang.cpp clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp llvm/lib/Passes/PassRegistry.def llvm/lib/Transforms/IPO/AttributorAttributes.cpp llvm/lib/Transforms/IPO/OpenMPOpt.cpp openmp/libomptarget/DeviceRTL/src/Mapping.cpp openmp/libomptarget/DeviceRTL/src/State.cpp openmp/libomptarget/DeviceRTL/src/Synchronization.cpp openmp/libomptarget/DeviceRTL/src/Utils.cpp openmp/libomptarget/DeviceRTL/src/exports openmp/libomptarget/include/omptarget.h openmp/libomptarget/src/exports openmp/libomptarget/src/interface.cpp	2024-03-21 11:51:44 -07:00