split runtimes libs install in multiples directories

Fixe segmentation fault fron Atributor Attributes
fixe issues where emmiting omp dynamic allocation instead of a pre alocated memory, we try to add debug informations to an invalide uninitialize adress
2024-08-09 10:12:57 -07:00 · 2024-08-09 10:09:56 -07:00 · 2024-08-09 10:01:37 -07:00 · 2024-08-09 09:58:59 -07:00 · 2024-08-09 09:51:58 -07:00 · 2024-08-09 09:49:59 -07:00
91 changed files with 6561 additions and 216 deletions
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -267,6 +267,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
 LANGOPT(OpenMPOptimisticCollapse  , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
 LANGOPT(OpenMPThreadSubscription  , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
 LANGOPT(OpenMPTeamSubscription  , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
+LANGOPT(OpenMPGlobalizeToGlobalSpace  , 1, 0, "Globalize to global space for the globalized variables")
 LANGOPT(OpenMPNoThreadState  , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
 LANGOPT(OpenMPNoNestedParallelism  , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
 LANGOPT(OpenMPOffloadMandatory  , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3561,6 +3561,10 @@ def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested
  HelpText<"Assert no nested parallel regions in the GPU">,
  MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;

+def fopenmp_globalize_to_global_space : Flag<["-"], "fopenmp-globalize-to-global-space">,
+  HelpText<"Globalize to global space for the globalized variables">,
+  MarshallingInfoFlag<LangOpts<"OpenMPGlobalizeToGlobalSpace">>;
+
 } // let Group = f_Group
 } // let Visibility = [ClangOption, CC1Option, FC1Option]
 } // let Flags = [NoArgumentUnused, HelpHidden]
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1475,6 +1475,7 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {

  bool NRVO = getLangOpts().ElideConstructors && D.isNRVOVariable();

+  bool VarAllocated = false; // omp  dynamic allocation
  if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
    address = OpenMPLocalAddr;
    AllocaAddr = OpenMPLocalAddr;
@@ -1608,7 +1609,6 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
    // by the definition of the VLA. Since this is an escaped declaration, in
    // OpenMP we have to use a call to __kmpc_alloc_shared(). The matching
    // deallocation call to __kmpc_free_shared() is emitted later.
-    bool VarAllocated = false;
    if (getLangOpts().OpenMPIsTargetDevice) {
      auto &RT = CGM.getOpenMPRuntime();
      if (RT.isDelayedVariableLengthDecl(*this, &D)) {
@@ -1668,7 +1668,7 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
  emission.AllocaAddr = AllocaAddr;

  // Emit debug info for local var declaration.
-  if (EmitDebugInfo && HaveInsertPoint()) {
+  if (EmitDebugInfo && HaveInsertPoint() && !VarAllocated) {
    Address DebugAddr = address;
    bool UsePointerValue = NRVO && ReturnValuePointer.isValid();
    DI->setLocation(D.getLocation());
@@ -2577,48 +2577,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
        (IPD->getParameterKind() == ImplicitParamKind::ThreadPrivateVar);
  }

-  Address DeclPtr = Address::invalid();
-  RawAddress AllocaPtr = Address::invalid();
-  bool DoStore = false;
-  bool IsScalar = hasScalarEvaluationKind(Ty);
-  bool UseIndirectDebugAddress = false;
-
-  // If we already have a pointer to the argument, reuse the input pointer.
-  if (Arg.isIndirect()) {
-    DeclPtr = Arg.getIndirectAddress();
-    DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty));
-    // Indirect argument is in alloca address space, which may be different
-    // from the default address space.
-    auto AllocaAS = CGM.getASTAllocaAddressSpace();
-    auto *V = DeclPtr.emitRawPointer(*this);
-    AllocaPtr = RawAddress(V, DeclPtr.getElementType(), DeclPtr.getAlignment());
-
-    // For truly ABI indirect arguments -- those that are not `byval` -- store
-    // the address of the argument on the stack to preserve debug information.
-    ABIArgInfo ArgInfo = CurFnInfo->arguments()[ArgNo - 1].info;
-    if (ArgInfo.isIndirect())
-      UseIndirectDebugAddress = !ArgInfo.getIndirectByVal();
-    if (UseIndirectDebugAddress) {
-      auto PtrTy = getContext().getPointerType(Ty);
-      AllocaPtr = CreateMemTemp(PtrTy, getContext().getTypeAlignInChars(PtrTy),
-                                D.getName() + ".indirect_addr");
-      EmitStoreOfScalar(V, AllocaPtr, /* Volatile */ false, PtrTy);
-    }
-
-    auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
-    auto DestLangAS =
-        getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
-    if (SrcLangAS != DestLangAS) {
-      assert(getContext().getTargetAddressSpace(SrcLangAS) ==
-             CGM.getDataLayout().getAllocaAddrSpace());
-      auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
-      auto *T = llvm::PointerType::get(getLLVMContext(), DestAS);
-      DeclPtr =
-          DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
-                                  *this, V, SrcLangAS, DestLangAS, T, true),
-                              DeclPtr.isKnownNonNull());
-    }
-
+  auto PushCleanupIfNeeded = [this, Ty, &D](Address DeclPtr) {
    // Push a destructor cleanup for this parameter if the ABI requires it.
    // Don't push a cleanup in a thunk for a method that will also emit a
    // cleanup.
@@ -2634,87 +2593,124 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
            EHStack.stable_begin();
      }
    }
-  } else {
-    // Check if the parameter address is controlled by OpenMP runtime.
-    Address OpenMPLocalAddr =
-        getLangOpts().OpenMP
-            ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
-            : Address::invalid();
-    if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
-      DeclPtr = OpenMPLocalAddr;
-      AllocaPtr = DeclPtr;
+  };
+
+  Address DeclPtr = Address::invalid();
+  RawAddress AllocaPtr = Address::invalid();
+  Address OpenMPLocalAddr =
+      getLangOpts().OpenMP
+          ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
+          : Address::invalid();
+  bool DoStore = false;
+  bool IsScalar = hasScalarEvaluationKind(Ty);
+  bool UseIndirectDebugAddress = false;
+  if (OpenMPLocalAddr.isValid()) {
+    DeclPtr = OpenMPLocalAddr;
+    AllocaPtr = DeclPtr;
+    LValue Dst = MakeAddrLValue(DeclPtr, Ty);
+    if (Arg.isIndirect()) {
+      LValue Src = MakeAddrLValue(Arg.getIndirectAddress(), Ty);
+      callCStructCopyConstructor(Dst, Src);
+      PushCleanupIfNeeded(Arg.getIndirectAddress());
    } else {
-      // Otherwise, create a temporary to hold the value.
+      EmitStoreOfScalar(Arg.getDirectValue(), Dst, /* isInitialization */ true);
+    }
+  } else {
+    // If we already have a pointer to the argument, reuse the input pointer.
+    if (Arg.isIndirect()) {
+      DeclPtr = Arg.getIndirectAddress();
+      DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty));
+      // Indirect argument is in alloca address space, which may be different
+      // from the default address space.
+      auto AllocaAS = CGM.getASTAllocaAddressSpace();
+      auto *V = DeclPtr.emitRawPointer(*this);
+      AllocaPtr = RawAddress(V, DeclPtr.getElementType(), DeclPtr.getAlignment());
+      auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS;
+      auto DestLangAS =
+          getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default;
+      if (SrcLangAS != DestLangAS) {
+        assert(getContext().getTargetAddressSpace(SrcLangAS) ==
+               CGM.getDataLayout().getAllocaAddrSpace());
+        auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
+        auto *T = llvm::PointerType::get(getLLVMContext(), DestAS);
+        DeclPtr =
+            DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast(
+                                    *this, V, SrcLangAS, DestLangAS, T, true),
+                                DeclPtr.isKnownNonNull());
+      }
+      PushCleanupIfNeeded(DeclPtr);
+    } else {
+      // Create a temporary to hold the value.
      DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
                              D.getName() + ".addr", &AllocaPtr);
+      DoStore = true;
    }
-    DoStore = true;
-  }

-  llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);
+    llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr);

-  LValue lv = MakeAddrLValue(DeclPtr, Ty);
-  if (IsScalar) {
-    Qualifiers qs = Ty.getQualifiers();
-    if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
-      // We honor __attribute__((ns_consumed)) for types with lifetime.
-      // For __strong, it's handled by just skipping the initial retain;
-      // otherwise we have to balance out the initial +1 with an extra
-      // cleanup to do the release at the end of the function.
-      bool isConsumed = D.hasAttr<NSConsumedAttr>();
+    LValue lv = MakeAddrLValue(DeclPtr, Ty);
+    if (IsScalar) {
+      Qualifiers qs = Ty.getQualifiers();
+      if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) {
+        // We honor __attribute__((ns_consumed)) for types with lifetime.
+        // For __strong, it's handled by just skipping the initial retain;
+        // otherwise we have to balance out the initial +1 with an extra
+        // cleanup to do the release at the end of the function.
+        bool isConsumed = D.hasAttr<NSConsumedAttr>();

-      // If a parameter is pseudo-strong then we can omit the implicit retain.
-      if (D.isARCPseudoStrong()) {
-        assert(lt == Qualifiers::OCL_Strong &&
-               "pseudo-strong variable isn't strong?");
-        assert(qs.hasConst() && "pseudo-strong variable should be const!");
-        lt = Qualifiers::OCL_ExplicitNone;
-      }
+        // If a parameter is pseudo-strong then we can omit the implicit retain.
+        if (D.isARCPseudoStrong()) {
+          assert(lt == Qualifiers::OCL_Strong &&
+                 "pseudo-strong variable isn't strong?");
+          assert(qs.hasConst() && "pseudo-strong variable should be const!");
+          lt = Qualifiers::OCL_ExplicitNone;
+        }

-      // Load objects passed indirectly.
-      if (Arg.isIndirect() && !ArgVal)
-        ArgVal = Builder.CreateLoad(DeclPtr);
+        // Load objects passed indirectly.
+        if (Arg.isIndirect() && !ArgVal)
+          ArgVal = Builder.CreateLoad(DeclPtr);

-      if (lt == Qualifiers::OCL_Strong) {
-        if (!isConsumed) {
-          if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
-            // use objc_storeStrong(&dest, value) for retaining the
-            // object. But first, store a null into 'dest' because
-            // objc_storeStrong attempts to release its old value.
-            llvm::Value *Null = CGM.EmitNullConstant(D.getType());
-            EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
-            EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
-            DoStore = false;
+        if (lt == Qualifiers::OCL_Strong) {
+          if (!isConsumed) {
+            if (CGM.getCodeGenOpts().OptimizationLevel == 0) {
+              // use objc_storeStrong(&dest, value) for retaining the
+              // object. But first, store a null into 'dest' because
+              // objc_storeStrong attempts to release its old value.
+              llvm::Value *Null = CGM.EmitNullConstant(D.getType());
+              EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
+              EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
+              DoStore = false;
+            } else
+              // Don't use objc_retainBlock for block pointers, because we
+              // don't want to Block_copy something just because we got it
+              // as a parameter.
+              ArgVal = EmitARCRetainNonBlock(ArgVal);
+          }
+        } else {
+          // Push the cleanup for a consumed parameter.
+          if (isConsumed) {
+            ARCPreciseLifetime_t precise =
+                (D.hasAttr<ObjCPreciseLifetimeAttr>() ? ARCPreciseLifetime
+                                                      : ARCImpreciseLifetime);
+            EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(),
+                                                     ArgVal, precise);
+          }
+
+          if (lt == Qualifiers::OCL_Weak) {
+            EmitARCInitWeak(DeclPtr, ArgVal);
+            DoStore = false; // The weak init is a store, no need to do two.
          }
-          else
-          // Don't use objc_retainBlock for block pointers, because we
-          // don't want to Block_copy something just because we got it
-          // as a parameter.
-            ArgVal = EmitARCRetainNonBlock(ArgVal);
-        }
-      } else {
-        // Push the cleanup for a consumed parameter.
-        if (isConsumed) {
-          ARCPreciseLifetime_t precise = (D.hasAttr<ObjCPreciseLifetimeAttr>()
-                                ? ARCPreciseLifetime : ARCImpreciseLifetime);
-          EHStack.pushCleanup<ConsumeARCParameter>(getARCCleanupKind(), ArgVal,
-                                                   precise);
        }

-        if (lt == Qualifiers::OCL_Weak) {
-          EmitARCInitWeak(DeclPtr, ArgVal);
-          DoStore = false; // The weak init is a store, no need to do two.
-        }
+        // Enter the cleanup scope.
+        EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
      }
-
-      // Enter the cleanup scope.
-      EmitAutoVarWithLifetime(*this, D, DeclPtr, lt);
    }
-  }

-  // Store the initial value into the alloca.
-  if (DoStore)
-    EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
+    // Store the initial value into the alloca.
+    if (DoStore)
+      EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true);
+  }

  setAddrOfLocalVar(&D, DeclPtr);

--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1088,10 +1088,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,

    // Allocate space for the variable to be globalized
    llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
-    llvm::CallBase *VoidPtr =
-        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                                CGM.getModule(), OMPRTL___kmpc_alloc_shared),
-                            AllocArgs, VD->getName());
+    llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(
+            CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
+                                 ? OMPRTL_malloc
+                                 : OMPRTL___kmpc_alloc_shared),
+        AllocArgs, VD->getName());
    // FIXME: We should use the variables actual alignment as an argument.
    VoidPtr->addRetAttr(llvm::Attribute::get(
        CGM.getLLVMContext(), llvm::Attribute::Alignment,
@@ -1122,12 +1124,14 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
    I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
    LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
                                     CGM.getContext().getDeclAlign(VD),
+
                                     AlignmentSource::Decl);
    I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress(CGF));
  }
  I->getSecond().MappedParams->apply(CGF);
 }

+
 bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
                                                     const VarDecl *VD) const {
  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
@@ -1155,10 +1159,12 @@ CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,

  // Allocate space for this VLA object to be globalized.
  llvm::Value *AllocArgs[] = {Size};
-  llvm::CallBase *VoidPtr =
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_alloc_shared),
-                          AllocArgs, VD->getName());
+  llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
+      OMPBuilder.getOrCreateRuntimeFunction(
+          CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
+                               ? OMPRTL_malloc
+                               : OMPRTL___kmpc_alloc_shared),
+      AllocArgs, VD->getName());
  VoidPtr->addRetAttr(llvm::Attribute::get(
      CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));

@@ -1184,20 +1190,29 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
    // globalized in the prolog (i.e. emitGenericVarsProlog).
    for (const auto &AddrSizePair :
         llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_free_shared),
-                          {AddrSizePair.first, AddrSizePair.second});
+      if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
+            {AddrSizePair.first});
+      else
+        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                CGM.getModule(), OMPRTL___kmpc_free_shared),
+                            {AddrSizePair.first, AddrSizePair.second});
    }
    // Deallocate the memory for each globalized value
    for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
      const auto *VD = cast<VarDecl>(Rec.first);
      I->getSecond().MappedParams->restore(CGF);

-      llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
-                                 CGF.getTypeSize(VD->getType())};
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_free_shared),
-                          FreeArgs);
+      if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
+            {Rec.second.GlobalizedVal});
+      else
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+                                                  OMPRTL___kmpc_free_shared),
+            {Rec.second.GlobalizedVal, CGF.getTypeSize(VD->getType())});
    }
  }
 }
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6641,6 +6641,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
        CmdArgs.push_back("-fopenmp-offload-mandatory");
      if (Args.hasArg(options::OPT_fopenmp_force_usm))
        CmdArgs.push_back("-fopenmp-force-usm");
+      if (Args.hasArg(options::OPT_fopenmp_globalize_to_global_space))
+        CmdArgs.push_back("-fopenmp-globalize-to-global-space");
      break;
    default:
      // By default, if Clang doesn't know how to generate useful OpenMP code
--- a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
+++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
@@ -83,6 +83,8 @@ inline void *operator new(__SIZE_TYPE__ size) {

 inline void *operator new[](__SIZE_TYPE__ size) { return ::operator new(size); }

+//inline void *operator new(__SIZE_TYPE__ size, void *ptr) { return ptr; }
+
 inline void operator delete(void *ptr)OPENMP_NOEXCEPT { ::free(ptr); }

 inline void operator delete[](void *ptr) OPENMP_NOEXCEPT {
--- a/clang/test/OpenMP/globalization_byval_struct.c
+++ b/clang/test/OpenMP/globalization_byval_struct.c
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -no-opaque-pointers -verify -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+
+extern int printf(const char *, ...);
+
+struct S {
+  int a;
+  float b;
+};
+
+// CHECK: define{{.*}}void @test(%struct.S* noundef byval(%struct.S) align {{[0-9]+}} [[arg:%[0-9a-zA-Z]+]])
+// CHECK: [[g:%[0-9a-zA-Z]+]] = call align {{[0-9]+}} i8* @__kmpc_alloc_shared
+// CHECK: bitcast i8* [[g]] to %struct.S*
+// CHECK: bitcast %struct.S* [[arg]] to i8**
+// CHECK: call void [[cc:@__copy_constructor[_0-9a-zA-Z]+]]
+// CHECK: void [[cc]]
+void test(struct S s) {
+#pragma omp parallel for
+  for (int i = 0; i < s.a; ++i) {
+    printf("%i : %i : %f\n", i, s.a, s.b);
+  }
+}
+
+void foo() {
+  #pragma omp target teams num_teams(1)
+  {
+    struct S s;
+    s.a = 7;
+    s.b = 11;
+    test(s);
+  }
+}
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -137,6 +137,13 @@ static constexpr OptTable::Info InfoTable[] = {
 #undef OPTION
 };

+/// Host RPC module that will be shared to the corresponding pass.
+Module *HostModule = nullptr;
+/// We only need to generate the host RPC module once.
+bool IsHostModuleGenerated = false;
+/// Host RPC object file.
+StringRef HostRPCObjFile;
+
 class WrapperOptTable : public opt::GenericOptTable {
 public:
  WrapperOptTable() : opt::GenericOptTable(InfoTable) {}
@@ -615,10 +622,12 @@ std::vector<std::string> getTargetFeatures(ArrayRef<OffloadFile> InputFiles) {
  return UnifiedFeatures;
 }

-template <typename ModuleHook = function_ref<bool(size_t, const Module &)>>
+template <typename PreHookTy = function_ref<bool(size_t, const Module &)>,
+          typename PostHookTy = function_ref<bool(size_t, const Module &)>>
 std::unique_ptr<lto::LTO> createLTO(
    const ArgList &Args, const std::vector<std::string> &Features,
-    ModuleHook Hook = [](size_t, const Module &) { return true; }) {
+    PreHookTy PreHook = [](size_t, const Module &) { return true; },
+    PostHookTy PostHook = [](size_t, const Module &) { return true; }) {
  const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
  // We need to remove AMD's target-id from the processor if present.
  StringRef Arch = Args.getLastArgValue(OPT_arch_EQ).split(":").first;
@@ -673,7 +682,9 @@ std::unique_ptr<lto::LTO> createLTO(
      return true;
    };
  }
-  Conf.PostOptModuleHook = Hook;
+
+  Conf.PreOptModuleHook = PreHook;
+  Conf.PostOptModuleHook = PostHook;
  Conf.CGFileType = (Triple.isNVPTX() || SaveTemps)
                        ? CodeGenFileType::AssemblyFile
                        : CodeGenFileType::ObjectFile;
@@ -692,6 +703,57 @@ bool isValidCIdentifier(StringRef S) {
                      [](char C) { return C == '_' || isAlnum(C); });
 }

+bool writeHostModule(std::string &FileName) {
+  if (!HostModule)
+    return false;
+  if (HostModule->getFunctionList().empty())
+    return false;
+
+  auto HostTriple = HostModule->getTargetTriple();
+  FileName =
+      sys::path::filename(ExecutableName).str() + "-host-rpc-" + HostTriple;
+  auto TempFileOrErr = createOutputFile(FileName, "bc");
+  if (!TempFileOrErr)
+    reportError(TempFileOrErr.takeError());
+  int FD = -1;
+  if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
+    reportError(errorCodeToError(EC));
+
+  auto Out = std::make_unique<llvm::raw_fd_ostream>(FD, true);
+  WriteBitcodeToFile(*HostModule, *Out);
+
+  return true;
+}
+
+std::unique_ptr<lto::LTO> createHostRPCLTO(StringRef HostTriple) {
+  const llvm::Triple Triple(HostTriple);
+  lto::Config Conf;
+  lto::ThinBackend Backend;
+  Backend =
+      lto::createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
+
+  // TODO: host arch?
+  // Conf.CPU = Arch.str();
+  Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple);
+
+  // TODO: host features?
+  // Conf.MAttrs = Features;
+  Conf.CGOptLevel = *CodeGenOpt::getLevel(3);
+  Conf.OptLevel = 3;
+  Conf.DefaultTriple = Triple.getTriple();
+
+  LTOError = false;
+  Conf.DiagHandler = diagnosticHandler;
+
+  Conf.PTO.LoopVectorization = Conf.OptLevel > 1;
+  Conf.PTO.SLPVectorization = Conf.OptLevel > 1;
+  Conf.CGFileType = CodeGenFileType::ObjectFile;
+
+  Conf.HasWholeProgramVisibility = false;
+
+  return std::make_unique<lto::LTO>(std::move(Conf), Backend);
+}
+
 Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
                       SmallVectorImpl<StringRef> &OutputFiles,
                       const ArgList &Args) {
@@ -777,14 +839,52 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
    BitcodeOutput.push_back(*TempFileOrErr);
    return false;
  };
+  auto AddHostModuleAddr = [&](size_t, const Module &M) {
+    if (!HostModule)
+      return true;
+
+    Module &CM = const_cast<Module &>(M);
+    auto *MD = CM.getOrInsertNamedMetadata("llvm.hostrpc.hostmodule");
+    MD->clearOperands();
+    MD->addOperand(MDTuple::get(
+        CM.getContext(), {ConstantAsMetadata::get(ConstantInt::get(
+                             Type::getInt64Ty(CM.getContext()),
+                             reinterpret_cast<uintptr_t>(HostModule)))}));
+    return true;
+  };

  // We assume visibility of the whole program if every input file was bitcode.
  auto Features = getTargetFeatures(BitcodeInputFiles);
  auto LTOBackend = Args.hasArg(OPT_embed_bitcode) ||
                            Args.hasArg(OPT_builtin_bitcode_EQ) ||
                            Args.hasArg(OPT_clang_backend)
-                        ? createLTO(Args, Features, OutputBitcode)
-                        : createLTO(Args, Features);
+                        ? createLTO(Args, Features, AddHostModuleAddr, OutputBitcode)
+                        : createLTO(Args, Features, AddHostModuleAddr);
+
+  LLVMContext &Ctx = LTOBackend->getContext();
+  StringRef HostTriple =
+      Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple());
+  std::unique_ptr<Module> HostModulePtr;
+  if (!IsHostModuleGenerated) {
+    HostModulePtr = std::make_unique<Module>(
+        sys::path::filename(ExecutableName).str() + "-host-rpc.bc", Ctx);
+    HostModule = HostModulePtr.get();
+    HostModulePtr->setTargetTriple(HostTriple);
+
+    std::string Msg;
+    const Target *T =
+        TargetRegistry::lookupTarget(HostModule->getTargetTriple(), Msg);
+    if (!T)
+      return createStringError(inconvertibleErrorCode(), Msg);
+    auto Options =
+        codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple(HostTriple));
+    StringRef CPU = "";
+    StringRef Features = "";
+    std::unique_ptr<TargetMachine> TM(
+        T->createTargetMachine(HostTriple, CPU, Features, Options, Reloc::PIC_,
+                               HostModule->getCodeModel()));
+    HostModule->setDataLayout(TM->createDataLayout());
+  }

  // We need to resolve the symbols so the LTO backend knows which symbols need
  // to be kept or can be internalized. This is a simplified symbol resolution
@@ -808,6 +908,7 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
    SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
    size_t Idx = 0;
    for (auto &Sym : Symbols) {
+
      lto::SymbolResolution &Res = Resolutions[Idx++];

      // We will use this as the prevailing symbol definition in LTO unless
@@ -878,6 +979,57 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
  if (Error Err = LTOBackend->run(AddStream))
    return Err;

+  std::string HostModuleTempFile;
+  bool ValidHostModule = writeHostModule(HostModuleTempFile);
+  // Reset the HostModule pointer.
+  HostModulePtr.reset();
+  HostModule = nullptr;
+  // TODO: this is really redundant code.
+  if (ValidHostModule) {
+    auto HostLTO = createHostRPCLTO(HostTriple);
+
+    std::string HostBitCodeFile = HostModuleTempFile + ".bc";
+    auto BufferOrError = MemoryBuffer::getFile(HostBitCodeFile);
+    if (!BufferOrError)
+      reportError(createFileError(HostBitCodeFile, BufferOrError.getError()));
+    Expected<std::unique_ptr<lto::InputFile>> BitcodeFileOrErr =
+        llvm::lto::InputFile::create(*BufferOrError.get());
+    if (!BitcodeFileOrErr)
+      return BitcodeFileOrErr.takeError();
+
+    const auto Symbols = (*BitcodeFileOrErr)->symbols();
+    SmallVector<lto::SymbolResolution, 16> Resolutions(Symbols.size());
+    size_t Idx = 0;
+    for (auto &Sym : Symbols) {
+      (void)Sym;
+      lto::SymbolResolution &Res = Resolutions[Idx++];
+      Res.ExportDynamic = true;
+      Res.VisibleToRegularObj = true;
+      Res.LinkerRedefined = false;
+      Res.Prevailing = true;
+    }
+    if (Error Err = HostLTO->add(std::move(*BitcodeFileOrErr), Resolutions))
+      return Err;
+
+    auto RPCAddStream =
+        [&](size_t Task,
+            const Twine &ModuleName) -> std::unique_ptr<CachedFileStream> {
+      int FD = -1;
+      auto TempFileOrErr = createOutputFile(
+          sys::path::filename(ExecutableName) + "-host-rpc-" + HostTriple, "o");
+      if (!TempFileOrErr)
+        reportError(TempFileOrErr.takeError());
+      HostRPCObjFile = *TempFileOrErr;
+      if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD))
+        reportError(errorCodeToError(EC));
+      return std::make_unique<CachedFileStream>(
+          std::make_unique<llvm::raw_fd_ostream>(FD, true));
+    };
+
+    if (Error Err = HostLTO->run(RPCAddStream))
+      return Err;
+  }
+
  if (LTOError)
    return createStringError(inconvertibleErrorCode(),
                             "Errors encountered inside the LTO pipeline.");
@@ -1285,6 +1437,9 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
    WrappedOutput.push_back(*OutputOrErr);
  }

+  if (!HostRPCObjFile.empty())
+    WrappedOutput.push_back(HostRPCObjFile);
+
  return WrappedOutput;
 }

--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -248,12 +248,12 @@ else()
 endif()

 if(LIBC_TARGET_TRIPLE)
-  set(LIBC_INSTALL_LIBRARY_DIR lib${LLVM_LIBDIR_SUFFIX}/${LIBC_TARGET_TRIPLE})
+  set(LIBC_INSTALL_LIBRARY_DIR "lib${LLVM_LIBDIR_SUFFIX}/${LIBC_TARGET_TRIPLE}/libc")
 elseif(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR)
  set(LIBC_INSTALL_LIBRARY_DIR
-      lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE})
+      "lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE}/libc")
 else()
-  set(LIBC_INSTALL_LIBRARY_DIR lib${LLVM_LIBDIR_SUFFIX})
+  set(LIBC_INSTALL_LIBRARY_DIR "lib${LLVM_LIBDIR_SUFFIX}")
 endif()

 if(LIBC_TARGET_OS_IS_GPU)
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -212,6 +212,10 @@ set(TARGET_LIBC_ENTRYPOINTS
    # gpu/rpc.h entrypoints
    libc.src.gpu.rpc_host_call
    libc.src.gpu.rpc_fprintf
+    libc.src.gpu.rpc_open_port
+    libc.src.gpu.rpc_send_n
+    libc.src.gpu.rpc_recv_n
+    libc.src.gpu.rpc_close_port
 )

 set(TARGET_LIBM_ENTRYPOINTS
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -636,6 +636,7 @@ if(LIBC_TARGET_OS_IS_GPU)
    DEPENDS
      .llvm_libc_common_h
      .llvm-libc-types.rpc_opcodes_t
+      .llvm-libc-types.rpc_port_t
  )
 endif()

--- a/libc/include/gpu/rpc.h.def
+++ b/libc/include/gpu/rpc.h.def
@@ -11,7 +11,13 @@

 #include "__llvm-libc-common.h"

-#include "llvm-libc-types/rpc_opcodes_t.h"
+#include <llvm-libc-types/size_t.h>
+
+#include <llvm-libc-types/rpc_opcodes_t.h>
+#include <llvm-libc-types/rpc_port_t.h>
+
+#include <stddef.h>
+#include <stdio.h>

 %%public_api()

--- a/libc/include/llvm-libc-types/CMakeLists.txt
+++ b/libc/include/llvm-libc-types/CMakeLists.txt
@@ -114,6 +114,7 @@ add_header(socklen_t HDR socklen_t.h)
 add_header(struct_sockaddr_un HDR struct_sockaddr_un.h)
 add_header(struct_sockaddr HDR struct_sockaddr.h)
 add_header(rpc_opcodes_t HDR rpc_opcodes_t.h)
+add_header(rpc_port_t HDR rpc_port_t.h)
 add_header(ACTION HDR ACTION.h)
 add_header(ENTRY HDR ENTRY.h)
 add_header(struct_hsearch_data HDR struct_hsearch_data.h)
--- a/libc/include/llvm-libc-types/rpc_opcodes_t.h
+++ b/libc/include/llvm-libc-types/rpc_opcodes_t.h
@@ -34,6 +34,7 @@ typedef enum {
  RPC_PRINTF_TO_STDOUT,
  RPC_PRINTF_TO_STDERR,
  RPC_PRINTF_TO_STREAM,
+  RPC_GPUFIRST,
  RPC_LAST = 0xFFFF,
 } rpc_opcode_t;

--- a/libc/include/llvm-libc-types/rpc_port_t.h
+++ b/libc/include/llvm-libc-types/rpc_port_t.h
@@ -0,0 +1,16 @@
+//===-- Definition of type rpc_port_t -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LLVM_LIBC_TYPES_RPC_PORT_T_H__
+#define __LLVM_LIBC_TYPES_RPC_PORT_T_H__
+
+typedef struct {
+  __UINT8_TYPE__ reserved[32];
+} rpc_port_t;
+
+#endif // __LLVM_LIBC_TYPES_RPC_PORT_T_H__
--- a/libc/include/llvm-libc-types/test_rpc_opcodes_t.h
+++ b/libc/include/llvm-libc-types/test_rpc_opcodes_t.h
@@ -15,6 +15,7 @@ typedef enum : unsigned short {
  RPC_TEST_NOOP = 1 << 15,
  RPC_TEST_INCREMENT,
  RPC_TEST_INTERFACE,
+  RPC_TEST_EXTERNAL,
  RPC_TEST_STREAM,
 } rpc_test_opcode_t;

--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -79,7 +79,7 @@ install(
 if(LIBC_TARGET_OS_IS_GPU)
  set(gpu_install_dir lib${LLVM_LIBDIR_SUFFIX})
  if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR)
-    set(gpu_install_dir lib${LLVM_LIBDIR_SUFFIX}/${LLVM_HOST_TRIPLE})
+    set(gpu_install_dir "lib${LLVM_LIBDIR_SUFFIX}/${LLVM_HOST_TRIPLE}/libcgpu")
  endif()
  install(
    TARGETS ${added_gpu_archive_targets}
--- a/libc/spec/gpu_ext.td
+++ b/libc/spec/gpu_ext.td
@@ -1,8 +1,11 @@
+def RPCPortT : NamedType<"rpc_port_t">;
+def RPCPortPtrT : PtrType<RPCPortT>;
+
 def GPUExtensions : StandardSpec<"GPUExtensions"> {
  HeaderSpec RPC = HeaderSpec<
    "gpu/rpc.h",
    [], // Macros
-    [], // Types
+    [RPCPortT, SizeTType], // Types
    [], // Enumerations
    [
        FunctionSpec<
@@ -18,6 +21,26 @@ def GPUExtensions : StandardSpec<"GPUExtensions"> {
             ArgSpec<VoidPtr>,
             ArgSpec<SizeTType>]
        >,
+        FunctionSpec<
+            "rpc_open_port",
+            RetValSpec<RPCPortT>,
+            [ArgSpec<UnsignedIntType>]
+        >,
+        FunctionSpec<
+            "rpc_send_n",
+            RetValSpec<VoidType>,
+            [ArgSpec<RPCPortPtrT>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
+        >,
+        FunctionSpec<
+            "rpc_recv_n",
+            RetValSpec<VoidType>,
+            [ArgSpec<RPCPortPtrT>, ArgSpec<VoidPtr>, ArgSpec<SizeTPtr>]
+        >,
+        FunctionSpec<
+            "rpc_close_port",
+            RetValSpec<VoidType>,
+            [ArgSpec<RPCPortPtrT>]
+        >,
    ]
  >;
  let Headers = [
--- a/libc/src/__support/CPP/bit.h
+++ b/libc/src/__support/CPP/bit.h
@@ -24,12 +24,14 @@ namespace LIBC_NAMESPACE::cpp {
 #define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
 #endif

-// This implementation of bit_cast requires trivially-constructible To, to avoid
-// UB in the implementation.
 template <typename To, typename From>
 LIBC_INLINE constexpr cpp::enable_if_t<
    (sizeof(To) == sizeof(From)) &&
+// Implementation of bit_cast that cannot use the compiler builtin must be
+// trivially-constructible To, to avoid UB in the implementation.
+#if !__has_builtin(__builtin_bit_cast)
        cpp::is_trivially_constructible<To>::value &&
+#endif
        cpp::is_trivially_copyable<To>::value &&
        cpp::is_trivially_copyable<From>::value,
    To>
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -320,6 +320,7 @@ public:
  LIBC_INLINE void send_n(const void *src, uint64_t size);
  template <typename A>
  LIBC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc);
+  LIBC_INLINE void recv_n(void *dst, uint64_t *size);

  LIBC_INLINE uint16_t get_opcode() const {
    return process.header[index].opcode;
@@ -359,7 +360,9 @@ struct Client {
      : process(port_count, buffer) {}

  using Port = rpc::Port<false>;
-  template <uint16_t opcode> LIBC_INLINE Port open();
+
+  template <uint16_t opcode> LIBC_INLINE Port open() { return open(opcode); }
+  LIBC_INLINE Port open(uint16_t opcode);

 private:
  Process<false> process;
@@ -484,6 +487,14 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
  }
 }

+/// Helper routine to simplify the interface when recieving from the GPU using
+/// thread private pointers to the underly value and the destination pointer
+/// contains enough data to recieve the values.
+template <bool T> LIBC_INLINE void Port<T>::recv_n(void *dst, uint64_t *size) {
+  void **dst_ptr = &dst;
+  recv_n(dst_ptr, size, [=](uint64_t) { return dst; });
+}
+
 /// Receives an arbitrarily sized data buffer across the shared channel in
 /// multiples of the packet length. The \p alloc function is called with the
 /// size of the data so that we can initialize the size of the \p dst buffer.
@@ -522,9 +533,9 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
 /// is, there are send operations pending that haven't been serviced on this
 /// port. Each port instance uses an associated \p opcode to tell the server
 /// what to do. The Client interface provides the appropriate lane size to the
-/// port using the platform's returned value.
-template <uint16_t opcode>
-[[clang::convergent]] LIBC_INLINE Client::Port Client::open() {
+/// port using the platform's returned value. It is required that \p opcode is
+/// uniform between all the lanes for this to work.
+[[clang::convergent]] LIBC_INLINE Client::Port Client::open(uint16_t opcode) {
  // Repeatedly perform a naive linear scan for a port that can be opened to
  // send data.
  for (uint32_t index = gpu::get_cluster_id();; ++index) {
--- a/libc/src/__support/big_int.h
+++ b/libc/src/__support/big_int.h
@@ -1073,24 +1073,13 @@ using make_integral_or_big_int_signed_t =

 namespace cpp {

-// Specialization of cpp::bit_cast ('bit.h') from T to BigInt.
-template <typename To, typename From>
-LIBC_INLINE constexpr cpp::enable_if_t<
-    (sizeof(To) == sizeof(From)) && cpp::is_trivially_copyable<To>::value &&
-        cpp::is_trivially_copyable<From>::value && is_big_int<To>::value,
-    To>
-bit_cast(const From &from) {
-  To out;
-  using Storage = decltype(out.val);
-  out.val = cpp::bit_cast<Storage>(from);
-  return out;
-}
-
 // Specialization of cpp::bit_cast ('bit.h') from BigInt to T.
 template <typename To, size_t Bits>
 LIBC_INLINE constexpr cpp::enable_if_t<
    sizeof(To) == sizeof(UInt<Bits>) &&
+#if !__has_builtin(__builtin_bit_cast)
        cpp::is_trivially_constructible<To>::value &&
+#endif
        cpp::is_trivially_copyable<To>::value &&
        cpp::is_trivially_copyable<UInt<Bits>>::value,
    To>
--- a/libc/src/gpu/CMakeLists.txt
+++ b/libc/src/gpu/CMakeLists.txt
@@ -17,6 +17,59 @@ add_entrypoint_object(
    rpc_fprintf.h
  DEPENDS
    libc.src.stdio.gpu.gpu_file
+)
+
+add_entrypoint_object(
+  rpc_open_port
+  SRCS
+    rpc_open_port.cpp
+  HDRS
+    rpc_open_port.h
+  DEPENDS
+    libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
+)
+
+add_entrypoint_object(
+  rpc_close_port
+  SRCS
+    rpc_close_port.cpp
+  HDRS
+    rpc_close_port.h
+  DEPENDS
+    libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
+)
+
+add_entrypoint_object(
+  rpc_get_buffer
+  SRCS
+    rpc_get_buffer.cpp
+  HDRS
+    rpc_get_buffer.h
+  DEPENDS
+    libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
+)
+
+add_entrypoint_object(
+  rpc_send_n
+  SRCS
+    rpc_send_n.cpp
+  HDRS
+    rpc_send_n.h
+  DEPENDS
+    libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
+)
+
+add_entrypoint_object(
+  rpc_recv_n
+  SRCS
+    rpc_recv_n.cpp
+  HDRS
+    rpc_recv_n.h
+  DEPENDS
    libc.src.__support.RPC.rpc_client
    libc.src.__support.GPU.utils
 )
--- a/libc/src/gpu/rpc_close_port.cpp
+++ b/libc/src/gpu/rpc_close_port.cpp
@@ -0,0 +1,25 @@
+//===---------- GPU implementation of the external RPC port interface -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/gpu/rpc_close_port.h"
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
+
+LLVM_LIBC_FUNCTION(void, rpc_close_port, (rpc_port_t * handle)) {
+  rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
+  port.close();
+  *handle = cpp::bit_cast<rpc_port_t>(port);
+}
+
+} // namespace LIBC_NAMESPACE
--- a/libc/src/gpu/rpc_close_port.h
+++ b/libc/src/gpu/rpc_close_port.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H
+#define LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H
+
+#include <gpu/rpc.h>
+
+namespace LIBC_NAMESPACE {
+
+void rpc_close_port(rpc_port_t *handle);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_GPU_RPC_CLOSE_PORT_H
--- a/libc/src/gpu/rpc_open_port.cpp
+++ b/libc/src/gpu/rpc_open_port.cpp
@@ -0,0 +1,26 @@
+//===---------- GPU implementation of the external RPC port interface -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/gpu/rpc_open_port.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
+
+LLVM_LIBC_FUNCTION(rpc_port_t, rpc_open_port, (unsigned opcode)) {
+  uint32_t uniform = gpu::broadcast_value(gpu::get_lane_mask(), opcode);
+  rpc::Client::Port port = rpc::client.open(static_cast<uint16_t>(uniform));
+  return cpp::bit_cast<rpc_port_t>(port);
+}
+
+} // namespace LIBC_NAMESPACE
--- a/libc/src/gpu/rpc_open_port.h
+++ b/libc/src/gpu/rpc_open_port.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H
+#define LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H
+
+#include <gpu/rpc.h>
+
+namespace LIBC_NAMESPACE {
+
+rpc_port_t rpc_open_port(unsigned opcode);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_GPU_RPC_OPEN_PORT_H
--- a/libc/src/gpu/rpc_recv_n.cpp
+++ b/libc/src/gpu/rpc_recv_n.cpp
@@ -0,0 +1,26 @@
+//===---------- GPU implementation of the external RPC port interface -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/gpu/rpc_recv_n.h"
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
+
+LLVM_LIBC_FUNCTION(void, rpc_recv_n,
+                   (rpc_port_t * handle, void *dst, size_t *size)) {
+  rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
+  port.recv_n(dst, reinterpret_cast<uint64_t *>(size));
+  *handle = cpp::bit_cast<rpc_port_t>(port);
+}
+
+} // namespace LIBC_NAMESPACE
--- a/libc/src/gpu/rpc_recv_n.h
+++ b/libc/src/gpu/rpc_recv_n.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_GPU_RPC_RECV_H
+#define LLVM_LIBC_SRC_GPU_RPC_RECV_H
+
+#include <gpu/rpc.h>
+
+namespace LIBC_NAMESPACE {
+
+void rpc_recv_n(rpc_port_t *handle, void *dst, size_t *size);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_GPU_RPC_RECV_H
--- a/libc/src/gpu/rpc_send_n.cpp
+++ b/libc/src/gpu/rpc_send_n.cpp
@@ -0,0 +1,26 @@
+//===---------- GPU implementation of the external RPC port interface -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/gpu/rpc_send_n.h"
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+static_assert(sizeof(rpc_port_t) == sizeof(rpc::Client::Port), "ABI mismatch");
+
+LLVM_LIBC_FUNCTION(void, rpc_send_n,
+                   (rpc_port_t * handle, const void *src, size_t size)) {
+  rpc::Client::Port port = cpp::bit_cast<rpc::Client::Port>(*handle);
+  port.send_n(src, size);
+  *handle = cpp::bit_cast<rpc_port_t>(port);
+}
+
+} // namespace LIBC_NAMESPACE
--- a/libc/src/gpu/rpc_send_n.h
+++ b/libc/src/gpu/rpc_send_n.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_GPU_RPC_SEND_H
+#define LLVM_LIBC_SRC_GPU_RPC_SEND_H
+
+#include <gpu/rpc.h>
+
+namespace LIBC_NAMESPACE {
+
+void rpc_send_n(rpc_port_t *handle, const void *src, size_t size);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_GPU_RPC_SEND_H
--- a/libc/test/integration/startup/gpu/CMakeLists.txt
+++ b/libc/test/integration/startup/gpu/CMakeLists.txt
@@ -44,6 +44,21 @@ add_integration_test(
   rpc_interface_test.cpp
 )

+add_integration_test(
+  startup_rpc_external_interface_test
+  SUITE libc-startup-tests
+  SRCS
+    rpc_external_interface_test.cpp
+  DEPENDS
+    libc.src.gpu.rpc_open_port
+    libc.src.gpu.rpc_send_n
+    libc.src.gpu.rpc_recv_n
+    libc.src.gpu.rpc_close_port
+  LOADER_ARGS
+    --threads 32
+    --blocks 8
+)
+
 add_integration_test(
  startup_rpc_stream_test
  SUITE libc-startup-tests
--- a/libc/test/integration/startup/gpu/rpc_external_interface_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_external_interface_test.cpp
@@ -0,0 +1,43 @@
+//===-- Loader test to check the external RPC interface with the loader ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <gpu/rpc.h>
+
+#include "src/gpu/rpc_close_port.h"
+#include "src/gpu/rpc_open_port.h"
+#include "src/gpu/rpc_recv_n.h"
+#include "src/gpu/rpc_send_n.h"
+
+#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "src/__support/GPU/utils.h"
+#include "src/__support/RPC/rpc_client.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+static void test_interface() {
+  uint32_t num_additions =
+      10 + 10 * gpu::get_thread_id() + 10 * gpu::get_block_id();
+  uint64_t cnt = 0;
+  for (uint32_t i = 0; i < num_additions; ++i) {
+    size_t size = sizeof(uint64_t);
+    rpc_port_t port = LIBC_NAMESPACE::rpc_open_port(RPC_TEST_EXTERNAL);
+    LIBC_NAMESPACE::rpc_send_n(&port, &cnt, size);
+    LIBC_NAMESPACE::rpc_recv_n(&port, &cnt, &size);
+    LIBC_NAMESPACE::rpc_close_port(&port);
+    ASSERT_TRUE(size == sizeof(uint64_t));
+  }
+
+  ASSERT_TRUE(cnt == num_additions && "Invalid number of increments");
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+  test_interface();
+
+  return 0;
+}
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -11,8 +11,8 @@

 #include "utils/gpu/server/llvmlibc_rpc_server.h"

-#include "llvm-libc-types/rpc_opcodes_t.h"
 #include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/rpc_opcodes_t.h"

 #include <cstddef>
 #include <cstdint>
@@ -222,6 +222,28 @@ inline void register_rpc_callbacks(rpc_device_t device) {
        }
      },
      nullptr);
+
+  // Register the stream test handler.
+  rpc_register_callback(
+      device, static_cast<rpc_opcode_t>(RPC_TEST_EXTERNAL),
+      [](rpc_port_t port, void *data) {
+        uint64_t sizes[lane_size] = {0};
+        void *dst[lane_size] = {nullptr};
+        rpc_recv_n(
+            port, dst, sizes,
+            [](uint64_t size, void *) -> void * { return new char[size]; },
+            nullptr);
+        for (uint64_t i = 0; i < lane_size; ++i) {
+          if (dst[i])
+            *reinterpret_cast<uint64_t *>(dst[i]) += 1;
+        }
+        rpc_send_n(port, dst, sizes);
+        for (uint64_t i = 0; i < lane_size; ++i) {
+          if (dst[i])
+            delete[] reinterpret_cast<uint8_t *>(dst[i]);
+        }
+      },
+      nullptr);
 }

 #endif
--- a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h
@@ -25,6 +25,19 @@ enum OMPTgtExecModeFlags : unsigned char {
      OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD
 };

+enum OMPTgtHostRPCArgType {
+  // No need to copy.
+  OMP_HOST_RPC_ARG_SCALAR = 0,
+  OMP_HOST_RPC_ARG_PTR = 1,
+  // Copy to device.
+  OMP_HOST_RPC_ARG_COPY_TO = OMP_HOST_RPC_ARG_PTR | (1 << 1),
+  // Copy to device.
+  OMP_HOST_RPC_ARG_COPY_FROM = OMP_HOST_RPC_ARG_PTR | (1 << 2),
+  // Copy to and from device.
+  OMP_HOST_RPC_ARG_COPY_TOFROM =
+      OMP_HOST_RPC_ARG_COPY_TO | OMP_HOST_RPC_ARG_COPY_FROM,
+};
+
 } // end namespace omp
 } // end namespace llvm

--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -227,7 +227,9 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
 __OMP_RTL(__kmpc_get_warp_size, false, Int32, )

 __OMP_RTL(omp_get_thread_num, false, Int32, )
+__OMP_RTL(omp_get_bulk_thread_num, false, Int32, )
 __OMP_RTL(omp_get_num_threads, false, Int32, )
+__OMP_RTL(omp_get_bulk_num_threads, false, Int32, )
 __OMP_RTL(omp_get_max_threads, false, Int32, )
 __OMP_RTL(omp_in_parallel, false, Int32, )
 __OMP_RTL(omp_get_dynamic, false, Int32, )
@@ -489,6 +491,8 @@ __OMP_RTL(__kmpc_reduction_get_fixed_buffer, false, VoidPtr, )

 __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)

+__OMP_RTL(malloc, false, VoidPtr, SizeTy)
+__OMP_RTL(free, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
 __OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)
 __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
@@ -502,6 +506,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
 __OMP_RTL(__kmpc_syncwarp, false, Void, Int64)

+__OMP_RTL(__kmpc_launch_parallel_51_kernel, false, Void, Int8Ptr, Int32, Int32,
+          Int32, VoidPtrPtr, Int64)
+
 __OMP_RTL(__last, false, Void, )

 #undef __OMP_RTL
@@ -709,6 +716,8 @@ __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs())

 __OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_bulk_thread_num, GetterAttrs, SExt, ParamAttrs())
+__OMP_RTL_ATTRS(omp_get_bulk_num_threads, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, SExt, ParamAttrs())
--- a/llvm/include/llvm/LTO/Config.h
+++ b/llvm/include/llvm/LTO/Config.h
@@ -60,6 +60,9 @@ struct Config {
  bool VerifyEach = false;
  bool DisableVerify = false;

+  /// Use the standard optimization pipeline.
+  bool UseDefaultPipeline = false;
+
  /// Flag to indicate that the optimizer should not assume builtins are present
  /// on the target.
  bool Freestanding = false;
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -303,6 +303,9 @@ public:
  /// by LTO but might not be visible from bitcode symbol table.
  static ArrayRef<const char*> getRuntimeLibcallSymbols();

+  /// Returns the context.
+  LLVMContext &getContext() { return RegularLTO.Ctx; }
+
 private:
  Config Conf;

--- a/llvm/include/llvm/Transforms/IPO/HostRPC.h
+++ b/llvm/include/llvm/Transforms/IPO/HostRPC.h
@@ -0,0 +1,31 @@
+//===- Transform/IPO/HostRPC.h - Code of automatic host rpc -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_HOSTRPC_H
+#define LLVM_TRANSFORMS_IPO_HOSTRPC_H
+
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class HostRPCPass : public PassInfoMixin<HostRPCPass> {
+public:
+  HostRPCPass() : LTOPhase(ThinOrFullLTOPhase::None) {}
+  HostRPCPass(ThinOrFullLTOPhase LTOPhase) : LTOPhase(LTOPhase) {}
+
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+  const ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None;
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_IPO_HOSTRPC_H
--- a/llvm/include/llvm/Transforms/Utils/CanonicalizeMainFunction.h
+++ b/llvm/include/llvm/Transforms/Utils/CanonicalizeMainFunction.h
@@ -0,0 +1,30 @@
+//===--------------- CanonicalizeMainFunction.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility function to canonicalize main function.
+// The canonical main function is defined as: int main(int argc, char *argv[]);
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
+#define LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// A pass that canonicalizes main function in a module.
+class CanonicalizeMainFunctionPass
+    : public PassInfoMixin<CanonicalizeMainFunctionPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -330,6 +330,8 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
      report_fatal_error(Twine("unable to parse pass pipeline description '") +
                         Conf.OptPipeline + "': " + toString(std::move(Err)));
    }
+  } else if (Conf.UseDefaultPipeline) {
+    MPM.addPass(PB.buildPerModuleDefaultPipeline(OL));
  } else if (IsThinLTO) {
    MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary));
  } else {
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -143,6 +143,7 @@
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/IPO/GlobalOpt.h"
 #include "llvm/Transforms/IPO/GlobalSplit.h"
+#include "llvm/Transforms/IPO/HostRPC.h"
 #include "llvm/Transforms/IPO/HotColdSplitting.h"
 #include "llvm/Transforms/IPO/IROutliner.h"
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
@@ -267,6 +268,7 @@
 #include "llvm/Transforms/Utils/BreakCriticalEdges.h"
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
 #include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
+#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
 #include "llvm/Transforms/Utils/CountVisits.h"
 #include "llvm/Transforms/Utils/DXILUpgrade.h"
 #include "llvm/Transforms/Utils/Debugify.h"
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -53,6 +53,7 @@
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/IPO/GlobalOpt.h"
 #include "llvm/Transforms/IPO/GlobalSplit.h"
+#include "llvm/Transforms/IPO/HostRPC.h"
 #include "llvm/Transforms/IPO/HotColdSplitting.h"
 #include "llvm/Transforms/IPO/IROutliner.h"
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
@@ -127,6 +128,7 @@
 #include "llvm/Transforms/Utils/AddDiscriminators.h"
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
+#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
 #include "llvm/Transforms/Utils/CountVisits.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
 #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
@@ -183,9 +185,9 @@ static cl::opt<bool> EnablePostPGOLoopRotation(
    "enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden,
    cl::desc("Run the loop rotation transformation after PGO instrumentation"));

-static cl::opt<bool> EnableGlobalAnalyses(
-    "enable-global-analyses", cl::init(true), cl::Hidden,
-    cl::desc("Enable inter-procedural analyses"));
+static cl::opt<bool>
+    EnableGlobalAnalyses("enable-global-analyses", cl::init(true), cl::Hidden,
+                         cl::desc("Enable inter-procedural analyses"));

 static cl::opt<bool>
    RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden,
@@ -305,6 +307,13 @@ extern cl::opt<bool> EnableMemProfContextDisambiguation;
 extern cl::opt<bool> EnableInferAlignmentPass;
 } // namespace llvm

+static cl::opt<bool> EnableHostRPC("enable-host-rpc", cl::init(false),
+                                   cl::Hidden, cl::desc("Enable HostRPC pass"));
+
+static cl::opt<bool> EnableCanonicalizeMainFunction(
+    "enable-canonicalize-main-function", cl::init(false), cl::Hidden,
+    cl::desc("Enable CanonicalizeMainFunction pass"));
+
 PipelineTuningOptions::PipelineTuningOptions() {
  LoopInterleaving = true;
  LoopVectorization = true;
@@ -1101,6 +1110,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
          PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
  }

+  if (EnableCanonicalizeMainFunction)
+    MPM.addPass(CanonicalizeMainFunctionPass());
+
  // Try to perform OpenMP specific optimizations on the module. This is a
  // (quick!) no-op if there are no OpenMP runtime calls present in the module.
  MPM.addPass(OpenMPOptPass());
@@ -1120,11 +1132,10 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
  // and prior to optimizing globals.
  // FIXME: This position in the pipeline hasn't been carefully considered in
  // years, it should be re-analyzed.
-  MPM.addPass(IPSCCPPass(
-              IPSCCPOptions(/*AllowFuncSpec=*/
-                            Level != OptimizationLevel::Os &&
-                            Level != OptimizationLevel::Oz &&
-                            !isLTOPreLink(Phase))));
+  MPM.addPass(IPSCCPPass(IPSCCPOptions(/*AllowFuncSpec=*/
+                                       Level != OptimizationLevel::Os &&
+                                       Level != OptimizationLevel::Oz &&
+                                       !isLTOPreLink(Phase))));

  // Attach metadata to indirect call sites indicating the set of functions
  // they may target at run-time. This should follow IPSCCP.
@@ -1616,7 +1627,7 @@ PassBuilder::buildFatLTODefaultPipeline(OptimizationLevel Level, bool ThinLTO,
 ModulePassManager
 PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
  if (Level == OptimizationLevel::O0)
-    return buildO0DefaultPipeline(Level, /*LTOPreLink*/true);
+    return buildO0DefaultPipeline(Level, /*LTOPreLink*/ true);

  ModulePassManager MPM;

@@ -1739,6 +1750,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
  // in the current module.
  MPM.addPass(CrossDSOCFIPass());

+  if (EnableCanonicalizeMainFunction)
+    MPM.addPass(CanonicalizeMainFunctionPass());
+
  if (Level == OptimizationLevel::O0) {
    // The WPD and LowerTypeTest passes need to run at -O0 to lower type
    // metadata and intrinsics.
@@ -1769,6 +1783,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
  // Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
  MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink));

+  if (EnableHostRPC)
+    MPM.addPass(HostRPCPass(ThinOrFullLTOPhase::FullLTOPostLink));
+
  // Remove unused virtual tables to improve the quality of code generated by
  // whole-program devirtualization and bitset lowering.
  MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true));
@@ -2143,6 +2160,9 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,

  MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));

+  if (EnableCanonicalizeMainFunction)
+    MPM.addPass(CanonicalizeMainFunctionPass());
+
  return MPM;
 }

--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -47,6 +47,7 @@ MODULE_PASS("attributor", AttributorPass())
 MODULE_PASS("attributor-light", AttributorLightPass())
 MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
 MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass())
+MODULE_PASS("canonicalize-main-function", CanonicalizeMainFunctionPass())
 MODULE_PASS("check-debugify", NewPMCheckDebugifyPass())
 MODULE_PASS("constmerge", ConstantMergePass())
 MODULE_PASS("coro-cleanup", CoroCleanupPass())
@@ -142,6 +143,7 @@ MODULE_PASS("tsan-module", ModuleThreadSanitizerPass())
 MODULE_PASS("verify", VerifierPass())
 MODULE_PASS("view-callgraph", CallGraphViewerPass())
 MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
+MODULE_PASS("host-rpc", HostRPCPass())
 #undef MODULE_PASS

 #ifndef MODULE_PASS_WITH_PARAMS
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -397,12 +397,27 @@ static bool getPotentialCopiesOfMemoryValue(
          dbgs() << "Underlying object is a valid nullptr, giving up.\n";);
      return false;
    }
-    // TODO: Use assumed noalias return.
-    if (!isa<AllocaInst>(&Obj) && !isa<GlobalVariable>(&Obj) &&
-        !(IsLoad ? isAllocationFn(&Obj, TLI) : isNoAliasCall(&Obj))) {
-      LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << Obj
-                        << "\n";);
-      return false;
+    if (!isa<AllocaInst>(&Obj) && !isa<GlobalVariable>(&Obj)) {
+      auto *CB = dyn_cast<CallBase>(&Obj);
+      bool R = false;
+      if (CB) {
+        const AANoAlias *AAN =
+            A.getOrCreateAAFor<AANoAlias>(IRPosition::callsite_returned(*CB),
+                                          &QueryingAA, DepClassTy::OPTIONAL);
+        if (! AAN)
+          return false;
+        if (AAN->isValidState())
+          R = AAN->isAssumedNoAlias();
+        else
+          R = isNoAliasCall(&Obj);
+        if (!R)
+          R = isAllocationFn(&Obj, TLI);
+      }
+      if (!R) {
+        LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << Obj
+                          << "\n";);
+        return false;
+      }
    }
    if (auto *GV = dyn_cast<GlobalVariable>(&Obj))
      if (!GV->hasLocalLinkage() &&
@@ -2226,7 +2241,8 @@ void Attributor::runTillFixpoint() {

  LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
                    << IterationCounter << "/" << MaxIterations
-                    << " iterations\n");
+                    << " iterations. #Changed AAs: " << ChangedAAs.size()
+                    << "\n");

  // Reset abstract arguments not settled in a sound fixpoint by now. This
  // happens when we stopped the fixpoint iteration early. Note that only the
@@ -2241,13 +2257,21 @@ void Attributor::runTillFixpoint() {

    AbstractState &State = ChangedAA->getState();
    if (!State.isAtFixpoint()) {
+      LLVM_DEBUG(
+          dbgs() << "\n[Attributor] Invalidate AA " << *ChangedAA
+                 << " as it is not at fix point after max iterations.\n");
+
      State.indicatePessimisticFixpoint();

      NumAttributesTimedOut++;
    }

-    for (auto &DepIt : ChangedAA->Deps)
+    for (auto &DepIt : ChangedAA->Deps) {
+      LLVM_DEBUG(dbgs() << "\n[Attributor] Invalidate dependent AA "
+                        << *(DepIt.getPointer()) << " of AA " << *ChangedAA
+                        << "\n");
      ChangedAAs.push_back(cast<AbstractAttribute>(DepIt.getPointer()));
+    }
    ChangedAA->Deps.clear();
  }

--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -1669,6 +1669,8 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
    // might change while we iterate through a loop. For now, we give up if
    // the PHI is not invariant.
    if (isa<PHINode>(Usr)) {
+      if (!Usr->getType()->isPointerTy())
+        return false;
      // Note the order here, the Usr access might change the map, CurPtr is
      // already in it though.
      bool IsFirstPHIUser = !OffsetInfoMap.count(Usr);
@@ -1899,6 +1901,31 @@ ChangeStatus AAPointerInfoFloating::updateImpl(Attributor &A) {
      return false;
    }

+    if (auto *II = dyn_cast<ICmpInst>(Usr)) {
+      auto CheckIfUsedAsPred = [&](const Use &U, bool &Follow) {
+        const auto *UU = U.getUser();
+        if (isa<SelectInst>(UU))
+          return true;
+        if (isa<BranchInst>(UU))
+          return true;
+
+        LLVM_DEBUG(dbgs() << "[AAPointerInfo] ICmpInst user not handled " << *UU
+                          << "\n");
+        return false;
+      };
+      if (!A.checkForAllUses(CheckIfUsedAsPred, *this, *II,
+                             /* CheckBBLivenessOnly */ true,
+                             DepClassTy::OPTIONAL,
+                             /* IgnoreDroppableUses */ true)) {
+        LLVM_DEBUG(
+            dbgs() << "[AAPointerInfo] Check for all uses failed for ICmpInst "
+                   << *II << "\n");
+        return false;
+      }
+
+      return true;
+    }
+
    LLVM_DEBUG(dbgs() << "[AAPointerInfo] User not handled " << *Usr << "\n");
    return false;
  };
@@ -2040,6 +2067,7 @@ struct AAPointerInfoCallSiteReturned final : AAPointerInfoFloating {
 };
 } // namespace

+
 /// -----------------------NoUnwind Function Attribute--------------------------

 namespace {
@@ -11906,10 +11934,35 @@ struct AAUnderlyingObjectsImpl
  /// See AbstractAttribute::trackStatistics()
  void trackStatistics() const override {}

+  // TODO: This is a temporary solution for terminals.
+  bool checkIfTerminals(Attributor &A, Value *V) {
+    auto *CI = dyn_cast<CallInst>(V);
+    if (!CI)
+      return false;
+
+    Function *Scope = CI->getFunction();
+    const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*Scope);
+    LibFunc TLIFn;
+    if (TLI && TLI->getLibFunc(*CI, TLIFn)) {
+      if (TLIFn == LibFunc::LibFunc_malloc ||
+          TLIFn == LibFunc::LibFunc___kmpc_alloc_shared)
+        return true;
+    }
+
+    return false;
+  }
+
  /// See AbstractAttribute::updateImpl(...).
  ChangeStatus updateImpl(Attributor &A) override {
    auto &Ptr = getAssociatedValue();

+    if (checkIfTerminals(A, &Ptr)) {
+      bool Changed = false;
+      Changed |= InterAssumedUnderlyingObjects.insert(&Ptr);
+      Changed |= IntraAssumedUnderlyingObjects.insert(&Ptr);
+      return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+    }
+
    auto DoUpdate = [&](SmallSetVector<Value *, 8> &UnderlyingObjects,
                        AA::ValueScope Scope) {
      bool UsedAssumedInformation = false;
@@ -11925,6 +11978,10 @@ struct AAUnderlyingObjectsImpl
      for (unsigned I = 0; I < Values.size(); ++I) {
        auto &VAC = Values[I];
        auto *Obj = VAC.getValue();
+        if (checkIfTerminals(A, Obj)) {
+          Changed |= UnderlyingObjects.insert(Obj);
+          continue;
+        }
        Value *UO = getUnderlyingObject(Obj);
        if (UO && UO != VAC.getValue() && SeenObjects.insert(UO).second) {
          const auto *OtherAA = A.getAAFor<AAUnderlyingObjects>(
@@ -11955,6 +12012,16 @@ struct AAUnderlyingObjectsImpl
          continue;
        }

+        if (auto *LI = dyn_cast<LoadInst>(Obj)) {
+          LLVM_DEBUG({
+            dbgs() << "[AAUnderlyingObjects] for CtxI ";
+            getCtxI()->print(dbgs());
+            dbgs() << " at position " << getIRPosition() << " has LoadInst ";
+            LI->print(dbgs());
+            dbgs() << '\n';
+          });
+        }
+
        Changed |= UnderlyingObjects.insert(Obj);
      }

@@ -11965,6 +12032,8 @@ struct AAUnderlyingObjectsImpl
    Changed |= DoUpdate(IntraAssumedUnderlyingObjects, AA::Intraprocedural);
    Changed |= DoUpdate(InterAssumedUnderlyingObjects, AA::Interprocedural);

+    LLVM_DEBUG(dumpState(dbgs()));
+
    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
  }

@@ -12003,6 +12072,19 @@ private:
    return Changed;
  }

+  void dumpState(raw_ostream &O) {
+    O << "Underlying objects:\nintra procedureal:\n";
+    for (auto *Obj : IntraAssumedUnderlyingObjects) {
+      Obj->print(O);
+      O << '\n';
+    }
+    O << "inter procedureal:\n";
+    for (auto *Obj : InterAssumedUnderlyingObjects) {
+      Obj->print(O);
+      O << '\n';
+    }
+  }
+
  /// All the underlying objects collected so far via intra procedural scope.
  SmallSetVector<Value *, 8> IntraAssumedUnderlyingObjects;
  /// All the underlying objects collected so far via inter procedural scope.
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_component_library(LLVMipo
  GlobalDCE.cpp
  GlobalOpt.cpp
  GlobalSplit.cpp
+  HostRPC.cpp
  HotColdSplitting.cpp
  IPO.cpp
  IROutliner.cpp
--- a/llvm/lib/Transforms/IPO/HostRPC.cpp
+++ b/llvm/lib/Transforms/IPO/HostRPC.cpp
@@ -0,0 +1,970 @@
+//===- Transform/IPO/HostRPC.h - Code of automatic host rpc -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/HostRPC.h"
+
+#include "llvm/ADT/EnumeratedArray.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/IPO/Attributor.h"
+#include <cstdint>
+
+#define DEBUG_TYPE "host-rpc"
+
+using namespace llvm;
+
+using ArgType = llvm::omp::OMPTgtHostRPCArgType;
+
+static cl::opt<bool>
+    UseDummyHostModule("host-rpc-use-dummy-host-module", cl::init(false),
+                       cl::Hidden,
+                       cl::desc("Use dummy host module if there no host module "
+                                "attached to the device module"));
+
+namespace {
+
+enum class HostRPCRuntimeFunction {
+#define __OMPRTL_HOST_RPC(_ENUM) OMPRTL_##_ENUM
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val),
+  __OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper),
+  __OMPRTL_HOST_RPC(__last),
+#undef __OMPRTL_HOST_RPC
+};
+
+#define __OMPRTL_HOST_RPC(_ENUM)                                               \
+  auto OMPRTL_##_ENUM = HostRPCRuntimeFunction::OMPRTL_##_ENUM;
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_desc)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_add_arg)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_get_arg)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_send_and_wait)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_set_ret_val)
+__OMPRTL_HOST_RPC(__kmpc_host_rpc_invoke_host_wrapper)
+#undef __OMPRTL_HOST_RPC
+
+// TODO: Remove those functions implemented in device runtime.
+static constexpr const char *InternalPrefix[] = {
+    "__kmp", "llvm.",        "nvm.",
+    "omp_",  "vprintf",      "malloc",
+    "free",  "__keep_alive", "__llvm_omp_vprintf",
+    "rpc_", "MPI_", "fprintf", "sprintf"
+};
+
+bool isInternalFunction(Function &F) {
+  auto Name = F.getName();
+
+  for (auto *P : InternalPrefix)
+    if (Name.starts_with(P))
+      return true;
+
+  return false;
+}
+
+std::string typeToString(Type *T) {
+  if (T->is16bitFPTy())
+    return "f16";
+  if (T->isFloatTy())
+    return "f32";
+  if (T->isDoubleTy())
+    return "f64";
+  if (T->isPointerTy())
+    return "ptr";
+  if (T->isStructTy())
+    return std::string(T->getStructName());
+  if (T->isIntegerTy())
+    return "i" + std::to_string(T->getIntegerBitWidth());
+
+  LLVM_DEBUG(dbgs() << "[HostRPC] unknown type " << *T
+                    << "  for typeToString.\n";);
+
+  llvm_unreachable("unknown type");
+}
+
+class HostRPC {
+  /// LLVM context instance
+  LLVMContext &Context;
+
+  /// Device module.
+  Module &M;
+  /// Host module
+  Module &HM;
+
+  /// Data layout of the device module.
+  DataLayout DL;
+
+  IRBuilder<> Builder;
+
+  /// External functions we are operating on.
+  SmallSetVector<Function *, 8> FunctionWorkList;
+
+  /// Attributor instance.
+  Attributor &A;
+
+  // Types
+  Type *Int8PtrTy;
+  Type *VoidTy;
+  Type *Int32Ty;
+  Type *Int64Ty;
+  StructType *ArgInfoTy;
+  // Values
+  Constant *NullPtr;
+  Constant *NullInt64;
+
+  struct CallSiteInfo {
+    CallInst *CI = nullptr;
+    SmallVector<Type *> Params;
+  };
+
+  struct HostRPCArgInfo {
+    Value *BasePtr = nullptr;
+    Constant *Type = nullptr;
+    Value *Size = nullptr;
+  };
+
+  ///
+  SmallVector<Function *> HostEntryTable;
+
+  EnumeratedArray<Function *, HostRPCRuntimeFunction,
+                  HostRPCRuntimeFunction::OMPRTL___last> RFIs;
+
+  SmallVector<std::pair<CallInst *, CallInst *>> CallInstMap;
+
+  Constant *getConstantInt64(uint64_t Val) {
+    return ConstantInt::get(Int64Ty, Val);
+  }
+
+  static std::string getWrapperFunctionName(Function *F, CallSiteInfo &CSI) {
+    std::string Name = "__kmpc_host_rpc_wrapper_" + std::string(F->getName());
+    if (!F->isVarArg())
+      return Name;
+
+    for (unsigned I = F->getFunctionType()->getNumParams();
+         I < CSI.Params.size(); ++I) {
+      Name.push_back('_');
+      Name.append(typeToString(CSI.Params[I]));
+    }
+
+    return Name;
+  }
+
+  void registerAAs();
+
+  Value *convertToInt64Ty(Value *V);
+
+  Value *convertFromInt64TyTo(Value *V, Type *TargetTy);
+
+  Constant *convertToInt64Ty(Constant *C);
+
+  Constant *convertFromInt64TyTo(Constant *C, Type *T);
+
+  // int device_wrapper(call_no, arg_info, ...) {
+  //   void *desc = __kmpc_host_rpc_get_desc(call_no, num_args, arg_info);
+  //   __kmpc_host_rpc_add_arg(desc, arg1, sizeof(arg1));
+  //   __kmpc_host_rpc_add_arg(desc, arg2, sizeof(arg2));
+  //   ...
+  //   int r = (int)__kmpc_host_rpc_send_and_wait(desc);
+  //   return r;
+  // }
+  Function *getDeviceWrapperFunction(StringRef WrapperName, Function *F,
+                                     CallSiteInfo &CSI);
+
+  // void host_wrapper(desc) {
+  //   int arg1 = (int)__kmpc_host_rpc_get_arg(desc, 0);
+  //   float arg2 = (float)__kmpc_host_rpc_get_arg(desc, 1);
+  //   char *arg3 = (char *)__kmpc_host_rpc_get_arg(desc, 2);
+  //   ...
+  //   int r = actual_call(arg1, arg2, arg3, ...);
+  //   __kmpc_host_rpc_set_ret_val(ptr(desc, (int64_t)r);
+  // }
+  Function *getHostWrapperFunction(StringRef WrapperName, Function *F,
+                                   CallSiteInfo &CSI);
+
+  bool rewriteWithHostRPC(Function *F);
+
+  void emitHostWrapperInvoker();
+
+  bool recollectInformation();
+
+public:
+  HostRPC(Module &DeviceModule, Module &HostModule, Attributor &A)
+      : Context(DeviceModule.getContext()), M(DeviceModule), HM(HostModule),
+        DL(M.getDataLayout()), Builder(Context), A(A) {
+    assert(&M.getContext() == &HM.getContext() &&
+           "device and host modules have different context");
+
+    Int8PtrTy = PointerType::getUnqual(Context);
+    VoidTy = Type::getVoidTy(Context);
+    Int32Ty = Type::getInt32Ty(Context);
+    Int64Ty = Type::getInt64Ty(Context);
+    NullPtr = ConstantInt::getNullValue(Int8PtrTy);
+    NullInt64 = ConstantInt::getNullValue(Int64Ty);
+
+#define __OMP_RTL(_ENUM, MOD, VARARG, RETTY, ...)                              \
+  {                                                                            \
+    SmallVector<Type *> Params{__VA_ARGS__};                                   \
+    Function *F = (MOD).getFunction(#_ENUM);                                   \
+    if (!F) {                                                                  \
+      FunctionType *FT = FunctionType::get(RETTY, Params, VARARG);             \
+      F = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,     \
+                           #_ENUM, (MOD));                                     \
+    }                                                                          \
+    RFIs[OMPRTL_##_ENUM] = F;                                                  \
+  }
+    // devices functions:
+    // get information about the functions that we are calling
+    __OMP_RTL(__kmpc_host_rpc_get_desc, M, false, Int8PtrTy, Int32Ty, Int32Ty,
+              Int8PtrTy)
+    // get arguments information about one of the argument
+    __OMP_RTL(__kmpc_host_rpc_add_arg, M, false, VoidTy, Int8PtrTy, Int64Ty,
+              Int32Ty)
+    // send the function to the host the function
+    __OMP_RTL(__kmpc_host_rpc_send_and_wait, M, false, Int64Ty, Int8PtrTy)
+
+    // host functions:
+    // get arguments (mirror of add arg)
+    __OMP_RTL(__kmpc_host_rpc_get_arg, HM, false, Int64Ty, Int8PtrTy, Int32Ty)
+    // send the ruturn value
+    __OMP_RTL(__kmpc_host_rpc_set_ret_val, HM, false, VoidTy, Int8PtrTy,
+              Int64Ty)
+    // Invoke the function on the host
+    __OMP_RTL(__kmpc_host_rpc_invoke_host_wrapper, HM, false, VoidTy, Int32Ty,
+              Int8PtrTy)
+#undef __OMP_RTL
+
+    ArgInfoTy = StructType::create({Int64Ty, Int64Ty, Int64Ty, Int8PtrTy},
+                                   "struct.arg_info_t");
+  }
+
+  bool run();
+};
+
+Value *HostRPC::convertToInt64Ty(Value *V) {
+  if (auto *C = dyn_cast<Constant>(V))
+    return convertToInt64Ty(C);
+
+  Type *T = V->getType();
+
+  if (T == Int64Ty)
+    return V;
+
+  if (T->isPointerTy())
+    return Builder.CreatePtrToInt(V, Int64Ty);
+
+  if (T->isIntegerTy())
+    return Builder.CreateIntCast(V, Int64Ty, /* isSigned */ true);
+
+  if (T->isFloatingPointTy()) {
+    V = Builder.CreateBitCast(
+        V, Type::getIntNTy(V->getContext(), T->getScalarSizeInBits()));
+    return Builder.CreateIntCast(V, Int64Ty, /* isSigned */ true);
+  }
+
+  llvm_unreachable("unknown cast to int64_t");
+}
+
+Value *HostRPC::convertFromInt64TyTo(Value *V, Type *T) {
+  if (auto *C = dyn_cast<Constant>(V))
+    return convertFromInt64TyTo(C, T);
+
+  if (T == Int64Ty)
+    return V;
+
+  if (T->isPointerTy())
+    return Builder.CreateIntToPtr(V, T);
+
+  if (T->isIntegerTy())
+    return Builder.CreateIntCast(V, T, /* isSigned */ true);
+
+  if (T->isFloatingPointTy()) {
+    V = Builder.CreateIntCast(
+        V, Type::getIntNTy(V->getContext(), T->getScalarSizeInBits()),
+        /* isSigned */ true);
+    return Builder.CreateBitCast(V, T);
+  }
+
+
+  LLVM_DEBUG(dbgs() << "[HostRPC] unknown type " << *T
+                    << "  for typeFromint64_t.\n";);
+
+  llvm_unreachable("unknown cast from int64_t");
+}
+
+Constant *HostRPC::convertToInt64Ty(Constant *C) {
+  Type *T = C->getType();
+
+  if (T == Int64Ty)
+    return C;
+
+  if (T->isPointerTy())
+    return ConstantExpr::getPtrToInt(C, Int64Ty);
+
+  if (T->isIntegerTy()) {
+    return ConstantFoldIntegerCast(C, Int64Ty, true, DL);
+  }
+
+  if (T->isFloatingPointTy()) {
+    // cast to an int of the same size
+    C = ConstantExpr::getBitCast(C,
+        Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()));
+    // set the int of size 64
+    return ConstantFoldIntegerCast(C, Int64Ty, true, DL);
+  }
+
+  llvm_unreachable("unknown cast to int64_t");
+}
+
+Constant *HostRPC::convertFromInt64TyTo(Constant *C, Type *T) {
+  assert(C->getType() == Int64Ty);
+
+  if (T == Int64Ty)
+    return C;
+
+  if (T->isPointerTy())
+    return ConstantExpr::getIntToPtr(C, T);
+
+  if (T->isIntegerTy()) {
+    return ConstantFoldIntegerCast(C, T, true, DL);
+  }
+
+  if (T->isFloatingPointTy()) {
+    // change size to T size
+    C = ConstantFoldIntegerCast(C,
+        Type::getIntNTy(C->getContext(), T->getScalarSizeInBits()), true, DL);
+    // from int to float
+    return ConstantExpr::getBitCast(C, T);
+  }
+
+  llvm_unreachable("unknown cast from int64_t");
+}
+
+void HostRPC::registerAAs() {
+  for (auto *F : FunctionWorkList)
+    for (User *U : F->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+
+      for (unsigned I = 0; I < CI->arg_size(); ++I) {
+        Value *Operand = CI->getArgOperand(I);
+        if (!Operand->getType()->isPointerTy())
+          continue;
+        A.getOrCreateAAFor<AAUnderlyingObjects>(
+            IRPosition::callsite_argument(*CI, I),
+            /* QueryingAA */ nullptr, DepClassTy::NONE);
+      }
+    }
+}
+
+bool HostRPC::recollectInformation() {
+  FunctionWorkList.clear();
+
+  for (Function &F : M) {
+    // If the function is already defined, it definitely does not require RPC.
+    if (!F.isDeclaration())
+      continue;
+
+    // If it is an internal function, skip it as well.
+    if (isInternalFunction(F))
+      continue;
+
+    // If there is no use of the function, skip it.
+    if (F.use_empty())
+      continue;
+
+    LLVM_DEBUG({
+      dbgs() << "[HostRPC] RPCing function: " << F.getName() << "\n"
+             << F << "\n";
+    });
+    FunctionWorkList.insert(&F);
+  }
+
+  return !FunctionWorkList.empty();
+}
+
+bool HostRPC::run() {
+  bool Changed = false;
+
+  LLVM_DEBUG(dbgs() << "[HostRPC] Running Pass\n");
+
+  if (!recollectInformation())
+    return Changed;
+
+  Changed = true;
+
+  LLVM_DEBUG(dbgs() << "[HostRPC] Reading Function to relocate:\n");
+  // We add a couple of assumptions to those RPC functions such that AAs will
+  // not error out because of unknown implementation of those functions.
+  for (Function &F : M) {
+    if (!F.isDeclaration())
+      continue;
+
+    F.addFnAttr(Attribute::NoRecurse);
+
+    for (auto &Arg : F.args())
+      if (Arg.getType()->isPointerTy())
+        Arg.addAttr(Attribute::NoCapture);
+
+    if (!F.isVarArg())
+      continue;
+
+    for (User *U : F.users()) {
+      auto *CB = dyn_cast<CallBase>(U);
+      if (!CB)
+        continue;
+      for (unsigned I = F.getFunctionType()->getNumParams(); I < CB->arg_size();
+           ++I) {
+        Value *Arg = CB->getArgOperand(I);
+        if (Arg->getType()->isPointerTy())
+          CB->addParamAttr(I, Attribute::NoCapture);
+      }
+    }
+  }
+
+  //LLVM_DEBUG(M.dump());
+
+  registerAAs();
+
+  ChangeStatus Status = A.run();
+  if (!recollectInformation())
+    return Status == ChangeStatus::CHANGED;
+
+  LLVM_DEBUG(dbgs() << "[HostRPC] Rewrite Functions:\n");
+  for (Function *F : FunctionWorkList)
+    Changed |= rewriteWithHostRPC(F);
+
+  if (!Changed)
+    return Changed;
+
+  LLVM_DEBUG(dbgs() << "[HostRPC] Replace Function Call:\n");
+  // replace all call to the function to a call to the rpc wrapper that have replace it.
+  for (auto Itr = CallInstMap.rbegin(); Itr != CallInstMap.rend(); ++Itr) {
+    auto *CI = Itr->first;
+    auto *NewCI = Itr->second;
+    CI->replaceAllUsesWith(NewCI);
+    CI->eraseFromParent();
+  }
+
+  LLVM_DEBUG(dbgs() << "[HostRPC] Erase Call to Non existing Functions:\n");
+  // erase all trace of the function in the Module
+  for (Function *F : FunctionWorkList)
+    if (F->user_empty())
+      F->eraseFromParent();
+
+  LLVM_DEBUG(dbgs() << "[HostRPC] Emit Host Wrapper:\n");
+  emitHostWrapperInvoker();
+
+  LLVM_DEBUG(dbgs() << "[HostRPC] Done\n");
+  return Changed;
+}
+
+bool HostRPC::rewriteWithHostRPC(Function *F) {
+  LLVM_DEBUG({
+    dbgs() << "[HostRPC] Rewriting Function: " << *F << "\n";
+  });
+
+  SmallVector<CallInst *> WorkList;
+
+  for (User *U : F->users()) {
+    auto *CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+    WorkList.push_back(CI);
+  }
+
+  if (WorkList.empty())
+    return false;
+
+  for (CallInst *CI : WorkList) {
+    CallSiteInfo CSI;
+    CSI.CI = CI;
+    unsigned NumArgs = CI->arg_size();
+    for (unsigned I = 0; I < NumArgs; ++I)
+      CSI.Params.push_back(CI->getArgOperand(I)->getType());
+
+    std::string WrapperName = getWrapperFunctionName(F, CSI);
+    Function *DeviceWrapperFn = getDeviceWrapperFunction(WrapperName, F, CSI);
+    Function *HostWrapperFn = getHostWrapperFunction(WrapperName, F, CSI);
+
+    int32_t WrapperNumber = -1;
+    for (unsigned I = 0; I < HostEntryTable.size(); ++I) {
+      if (HostEntryTable[I] == HostWrapperFn) {
+        WrapperNumber = I;
+        break;
+      }
+    }
+    if (WrapperNumber == -1) {
+      WrapperNumber = HostEntryTable.size();
+      HostEntryTable.push_back(HostWrapperFn);
+    }
+
+    auto CheckIfIdentifierPtr = [this](const Value *V) {
+      auto *CI = dyn_cast<CallInst>(V);
+      if (!CI)
+        return false;
+      Function *Callee = CI->getCalledFunction();
+      if (this->FunctionWorkList.count(Callee))
+        return true;
+      return Callee->getName().starts_with("__kmpc_host_rpc_wrapper_");
+    };
+
+    auto CheckIfDynAlloc = [](Value *V) -> CallInst * {
+      auto *CI = dyn_cast<CallInst>(V);
+      if (!CI)
+        return nullptr;
+      Function *Callee = CI->getCalledFunction();
+      auto Name = Callee->getName();
+      if (Name == "malloc" || Name == "__kmpc_alloc_shared")
+        return CI;
+      return nullptr;
+    };
+
+    auto CheckIfStdIO = [](Value *V) -> GlobalVariable * {
+      auto *LI = dyn_cast<LoadInst>(V);
+      if (!LI)
+        return nullptr;
+      auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
+      if (!GV)
+        return nullptr;
+      auto Name = GV->getName();
+      if (Name == "stdout" || Name == "stderr" || Name == "stdin")
+        return GV;
+      return nullptr;
+    };
+
+    auto CheckIfGlobalVariable = [](Value *V) {
+      if (auto *GV = dyn_cast<GlobalVariable>(V))
+        return GV;
+      if (auto *LI = dyn_cast<LoadInst>(V))
+        if (auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()))
+          return GV;
+      return static_cast<GlobalVariable *>(nullptr);
+    };
+
+    auto CheckIfNullPtr = [](Value *V) {
+      if (!V->getType()->isPointerTy())
+        return false;
+      return V == ConstantInt::getNullValue(V->getType());
+    };
+
+    auto HandleDirectUse = [&](Value *Ptr, HostRPCArgInfo &AI,
+                               bool IsPointer = false) {
+      AI.BasePtr = Ptr;
+      AI.Type = getConstantInt64(IsPointer ? ArgType::OMP_HOST_RPC_ARG_PTR
+                                           : ArgType::OMP_HOST_RPC_ARG_SCALAR);
+      AI.Size = NullInt64;
+    };
+
+    SmallVector<SmallVector<HostRPCArgInfo>> ArgInfo;
+    bool IsConstantArgInfo = true;
+
+    for (unsigned I = 0; I < CI->arg_size(); ++I) {
+      ArgInfo.emplace_back();
+      auto &AII = ArgInfo.back();
+
+      Value *Operand = CI->getArgOperand(I);
+
+      //LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Argument: " << I << ": " << *Operand << "\n"; });
+
+      // Check if scalar type.
+      if (!Operand->getType()->isPointerTy()) {
+        AII.emplace_back();
+        HandleDirectUse(Operand, AII.back());
+        IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
+        //LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Constant: " << *Operand << "\n"; });
+        continue;
+      }
+
+      if (CheckIfNullPtr(Operand)){
+        //LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Null Ptr: " << *Operand << "\n"; });
+        continue;
+      }
+
+      auto Pred = [&](Value &Obj) {
+        if (CheckIfNullPtr(&Obj))
+          return true;
+
+        bool IsConstantArgument = false;
+        if (!F->isVarArg() &&
+            F->hasParamAttribute(I, Attribute::AttrKind::ReadOnly))
+          IsConstantArgument = true;
+
+        HostRPCArgInfo AI;
+
+        if (auto *IO = CheckIfStdIO(&Obj)) {
+          HandleDirectUse(IO, AI, /* IsPointer */ true);
+        } else if (CheckIfIdentifierPtr(&Obj)) {
+          IsConstantArgInfo = IsConstantArgInfo && isa<Constant>(Operand);
+          HandleDirectUse(Operand, AI, /* IsPointer */ true);
+        } else if (auto *GV = CheckIfGlobalVariable(&Obj)) {
+          AI.BasePtr = GV;
+          AI.Size = getConstantInt64(DL.getTypeStoreSize(GV->getValueType()));
+          AI.Type =
+              getConstantInt64(GV->isConstant() || IsConstantArgument
+                                   ? ArgType::OMP_HOST_RPC_ARG_COPY_TO
+                                   : ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM);
+        } else if (CheckIfDynAlloc(&Obj)) {
+          // We will handle this case at runtime so here we don't do anything.
+          //LLVM_DEBUG({dbgs() << "[HostRPC] [argparse]: Dynamic Alloc: " << *Operand << "\n"; });
+          return true;
+        } else if (isa<AllocaInst>(&Obj)) {
+          llvm_unreachable("alloca instruction needs to be handled!");
+        } else {
+          LLVM_DEBUG({
+            dbgs() << "[HostRPC] warning: call site " << *CI << ", operand "
+                   << *Operand << ", underlying object " << Obj
+                   << " cannot be handled.\n";
+          });
+          return true;
+        }
+        AII.push_back(std::move(AI));
+        return true;
+      };
+
+      //LLVM_DEBUG({
+      //  dbgs() << "[HostRPC] function rewrite:\n"
+      //          << "Function: " << *F << "\n"
+      //          << "Call site: " << *CI << "\n "
+      //          << "Operand: " << *Operand << "\n";
+      //});
+
+      // TODO replace with LLVM functions to not use Attributors.
+      assert(!IRPosition::callsite_argument(*CI, I)
+          .getAnchorScope()->hasFnAttribute(Attribute::OptimizeNone)
+          && "[HostRPC]: Optimize None is not supported");
+
+      const llvm::AAUnderlyingObjects* AAUO =
+        A.getOrCreateAAFor<AAUnderlyingObjects>(
+            IRPosition::callsite_argument(*CI, I));
+
+      //LLVM_DEBUG({dbgs() << "[HostRPC] AAUO:" << AAUO << "\n";});
+      if (!AAUO->forallUnderlyingObjects(Pred))
+        llvm_unreachable("internal error");
+    }
+
+    // Reset the insert point to the call site.
+    Builder.SetInsertPoint(CI);
+
+    Value *ArgInfoVal = nullptr;
+    if (!IsConstantArgInfo) {
+      ArgInfoVal = Builder.CreateAlloca(Int8PtrTy, getConstantInt64(NumArgs),
+                                        "arg_info");
+      for (unsigned I = 0; I < NumArgs; ++I) {
+        auto &AII = ArgInfo[I];
+        Value *Next = NullPtr;
+        for (auto &AI : AII) {
+          Value *AIV = Builder.CreateAlloca(ArgInfoTy);
+
+          Value *AIIArg =
+              GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(0)});
+          Builder.Insert(AIIArg);
+          Builder.CreateStore(convertToInt64Ty(AI.BasePtr), AIIArg);
+
+          Value *AIIType =
+              GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(1)});
+          Builder.Insert(AIIType);
+          Builder.CreateStore(AI.Type, AIIType);
+
+          Value *AIISize =
+              GetElementPtrInst::Create(Int64Ty, AIV, {getConstantInt64(2)});
+          Builder.Insert(AIISize);
+          Builder.CreateStore(AI.Size, AIISize);
+
+          Value *AIINext =
+              GetElementPtrInst::Create(Int8PtrTy, AIV, {getConstantInt64(3)});
+          Builder.Insert(AIINext);
+          Builder.CreateStore(Next, AIINext);
+
+          Next = AIV;
+        }
+        Value *AIIV = GetElementPtrInst::Create(Int8PtrTy, ArgInfoVal,
+                                                {getConstantInt64(I)});
+        Builder.Insert(AIIV);
+        Builder.CreateStore(Next, AIIV);
+      }
+    } else {
+      SmallVector<Constant *> ArgInfoInitVar;
+      for (auto &AII : ArgInfo) {
+        Constant *Last = NullPtr;
+        for (auto &AI : AII) {
+          auto *Arg = cast<Constant>(AI.BasePtr);
+          auto *CS =
+              ConstantStruct::get(ArgInfoTy, {convertToInt64Ty(Arg), AI.Type,
+                                              cast<Constant>(AI.Size), Last});
+          auto *GV = new GlobalVariable(
+              M, ArgInfoTy, /* isConstant */ true,
+              GlobalValue::LinkageTypes::InternalLinkage, CS, "",
+              nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 0);
+          // force adress space 0 on AMD GPU
+          // insted of address space 1 for globals
+          Last = GV;
+        }
+          LLVM_DEBUG({
+            dbgs() << "[HostRPC] ArgInfoInitVar:" << *Last << "\n";
+          });
+          ArgInfoInitVar.push_back(Last);
+      }
+
+
+      Constant *ArgInfoInit = ConstantArray::get(
+          ArrayType::get(Int8PtrTy, NumArgs), ArgInfoInitVar);
+      ArgInfoVal = new GlobalVariable(
+          M, ArrayType::get(Int8PtrTy, NumArgs), /* isConstant */ true,
+          GlobalValue::LinkageTypes::InternalLinkage, ArgInfoInit, "arg_info",
+            nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 0);
+    }
+
+    SmallVector<Value *> Args{ConstantInt::get(Int32Ty, WrapperNumber),
+                              ArgInfoVal};
+    for (Value *Operand : CI->args())
+      Args.push_back(Operand);
+
+    CallInst *NewCall = Builder.CreateCall(DeviceWrapperFn, Args);
+
+    CallInstMap.emplace_back(CI, NewCall);
+  }
+
+  return true;
+}
+
+Function *HostRPC::getDeviceWrapperFunction(StringRef WrapperName, Function *F,
+                                            CallSiteInfo &CSI) {
+  Function *WrapperFn = M.getFunction(WrapperName);
+  if (WrapperFn)
+    return WrapperFn;
+
+  // return_type device_wrapper(int32_t call_no, void *arg_info, ...)
+  SmallVector<Type *> Params{Int32Ty, Int8PtrTy};
+  Params.append(CSI.Params);
+
+  Type *RetTy = F->getReturnType();
+
+  FunctionType *FT = FunctionType::get(RetTy, Params, /*isVarArg*/ false);
+  WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::InternalLinkage,
+                               WrapperName, M);
+
+  // Emit the body of the device wrapper
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
+  Builder.SetInsertPoint(EntryBB);
+
+  // skip call_no and arg_info.
+  constexpr const unsigned NumArgSkipped = 2;
+
+  Value *Desc = nullptr;
+  {
+    Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_get_desc];
+    //LLVM_DEBUG({dbgs() << "[HostRPC] Building: rpc get desc: " << Fn->getName() << "\n"; });
+    for (unsigned i = 0; i < 3; ++i)
+      //LLVM_DEBUG({dbgs() << "ParamI: " << *(Fn->getFunctionType()->getParamType(i)) << "\n"; });
+
+    Desc = Builder.CreateCall(
+      Fn,
+      {
+        WrapperFn->getArg(0),
+        ConstantInt::get(Int32Ty, WrapperFn->arg_size() - NumArgSkipped),
+        WrapperFn->getArg(1)
+      },
+      "desc"
+    );
+  }
+
+  {
+    Function *Fn = RFIs[OMPRTL___kmpc_host_rpc_add_arg];
+    //LLVM_DEBUG({dbgs() << "[HostRPC] Building: rpc add arg\n"; });
+    for (unsigned I = NumArgSkipped; I < WrapperFn->arg_size(); ++I) {
+      Value *V = convertToInt64Ty(WrapperFn->getArg(I));
+      Builder.CreateCall(
+          Fn, {Desc, V, ConstantInt::get(Int32Ty, I - NumArgSkipped)});
+    }
+  }
+
+  //LLVM_DEBUG({dbgs() << "[HostRPC] Building: rpc send and wait\n"; });
+  Value *RetVal =
+      Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_send_and_wait], {Desc});
+
+  if (RetTy->isVoidTy()) {
+    Builder.CreateRetVoid();
+    return WrapperFn;
+  }
+
+  if (RetTy != RetVal->getType())
+    RetVal = convertFromInt64TyTo(RetVal, RetTy);
+
+  Builder.CreateRet(RetVal);
+
+  //LLVM_DEBUG({dbgs() << "[HostRPC] Device Wrapper Function:\n" << *WrapperFn; });
+
+  return WrapperFn;
+}
+
+Function *HostRPC::getHostWrapperFunction(StringRef WrapperName, Function *F,
+                                          CallSiteInfo &CSI) {
+  Function *WrapperFn = HM.getFunction(WrapperName);
+  if (WrapperFn)
+    return WrapperFn;
+
+  SmallVector<Type *> Params{Int8PtrTy};
+  FunctionType *FT = FunctionType::get(VoidTy, Params, /* isVarArg */ false);
+  WrapperFn = Function::Create(FT, GlobalValue::LinkageTypes::ExternalLinkage,
+                               WrapperName, HM);
+
+  Value *Desc = WrapperFn->getArg(0);
+
+  // Emit the body of the host wrapper
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", WrapperFn);
+  Builder.SetInsertPoint(EntryBB);
+
+  SmallVector<Value *> Args;
+  for (unsigned I = 0; I < CSI.CI->arg_size(); ++I) {
+    Value *V = Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_get_arg],
+                                  {Desc, ConstantInt::get(Int32Ty, I)});
+    Args.push_back(convertFromInt64TyTo(V, CSI.Params[I]));
+  }
+
+  // The host callee that will be called eventually by the host wrapper.
+  Function *HostCallee = HM.getFunction(F->getName());
+  if (!HostCallee)
+    HostCallee = Function::Create(F->getFunctionType(), F->getLinkage(),
+                                  F->getName(), HM);
+
+  Value *RetVal = Builder.CreateCall(HostCallee, Args);
+  if (!RetVal->getType()->isVoidTy()) {
+    RetVal = convertToInt64Ty(RetVal);
+    Builder.CreateCall(RFIs[OMPRTL___kmpc_host_rpc_set_ret_val],
+                       {Desc, RetVal});
+  }
+  Builder.CreateRetVoid();
+
+  return WrapperFn;
+}
+
+void HostRPC::emitHostWrapperInvoker() {
+  IRBuilder<> Builder(Context);
+  unsigned NumEntries = HostEntryTable.size();
+  Function *F = RFIs[OMPRTL___kmpc_host_rpc_invoke_host_wrapper];
+  F->setDLLStorageClass(
+      GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
+  Value *CallNo = F->getArg(0);
+  Value *Desc = F->getArg(1);
+
+  SmallVector<BasicBlock *> SwitchBBs;
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", F);
+  BasicBlock *ReturnBB = BasicBlock::Create(Context, "return", F);
+
+  // Emit code for the return bb.
+  Builder.SetInsertPoint(ReturnBB);
+  Builder.CreateRetVoid();
+
+  // Create BB for each host entry and emit function call.
+  for (unsigned I = 0; I < NumEntries; ++I) {
+    BasicBlock *BB = BasicBlock::Create(Context, "invoke.bb", F, ReturnBB);
+    SwitchBBs.push_back(BB);
+    Builder.SetInsertPoint(BB);
+    Builder.CreateCall(HostEntryTable[I], {Desc});
+    Builder.CreateBr(ReturnBB);
+  }
+
+  // Emit code for the entry BB.
+  Builder.SetInsertPoint(EntryBB);
+  SwitchInst *Switch = Builder.CreateSwitch(CallNo, ReturnBB, NumEntries);
+  for (unsigned I = 0; I < NumEntries; ++I)
+    Switch->addCase(ConstantInt::get(cast<IntegerType>(Int32Ty), I),
+                    SwitchBBs[I]);
+}
+
+Module *getHostModule(Module &M) {
+  auto *MD = M.getNamedMetadata("llvm.hostrpc.hostmodule");
+  if (!MD || MD->getNumOperands() == 0)
+    return nullptr;
+  auto *Node = MD->getOperand(0);
+  assert(Node->getNumOperands() == 1 && "invliad named metadata");
+  auto *CAM = dyn_cast<ConstantAsMetadata>(Node->getOperand(0));
+  if (!CAM)
+    return nullptr;
+  auto *CI = cast<ConstantInt>(CAM->getValue());
+  Module *Mod = reinterpret_cast<Module *>(CI->getZExtValue());
+  M.eraseNamedMetadata(MD);
+  return Mod;
+}
+} // namespace
+
+PreservedAnalyses HostRPCPass::run(Module &M, ModuleAnalysisManager &AM) {
+  std::unique_ptr<Module> DummyHostModule;
+
+  Module *HostModule = nullptr;
+
+  if (UseDummyHostModule) {
+    DummyHostModule =
+        std::make_unique<Module>("dummy-host-rpc.bc", M.getContext());
+    HostModule = DummyHostModule.get();
+  } else {
+    HostModule = getHostModule(M);
+  }
+
+  if (!HostModule)
+    return PreservedAnalyses::all();
+
+  bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
+                  LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
+
+  // The pass will not run if it is not invoked directly or not invoked at link
+  // time.
+  if (!UseDummyHostModule && !PostLink)
+    return PreservedAnalyses::all();
+
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  AnalysisGetter AG(FAM);
+
+  CallGraphUpdater CGUpdater;
+  BumpPtrAllocator Allocator;
+
+  AttributorConfig AC(CGUpdater);
+  AC.DefaultInitializeLiveInternals = false;
+  AC.RewriteSignatures = false;
+  AC.PassName = DEBUG_TYPE;
+  AC.MaxFixpointIterations = 1024;
+
+  InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
+
+  SetVector<Function *> Functions;
+  Attributor A(Functions, InfoCache, AC);
+
+  HostRPC RPC(M, *HostModule, A);
+  bool Changed = RPC.run();
+
+  LLVM_DEBUG({
+    if (Changed && UseDummyHostModule) {
+      M.dump();
+      HostModule->dump();
+    }
+  });
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -56,6 +56,7 @@
 #include "llvm/Transforms/IPO/Attributor.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"

 #include <algorithm>
 #include <optional>
@@ -138,6 +139,15 @@ static cl::opt<bool>
                         cl::desc("Enables more verbose remarks."), cl::Hidden,
                         cl::init(false));

+static cl::opt<bool>
+    EnableParallel51Split("openmp-opt-enable-parallel-51-split",
+                          cl::desc("Enable the kernel split at parallel_51."),
+                          cl::Hidden, cl::init(false));
+static cl::opt<bool> EnableParallel51SplitMultiTeams(
+    "openmp-opt-enable-parallel-51-split-multi-teams",
+    cl::desc("Enable multi-teams support for the parallel 51 kernel."),
+    cl::Hidden, cl::init(false));
+
 static cl::opt<unsigned>
    SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
                          cl::desc("Maximal number of attributor iterations."),
@@ -917,6 +927,28 @@ private:
  }
 };

+bool canUseMultiTeam(Function *OutlinedFn, OMPInformationCache &OMPInfoCache) {
+  if (!EnableParallel51SplitMultiTeams)
+    return false;
+
+  return true;
+}
+
+void collectReachingKernels(Function *F,
+                            SmallVector<Kernel> &ReachingKernels) {
+  if (omp::isOpenMPKernel(*F)) {
+    if (F->hasFnAttribute("omp_parallel_51_kernel"))
+      ReachingKernels.push_back(F);
+    return;
+  }
+  for (User *U : F->users()) {
+    auto *I = dyn_cast<Instruction>(U);
+    if (!I)
+      continue;
+    collectReachingKernels(I->getFunction(), ReachingKernels);
+  }
+}
+
 struct OpenMPOpt {

  using OptimizationRemarkGetter =
@@ -945,6 +977,11 @@ struct OpenMPOpt {
                      << " functions\n");

    if (IsModulePass) {
+      if (EnableParallel51Split)
+        Changed |= splitKernels();
+
+      OMPInfoCache.recollectUses();
+
      Changed |= runAttributor(IsModulePass);

      // Recollect uses, in case Attributor deleted any.
@@ -1047,6 +1084,308 @@ struct OpenMPOpt {
  }

 private:
+  /// Create a new kernel for each function call to __kmpc_parallel_51.
+  bool splitKernels() {
+    bool Changed = false;
+
+    OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
+
+    if (!Parallel51RFI.Declaration)
+      return Changed;
+
+    SmallVector<CallInst *> WorkItems;
+    for (User *U : Parallel51RFI.Declaration->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+
+      Function *F = CI->getFunction();
+      // If the parent function is already from kernel split, skip it.
+      Attribute Attr = F->getFnAttribute("omp_parallel_51_kernel");
+      if (Attr.isValid())
+        continue;
+
+      WorkItems.push_back(CI);
+    }
+
+    if (WorkItems.empty())
+      return Changed;
+
+    Changed = true;
+
+    auto CreateNewKernel = [](LLVMContext &Ctx, Module &M,
+                              OpenMPIRBuilder &IRBuilder, Function *Parallel51,
+                              const CallInst *CI) {
+      SmallVector<Type *, 4> ArgTypes;
+      // int32_t gtid
+      ArgTypes.push_back(Type::getInt32Ty(Ctx));
+      // int32_t if_expr
+      ArgTypes.push_back(Type::getInt32Ty(Ctx));
+      // int32_t num_threads
+      ArgTypes.push_back(Type::getInt32Ty(Ctx));
+      // void **args
+      ArgTypes.push_back(PointerType::getUnqual(Ctx));
+      // int64_t nargs
+      ArgTypes.push_back(Type::getInt64Ty(Ctx));
+
+      FunctionType *FT =
+          FunctionType::get(Type::getVoidTy(Ctx), ArgTypes, false);
+      std::string KernelName = "__omp_offloading_parallel_51_from_";
+      KernelName += CI->getFunction()->getName();
+      // Sanitize the kernel name
+      {
+        size_t DotPos = KernelName.find('.');
+        while (DotPos != std::string::npos) {
+          KernelName[DotPos] = '_';
+          DotPos = KernelName.find('.');
+        }
+      }
+      Function *K =
+          Function::Create(FT, GlobalValue::WeakODRLinkage, KernelName, &M);
+      K->setVisibility(GlobalValue::ProtectedVisibility);
+      // exec mode global variable
+      GlobalVariable *ModeGV = new GlobalVariable(
+          M, Type::getInt8Ty(Ctx), /*isConstant=*/true,
+          GlobalValue::WeakAnyLinkage,
+          ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD),
+          K->getName() + "_exec_mode");
+      appendToCompilerUsed(M, {ModeGV});
+
+      // Attach "kernel" metadata to the new kernel.
+      NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
+      Metadata *MDVals[] = {ConstantAsMetadata::get(K),
+                            MDString::get(Ctx, "kernel"),
+                            ConstantAsMetadata::get(ConstantInt::get(
+                                llvm::Type::getInt32Ty(Ctx), 1))};
+      MD->addOperand(MDNode::get(Ctx, MDVals));
+      // Set kernel attributes.
+      K->setAttributes(
+          AttributeList::get(Ctx, AttributeList::FunctionIndex,
+                             CI->getFunction()->getAttributes().getFnAttrs()));
+      K->removeFnAttr("omp_target_thread_limit");
+      K->removeFnAttr("omp_target_num_teams");
+      K->addFnAttr("omp_parallel_51_kernel");
+      K->addFnAttr("kernel");
+      K->getArg(3)->addAttr(Attribute::NoAlias);
+
+      auto *EntryBB = BasicBlock::Create(Ctx, "entry", K);
+      OpenMPIRBuilder::LocationDescription TargetInitLoc(
+          {EntryBB, EntryBB->end()});
+      auto IP = IRBuilder.createTargetInit(TargetInitLoc, /* IsSPMD */ true);
+      BasicBlock *UserCodeBB = IP.getBlock();
+
+      auto UseDirectIfPossible = [](Value *LHS, Value *RHS) -> Value * {
+        if (isa<Constant>(LHS))
+          return LHS;
+        return RHS;
+      };
+
+      Value *Ident = CI->getOperand(0);
+      Value *TId = UseDirectIfPossible(CI->getOperand(1), K->getArg(0));
+      Value *IfExpr = UseDirectIfPossible(CI->getOperand(2), K->getArg(1));
+      Value *NumThreads = UseDirectIfPossible(CI->getOperand(3), K->getArg(2));
+      Value *ProcBind = CI->getOperand(4);
+      Value *Fn = CI->getOperand(5);
+      Value *WrapperFn = CI->getOperand(6);
+      Value *Args = K->getArg(3);
+      Value *NArgs = UseDirectIfPossible(CI->getOperand(8), K->getArg(4));
+
+      (void)CallInst::Create(Parallel51,
+                             {Ident, TId, IfExpr, NumThreads, ProcBind, Fn,
+                              WrapperFn, Args, NArgs},
+                             "", UserCodeBB);
+
+      OpenMPIRBuilder::LocationDescription TargetDeInitLoc(
+          {UserCodeBB, UserCodeBB->end()});
+      IRBuilder.createTargetDeinit(TargetDeInitLoc, /* IsSPMD */ true);
+      IRBuilder.Builder.CreateRetVoid();
+
+      return K;
+    };
+
+    auto EnableMultiTeam = [](Function *OutlinedFn) {
+      SmallVector<CallInst *> WorkItems;
+      for (BasicBlock &BB : *OutlinedFn)
+        for (Instruction &I : BB) {
+          CallInst *C = dyn_cast<CallInst>(&I);
+          if (!C)
+            continue;
+          Function *Callee = C->getCalledFunction();
+          auto CalleeName = Callee->getName();
+          if (CalleeName == "__kmpc_for_static_init_4" ||
+              CalleeName == "__kmpc_for_static_init_4u" ||
+              CalleeName == "__kmpc_for_static_init_8" ||
+              CalleeName == "__kmpc_for_static_init_8u")
+            WorkItems.push_back(C);
+        }
+
+      for (CallInst *C : WorkItems) {
+        constexpr const unsigned SchedTypeArgNum = 2;
+        Value *SchedTypeVal = C->getOperand(SchedTypeArgNum);
+        ConstantInt *SchedTypeCI = cast<ConstantInt>(SchedTypeVal);
+        int32_t SchedType = SchedTypeCI->getSExtValue();
+        Value *NewSchedType = SchedTypeVal;
+        if (SchedType == /*kmp_sched_static_chunk*/ 33)
+          NewSchedType = ConstantInt::get(SchedTypeCI->getType(), 100);
+        else if (SchedType == /*kmp_sched_static_chunk*/ 34)
+          NewSchedType = ConstantInt::get(SchedTypeCI->getType(), 101);
+        C->setOperand(SchedTypeArgNum, NewSchedType);
+      }
+    };
+
+    auto &Ctx = M.getContext();
+
+    for (CallInst *CI : WorkItems) {
+      Function *K = CreateNewKernel(Ctx, M, OMPInfoCache.OMPBuilder,
+                                    Parallel51RFI.Declaration, CI);
+
+      constexpr const unsigned OutlinedFnArgNum = 5;
+      Function *OutlinedFn =
+          dyn_cast<Function>(CI->getOperand(OutlinedFnArgNum));
+      assert(OutlinedFn && "arg fn is not a function");
+
+      if (canUseMultiTeam(OutlinedFn, OMPInfoCache))
+        EnableMultiTeam(OutlinedFn);
+
+      auto &Ctx = M.getContext();
+      IRBuilder<> Builder(CI);
+
+      FunctionCallee Callee =
+          OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+              M, OMPRTL___kmpc_launch_parallel_51_kernel);
+      Constant *KernelName = ConstantDataArray::getString(Ctx, K->getName());
+      GlobalVariable *KernelNameVar =
+          new GlobalVariable(M, KernelName->getType(), /*isConstant=*/true,
+                             GlobalValue::WeakAnyLinkage, KernelName);
+      Value *Args[6] = {KernelNameVar,        CI->getArgOperand(1),
+                        CI->getArgOperand(2), CI->getArgOperand(3),
+                        CI->getArgOperand(7), CI->getArgOperand(8)};
+
+      CallInst *C = Builder.CreateCall(Callee, Args);
+      OMPInfoCache.setCallingConvention(Callee, C);
+      CI->eraseFromParent();
+    }
+
+    // Check all the use of __kmpc_parallel_51 again. If it is just used in
+    // parallel kernel, we can mark the main kernel as SPMD mode as well.
+    bool OnlyUsedByParallelKernel = true;
+    for (User *U : Parallel51RFI.Declaration->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+      Function *F = CI->getFunction();
+      Attribute Attr = F->getFnAttribute("omp_parallel_51_kernel");
+      if (Attr.isValid())
+        continue;
+      OnlyUsedByParallelKernel = false;
+      break;
+    }
+
+    if (OnlyUsedByParallelKernel) {
+      for (auto K : SCC) {
+        if (!omp::isOpenMPKernel(*K))
+          continue;
+        Attribute Attr = K->getFnAttribute("omp_parallel_51_kernel");
+        if (Attr.isValid())
+          continue;
+        GlobalVariable *ExecMode =
+            M.getGlobalVariable((K->getName() + "_exec_mode").str());
+        assert(ExecMode && "kernel without exec mode");
+        assert(ExecMode->getInitializer() &&
+               "ExecMode doesn't have initializer!");
+        assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
+               "ExecMode is not an integer!");
+        const int8_t ExecModeVal =
+            cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
+        // Kernel is already in SPMD mode, skip.
+        if (ExecModeVal & OMP_TGT_EXEC_MODE_SPMD)
+          continue;
+        auto *NewExecModeC =
+            ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_SPMD);
+        ExecMode->setInitializer(NewExecModeC);
+        OMPInformationCache::RuntimeFunctionInfo &TargetInitRFI =
+            OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
+        OMPInformationCache::RuntimeFunctionInfo &TargetDeInitRFI =
+            OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
+        CallInst *InitCI = nullptr;
+        CallInst *DeInitCI = nullptr;
+        for (User *U : TargetInitRFI.Declaration->users()) {
+          auto *CI = dyn_cast<CallInst>(U);
+          if (!CI)
+            continue;
+          if (CI->getFunction() == K) {
+            InitCI = CI;
+            break;
+          }
+        }
+        for (User *U : TargetDeInitRFI.Declaration->users()) {
+          auto *CI = dyn_cast<CallInst>(U);
+          if (!CI)
+            continue;
+          if (CI->getFunction() == K) {
+            DeInitCI = CI;
+            break;
+          }
+        }
+        assert(InitCI && DeInitCI && "kernel without init and deinit");
+        InitCI->setArgOperand(1, NewExecModeC);
+        InitCI->setArgOperand(2,
+                              ConstantInt::getNullValue(Type::getInt1Ty(Ctx)));
+        DeInitCI->setArgOperand(1, NewExecModeC);
+      }
+    }
+
+    // Check the use of omp_get_thread_num and omp_get_num_threads.
+    OMPInformationCache::RuntimeFunctionInfo &ThreadNumRFI =
+        OMPInfoCache.RFIs[OMPRTL_omp_get_thread_num];
+    OMPInformationCache::RuntimeFunctionInfo &NumThreadsRFI =
+        OMPInfoCache.RFIs[OMPRTL_omp_get_num_threads];
+
+    auto CheckIfOnlyUsedByParallelKernel = [&](CallInst *CI) {
+      SmallVector<Kernel> ReachingKernels;
+      collectReachingKernels(CI->getFunction(),
+                             ReachingKernels);
+      return ReachingKernels.size() < 2;
+    };
+
+    SmallVector<CallInst *> WorkList;
+
+    for (User *U : ThreadNumRFI.Declaration->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+      if (!CheckIfOnlyUsedByParallelKernel(CI))
+        continue;
+      WorkList.push_back(CI);
+    }
+     for (User *U : NumThreadsRFI.Declaration->users()) {
+      auto *CI = dyn_cast<CallInst>(U);
+      if (!CI)
+        continue;
+      if (!CheckIfOnlyUsedByParallelKernel(CI))
+        continue;
+      WorkList.push_back(CI);
+    }
+
+    for (CallInst *CI : WorkList) {
+      Function *Callee = nullptr;
+      if (CI->getCalledFunction() == ThreadNumRFI.Declaration)
+        Callee = OMPInfoCache.RFIs[OMPRTL_omp_get_bulk_thread_num].Declaration;
+      if (CI->getCalledFunction() == NumThreadsRFI.Declaration)
+        Callee = OMPInfoCache.RFIs[OMPRTL_omp_get_bulk_num_threads].Declaration;
+      assert(Callee && "unknown callee");
+
+      auto &Builder = OMPInfoCache.OMPBuilder.Builder;
+      Builder.SetInsertPoint(CI);
+      Value *C = Builder.CreateCall(Callee);
+      CI->replaceAllUsesWith(C);
+      CI->eraseFromParent();
+    }
+
+    return Changed;
+  }
+
  /// Merge parallel regions when it is safe.
  bool mergeParallelRegions() {
    const unsigned CallbackCalleeOperand = 2;
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMTransformUtils
  CallGraphUpdater.cpp
  CanonicalizeAliases.cpp
  CanonicalizeFreezeInLoops.cpp
+  CanonicalizeMainFunction.cpp
  CloneFunction.cpp
  CloneModule.cpp
  CodeExtractor.cpp
--- a/llvm/lib/Transforms/Utils/CanonicalizeMainFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeMainFunction.cpp
@@ -0,0 +1,105 @@
+//===-------------- CanonicalizeMainFunction.cpp ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility function to canonicalize main function.
+// The canonical main function is defined as: int main(int argc, char *argv[]);
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "canonicalize-main-function"
+
+static cl::opt<std::string>
+    MainFunctionName("canonical-main-function-name",
+                     cl::desc("New main function name"),
+                     cl::value_desc("main function name"));
+
+bool rewriteMainFunction(Function &F) {
+  if (F.arg_size() == 2 && F.getReturnType()->isIntegerTy(32))
+    return false;
+
+  auto &Ctx = F.getContext();
+  auto &DL = F.getParent()->getDataLayout();
+  auto *Int32Ty = IntegerType::getInt32Ty(Ctx);
+  auto *PtrTy = PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace());
+
+  FunctionType *NewFnTy =
+      FunctionType::get(Int32Ty, {Int32Ty, PtrTy}, /* isVarArg */ false);
+  Function *NewFn =
+      Function::Create(NewFnTy, F.getLinkage(), F.getAddressSpace(), "");
+  F.getParent()->getFunctionList().insert(F.getIterator(), NewFn);
+  NewFn->takeName(&F);
+  NewFn->copyAttributesFrom(&F);
+  NewFn->setSubprogram(F.getSubprogram());
+  F.setSubprogram(nullptr);
+  NewFn->splice(NewFn->begin(), &F);
+
+  if (!F.getReturnType()->isIntegerTy(32)) {
+    SmallVector<ReturnInst *> WorkList;
+    for (BasicBlock &BB : *NewFn)
+      for (Instruction &I : BB) {
+        auto *RI = dyn_cast<ReturnInst>(&I);
+        if (!RI)
+          continue;
+        assert(RI->getReturnValue() == nullptr &&
+               "return value of a void main function is not nullptr");
+        WorkList.push_back(RI);
+      }
+    for (auto *RI : WorkList) {
+      (void)ReturnInst::Create(Ctx, ConstantInt::getNullValue(Int32Ty), RI);
+      RI->eraseFromParent();
+    }
+  }
+
+  if (F.arg_size() == NewFn->arg_size())
+    for (unsigned I = 0; I < NewFn->arg_size(); ++I) {
+      Argument *OldArg = F.getArg(I);
+      Argument *NewArg = NewFn->getArg(I);
+      NewArg->takeName(OldArg);
+      OldArg->replaceAllUsesWith(NewArg);
+    }
+
+  return true;
+}
+
+PreservedAnalyses CanonicalizeMainFunctionPass::run(Module &M,
+                                                    ModuleAnalysisManager &AM) {
+  Function *MainFunc = nullptr;
+
+  for (Function &F : M)
+    if (F.getName() == "main") {
+      assert(MainFunc == nullptr && "more than one main function");
+      MainFunc = &F;
+    }
+
+  if (MainFunc == nullptr)
+    return PreservedAnalyses::all();
+
+  bool Changed = false;
+
+  if (!MainFunctionName.empty() && MainFunc->getName() != MainFunctionName) {
+    MainFunc->setName(MainFunctionName);
+    Changed = true;
+  }
+
+  assert(MainFunc->arg_size() == 2 && MainFunc->getReturnType()->isIntegerTy(32)
+      && "[Canonicalize Main Function] wrong user main function type; should be: int main(int argc, char** argv) (in CanonicalizeMainFunction.cpp)");
+//  if (rewriteMainFunction(*MainFunc)) {
+//    MainFunc->eraseFromParent();
+//    Changed = true;
+//  }
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -28,10 +28,10 @@ if(OPENMP_STANDALONE_BUILD)
 else()
  # When building in tree we install the runtime according to the LLVM settings.
  if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND NOT APPLE)
-    set(OFFLOAD_INSTALL_LIBDIR lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE} CACHE STRING
+    set(OFFLOAD_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE}/offload" CACHE STRING
      "Path where built offload libraries should be installed.")
  else()
-    set(OFFLOAD_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}" CACHE STRING
+    set(OFFLOAD_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}/offload" CACHE STRING
      "Path where built offload libraries should be installed.")
  endif()
 endif()
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -77,16 +77,19 @@ set(include_files
  ${include_directory}/Interface.h
  ${include_directory}/LibC.h
  ${include_directory}/Mapping.h
+  ${include_directory}/Memory.h
  ${include_directory}/State.h
  ${include_directory}/Synchronization.h
  ${include_directory}/Types.h
  ${include_directory}/Utils.h
+  ${include_directory}/Mpi.h
 )

 set(src_files
  ${source_directory}/Allocator.cpp
  ${source_directory}/Configuration.cpp
  ${source_directory}/Debug.cpp
+  ${source_directory}/HostRPC.cpp
  ${source_directory}/Kernel.cpp
  ${source_directory}/LibC.cpp
  ${source_directory}/Mapping.cpp
@@ -98,8 +101,22 @@ set(src_files
  ${source_directory}/Tasking.cpp
  ${source_directory}/Utils.cpp
  ${source_directory}/Workshare.cpp
+  ${source_directory}/Mpi.cpp
 )

+# WarpAllocator.cpp is missing from this list
+if (LIBOMPTARGET_DEVICE_BUILTIN_ALLOCATOR)
+  # Use the already buildin allocator of DeviceRTL instead of GPUFirst one,
+  # Does not support rpc call of function argument with pointer to GPU memory
+  # as Allocation informations is not saved.
+  list(APPEND src_files ${source_directory}/BuiltinAllocator.cpp)
+elseif (LIBOMPTARGET_GENERIC_ALLOCATOR)
+  list(APPEND src_files ${source_directory}/GenericAllocator.cpp)
+else()
+  list(APPEND src_files ${source_directory}/TeamAllocator.cpp)
+endif()
+
+
 # We disable the slp vectorizer during the runtime optimization to avoid
 # vectorized accesses to the shared state. Generally, those are "good" but
 # the optimizer pipeline (esp. Attributor) does not fully support vectorized
@@ -129,8 +146,18 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
             -I${include_directory}
             -I${devicertl_base_directory}/../include
             ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
+             -I${CMAKE_SOURCE_DIR}/../libc/include
 )

+
+if (LIBOMPTARGET_DEVICERTL_HOSTRPC_DEBUG)
+  list(APPEND bc_flags "-DHOSTRPC_DEBUG")
+endif()
+
+if (LIBOMPTARGET_DEVICERTL_HOSTRPC_PROFILING)
+  list(APPEND bc_flags "-DHOSTRPC_PROFILING")
+endif()
+
 # first create an object target
 add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
 function(compileDeviceRTLLibrary target_cpu target_name target_triple)
@@ -298,6 +325,7 @@ set_target_properties(omptarget.devicertl PROPERTIES
  ARCHIVE_OUTPUT_DIRECTORY "${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}"
  LINKER_LANGUAGE CXX
 )
+
 target_link_libraries(omptarget.devicertl PRIVATE omptarget.devicertl.all_objs)

 install(TARGETS omptarget.devicertl ARCHIVE DESTINATION ${OFFLOAD_INSTALL_LIBDIR})
--- a/offload/DeviceRTL/include/Debug.h
+++ b/offload/DeviceRTL/include/Debug.h
@@ -39,6 +39,12 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
  __builtin_trap();                                                            \
  __builtin_unreachable();

+#define assert(expr)                                                           \
+  {                                                                            \
+    if (!(expr))                                                               \
+      __assert_fail(#expr, __FILE__, __LINE__, __PRETTY_FUNCTION__);           \
+  }
+
 ///}

 #define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);
--- a/offload/DeviceRTL/include/Interface.h
+++ b/offload/DeviceRTL/include/Interface.h
@@ -360,6 +360,22 @@ int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
 int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
 int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
 ///}
+
+/// Host RPC
+///
+/// {
+void *__kmpc_host_rpc_get_desc(int32_t CallNo, int32_t NumArgs, void *ArgInfo);
+void __kmpc_host_rpc_add_arg(void *Desc, int64_t Arg, int32_t ArgNum);
+int64_t __kmpc_host_rpc_send_and_wait(void *Desc);
+/// }
+
+/// Launch parallel kernel
+///
+///{
+void __kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
+                                      int32_t if_expr, int32_t num_threads,
+                                      void **args, int64_t nargs);
+///}
 }

 #endif
--- a/offload/DeviceRTL/include/LibC.h
+++ b/offload/DeviceRTL/include/LibC.h
@@ -14,12 +14,112 @@

 #include "Types.h"

+
+#ifndef _ASM_GENERIC_ERRNO_BASE_H
+#define _ASM_GENERIC_ERRNO_BASE_H
+
+#define EPERM 1    /* Operation not permitted */
+#define ENOENT 2   /* No such file or directory */
+#define ESRCH 3    /* No such process */
+#define EINTR 4    /* Interrupted system call */
+#define EIO 5      /* I/O error */
+#define ENXIO 6    /* No such device or address */
+#define E2BIG 7    /* Argument list too long */
+#define ENOEXEC 8  /* Exec format error */
+#define EBADF 9    /* Bad file number */
+#define ECHILD 10  /* No child processes */
+#define EAGAIN 11  /* Try again */
+#define ENOMEM 12  /* Out of memory */
+#define EACCES 13  /* Permission denied */
+#define EFAULT 14  /* Bad address */
+#define ENOTBLK 15 /* Block device required */
+#define EBUSY 16   /* Device or resource busy */
+#define EEXIST 17  /* File exists */
+#define EXDEV 18   /* Cross-device link */
+#define ENODEV 19  /* No such device */
+#define ENOTDIR 20 /* Not a directory */
+#define EISDIR 21  /* Is a directory */
+#define EINVAL 22  /* Invalid argument */
+#define ENFILE 23  /* File table overflow */
+#define EMFILE 24  /* Too many open files */
+#define ENOTTY 25  /* Not a typewriter */
+#define ETXTBSY 26 /* Text file busy */
+#define EFBIG 27   /* File too large */
+#define ENOSPC 28  /* No space left on device */
+#define ESPIPE 29  /* Illegal seek */
+#define EROFS 30   /* Read-only file system */
+#define EMLINK 31  /* Too many links */
+#define EPIPE 32   /* Broken pipe */
+#define EDOM 33    /* Math argument out of domain of func */
+#define ERANGE 34  /* Math result not representable */
+
+#endif
+
+#define errno (*__errno_location())
+
 extern "C" {

 int memcmp(const void *lhs, const void *rhs, size_t count);
-void memset(void *dst, int C, size_t count);
+void *memset(void *dst, int C, size_t count);

 int printf(const char *format, ...);
+
+long strtol(const char *str, char **str_end, int base);
+
+int strcmp(const char *lhs, const char *rhs);
+
+void *calloc(size_t num, size_t size);
+
+int strcasecmp(const char *string1, const char *string2);
+
+void exit(int exit_code);
+
+size_t strlen(const char *str);
+
+int atoi(const char *str);
+
+char *strcpy(char *dest, const char *src);
+
+int stat(const char *path, struct stat *buf);
+
+int *__errno_location();
+
+char *strcat(char *dest, const char *src);
+
+void perror(const char *s);
+
+int strncmp(const char *lhs, const char *rhs, size_t count);
+
+char *strncpy(char *dest, const char *src, size_t count);
+
+char *strchr(const char *str, int ch);
+
+char *strtok(char *str, const char *delim);
+
+const unsigned short **__ctype_b_loc(void);
+
+void *realloc(void *ptr, size_t new_size);
+
+void qsort(void *const pbase, size_t total_elems, size_t size,
+           int (*comp)(const void *, const void *));
+
+int gettimeofday(struct timeval *tv, struct timezone *tz);
+
+char *__xpg_basename(const char *path);
+
+void srand(unsigned seed);
+
+int rand();
+
+int abs(int n);
+
+void *memcpy(void *dest, const void *src, size_t count);
+
+double atof(const char *str);
+
+double strtod(const char *str, char **ptr);
+
+long strtol(const char *nptr, char **endptr, int base);
 }

 #endif
--- a/offload/DeviceRTL/include/Memory.h
+++ b/offload/DeviceRTL/include/Memory.h
@@ -0,0 +1,39 @@
+//===--- Memory.h - OpenMP device runtime memory allocator -------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_MEMORY_H
+#define OMPTARGET_MEMORY_H
+
+#include "Types.h"
+
+extern "C" {
+__attribute__((leaf)) void *malloc(size_t Size);
+__attribute__((leaf)) void free(void *Ptr);
+}
+
+namespace ompx {
+namespace memory {
+
+struct MemoryAllocationInfo {
+  void *BasePtr = nullptr;
+  size_t Size = 0;
+
+  bool isValid() const { return BasePtr; }
+};
+
+/// Get the memory allocation information if pointer \p P is in the range of one
+/// of the buffer allocated by \p malloc.
+MemoryAllocationInfo getMemoryAllocationInfo(void *P);
+} // namespace memory
+} // namespace ompx
+
+#endif
--- a/offload/DeviceRTL/include/Mpi.h
+++ b/offload/DeviceRTL/include/Mpi.h
@@ -0,0 +1,158 @@
+#ifndef MPI_H
+#define MPI_H
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+//struct MPI_Comm_s;
+//struct MPI_Comm_s {
+//  int id; // id = 0 -> MPI_COMM_WORLD (the only supported)
+//  uint32_t size;
+//  uint32_t barrier_counter;
+//  uint32_t barrier_generation_counter;
+//  uint32_t *nodes_infos;
+//};
+typedef struct MPI_Comm_s *MPI_Comm;
+MPI_Comm MPI_COMM_WORLD;
+
+typedef enum MPI_Datatype_e {
+  MPI_CHAR,
+  MPI_SHORT,
+  MPI_INT,
+  MPI_LONG,
+  MPI_LONG_LONG_INT,
+  MPI_LONG_LONG,
+  MPI_SIGNED_CHAR,
+  MPI_UNSIGNED_CHAR,
+  MPI_UNSIGNED_SHORT,
+  MPI_UNSIGNED,
+  MPI_UNSIGNED_LONG,
+  MPI_UNSIGNED_LONG_LONG,
+  MPI_FLOAT,
+  MPI_DOUBLE,
+  MPI_LONG_DOUBLE,
+  MPI_WCHAR,
+  MPI_C_BOOL,
+  MPI_INT8_T,
+  MPI_INT16_T,
+  MPI_INT32_T,
+  MPI_INT64_T,
+  MPI_UINT8_T,
+  MPI_UINT16_T,
+  MPI_UINT32_T,
+  MPI_UINT64_T,
+  MPI_C_COMPLEX,
+  MPI_C_FLOAT_COMPLEX,
+  MPI_C_DOUBLE_COMPLEX,
+  MPI_C_LONG_DOUBLE_COMPLEX,
+  MPI_BYTE,
+//MPI_PACKED,
+//MPI_AINT,
+//MPI_OFFSET,
+//MPI_COUNT,
+  MPI_CXX_BOOL
+//MPI_CXX_FLOAT_COMPLEX,
+//MPI_CXX_DOUBLE_COMPLEX,
+//MPI_CXX_LONG_DOUBLE_COMPLEX,
+} MPI_Datatype;
+
+typedef enum MPI_Intent_e {
+  MPI_THREAD_SINGLE,
+  MPI_THREAD_FUNNELED,
+  MPI_THREAD_SERIALIZED,
+  MPI_THREAD_MULTIPLE
+} MPI_Intent;
+
+typedef void MPI_User_function(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype);
+
+typedef struct MPI_Op_s {
+  MPI_User_function *func;
+} MPI_Op;
+
+typedef struct MPI_Status_s {
+  int MPI_SOURCE;
+  int MPI_TAG;
+  int MPI_ERROR;
+} MPI_Status;
+
+typedef struct MPI_Request_s *MPI_Request;
+
+//const MPI_Status MPI_STATUS_IGNORE;
+//const MPI_Status MPI_STATUSES_IGNORE;
+
+// global variables
+const int MPI_ANY_SOURCE = -1;
+const int MPI_ANY_TAG = -1;
+const int MPI_SUCCESS = 0;
+
+MPI_Status MPI_STATUS_IGNORE;
+MPI_Status MPI_STATUSES_IGNORE;
+
+MPI_Op MPI_MIN;
+MPI_Op MPI_MAX;
+
+MPI_Request MPI_REQUEST_NULL = 0;
+
+// Common  functions
+int MPI_Init(int *argc, char **argv);
+int MPI_Init_thread(int *argc, char ***argv, int required, int *provided);
+int MPI_Finalize(void);
+int MPI_Abort(MPI_Comm comm, int errorcode);
+
+int MPI_Barrier(MPI_Comm comm);
+
+int MPI_Comm_rank(MPI_Comm comm, int *rank);
+int MPI_Comm_size(MPI_Comm comm, int *size);
+
+int MPI_Type_size(MPI_Datatype datatype, int *size);
+
+// Blocking Comm
+int MPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm);
+//int MPI_Rsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm);
+int MPI_Ssend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm);
+int MPI_Bsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm);
+int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status);
+
+// Non Blocking Comm
+int MPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
+//int MPI_Irsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
+int MPI_Issend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
+int MPI_Ibsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
+int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request *request);
+
+// Non Blocking Wait
+int MPI_Test(MPI_Request *request, int *flag, MPI_Status *status);
+int MPI_Wait(MPI_Request *request, MPI_Status *status);
+int MPI_Testall(int count, MPI_Request array_of_requests[], int *flag, MPI_Status array_of_statuses[]);
+int MPI_Waitall(int count, MPI_Request array_of_requests[], MPI_Status array_of_statuses[]);
+int MPI_Testany(int count, MPI_Request array_of_requests[], int *index, int *flag, MPI_Status *status);
+int MPI_Waitany(int count, MPI_Request array_of_requests[], int *index, MPI_Status *status);
+int MPI_Testsome(int incount, MPI_Request array_of_requests[], int *outcount, int array_of_indices[], MPI_Status array_of_statuses[]);
+int MPI_Waitsome(int incount, MPI_Request array_of_requests[], int *outcount, int array_of_indices[], MPI_Status array_of_statuses[]);
+
+// Persistent Communications setup
+int MPI_Send_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
+//int MPI_Rsend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
+int MPI_Ssend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
+int MPI_Bsend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
+int MPI_Recv_init(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request *request);
+
+// Persistent Communications start and end
+int MPI_Start(MPI_Request *request);
+int MPI_Startall(int count, MPI_Request array_of_requests[]);
+int MPI_Request_free(MPI_Request *request);
+
+// Collective Operation
+int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm);
+int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm);
+
+// Uknow
+double MPI_Wtime(void);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif  // MPI_H
+
--- a/offload/DeviceRTL/include/Synchronization.h
+++ b/offload/DeviceRTL/include/Synchronization.h
@@ -36,6 +36,9 @@ enum MemScopeTy {
 uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering,
             MemScopeTy MemScope = MemScopeTy::all);

+///
+int32_t addSys(int32_t *Addr, int32_t V);
+
 /// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
 /// result is stored in \p *Addr;
 /// {
@@ -135,6 +138,58 @@ void system(atomic::OrderingTy Ordering);

 } // namespace fence

+namespace atomic {
+
+/// Atomically load \p Addr with \p Ordering semantics.
+uint32_t load(uint32_t *Addr, int Ordering);
+
+/// Atomically load \p Addr with \p Ordering semantics.
+uint64_t load(uint64_t *Addr, int Ordering);
+
+/// Atomically store \p V to \p Addr with \p Ordering semantics.
+void store(uint32_t *Addr, uint32_t V, int Ordering);
+
+void store(uint64_t *Addr, uint64_t V, int Ordering);
+
+/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
+uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically add \p V to \p *Addr with \p Ordering semantics.
+uint32_t add(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically add \p V to \p *Addr with \p Ordering semantics.
+uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);
+
+} // namespace atomic
+
+namespace mutex {
+
+class TicketLock {
+  uint64_t NowServing = 0;
+  uint64_t NextTicket = 0;
+
+public:
+  TicketLock() = default;
+
+  TicketLock(const TicketLock &) = delete;
+
+  void lock();
+
+  void unlock();
+};
+
+template <typename T> class LockGuard {
+  T &Lock;
+
+public:
+  explicit LockGuard(T &L) : Lock(L) { Lock.lock(); }
+
+  ~LockGuard() { Lock.unlock(); }
+};
+
+} // namespace mutex
+
 } // namespace ompx

+
 #endif
--- a/offload/DeviceRTL/include/Types.h
+++ b/offload/DeviceRTL/include/Types.h
@@ -36,6 +36,7 @@ using size_t = decltype(sizeof(char));
 // TODO: Properly implement this
 using intptr_t = int64_t;
 using uintptr_t = uint64_t;
+using ptrdiff_t = intptr_t;

 static_assert(sizeof(int8_t) == 1, "type size mismatch");
 static_assert(sizeof(uint8_t) == 1, "type size mismatch");
@@ -83,6 +84,11 @@ enum kmp_sched_t {
  kmp_sched_distr_static_nochunk = 92,
  kmp_sched_distr_static_chunk_sched_static_chunkone = 93,

+  // The following two are for direct GPU compilation where we don't have outer
+  // loop to distribute workload among teams but we still need to.
+  kmp_sched_distr_parallel_static_noloop_chunk = 100,
+  kmp_sched_distr_parallel_static_noloop_nochunk = 101,
+
  kmp_sched_default = kmp_sched_static_nochunk,
  kmp_sched_unordered_first = kmp_sched_static_chunk,
  kmp_sched_unordered_last = kmp_sched_auto,
--- a/offload/DeviceRTL/include/Utils.h
+++ b/offload/DeviceRTL/include/Utils.h
@@ -16,6 +16,9 @@

 #pragma omp begin declare target device_type(nohost)

+extern "C" double omp_get_wtime();
+extern "C" int printf(const char *, ...);
+
 namespace ompx {
 namespace utils {

@@ -37,8 +40,8 @@ template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
 }

 /// Advance \p Ptr by \p Bytes bytes.
-template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
-  return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
+template <typename Ty1, typename Ty2> inline Ty1 advance(Ty1 Ptr, Ty2 Bytes) {
+  return reinterpret_cast<Ty1>(reinterpret_cast<char *>(Ptr) + Bytes);
 }

 /// Return the first bit set in \p V.
@@ -82,12 +85,39 @@ template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
  return *((DstTy *)(&V));
 }

+template <typename Ty = char>
+ptrdiff_t getPtrDiff(const void *End, const void *Begin) {
+  return reinterpret_cast<const Ty *>(End) -
+         reinterpret_cast<const Ty *>(Begin);
+}
+
+inline bool isInRange(void *Ptr, void *BasePtr, int64_t Offset) {
+  ptrdiff_t Diff = getPtrDiff(Ptr, BasePtr);
+  return Diff >= 0 && Diff < Offset;
+}
+
+inline intptr_t ptrtoint(void *Ptr) { return reinterpret_cast<intptr_t>(Ptr); }
+
+template <typename T> T min(T a, T b) { return a < b ? a : b; }
+
 /// A  pointer variable that has by design an `undef` value. Use with care.
 [[clang::loader_uninitialized]] static void *const UndefPtr;

 #define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
 #define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)

+class SimpleProfiler {
+  const char *HeadLine = nullptr;
+  double Start;
+
+public:
+  SimpleProfiler(const char *HL) : HeadLine(HL), Start(omp_get_wtime()) {}
+  ~SimpleProfiler() {
+    double End = omp_get_wtime();
+    printf("%s --> %lf s.\n", HeadLine, End - Start);
+  }
+};
+
 } // namespace utils
 } // namespace ompx

--- a/offload/DeviceRTL/src/BuiltinAllocator.cpp
+++ b/offload/DeviceRTL/src/BuiltinAllocator.cpp
@@ -0,0 +1,44 @@
+//===------- BuiltinAllocator.cpp - Generic GPU memory allocator -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "Debug.h"
+#include "Memory.h"
+#include "Utils.h"
+
+using namespace ompx;
+
+namespace ompx {
+namespace memory {
+MemoryAllocationInfo getMemoryAllocationInfo(void *P) { return {}; }
+
+void init() {}
+} // namespace memory
+} // namespace ompx
+
+extern "C" {
+
+void *realloc(void *ptr, size_t new_size) {
+  void *NewPtr = malloc(new_size);
+  if (!NewPtr)
+    return nullptr;
+  __builtin_memcpy(NewPtr, ptr, new_size);
+  free(ptr);
+  return NewPtr;
+}
+}
+
+extern "C" {
+  void __kmpc_target_init_allocator() { return; }
+}
+
+#pragma omp end declare target
--- a/offload/DeviceRTL/src/GenericAllocator.cpp
+++ b/offload/DeviceRTL/src/GenericAllocator.cpp
@@ -0,0 +1,312 @@
+//===------- GenericAllocator.cpp - Generic GPU memory allocator -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "Debug.h"
+#include "Memory.h"
+#include "Synchronization.h"
+#include "Utils.h"
+
+using namespace ompx;
+
+[[gnu::used, gnu::retain, gnu::weak,
+  gnu::visibility(
+      "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
+[[gnu::used, gnu::retain, gnu::weak,
+  gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
+    __omp_rtl_device_memory_pool_tracker;
+// TODO: implement Device Debug Allocation Tracker
+
+namespace {
+size_t HeapCurPos = 0;
+mutex::TicketLock HeapLock;
+mutex::TicketLock AllocationListLock;
+mutex::TicketLock FreeListLock;
+constexpr const size_t Alignment = 16;
+
+using intptr_t = int64_t;
+
+struct SimpleLinkListNode {
+  template <typename T> friend struct SimpleLinkList;
+
+protected:
+  SimpleLinkListNode *Prev = nullptr;
+  SimpleLinkListNode *Next = nullptr;
+
+  bool operator>(const SimpleLinkListNode &RHS) const {
+    const auto This = reinterpret_cast<int64_t>(this);
+    const auto RHSThis = reinterpret_cast<int64_t>(&RHS);
+    return This > RHSThis;
+  }
+
+  SimpleLinkListNode() = default;
+  SimpleLinkListNode(const SimpleLinkListNode &) = delete;
+  SimpleLinkListNode(SimpleLinkListNode &&) = delete;
+};
+
+struct AllocationMetadata final : SimpleLinkListNode {
+public:
+  size_t getUserSize() const;
+  void *getUserAddr() const {
+    return const_cast<void *>(reinterpret_cast<const void *>(this + 1));
+  }
+
+  size_t getSize() const { return Size; }
+  void setSize(size_t V) { Size = V; }
+
+  bool isInRange(void *Ptr) {
+    return utils::isInRange(Ptr, this + 1, getUserSize());
+  }
+
+  static AllocationMetadata *getFromUserAddr(void *P) {
+    return getFromAddr(P) - 1;
+  }
+
+  static AllocationMetadata *getFromAddr(void *P) {
+    return reinterpret_cast<AllocationMetadata *>(P);
+  }
+
+private:
+  size_t Size = 0;
+  int64_t Reserved = 0;
+};
+
+constexpr const size_t AllocationMetadataSize = sizeof(AllocationMetadata);
+static_assert(AllocationMetadataSize == 32,
+              "expect the metadata size to be 32");
+
+size_t AllocationMetadata::getUserSize() const {
+  return Size - AllocationMetadataSize;
+}
+
+template <typename T> struct SimpleLinkList {
+  struct iterator {
+    friend SimpleLinkList;
+
+    T *operator->() { return reinterpret_cast<T *>(Node); }
+    T &operator*() { return reinterpret_cast<T &>(*Node); }
+
+    iterator operator++() {
+      Node = Node->Next;
+      return *this;
+    }
+
+    bool operator==(const iterator &RHS) const { return Node == RHS.Node; }
+    bool operator!=(const iterator &RHS) const { return !(*this == RHS); }
+
+    iterator next() const {
+      iterator Itr;
+      Itr.Node = Node->Next;
+      return Itr;
+    }
+
+  private:
+    SimpleLinkListNode *Node = nullptr;
+  };
+
+  iterator begin() {
+    iterator Itr;
+    Itr.Node = Head.Next;
+    return Itr;
+  }
+
+  iterator end() { return {}; }
+
+  void insert(SimpleLinkListNode *Node) { insertImpl(&Head, Node); }
+
+  void remove(iterator Itr) {
+    SimpleLinkListNode *Node = Itr.Node;
+    remove(Node);
+  }
+
+  void remove(SimpleLinkListNode *Node) { removeImpl(Node); }
+
+  bool empty() const { return Head.Next == nullptr; }
+
+private:
+  static void insertImpl(SimpleLinkListNode *Current,
+                         SimpleLinkListNode *Node) {
+    SimpleLinkListNode *OldNext = Current->Next;
+    Node->Prev = Current;
+    Node->Next = OldNext;
+    if (OldNext)
+      OldNext->Prev = Node;
+    Current->Next = Node;
+  }
+
+  static void removeImpl(SimpleLinkListNode *Node) {
+    SimpleLinkListNode *Prev = Node->Prev;
+    SimpleLinkListNode *Next = Node->Next;
+    Prev->Next = Next;
+    if (Next)
+      Next->Prev = Prev;
+    Node->Prev = nullptr;
+    Node->Next = nullptr;
+  }
+
+  /// Check if the node is in the list.
+  bool checkExist(SimpleLinkListNode *Node) const {
+    SimpleLinkListNode *P = Head.Next;
+    while (P) {
+      if (P == Node)
+        return true;
+      P = P->Next;
+    }
+    return false;
+  }
+
+  static bool checkSanity(SimpleLinkListNode *Current,
+                          SimpleLinkListNode *Next) {
+    return Current->Next == Next && (!Next || Next->Prev == Current);
+  }
+
+  static bool checkSanity(SimpleLinkListNode *Prev, SimpleLinkListNode *Current,
+                          SimpleLinkListNode *Next) {
+    return Prev->Next == Current && Current->Next == Next &&
+           Current->Prev == Prev && (!Next || Next->Prev == Current);
+  }
+
+  static bool checkDangling(SimpleLinkListNode *Node) {
+    return Node->Next == nullptr && Node->Prev == nullptr;
+  }
+
+  SimpleLinkListNode Head;
+};
+
+SimpleLinkList<AllocationMetadata> AllocationList;
+SimpleLinkList<AllocationMetadata> FreeList;
+
+constexpr const int SplitRatio = 5;
+
+#ifndef LIBOMPTARGET_MEMORY_PROFILING
+struct MemoryProfiler {
+  MemoryProfiler(const char *S) {}
+  ~MemoryProfiler() {}
+};
+#else
+struct MemoryProfiler : utils::SimpleProfiler {
+  MemoryProfiler(const char *S) : utils::SimpleProfiler(S) {}
+  ~MemoryProfiler() { utils::SimpleProfiler::~SimpleProfiler(); }
+};
+#endif
+} // namespace
+
+namespace ompx {
+namespace memory {
+MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+  mutex::LockGuard ALG(AllocationListLock);
+  for (auto Itr = AllocationList.begin(); Itr != AllocationList.end(); ++Itr) {
+    AllocationMetadata *MD = &(*Itr);
+    if (MD->isInRange(P))
+      return {MD->getUserAddr(), MD->getUserSize()};
+  }
+  return {};
+}
+
+void init() {}
+} // namespace memory
+} // namespace ompx
+
+extern "C" {
+
+void *memset(void *dest, int ch, size_t count);
+
+void *malloc(size_t Size) {
+  MemoryProfiler Profiler(__FUNCTION__);
+
+  Size = utils::align_up(Size + AllocationMetadataSize, Alignment);
+
+  AllocationMetadata *MD = nullptr;
+
+  {
+    mutex::LockGuard FLG(FreeListLock);
+
+    auto Itr = FreeList.begin();
+    for (; Itr != FreeList.end(); ++Itr) {
+      if (Itr->getSize() >= Size)
+        break;
+    }
+
+    bool Found = Itr != FreeList.end() && (Itr->getSize() / Size < SplitRatio);
+    if (Found) {
+      MD = &(*Itr);
+      FreeList.remove(Itr);
+    }
+  }
+
+  if (MD) {
+    mutex::LockGuard ALG(AllocationListLock);
+    AllocationList.insert(MD);
+    return MD->getUserAddr();
+  }
+
+  {
+    mutex::LockGuard LG(HeapLock);
+
+    if (Size + HeapCurPos < __omp_rtl_device_memory_pool.Size) {
+      void *R = reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr) + HeapCurPos;
+      (void)atomic::add(&HeapCurPos, Size, atomic::acq_rel);
+      MD = AllocationMetadata::getFromAddr(R);
+    }
+  }
+
+  if (MD) {
+    // We need to reset the head in case of any dirty data.
+    memset(MD, 0, AllocationMetadataSize);
+    MD->setSize(Size);
+    mutex::LockGuard ALG(AllocationListLock);
+    AllocationList.insert(MD);
+    return MD->getUserAddr();
+  }
+
+  printf("out of heap memory! size=%lu, cur=%lu.\n", Size, HeapCurPos);
+
+  printf("%s:%d\n", __FILE__, __LINE__);
+  __builtin_trap();
+}
+
+void free(void *P) {
+  MemoryProfiler Profiler(__FUNCTION__);
+
+  if (!P)
+    return;
+
+  auto *MD = AllocationMetadata::getFromUserAddr(P);
+
+  {
+    mutex::LockGuard ALG(AllocationListLock);
+    AllocationList.remove(MD);
+  }
+
+  {
+    mutex::LockGuard FLG(FreeListLock);
+    FreeList.insert(MD);
+  }
+}
+
+void *realloc(void *ptr, size_t new_size) {
+  MemoryProfiler Profiler(__FUNCTION__);
+
+  void *NewPtr = malloc(new_size);
+  if (!NewPtr)
+    return nullptr;
+  auto *OldMD = AllocationMetadata::getFromUserAddr(ptr);
+  assert(ptr == OldMD->getUserAddr());
+  __builtin_memcpy(NewPtr, ptr, utils::min(OldMD->getUserSize(), new_size));
+  free(ptr);
+  return NewPtr;
+}
+
+void __kmpc_target_init_allocator() {}
+}
+
+#pragma omp end declare target
--- a/offload/DeviceRTL/src/HostRPC.cpp
+++ b/offload/DeviceRTL/src/HostRPC.cpp
@@ -0,0 +1,428 @@
+//===------- HostRPC.cpp - Implementation of host RPC ------------- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "HostRPC.h"
+
+#include "Debug.h"
+#include "LibC.h"
+#include "Memory.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
+#include "llvm-libc-types/rpc_opcodes_t.h"
+#include "llvm-libc-types/rpc_port_t.h"
+
+#ifdef HOSTRPC_DEBUG
+#define DEBUG_PREFIX "host-rpc-device"
+#define DP(FMT, ...)                                                           \
+  { printf("%s --> " FMT, DEBUG_PREFIX, __VA_ARGS__); }
+#else
+#define DP(FMT, ...)
+#endif
+
+using namespace ompx;
+using namespace hostrpc;
+
+using ArgType = llvm::omp::OMPTgtHostRPCArgType;
+
+#ifdef HOSTRPC_PROFILING
+int32_t HostRPCId;
+double GetDescStart;
+double GetDescEnd;
+double AddArgStart;
+double AddArgEnd;
+double IssueAndWaitStart;
+double IssueAndWaitEnd;
+double CopyBackStart;
+double CopyBackEnd;
+#endif
+
+
+// libc rpc functions forward declare:
+// TODO: replace when a proper header exposing device functions is created
+extern "C" {
+  rpc_port_t rpc_open_port(rpc_opcode_t);
+  void rpc_send_n(rpc_port_t *handle, const void *src, size_t size);
+  void rpc_recv_n(rpc_port_t *handle, void *dst, size_t *size);
+  void rpc_close_port(rpc_port_t *handle);
+}
+
+
+namespace {
+size_t HostRPCMemoryBufferCurrentPosition = 0;
+constexpr const size_t Alignment = 16;
+
+static_assert(sizeof(intptr_t) == sizeof(int64_t), "pointer size not match");
+
+struct HostRPCArgInfo {
+  void *BasePtr;
+  int64_t Type;
+  int64_t Size;
+  HostRPCArgInfo *Next;
+};
+
+struct HostRPCPointerMapEntry {
+  void *BasePtr;
+  void *MappedBasePtr;
+  int64_t Size;
+  int64_t Kind;
+};
+
+void *getMappedPointer(Descriptor *D, void *BasePtr, int64_t Size,
+                       int64_t Offset, int64_t Kind) {
+  assert(D->ArgMap && "ArgMap should not be nullptr");
+
+  HostRPCPointerMapEntry *MapTable =
+      reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
+  int I = 0;
+  for (; I < D->NumArgs && MapTable[I].BasePtr; ++I)
+    if (MapTable[I].BasePtr == BasePtr)
+      return utils::advance(MapTable[I].MappedBasePtr, Offset);
+
+  MapTable[I].BasePtr = BasePtr;
+  MapTable[I].MappedBasePtr = malloc(Size);
+  MapTable[I].Size = Size;
+  MapTable[I].Kind = Kind;
+
+  if (Kind & ArgType::OMP_HOST_RPC_ARG_COPY_TO) {
+    __builtin_memcpy(MapTable[I].MappedBasePtr, BasePtr, Size);
+    DP("getMappedPointer: copy %ld bytes memory from %p to %p.\n", Size,
+       BasePtr, MapTable[I].MappedBasePtr);
+  }
+
+  return utils::advance(MapTable[I].MappedBasePtr, Offset);
+}
+
+void copybackIfNeeded(Descriptor *D) {
+  if (!D->ArgMap)
+    return;
+
+  auto *MapTable = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
+  for (int I = 0; I < D->NumArgs && MapTable[I].BasePtr; ++I)
+    if (MapTable[I].Kind & ArgType::OMP_HOST_RPC_ARG_COPY_FROM) {
+      __builtin_memcpy(MapTable[I].BasePtr, MapTable[I].MappedBasePtr,
+                       MapTable[I].Size);
+      DP("copybackIfNeeded: copy %ld bytes memory from %p to %p.\n",
+         MapTable[I].Size, MapTable[I].MappedBasePtr, MapTable[I].BasePtr);
+    }
+}
+
+} // namespace
+
+extern "C" {
+__attribute__((noinline, used)) void *
+__kmpc_host_rpc_get_desc(int32_t CallId, int32_t NumArgs, void *ArgInfo) {
+
+  DP("get desc for request (id=%d), NumArgs=%d, ArgInfo=%p.\n", CallId, NumArgs,
+     ArgInfo);
+#ifdef HOSTRPC_DEBUG
+  {
+    void **AIs = reinterpret_cast<void **>(ArgInfo);
+    for (int I = 0; I < NumArgs; ++I)
+      DP("ArgInfo[%d]=%p.\n", I, AIs[I]);
+  }
+#endif
+
+#ifdef HOSTRPC_PROFILING
+  HostRPCId = CallId;
+  GetDescStart = omp_get_wtime();
+#endif
+
+  Descriptor *D = (Descriptor *) malloc(sizeof(Descriptor));
+
+  D->Id = CallId;
+  D->ArgInfo = reinterpret_cast<void **>(ArgInfo);
+  D->NumArgs = NumArgs;
+  D->Status = EXEC_STAT_CREATED;
+  D->ReturnValue = 0;
+  D->Args =
+      reinterpret_cast<Argument *>(malloc(sizeof(Argument) * NumArgs));
+  D->ArgMap = malloc(sizeof(HostRPCPointerMapEntry) * NumArgs);
+
+  assert(!NumArgs || (D->Args && D->ArgMap) && "out of host rpc memory!");
+
+  // Reset the map table.
+  auto *ArgMap = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
+  for (int I = 0; I < NumArgs; ++I)
+    ArgMap[I].BasePtr = nullptr;
+
+#ifdef HOSTRPC_PROFILING
+  GetDescEnd = omp_get_wtime();
+  AddArgStart = omp_get_wtime();
+#endif
+
+  return D;
+}
+
+__attribute__((noinline, used)) void
+__kmpc_host_rpc_add_arg(void *Desc, int64_t ArgVal, int32_t ArgNum) {
+  auto *D = reinterpret_cast<Descriptor *>(Desc);
+  assert(ArgNum < D->NumArgs && "out-of-range arguments");
+  Argument &ArgInDesc = D->Args[ArgNum];
+
+  DP("add arg (no=%d), arg=%lx to request (id=%d).\n", ArgNum, ArgVal, D->Id);
+
+  // This early branch can rule out nullptr and zero scalar value because it
+  // doesn't matter whether it is a pointer or scalar value.
+  if (ArgVal == 0) {
+    ArgInDesc.Value = 0;
+    ArgInDesc.ArgType = Type::ARG_LITERAL;
+
+    DP("arg (no=%d) is null, done.\n", ArgNum);
+
+    return;
+  }
+
+  void *ArgPtr = reinterpret_cast<void *>(ArgVal);
+
+  const auto *AI = reinterpret_cast<HostRPCArgInfo *>(D->ArgInfo[ArgNum]);
+
+  DP("try to find arg (no=%d) from args AI=%p\n", ArgNum, AI);
+
+  if (AI) {
+    // Let's first check if Arg is a scalar.
+    if (AI->Type == ArgType::OMP_HOST_RPC_ARG_SCALAR) {
+      assert(AI->BasePtr == ArgPtr && "invalid scalar argument info");
+      assert(AI->Next == nullptr && "invalid scalar argument info");
+
+      ArgInDesc.Value = ArgVal;
+      ArgInDesc.ArgType = Type::ARG_LITERAL;
+
+      DP("arg (no=%d) is scalar, done.\n", ArgNum);
+
+      return;
+    }
+
+    // Then let's see if it is a literal pointer that we don't need copy.
+    auto *P = AI;
+    while (P) {
+      if (P->Type == ArgType::OMP_HOST_RPC_ARG_PTR && P->BasePtr == ArgPtr) {
+        ArgInDesc.Value = ArgVal;
+        ArgInDesc.ArgType = Type::ARG_POINTER;
+
+        DP("arg (no=%d) is literal pointer, done.\n", ArgNum);
+
+        return;
+      }
+      P = P->Next;
+    }
+
+    // Next we check if it is within the range of any buffer described in
+    // argument info.
+    P = AI;
+    while (P) {
+      if ((P->Type & ArgType::OMP_HOST_RPC_ARG_PTR) && P->Size) {
+        if (utils::isInRange(ArgPtr, P->BasePtr, P->Size)) {
+          auto Size = P->Size;
+          auto Offset = utils::getPtrDiff(ArgPtr, P->BasePtr);
+
+          ArgInDesc.Value = utils::ptrtoint(
+              getMappedPointer(D, P->BasePtr, Size, Offset, P->Type));
+          ArgInDesc.ArgType = Type::ARG_POINTER;
+
+          DP("found a match for arg (no=%d). done.\n", ArgNum);
+
+          return;
+        }
+      }
+      P = P->Next;
+    }
+  }
+
+  // Now we can't find a match from argument info, then we assume it is from
+  // dynamic allocation.
+  memory::MemoryAllocationInfo MAI = memory::getMemoryAllocationInfo(ArgPtr);
+  if (MAI.isValid()) {
+    auto Size = MAI.Size;
+    auto Offset = utils::getPtrDiff(ArgPtr, MAI.BasePtr);
+
+    ArgInDesc.Value = utils::ptrtoint(
+        getMappedPointer(D, MAI.BasePtr, Size, Offset,
+                         /* Kind */ ArgType::OMP_HOST_RPC_ARG_COPY_TOFROM));
+    ArgInDesc.ArgType = Type::ARG_POINTER;
+
+    DP("arg (no=%d) is from malloc. done.\n", ArgNum);
+
+    return;
+  }
+
+  printf("request (id=%d) arg (no=%d, val=%p) is unknown. send it to host "
+         "directly.\n",
+         D->Id, ArgNum, ArgPtr);
+
+  ArgInDesc.Value = ArgVal;
+  ArgInDesc.ArgType = Type::ARG_POINTER;
+}
+
+__attribute__((noinline, used)) int64_t
+__kmpc_host_rpc_send_and_wait(void *Desc) {
+  Descriptor *D = reinterpret_cast<Descriptor *>(Desc);
+  int32_t Id = D->Id;
+
+#ifdef HOSTRPC_PROFILING
+  AddArgEnd = omp_get_wtime();
+  IssueAndWaitStart = omp_get_wtime();
+#endif
+
+
+
+//  // WORKING back & forth of an uint64_t
+//
+//  printf("[HostRPC] [Device]: Start \n");
+//
+//  rpc_port_t port = rpc_open_port(RPC_GPUFIRST);
+//
+//  uint64_t size_send = sizeof(uint64_t);
+//  void *buf_send = malloc(size_send);
+//  *((uint64_t *) buf_send) = 123456789;
+//
+//  printf("[Hostrpc] [Device] [SEND]: %lu\n", *((uint64_t *) buf_send));
+//  printf("[HostRPC] [Device] [SEND] Size: %lu\n", size_send);
+//
+//  rpc_send_n(&port, buf_send, size_send);
+//
+//
+//  uint64_t size_recv = sizeof(uint64_t);
+//  void *buf_recv = malloc(size_recv);
+//
+//  rpc_recv_n(&port, buf_recv, &size_recv);
+//
+//  printf("[HostRPC] [Device] [RECV]: %lu\n", *((uint64_t *) buf_recv));
+//  printf("[HostRPC] [Device] [RECV] Size: %lu\n", size_recv);
+//
+//  rpc_close_port(&port);
+//
+//  assert(size_send == size_recv);
+//
+//  printf("[HostRPC] [Device]: End \n");
+//
+//  // END of working part
+
+
+  rpc_port_t port = rpc_open_port(RPC_GPUFIRST);
+
+  Argument *Args = D->Args;
+
+  rpc_send_n(&port, D, sizeof(Descriptor));
+  rpc_send_n(&port, Args, sizeof(Argument) * D->NumArgs);
+
+  // CPU is calling the function here
+
+  // unuse
+  uint64_t size_recv = 0;
+
+  rpc_recv_n(&port, D, &size_recv);
+  rpc_recv_n(&port, Args, &size_recv);
+
+  D->Args = Args;
+
+  (void) size_recv;
+  rpc_close_port(&port);
+
+
+#ifdef HOSTRPC_PROFILING
+  IssueAndWaitEnd = omp_get_wtime();
+#endif
+
+  DP("finish waiting for request (id=%d).\n", Id);
+
+  int64_t Ret = D->ReturnValue;
+
+  assert(!D->NumArgs || D->ArgMap && "arg map should not be nullptr");
+
+#ifdef HOSTRPC_PROFILING
+  CopyBackStart = omp_get_wtime();
+#endif
+
+  if (D->ArgMap) {
+    DP("copy memory back for request (id=%d).\n", Id);
+    copybackIfNeeded(D);
+    DP("finish copy memory back for request (id=%d).\n", Id);
+  }
+
+#ifdef HOSTRPC_PROFILING
+  CopyBackEnd = omp_get_wtime();
+#endif
+
+  // free memory allocated for the call
+  HostRPCPointerMapEntry *MapTable = reinterpret_cast<HostRPCPointerMapEntry *>(D->ArgMap);
+  for(int i = 0; i < D->NumArgs && MapTable[i].BasePtr; ++i){
+    free(MapTable[i].MappedBasePtr);
+  }
+  free(D->Args);
+  free(D->ArgMap);
+  free(D);
+
+  DP("request (id=%d) is done with return code=%lx.\n", Id, Ret);
+
+#ifdef HOSTRPC_PROFILING
+  printf("[host-rpc-profiling-device] id=%d, init=%lf, add_arg=%lf, wait=%lf, "
+         "copy=%lf.\n",
+         HostRPCId, GetDescEnd - GetDescStart, AddArgEnd - AddArgStart,
+         IssueAndWaitEnd - IssueAndWaitStart, CopyBackEnd - CopyBackStart);
+#endif
+
+  return Ret;
+}
+
+__attribute__((noinline, used)) void
+__kmpc_launch_parallel_51_kernel(const char *name, int32_t gtid,
+                                 int32_t if_expr, int32_t num_threads,
+                                 void **args, int64_t nargs) {
+  constexpr const int64_t NumArgs = 6;
+  HostRPCArgInfo ArgInfoArray[NumArgs];
+  void *ArgInfo[NumArgs];
+  for (unsigned I = 0; I < NumArgs; ++I) {
+    ArgInfoArray[I].BasePtr = 0;
+    ArgInfoArray[I].Type = ArgType::OMP_HOST_RPC_ARG_SCALAR;
+    ArgInfoArray[I].Size = 0;
+    ArgInfoArray[I].Next = nullptr;
+    ArgInfo[I] = &ArgInfoArray[I];
+  }
+  auto *D = (Descriptor *)__kmpc_host_rpc_get_desc(0, NumArgs, (void *)ArgInfo);
+
+  // Set up arg info struct.
+  ArgInfoArray[0].BasePtr = const_cast<char *>(name);
+  ArgInfoArray[0].Type = ArgType::OMP_HOST_RPC_ARG_COPY_TO;
+  ArgInfoArray[0].Size = strlen(name) + 1;
+  ArgInfoArray[1].BasePtr =
+      reinterpret_cast<void *>(static_cast<int64_t>(gtid));
+  ArgInfoArray[2].BasePtr =
+      reinterpret_cast<void *>(static_cast<int64_t>(if_expr));
+  ArgInfoArray[3].BasePtr =
+      reinterpret_cast<void *>(static_cast<int64_t>(num_threads));
+  ArgInfoArray[5].BasePtr = reinterpret_cast<void *>(nargs);
+
+  // We need to treat args in a little bit different way because nargs might be
+  // on the stack.
+  ArgInfoArray[4].Size = sizeof(void *) * nargs;
+  void *Args = nullptr;
+  if (nargs) {
+    Args = malloc(ArgInfoArray[4].Size);
+    __builtin_memcpy(Args, args, ArgInfoArray[4].Size);
+  }
+  ArgInfoArray[4].BasePtr = Args;
+
+  D->Id = CALLID___kmpc_launch_parallel_51_kernel;
+
+  __kmpc_host_rpc_add_arg(D, reinterpret_cast<int64_t>(name), 0);
+  __kmpc_host_rpc_add_arg(D, gtid, 1);
+  __kmpc_host_rpc_add_arg(D, if_expr, 2);
+  __kmpc_host_rpc_add_arg(D, num_threads, 3);
+  __kmpc_host_rpc_add_arg(D, reinterpret_cast<int64_t>(Args), 4);
+  __kmpc_host_rpc_add_arg(D, nargs, 5);
+
+  (void)__kmpc_host_rpc_send_and_wait(D);
+}
+}
+
+#pragma omp end declare target
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -16,6 +16,7 @@
 #include "Debug.h"
 #include "Interface.h"
 #include "Mapping.h"
+#include "Memory.h"
 #include "State.h"
 #include "Synchronization.h"
 #include "Types.h"
--- a/offload/DeviceRTL/src/LibC.cpp
+++ b/offload/DeviceRTL/src/LibC.cpp
@@ -7,6 +7,12 @@
 //===----------------------------------------------------------------------===//

 #include "LibC.h"
+#include "Debug.h"
+#include "Memory.h"
+#include "Synchronization.h"
+#include "Utils.h"
+
+using namespace ompx;

 #pragma omp begin declare target device_type(nohost)

@@ -47,6 +53,321 @@ int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
 #endif
 #pragma omp end declare variant

+// Dummy variable for stdio.
+int StdInDummyVar;
+int StdOutDummyVar;
+int StdErrDummyVar;
+
+struct FILE;
+__attribute__((used, retain, weak, visibility("protected"))) FILE *stdin =
+    (FILE *)&StdInDummyVar;
+__attribute__((used, retain, weak, visibility("protected"))) FILE *stdout =
+    (FILE *)&StdOutDummyVar;
+__attribute__((used, retain, weak, visibility("protected"))) FILE *stderr =
+    (FILE *)&StdErrDummyVar;
+
+typedef int (*__compar_fn_t)(const void *, const void *);
+typedef int (*__compar_d_fn_t)(const void *, const void *, void *);
+
+namespace {
+const int32_t ToLowerMapTable[] = {
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   2,   3,   4,   5,   6,
+    7,   8,   9,   10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,
+    22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,
+    37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
+    52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  'a', 'b',
+    'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+    'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 91,  92,  93,  94,  95,  96,
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 123, 124, 125, 126,
+    127, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,
+};
+const int32_t *ToLowerPTable = ToLowerMapTable + 128;
+
+#define X(x) (((x) / 256 | (x)*256) % 65536)
+
+static const unsigned short BLocTable[] = {
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
+    X(0x200), X(0x200), X(0x200), X(0x200), X(0x320), X(0x220), X(0x220),
+    X(0x220), X(0x220), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
+    X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200),
+    X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x200), X(0x160),
+    X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
+    X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
+    X(0x4c0), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8),
+    X(0x8d8), X(0x8d8), X(0x8d8), X(0x8d8), X(0x4c0), X(0x4c0), X(0x4c0),
+    X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x8d5), X(0x8d5), X(0x8d5),
+    X(0x8d5), X(0x8d5), X(0x8d5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
+    X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
+    X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5), X(0x8c5),
+    X(0x8c5), X(0x8c5), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0), X(0x4c0),
+    X(0x4c0), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6), X(0x8d6),
+    X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6),
+    X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6),
+    X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x8c6), X(0x4c0),
+    X(0x4c0), X(0x4c0), X(0x4c0), X(0x200), 0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,        0,
+    0,        0,        0,        0,        0,        0,
+};
+
+#undef X
+
+static const unsigned short *BLocTablePTable = BLocTable + 128;
+
+int ErrNo __attribute__((used, retain));
+
+static unsigned long int RandomNext = 1;
+
+#define SWAP(a, b, size)                                                       \
+  do {                                                                         \
+    size_t __size = (size);                                                    \
+    char *__a = (a), *__b = (b);                                               \
+    do {                                                                       \
+      char __tmp = *__a;                                                       \
+      *__a++ = *__b;                                                           \
+      *__b++ = __tmp;                                                          \
+    } while (--__size > 0);                                                    \
+  } while (0)
+
+/* Discontinue quicksort algorithm when partition gets below this size.
+   This particular magic number was chosen to work best on a Sun 4/260. */
+#define MAX_THRESH 4
+
+/* Stack node declarations used to store unfulfilled partition obligations. */
+typedef struct {
+  char *lo;
+  char *hi;
+} stack_node;
+
+/* The next 4 #defines implement a very fast in-line stack abstraction. */
+/* The stack needs log (total_elements) entries (we could even subtract
+   log(MAX_THRESH)).  Since total_elements has type size_t, we get as
+   upper bound for log (total_elements):
+   bits per byte (CHAR_BIT) * sizeof(size_t).  */
+#define CHAR_BIT 8
+#define STACK_SIZE (CHAR_BIT * sizeof(size_t))
+#define PUSH(low, high) ((void)((top->lo = (low)), (top->hi = (high)), ++top))
+#define POP(low, high) ((void)(--top, (low = top->lo), (high = top->hi)))
+#define STACK_NOT_EMPTY (stack < top)
+
+/* Order size using quicksort.  This implementation incorporates
+   four optimizations discussed in Sedgewick:
+   1. Non-recursive, using an explicit stack of pointer that store the
+      next array partition to sort.  To save time, this maximum amount
+      of space required to store an array of SIZE_MAX is allocated on the
+      stack.  Assuming a 32-bit (64 bit) integer for size_t, this needs
+      only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes).
+      Pretty cheap, actually.
+   2. Chose the pivot element using a median-of-three decision tree.
+      This reduces the probability of selecting a bad pivot value and
+      eliminates certain extraneous comparisons.
+   3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving
+      insertion sort to order the MAX_THRESH items within each partition.
+      This is a big win, since insertion sort is faster for small, mostly
+      sorted array segments.
+   4. The larger of the two sub-partitions is always pushed onto the
+      stack first, with the algorithm then concentrating on the
+      smaller partition.  This *guarantees* no more than log (total_elems)
+      stack size is needed (actually O(1) in this case)!  */
+
+void _quicksort(void *const pbase, size_t total_elems, size_t size,
+                __compar_d_fn_t cmp, void *arg) {
+  char *base_ptr = (char *)pbase;
+
+  const size_t max_thresh = MAX_THRESH * size;
+
+  if (total_elems == 0)
+    /* Avoid lossage with unsigned arithmetic below.  */
+    return;
+
+  if (total_elems > MAX_THRESH) {
+    char *lo = base_ptr;
+    char *hi = &lo[size * (total_elems - 1)];
+    stack_node stack[STACK_SIZE];
+    stack_node *top = stack;
+
+    PUSH(nullptr, nullptr);
+
+    while (STACK_NOT_EMPTY) {
+      char *left_ptr;
+      char *right_ptr;
+
+      /* Select median value from among LO, MID, and HI. Rearrange
+         LO and HI so the three values are sorted. This lowers the
+         probability of picking a pathological pivot value and
+         skips a comparison for both the LEFT_PTR and RIGHT_PTR in
+         the while loops. */
+
+      char *mid = lo + size * ((hi - lo) / size >> 1);
+
+      if ((*cmp)((void *)mid, (void *)lo, arg) < 0)
+        SWAP(mid, lo, size);
+      if ((*cmp)((void *)hi, (void *)mid, arg) < 0)
+        SWAP(mid, hi, size);
+      else
+        goto jump_over;
+      if ((*cmp)((void *)mid, (void *)lo, arg) < 0)
+        SWAP(mid, lo, size);
+    jump_over:;
+
+      left_ptr = lo + size;
+      right_ptr = hi - size;
+
+      /* Here's the famous ``collapse the walls'' section of quicksort.
+         Gotta like those tight inner loops!  They are the main reason
+         that this algorithm runs much faster than others. */
+      do {
+        while ((*cmp)((void *)left_ptr, (void *)mid, arg) < 0)
+          left_ptr += size;
+
+        while ((*cmp)((void *)mid, (void *)right_ptr, arg) < 0)
+          right_ptr -= size;
+
+        if (left_ptr < right_ptr) {
+          SWAP(left_ptr, right_ptr, size);
+          if (mid == left_ptr)
+            mid = right_ptr;
+          else if (mid == right_ptr)
+            mid = left_ptr;
+          left_ptr += size;
+          right_ptr -= size;
+        } else if (left_ptr == right_ptr) {
+          left_ptr += size;
+          right_ptr -= size;
+          break;
+        }
+      } while (left_ptr <= right_ptr);
+
+      /* Set up pointers for next iteration.  First determine whether
+         left and right partitions are below the threshold size.  If so,
+         ignore one or both.  Otherwise, push the larger partition's
+         bounds on the stack and continue sorting the smaller one. */
+
+      if ((size_t)(right_ptr - lo) <= max_thresh) {
+        if ((size_t)(hi - left_ptr) <= max_thresh)
+          /* Ignore both small partitions. */
+          POP(lo, hi);
+        else
+          /* Ignore small left partition. */
+          lo = left_ptr;
+      } else if ((size_t)(hi - left_ptr) <= max_thresh)
+        /* Ignore small right partition. */
+        hi = right_ptr;
+      else if ((right_ptr - lo) > (hi - left_ptr)) {
+        /* Push larger left partition indices. */
+        PUSH(lo, right_ptr);
+        lo = left_ptr;
+      } else {
+        /* Push larger right partition indices. */
+        PUSH(left_ptr, hi);
+        hi = right_ptr;
+      }
+    }
+  }
+
+  /* Once the BASE_PTR array is partially sorted by quicksort the rest
+     is completely sorted using insertion sort, since this is efficient
+     for partitions below MAX_THRESH size. BASE_PTR points to the beginning
+     of the array to sort, and END_PTR points at the very last element in
+     the array (*not* one beyond it!). */
+
+#define min(x, y) ((x) < (y) ? (x) : (y))
+
+  {
+    char *const end_ptr = &base_ptr[size * (total_elems - 1)];
+    char *tmp_ptr = base_ptr;
+    char *thresh = min(end_ptr, base_ptr + max_thresh);
+    char *run_ptr;
+
+    /* Find smallest element in first threshold and place it at the
+       array's beginning.  This is the smallest array element,
+       and the operation speeds up insertion sort's inner loop. */
+
+    for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
+      if ((*cmp)((void *)run_ptr, (void *)tmp_ptr, arg) < 0)
+        tmp_ptr = run_ptr;
+
+    if (tmp_ptr != base_ptr)
+      SWAP(tmp_ptr, base_ptr, size);
+
+    /* Insertion sort, running from left-hand-side up to right-hand-side.  */
+
+    run_ptr = base_ptr + size;
+    while ((run_ptr += size) <= end_ptr) {
+      tmp_ptr = run_ptr - size;
+      while ((*cmp)((void *)run_ptr, (void *)tmp_ptr, arg) < 0)
+        tmp_ptr -= size;
+
+      tmp_ptr += size;
+      if (tmp_ptr != run_ptr) {
+        char *trav;
+
+        trav = run_ptr + size;
+        while (--trav >= run_ptr) {
+          char c = *trav;
+          char *hi, *lo;
+
+          for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
+            *hi = *lo;
+          *hi = c;
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
 extern "C" {

 int memcmp(const void *lhs, const void *rhs, size_t count) {
@@ -60,16 +381,419 @@ int memcmp(const void *lhs, const void *rhs, size_t count) {
  return 0;
 }

-void memset(void *dst, int C, size_t count) {
-  auto *dstc = reinterpret_cast<char *>(dst);
-  for (size_t I = 0; I < count; ++I)
-    dstc[I] = C;
-}
-
 /// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf
 int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
  return impl::omp_vprintf(Format, Arguments, Size);
 }
+
+// -----------------------------------------------------------------------------
+
+#ifndef ULONG_MAX
+#define ULONG_MAX ((unsigned long)(~0L)) /* 0xFFFFFFFF */
+#endif
+
+#ifndef LONG_MAX
+#define LONG_MAX ((long)(ULONG_MAX >> 1)) /* 0x7FFFFFFF */
+#endif
+
+#ifndef LONG_MIN
+#define LONG_MIN ((long)(~LONG_MAX)) /* 0x80000000 */
+#endif
+
+#ifndef ISDIGIT
+#define ISDIGIT(C) ((C) >= '0' && (C) <= '9')
+#endif
+
+#ifndef ISSPACE
+#define ISSPACE(C) ((C) == ' ')
+#endif
+
+#ifndef ISALPHA
+#define ISALPHA(C) (((C) >= 'a' && (C) <= 'z') || ((C) >= 'A' && (C) <= 'Z'))
+#endif
+
+#ifndef ISUPPER
+#define ISUPPER(C) ((C) >= 'A' && (C) <= 'Z')
+#endif
+
+long strtol(const char *nptr, char **endptr, int base) {
+  const char *s = nptr;
+  unsigned long acc;
+  int c;
+  unsigned long cutoff;
+  int neg = 0, any, cutlim;
+
+  /*
+   * Skip white space and pick up leading +/- sign if any.
+   * If base is 0, allow 0x for hex and 0 for octal, else
+   * assume decimal; if base is already 16, allow 0x.
+   */
+  do {
+    c = *s++;
+  } while (ISSPACE(c));
+  if (c == '-') {
+    neg = 1;
+    c = *s++;
+  } else if (c == '+')
+    c = *s++;
+  if ((base == 0 || base == 16) && c == '0' && (*s == 'x' || *s == 'X')) {
+    c = s[1];
+    s += 2;
+    base = 16;
+  }
+  if (base == 0)
+    base = c == '0' ? 8 : 10;
+
+  /*
+   * Compute the cutoff value between legal numbers and illegal
+   * numbers.  That is the largest legal value, divided by the
+   * base.  An input number that is greater than this value, if
+   * followed by a legal input character, is too big.  One that
+   * is equal to this value may be valid or not; the limit
+   * between valid and invalid numbers is then based on the last
+   * digit.  For instance, if the range for longs is
+   * [-2147483648..2147483647] and the input base is 10,
+   * cutoff will be set to 214748364 and cutlim to either
+   * 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated
+   * a value > 214748364, or equal but the next digit is > 7 (or 8),
+   * the number is too big, and we will return a range error.
+   *
+   * Set any if any `digits' consumed; make it negative to indicate
+   * overflow.
+   */
+  cutoff = neg ? -(unsigned long)LONG_MIN : LONG_MAX;
+  cutlim = cutoff % (unsigned long)base;
+  cutoff /= (unsigned long)base;
+  for (acc = 0, any = 0;; c = *s++) {
+    if (ISDIGIT(c))
+      c -= '0';
+    else if (ISALPHA(c))
+      c -= ISUPPER(c) ? 'A' - 10 : 'a' - 10;
+    else
+      break;
+    if (c >= base)
+      break;
+    if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
+      any = -1;
+    else {
+      any = 1;
+      acc *= base;
+      acc += c;
+    }
+  }
+  if (any < 0) {
+    acc = neg ? LONG_MIN : LONG_MAX;
+    errno = ERANGE;
+  } else if (neg)
+    acc = -acc;
+  if (endptr != 0)
+    *endptr = const_cast<char *>(any ? s - 1 : nptr);
+  return (acc);
+}
+
+int strcmp(const char *lhs, const char *rhs) {
+  while (*lhs != '\0' && *rhs != '\0') {
+    if (*lhs == *rhs) {
+      ++lhs;
+      ++rhs;
+    }
+    return *lhs - *rhs;
+  }
+  if (*lhs != '\0')
+    return 1;
+
+  return -1;
+}
+
+void *calloc(size_t num, size_t size) {
+  size_t bits = num * size;
+  char *p = (char *)malloc(bits);
+  if (!p)
+    return p;
+  char *q = (char *)p;
+  while (q - p < bits) {
+    *(int *)q = 0;
+    q += sizeof(int);
+  }
+  while (q - p < bits) {
+    *q = 0;
+    q++;
+  }
+  return p;
+}
+
+void exit(int exit_code) { asm volatile("exit;"); }
+
+size_t strlen(const char *str) {
+  size_t r = 0;
+  while (*str == ' ')
+    ++str;
+
+  while (*str != '\0') {
+    ++r;
+    ++str;
+  }
+
+  return r;
+}
+
+char *strcpy(char *dest, const char *src) {
+  char *pd = dest;
+  const char *ps = src;
+
+  while (*ps != '\0')
+    *(pd++) = *(ps++);
+
+  *pd = '\0';
+
+  return dest;
+}
+
+int *__errno_location() { return &ErrNo; }
+
+char *strcat(char *dest, const char *src) {
+  char *pd = dest;
+  const char *ps = src;
+
+  while (*pd != '\0')
+    ++pd;
+
+  while (*ps != '\0')
+    *(pd++) = *(ps++);
+
+  *pd = '\0';
+
+  return dest;
+}
+
+void perror(const char *s) { printf("%s", s); }
+
+int strncmp(const char *lhs, const char *rhs, size_t count) {
+  size_t c = 0;
+  while (*lhs != '\0' && *rhs != '\0' && c < count) {
+    if (*lhs == *rhs) {
+      ++lhs;
+      ++rhs;
+      ++c;
+    } else
+      return *lhs - *rhs;
+  }
+
+  return 0;
+}
+
+char *strncpy(char *dest, const char *src, size_t count) {
+  char *pd = dest;
+  const char *ps = src;
+  size_t c = 0;
+
+  while (*ps != '\0' && c < count) {
+    *(pd++) = *(ps++);
+    ++c;
+  }
+
+  if (c < count)
+    *pd = '\0';
+
+  return dest;
+}
+
+char *strchr(const char *s, int c) {
+  do {
+    if (*s == c)
+      return const_cast<char *>(s);
+  } while (*s++);
+  return nullptr;
+}
+
+char *strtok(char *str, const char *delim) {
+  static char *s = nullptr;
+  char *tok;
+  if (str == nullptr) {
+    if (s == nullptr)
+      return nullptr;
+  } else
+    s = str;
+  for (size_t i; (*s != '\0'); s++) {
+    for (i = 0; (delim[i] != '\0') && (*s != delim[i]); i++)
+      ;
+    if (delim[i] == '\0')
+      break;
+  }
+  if (*s == '\0')
+    return s = nullptr;
+  tok = s++;
+  for (size_t i; (*s != '\0'); s++) {
+    for (i = 0; (delim[i] != '\0') && (*s != delim[i]); i++)
+      ;
+    if (delim[i] != '\0')
+      break;
+  }
+  if (*s != '\0') {
+    *s = '\0';
+    s++;
+  }
+  return tok;
+}
+
+void srand(unsigned seed) { RandomNext = seed; }
+
+int rand() {
+  RandomNext = RandomNext * 1103515245 + 12345;
+  return static_cast<unsigned int>((RandomNext / 65536)) % 2147483647;
+}
+
+int abs(int n) { return n > 0 ? n : -n; }
+
+void *memcpy(void *dest, const void *src, size_t count) {
+  __builtin_memcpy(dest, src, count);
+  return dest;
+}
+
+unsigned long strtoul(const char *str, char **str_end, int base) {
+  unsigned long res = 0;
+  while (*str != '\0') {
+    if (*str == ' ') {
+      ++str;
+      continue;
+    }
+    if (*str >= '0' && *str <= '9') {
+      res = res * 10 + *str - '0';
+      ++str;
+      continue;
+    }
+    break;
+  }
+
+  if (*str_end)
+    *str_end = const_cast<char *>(str);
+  return res;
+}
+
+double atof(const char *s) { return strtod(s, nullptr); }
+
+int atoi(const char *str) { return (int)strtol(str, nullptr, 10); }
+
+double strtod(const char *str, char **ptr) {
+  char *p;
+
+  if (ptr == (char **)0)
+    return atof(str);
+
+  p = const_cast<char *>(str);
+
+  while (ISSPACE(*p))
+    ++p;
+
+  if (*p == '+' || *p == '-')
+    ++p;
+
+  /* INF or INFINITY.  */
+  if ((p[0] == 'i' || p[0] == 'I') && (p[1] == 'n' || p[1] == 'N') &&
+      (p[2] == 'f' || p[2] == 'F')) {
+    if ((p[3] == 'i' || p[3] == 'I') && (p[4] == 'n' || p[4] == 'N') &&
+        (p[5] == 'i' || p[5] == 'I') && (p[6] == 't' || p[6] == 'T') &&
+        (p[7] == 'y' || p[7] == 'Y')) {
+      *ptr = p + 8;
+      return atof(str);
+    } else {
+      *ptr = p + 3;
+      return atof(str);
+    }
+  }
+
+  /* NAN or NAN(foo).  */
+  if ((p[0] == 'n' || p[0] == 'N') && (p[1] == 'a' || p[1] == 'A') &&
+      (p[2] == 'n' || p[2] == 'N')) {
+    p += 3;
+    if (*p == '(') {
+      ++p;
+      while (*p != '\0' && *p != ')')
+        ++p;
+      if (*p == ')')
+        ++p;
+    }
+    *ptr = p;
+    return atof(str);
+  }
+
+  /* digits, with 0 or 1 periods in it.  */
+  if (ISDIGIT(*p) || *p == '.') {
+    int got_dot = 0;
+    while (ISDIGIT(*p) || (!got_dot && *p == '.')) {
+      if (*p == '.')
+        got_dot = 1;
+      ++p;
+    }
+
+    /* Exponent.  */
+    if (*p == 'e' || *p == 'E') {
+      int i;
+      i = 1;
+      if (p[i] == '+' || p[i] == '-')
+        ++i;
+      if (ISDIGIT(p[i])) {
+        while (ISDIGIT(p[i]))
+          ++i;
+        *ptr = p + i;
+        return atof(str);
+      }
+    }
+    *ptr = p;
+    return atof(str);
+  }
+  /* Didn't find any digits.  Doesn't look like a number.  */
+  *ptr = const_cast<char *>(str);
+  return 0.0;
+}
+
+void *memset(void *dest, int ch, size_t count) {
+  auto *P = reinterpret_cast<unsigned char *>(dest);
+  for (size_t I = 0; I < count; ++I, ++P)
+    *P = (unsigned char)ch;
+  return dest;
+}
+
+int tolower(int ch) {
+  if (ch >= 'A' && ch <= 'Z')
+    return ch + 32;
+  return ch;
+}
+
+char *strstr(const char *s1, const char *s2) {
+  const size_t len = strlen(s2);
+  while (*s1) {
+    if (!memcmp(s1, s2, len))
+      return const_cast<char *>(s1);
+    ++s1;
+  }
+  return (0);
+}
+
+char *__xpg_basename(const char *path) { return const_cast<char *>(path); }
+
+const unsigned short **__ctype_b_loc(void) { return &BLocTablePTable; }
+
+const int32_t **__ctype_tolower_loc(void) { return &ToLowerPTable; }
+
+void qsort(void *b, size_t n, size_t s, __compar_fn_t cmp) {
+  return _quicksort(b, n, s, (__compar_d_fn_t)cmp, nullptr);
+}
+
+int strcasecmp(const char *s1, const char *s2) {
+  const unsigned char *p1 = (const unsigned char *)s1;
+  const unsigned char *p2 = (const unsigned char *)s2;
+  int result;
+
+  if (p1 == p2)
+    return 0;
+
+  while ((result = tolower(*p1) - tolower(*p2++)) == 0)
+    if (*p1++ == '\0')
+      break;
+
+  return result;
+}
 }

 #pragma omp end declare target
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -279,9 +279,15 @@ uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }

 uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
  uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
-  // If we are in SPMD mode, remove one warp.
-  return BlockSize - (!IsSPMD * impl::getWarpSize());
+  if (IsSPMD)
+     return BlockSize;
+   uint32_t WarpSize = impl::getWarpSize();
+   bool IsFullLastWarp = !(BlockSize % WarpSize);
+   if (OMP_LIKELY(IsFullLastWarp))
+     return BlockSize - WarpSize;
+   return BlockSize - BlockSize % WarpSize;
 }
+
 uint32_t mapping::getMaxTeamThreads() {
  return mapping::getMaxTeamThreads(mapping::isSPMDMode());
 }
--- a/offload/DeviceRTL/src/Mpi.cpp
+++ b/offload/DeviceRTL/src/Mpi.cpp
@@ -0,0 +1,885 @@
+#pragma omp begin declare target device_type(nohost)
+
+#include "Utils.h"
+#include "Synchronization.h"
+#include "Interface.h"
+#include "Memory.h"
+#include "Debug.h"
+#include "LibC.h"
+
+#include "Mpi.h"
+
+//using namespace ompx;
+
+// forward declaration of new
+inline void *operator new(__SIZE_TYPE__ size, void *ptr) { return ptr; }
+
+namespace mpiutils {
+
+template <typename T>
+struct LinkListNode {
+  template <typename R> friend struct LinkList;
+
+public:
+  T *getPrev(){ return Prev; };
+  T *getNext(){ return Next; };
+
+private:
+  T *Prev = nullptr;
+  T *Next = nullptr;
+};
+
+
+template <typename T>
+struct LinkList {
+
+public:
+  T *getHead(){return Head;};
+  T *getTail(){return Tail;};
+
+  void push(T *Node){ insertImpl(Tail, Node); }
+
+  T *remove(T *Node){ return removeImpl(Node); }
+
+  void lock(){ listlock.lock(); }
+
+  void unlock(){ listlock.unlock(); }
+
+private:
+  T *Head = nullptr;
+  T *Tail = nullptr;
+  ompx::mutex::TicketLock listlock;
+
+  void insertImpl(T *Prev, T *Node) {
+    lock();
+
+    // Set current Node link
+    Node->Prev = Prev;
+    if (Node->Prev)
+      Node->Next = Node->Prev->Next;
+    else
+      Node->Next = Head;
+
+    // Set adjacent nodes links
+    if (Node->Prev)
+      Node->Prev->Next = Node;
+    if (Node->Next)
+      Node->Next->Prev = Node;
+
+    // Set head & tail
+    if (!Node->Prev)
+      Head = Node;
+    if (!Node->Next)
+      Tail = Node;
+
+    unlock();
+  }
+
+  T *removeImpl(T *Node) {
+    lock();
+
+    // Set adjacent nodes links
+    if (Node->Prev)
+      Node->Prev->Next = Node->Next;
+    if (Node->Next)
+      Node->Next->Prev = Node->Prev;
+
+    // Set Head and tail
+    if (!Node->Prev)
+      Head = Node->Next;
+    if (!Node->Next)
+      Tail = Node->Prev;
+
+    unlock();
+
+    // Set current nodes links
+    Node->Prev = nullptr;
+    Node->Next = nullptr;
+
+    return Node;
+  }
+};
+}
+
+extern "C" {
+
+const int MPI_MAX_BUF_SEND = 8 * 512 * 1024;
+
+enum MPI_Request_type_e {
+  MPI_SEND,
+  MPI_RECV
+};
+
+struct MPI_Message_info_s {
+  int count;                // nb. of items
+  MPI_Datatype datatype;    // type of items
+  int rank;                 // sender or recv rank
+  int tag;                  // tag
+  struct MPI_Comm_s *comm;  // communicator
+};
+
+struct MPI_Message_s : MPI_Message_info_s,
+    mpiutils::LinkListNode<struct MPI_Message_s> {
+  const void *send_data;
+  void *buf_data;
+  uint32_t status;
+  // 0 send done (may be waiting for recv)
+  // 1 recv done (done, sender can clean)
+  bool buffered;
+};
+
+struct MPI_Request_s : MPI_Message_info_s {
+  enum MPI_Request_type_e req_type;
+  MPI_Status mpi_status;
+  bool persistent;
+  bool enable;
+  // true:  running (not finish)
+  // false: finish (send / recv done)
+};
+
+struct MPI_Send_Request_s : MPI_Request_s {
+  const void *send_data;
+  bool buffered;
+  struct MPI_Message_s *msg;
+};
+
+struct MPI_Recv_Request_s : MPI_Request_s {
+  void *recv_data;
+};
+
+struct MPI_Comm_s {
+  int id; // id = 0 -> MPI_COMM_WORLD (the only supported)
+  uint32_t size;
+  uint32_t barrier_counter;
+  uint32_t barrier_generation_counter;
+  int *ranks; // map teams to ranks
+  struct mpiutils::LinkList<struct MPI_Message_s> *messagebox;
+};
+
+}
+
+namespace impl {
+
+// forward declaration for varient
+void yield(void);
+
+// amdgcn
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+void yield(void){
+  // split kernel here (if it is ever implemented)
+  //__ompx_split();
+  __builtin_amdgcn_s_sleep(1);
+}
+#pragma omp end declare variant
+
+
+// nvptx
+#pragma omp begin declare variant match(                                       \
+        device = {arch(nvptx, nvptx64)},                                       \
+            implementation = {extension(match_any)})
+void yield(void){
+  // split kernel here
+  //__ompx_split();
+}
+#pragma omp end declare variant
+
+void barrier(uint32_t *counter, uint32_t *gen_counter, uint32_t size){
+  int previous_gen = ompx::atomic::load(gen_counter, ompx::atomic::seq_cst);
+  int current = ompx::atomic::inc(counter, size - 1,
+                  ompx::atomic::seq_cst, ompx::atomic::MemScopeTy::device);
+  if (current + 1 == size)
+    ompx::atomic::add(gen_counter, 1, ompx::atomic::seq_cst);
+  while(ompx::atomic::load(gen_counter, ompx::atomic::seq_cst) <= previous_gen){
+    yield();
+  }
+}
+
+size_t mpi_type_size(MPI_Datatype datatype){
+  switch(datatype){
+    case MPI_CHAR                   : return sizeof(char)                     ;
+    case MPI_SHORT                  : return sizeof(signed short int)         ;
+    case MPI_INT                    : return sizeof(signed int)               ;
+    case MPI_LONG                   : return sizeof(signed long int)          ;
+    case MPI_LONG_LONG_INT          : return sizeof(signed long long int)     ;
+    case MPI_LONG_LONG              : return sizeof(signed long long int)     ;
+    case MPI_SIGNED_CHAR            : return sizeof(signed char)              ;
+    case MPI_UNSIGNED_CHAR          : return sizeof(unsigned char)            ;
+    case MPI_UNSIGNED_SHORT         : return sizeof(unsigned short int)       ;
+    case MPI_UNSIGNED               : return sizeof(unsigned int)             ;
+    case MPI_UNSIGNED_LONG          : return sizeof(unsigned long int)        ;
+    case MPI_UNSIGNED_LONG_LONG     : return sizeof(unsigned long long int)   ;
+    case MPI_FLOAT                  : return sizeof(float)                    ;
+    case MPI_DOUBLE                 : return sizeof(double)                   ;
+    case MPI_LONG_DOUBLE            : return sizeof(long double)              ;
+    case MPI_WCHAR                  : return sizeof(wchar_t)                  ;
+    case MPI_C_BOOL                 : return sizeof(bool)                     ; // should be `_Bool` but it is not defined
+    case MPI_INT8_T                 : return sizeof(int8_t)                   ;
+    case MPI_INT16_T                : return sizeof(int16_t)                  ;
+    case MPI_INT32_T                : return sizeof(int32_t)                  ;
+    case MPI_INT64_T                : return sizeof(int64_t)                  ;
+    case MPI_UINT8_T                : return sizeof(uint8_t)                  ;
+    case MPI_UINT16_T               : return sizeof(uint16_t)                 ;
+    case MPI_UINT32_T               : return sizeof(uint32_t)                 ;
+    case MPI_UINT64_T               : return sizeof(uint64_t)                 ;
+    case MPI_C_COMPLEX              : return sizeof(float _Complex)           ;
+    case MPI_C_FLOAT_COMPLEX        : return sizeof(float _Complex)           ;
+    case MPI_C_DOUBLE_COMPLEX       : return sizeof(double _Complex)          ;
+    case MPI_C_LONG_DOUBLE_COMPLEX  : return sizeof(long double _Complex)     ;
+    case MPI_BYTE                   : return 8                                ;
+  //case MPI_PACKED                 : return //unsported                      ;
+  //case MPI_AINT                   : return sizeof(MPI_Aint)                 ;
+  //case MPI_OFFSET                 : return sizeof(MPI_Offset)               ;
+  //case MPI_COUNT                  : return sizeof(MPI_Count)                ;
+    case MPI_CXX_BOOL               : return sizeof(bool)                     ;
+  //case MPI_CXX_FLOAT_COMPLEX      : return sizeof(std::complex<float>)      ;
+  //case MPI_CXX_DOUBLE_COMPLEX     : return sizeof(std::complex<double>)     ;
+  //case MPI_CXX_LONG_DOUBLE_COMPLEX: return sizeof(std::complex<long double>);
+    default:
+      __builtin_unreachable();
+  }
+  __builtin_unreachable();
+}
+
+int mpi_rank(MPI_Comm comm) {
+  return comm->ranks[omp_get_team_num()];
+}
+
+void mpi_req_init(int count, MPI_Datatype datatype,
+    int rank, int tag, MPI_Comm comm,
+    enum MPI_Request_type_e req_type, struct MPI_Request_s *req)
+{
+  req->req_type = req_type;
+  req->count    = count;
+  req->datatype = datatype;
+  req->rank     = rank;
+  req->tag      = tag;
+  req->comm     = comm;
+  req->enable   = false;
+}
+
+struct MPI_Send_Request_s *mpi_send_init(
+        const void *buf, int count, MPI_Datatype datatype,
+        int recv_rank, int tag, MPI_Comm comm,
+        bool buffered, bool persistent)
+{
+  struct MPI_Send_Request_s *req =
+      reinterpret_cast<struct MPI_Send_Request_s *>(
+          malloc(sizeof(struct MPI_Send_Request_s)));
+
+  req->send_data  = buf;
+  req->buffered   = buffered;
+  req->persistent = persistent;
+
+  mpi_req_init(count, datatype, recv_rank, tag, comm, MPI_SEND, req);
+
+  return req;
+}
+
+struct MPI_Recv_Request_s *mpi_recv_init(
+        void *buf, int count, MPI_Datatype datatype,
+        int send_rank, int tag, MPI_Comm comm)
+{
+  struct MPI_Recv_Request_s *req =
+      reinterpret_cast<struct MPI_Recv_Request_s *>(
+          malloc(sizeof(struct MPI_Recv_Request_s)));
+
+  req->recv_data              = buf;
+  req->mpi_status.MPI_SOURCE  = MPI_ANY_SOURCE;
+  req->mpi_status.MPI_TAG     = MPI_ANY_TAG;
+  req->mpi_status.MPI_ERROR   = MPI_SUCCESS;
+
+  mpi_req_init(count, datatype, send_rank, tag, comm, MPI_RECV, req);
+
+  return req;
+}
+
+void mpi_req_free(struct MPI_Request_s **req){
+  free(*req);
+  *req = MPI_REQUEST_NULL;
+}
+
+bool mpi_msg_test(struct MPI_Message_s *msg)
+{
+  return ompx::atomic::load(&msg->status, ompx::atomic::seq_cst) == 1;
+}
+
+void mpi_msg_wait(struct MPI_Message_s *msg)
+{
+  while (! mpi_msg_test(msg))
+    yield();
+}
+
+void mpi_msg_free(struct MPI_Message_s *msg)
+{
+  free(msg);
+}
+
+struct MPI_Message_s *mpi_send(
+    const void *buf, int count, MPI_Datatype datatype, int recv_rank,
+    int tag, MPI_Comm comm, bool buffered, bool blocking)
+{
+  int data_size = mpi_type_size(datatype) * count;
+  int send_rank = mpi_rank(comm);
+
+  struct MPI_Message_s *msg = reinterpret_cast<struct MPI_Message_s *>(
+          malloc(sizeof(struct MPI_Message_s)));
+
+  msg->datatype   = datatype;
+  msg->count      = count;
+  msg->rank       = send_rank;
+  msg->tag        = tag;
+  msg->buffered   = buffered;
+  ompx::atomic::store(&msg->status, 0, ompx::atomic::seq_cst);
+
+  if (buffered){
+    msg->buf_data = malloc(data_size);
+    memcpy(msg->buf_data, buf, data_size);
+  } else {
+    msg->send_data = buf;
+  }
+
+  comm->messagebox[recv_rank].push(msg);
+
+  if (blocking && !buffered) {// ssend
+    mpi_msg_wait(msg);
+    mpi_msg_free(msg);
+  }
+
+  return msg;
+}
+
+void mpi_send_start(struct MPI_Send_Request_s *req)
+{
+  req->enable = true;
+  struct MPI_Message_s *msg = mpi_send(req->send_data, req->count, req->datatype,
+      req->rank, req->tag, req->comm, req->buffered, false);
+  req->msg = msg;
+}
+
+bool mpi_send_test(struct MPI_Send_Request_s *req)
+{
+  if (req->buffered)
+    return true;
+  return mpi_msg_test(req->msg);
+}
+
+void mpi_send_wait(struct MPI_Send_Request_s *req)
+{
+  if (req->buffered)
+    return;
+  mpi_msg_wait(req->msg);
+}
+
+struct MPI_Message_s *__mpi_recv_test(int count, MPI_Datatype datatype,
+        int source, int tag, MPI_Comm comm)
+{
+    struct mpiutils::LinkList<struct MPI_Message_s> *messages =
+        &comm->messagebox[mpi_rank(comm)];
+    messages->lock();
+    for (struct MPI_Message_s *msg = messages->getHead();
+        msg != nullptr; msg = msg->getNext()){
+      if ((source == MPI_ANY_SOURCE || source == msg->rank)
+          && (tag == MPI_ANY_TAG    || tag == msg->tag)){
+        assert(count == msg->count && datatype == msg->datatype
+            && "[MPI_recv]: count or datatype invalide");
+        messages->unlock();
+        // TODO: fixe race condition here
+        messages->remove(msg);
+        return msg;
+      }
+    }
+    messages->unlock();
+    return nullptr;
+}
+
+struct MPI_Message_s *__mpi_recv_wait(int count, MPI_Datatype datatype,
+        int source, int tag, MPI_Comm comm)
+{
+  struct MPI_Message_s *msg = nullptr;
+  while ((msg = __mpi_recv_test(count, datatype, source, tag, comm)) == nullptr){
+    // we did not reciev any messages
+    yield();
+  }
+  return msg;
+}
+
+void __mpi_recv_do(struct MPI_Message_s *msg, void *buf, MPI_Status *status)
+{
+  int data_size = mpi_type_size(msg->datatype) * msg->count;
+
+  if (msg->buffered) {
+    memcpy(buf, msg->buf_data, data_size);
+    free(msg->buf_data);
+  } else {
+    memcpy(buf, msg->send_data, data_size);
+  }
+
+  if (status != &MPI_STATUS_IGNORE && status != &MPI_STATUSES_IGNORE) {
+    status->MPI_SOURCE = msg->rank;
+    status->MPI_TAG = msg->tag;
+  }
+
+  if (msg->buffered)
+    mpi_msg_free(msg);
+  else
+    ompx::atomic::store(&msg->status, 1, ompx::atomic::seq_cst);
+}
+
+bool mpi_recv_test(void *buf, int count, MPI_Datatype datatype,
+    int source, int tag, MPI_Comm comm, MPI_Status *status)
+{
+  struct MPI_Message_s *msg = __mpi_recv_test(count, datatype, source, tag, comm);
+  if (msg == nullptr)
+    return false;
+  __mpi_recv_do(msg, buf, status);
+  return true;
+}
+
+void mpi_recv_wait(void *buf, int count, MPI_Datatype datatype,
+    int source, int tag, MPI_Comm comm, MPI_Status *status)
+{
+  struct MPI_Message_s *msg = __mpi_recv_wait(count, datatype, source, tag, comm);
+  __mpi_recv_do(msg, buf, status);
+}
+
+void mpi_recv(void *buf, int count, MPI_Datatype datatype,
+    int source, int tag, MPI_Comm comm, MPI_Status *status)
+{
+  mpi_recv_wait(buf, count, datatype, source, tag, comm, status);
+}
+
+void mpi_recv_start(struct MPI_Recv_Request_s *req){
+  bool res = mpi_recv_test(req->recv_data, req->count, req->datatype,
+          req->rank, req->tag, req->comm, &req->mpi_status);
+}
+
+bool mpi_recv_test(struct MPI_Recv_Request_s *req){
+  bool res = mpi_recv_test(req->recv_data, req->count, req->datatype,
+          req->rank, req->tag, req->comm, &req->mpi_status);
+  return res;
+}
+
+void mpi_recv_wait(struct MPI_Recv_Request_s *req){
+  mpi_recv_wait(req->recv_data, req->count, req->datatype,
+          req->rank, req->tag, req->comm, &req->mpi_status);
+}
+
+void mpi_req_start(struct MPI_Request_s **reqp){
+  if (reqp == &MPI_REQUEST_NULL)
+    return;
+
+  struct MPI_Request_s *req = *reqp;
+  if (!req->enable)
+    return;
+
+  switch (req->req_type) {
+   case (MPI_SEND):
+     mpi_send_start(static_cast<struct MPI_Send_Request_s *>(req));
+     break;
+   case (MPI_RECV):
+     mpi_recv_start(static_cast<struct MPI_Recv_Request_s *>(req));
+     break;
+   default:
+     __builtin_unreachable();
+  }
+}
+
+bool mpi_req_test(struct MPI_Request_s **reqp){
+  if (reqp == &MPI_REQUEST_NULL)
+    return false;
+
+  struct MPI_Request_s *req = *reqp;
+  if (!req->enable)
+    return false;
+
+  bool res = false;
+  switch (req->req_type) {
+    case (MPI_SEND):
+      res =  mpi_send_test(static_cast<struct MPI_Send_Request_s *>(req));
+      break;
+    case (MPI_RECV):
+      res = mpi_recv_test(static_cast<struct MPI_Recv_Request_s *>(req));
+      break;
+    default:
+      __builtin_unreachable();
+  }
+
+  return res;
+}
+
+void mpi_req_wait(struct MPI_Request_s **reqp){
+  if (reqp == &MPI_REQUEST_NULL)
+    return;
+
+  struct MPI_Request_s *req = *reqp;
+  if (!req->enable)
+    return;
+
+  switch (req->req_type) {
+   case (MPI_SEND):
+     mpi_send_wait(static_cast<struct MPI_Send_Request_s *>(req));
+     break;
+   case (MPI_RECV):
+     mpi_recv_wait(static_cast<struct MPI_Recv_Request_s *>(req));
+     break;
+   default:
+     __builtin_unreachable();
+  }
+}
+
+void mpi_req_deactivte(struct MPI_Request_s **reqp, MPI_Status *status) {
+  struct MPI_Request_s *req = *reqp;
+  if (status != &MPI_STATUS_IGNORE) {
+    status->MPI_SOURCE  = req->mpi_status.MPI_SOURCE;
+    status->MPI_TAG     = req->mpi_status.MPI_TAG;
+    status->MPI_ERROR   = req->mpi_status.MPI_ERROR;
+  }
+  if (req->persistent) {
+    req->enable = false;
+  } else {
+    mpi_req_free(reqp);
+  }
+}
+
+
+} // namespace impl
+
+extern "C" {
+
+// global used for atomics
+uint32_t global_counter = 0;
+uint32_t global_generation_counter = 0;
+
+
+int MPI_Init(int *argc, char **argv){
+  (void) argc;
+  (void) argv;
+
+  int size = omp_get_num_teams();
+  int rank = omp_get_team_num();
+
+  if (omp_get_team_num() == 0){
+    MPI_COMM_WORLD = reinterpret_cast<MPI_Comm>(
+        malloc(sizeof(struct MPI_Comm_s)));
+    MPI_COMM_WORLD->id = 0;
+    MPI_COMM_WORLD->size = size;
+    MPI_COMM_WORLD->barrier_counter = 0;
+    MPI_COMM_WORLD->barrier_generation_counter = 0;
+    MPI_COMM_WORLD->ranks = reinterpret_cast<int *>(
+            malloc(MPI_COMM_WORLD->size * sizeof(int)));
+    MPI_COMM_WORLD->messagebox =
+            reinterpret_cast<struct mpiutils::LinkList<struct MPI_Message_s> *>(
+            malloc(MPI_COMM_WORLD->size
+                * sizeof(struct mpiutils::LinkList<struct MPI_Message_s>)));
+  }
+
+  impl::barrier(&global_counter, &global_generation_counter,
+          omp_get_num_teams());
+
+  MPI_COMM_WORLD->ranks[omp_get_team_num()] = rank;
+  new (&MPI_COMM_WORLD->messagebox[rank])
+      mpiutils::LinkList<struct MPI_Message_s>();
+
+  impl::barrier(&global_counter, &global_generation_counter,
+          omp_get_num_teams());
+  return 0;
+}
+
+int MPI_Init_thread(int *argc, char ***argv, int required, int *provided)
+{
+  if (required == MPI_THREAD_MULTIPLE)
+    return 1; //TODO; Return proper MPI_Error
+  *provided = MPI_THREAD_FUNNELED;
+
+  return MPI_Init(argc, *argv);
+}
+
+int MPI_Finalize(void){
+  impl::barrier(&global_counter, &global_generation_counter,
+                omp_get_num_teams());
+  if(omp_get_team_num() == 0){
+    free(MPI_COMM_WORLD->ranks);
+    free(MPI_COMM_WORLD->messagebox);
+    free(MPI_COMM_WORLD);
+  }
+  return 0;
+}
+
+int MPI_Abort(MPI_Comm comm, int errorcode)
+{
+  printf("MPI_Abort(%d)\n", errorcode);
+  // TODO; Abort the GPu kernel here
+  return 0;
+}
+
+int MPI_Barrier(MPI_Comm comm) {
+  impl::barrier(&comm->barrier_counter, &comm->barrier_generation_counter,
+    comm->size);
+  return 0;
+}
+
+int MPI_Comm_rank(MPI_Comm comm, int *rank){
+  *rank = impl::mpi_rank(comm);
+  return 0;
+}
+
+int MPI_Comm_size(MPI_Comm comm, int *size){
+  *size = comm->size;
+  return 0;
+}
+
+int MPI_Type_size(MPI_Datatype datatype, int *size){
+  *size = impl::mpi_type_size(datatype);
+  return 0;
+}
+
+// Blocking Communications
+
+int MPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)
+{
+  impl::mpi_send(buf, count, datatype, dest, tag, comm,
+      (count * impl::mpi_type_size(datatype)) < MPI_MAX_BUF_SEND, true);
+  return 0;
+}
+
+int MPI_Ssend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)
+{
+
+  impl::mpi_send(buf, count, datatype, dest, tag, comm, false, true);
+  return 0;
+}
+
+int MPI_Bsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)
+{
+  impl::mpi_send(buf, count, datatype, dest, tag, comm, true, true);
+  return 0;
+}
+
+int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status)
+{
+  impl::mpi_recv(buf, count, datatype, source, tag, comm, status);
+  return 0;
+}
+
+// Non-Blocking Communications
+
+int MPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
+{
+  struct MPI_Send_Request_s *req = 
+    impl::mpi_send_init(buf, count, datatype, dest, tag, comm,
+      (count * impl::mpi_type_size(datatype)) < MPI_MAX_BUF_SEND, false);
+  impl::mpi_send_start(req);
+  *request = req;
+  return 0;
+}
+
+int MPI_Issend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
+{
+  struct MPI_Send_Request_s *req =
+    impl::mpi_send_init(buf, count, datatype, dest, tag, comm, false, false);
+  impl::mpi_send_start(req);
+  *request = req;
+  return 0;
+}
+
+int MPI_Ibsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
+{
+  struct MPI_Send_Request_s *req =
+    impl::mpi_send_init(buf, count, datatype, dest, tag, comm, true, false);
+  impl::mpi_send_start(req);
+  *request = req;
+  return 0;
+}
+
+int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request *request)
+{
+  struct MPI_Recv_Request_s *req =
+    impl::mpi_recv_init(buf, count, datatype, source, tag, comm);
+  impl::mpi_recv_start(req); // try to recive early to reduce deadlock probability
+  *request = req;
+  return 0;
+}
+
+// Test & Wait
+
+int MPI_Test(MPI_Request *request, int *flag, MPI_Status *status)
+{
+  *flag = impl::mpi_req_test(request);
+  if (*flag)
+    impl::mpi_req_deactivte(request, status);
+  return 0;
+}
+
+int MPI_Wait(MPI_Request *request, MPI_Status *status)
+{
+  impl::mpi_req_wait(request);
+  impl::mpi_req_deactivte(request, status);
+  return 0;
+}
+
+int MPI_Testall(int count, MPI_Request array_of_requests[], int *flag, MPI_Status array_of_statuses[])
+{
+  int finished = 0;
+  for (int i = 0; i < count; ++i)
+    finished += impl::mpi_req_test(&array_of_requests[i]);
+  *flag = (finished == count);
+  if (*flag)
+    for (int i = 0; i < count; ++i)
+      impl::mpi_req_deactivte(&array_of_requests[i], &array_of_statuses[i]);
+  return 0;
+}
+
+int MPI_Waitall(int count, MPI_Request array_of_requests[], MPI_Status array_of_statuses[])
+{
+  // we don't want to wait for the request one after the others
+  while (true) {
+    int finished = 0;
+    for (int i = 0; i < count; ++i)
+      finished += impl::mpi_req_test(&array_of_requests[i]);
+    if (finished >= count)
+      break;
+    impl::yield();
+  }
+  for (int i = 0; i < count; ++i)
+    impl::mpi_req_deactivte(&array_of_requests[i], &array_of_statuses[i]);
+  return 0;
+}
+
+int MPI_Testany(int count, MPI_Request array_of_requests[], int *index, int *flag, MPI_Status *status)
+{
+  for (int i = 0; i < count; ++i) {
+    if (impl::mpi_req_test(&array_of_requests[i])) {
+      *flag = true;
+      *index = i;
+      impl::mpi_req_deactivte(&array_of_requests[i], status);
+      return 0;
+    }
+  }
+  *flag = false;
+  return 0;
+}
+
+int MPI_Waitany(int count, MPI_Request array_of_requests[], int *index, MPI_Status *status)
+{
+  while (true) {
+    for (int i = 0; i < count; ++i) {
+      if (impl::mpi_req_test(&array_of_requests[i])) {
+        *index = i;
+        impl::mpi_req_deactivte(&array_of_requests[i], status);
+        return 0;
+      }
+    }
+    impl::yield();
+  }
+  return 0;
+}
+
+int MPI_Testsome(int incount, MPI_Request array_of_requests[], int *outcount, int array_of_indices[], MPI_Status array_of_statuses[])
+{
+  *outcount = 0;
+  for (int i = 0; i < incount; ++i) {
+    if(impl::mpi_req_test(&array_of_requests[i])) {
+      array_of_indices[*outcount] = i;
+      impl::mpi_req_deactivte(&array_of_requests[i],
+                              &array_of_statuses[*outcount]);
+      (*outcount)++;
+    }
+  }
+  return 0;
+}
+
+int MPI_Waitsome(int incount, MPI_Request array_of_requests[], int *outcount, int array_of_indices[], MPI_Status array_of_statuses[])
+{
+  while (true) {
+    for (int i = 0; i < incount; ++i)
+      if (impl::mpi_req_test(&array_of_requests[i]))
+        return MPI_Testsome(incount, array_of_requests,
+                            outcount, array_of_indices, array_of_statuses);
+    impl::yield();
+  }
+  return 0;
+}
+
+
+
+// Persistent Communications setup
+
+int MPI_Send_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
+{
+  struct MPI_Send_Request_s *req =
+    impl::mpi_send_init(buf, count, datatype, dest, tag, comm,
+      (count * impl::mpi_type_size(datatype)) < MPI_MAX_BUF_SEND, true);
+  *request = req;
+  return 0;
+}
+
+int MPI_Ssend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
+{
+  struct MPI_Send_Request_s *req =
+    impl::mpi_send_init(buf, count, datatype, dest, tag, comm, false, true);
+  *request = req;
+  return 0;
+}
+
+int MPI_Bsend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request)
+{
+  struct MPI_Send_Request_s *req =
+    impl::mpi_send_init(buf, count, datatype, dest, tag, comm, true, true);
+  *request = req;
+  return 0;
+}
+
+int MPI_Recv_init(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request *request)
+{
+  struct MPI_Recv_Request_s *req =
+    impl::mpi_recv_init(buf, count, datatype, source, tag, comm);
+  *request = req;
+  return 0;
+}
+
+// Persistent Communications start and end
+
+int MPI_Start(MPI_Request *request)
+{
+  impl::mpi_req_start(request);
+  return 0;
+}
+
+int MPI_Startall(int count, MPI_Request array_of_requests[])
+{
+  for (int i = 0; i < count; ++i)
+    MPI_Start(&array_of_requests[i]);
+  return 0;
+}
+
+int MPI_Request_free(MPI_Request *request)
+{
+  impl::mpi_req_free(request);
+  return 0;
+}
+
+// Collective Operation
+int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm)
+{
+  // TODO; Implement MPI_Reduce
+  return 0;
+}
+
+int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+{
+  // TODO; Implement MPI_Allreduce
+  return 0;
+}
+
+// double
+double MPI_Wtime(void)
+{
+  return omp_get_wtime(); // thx openmp
+}
+
+
+} // extern "C"
+
+#pragma omp end declare target
+
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -47,6 +47,9 @@ using namespace ompx;
 namespace {

 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
+  if (NumThreadsClause != -1)
+    return NumThreadsClause;
+
  uint32_t NThreadsICV =
      NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
  uint32_t NumThreads = mapping::getMaxTeamThreads();
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -17,6 +17,7 @@
 #include "LibC.h"
 #include "Mapping.h"
 #include "State.h"
+#include "Memory.h"
 #include "Synchronization.h"
 #include "Types.h"
 #include "Utils.h"
@@ -83,6 +84,7 @@ struct SharedMemorySmartStackTy {
  void *push(uint64_t Bytes);

  /// Deallocate the last allocation made by the encountering thread and pointed
+  //
  /// to by \p Ptr from the stack. Each thread can call this function.
  void pop(void *Ptr, uint32_t Bytes);

@@ -274,7 +276,8 @@ void state::enterDataEnvironment(IdentTy *Ident) {
  uintptr_t *ThreadStatesBitsPtr = reinterpret_cast<uintptr_t *>(&ThreadStates);
  if (!atomic::load(ThreadStatesBitsPtr, atomic::seq_cst)) {
    uint32_t Bytes =
-        sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
+        sizeof(ThreadStates[0]) *
+        (mapping::getNumberOfThreadsInBlock() + (mapping::isSPMDMode() ? 0 : 1));
    void *ThreadStatesPtr =
        memory::allocGlobal(Bytes, "Thread state array allocation");
    memset(ThreadStatesPtr, 0, Bytes);
@@ -371,6 +374,15 @@ int omp_get_thread_num(void) {
  return omp_get_ancestor_thread_num(omp_get_level());
 }

+int omp_get_bulk_thread_num() {
+  ASSERT(mapping::isSPMDMode(), "Not SPMD");
+  int BId = mapping::getBlockIdInKernel();
+  int BSize = mapping::getMaxTeamThreads();
+  int TId = mapping::getThreadIdInBlock();
+  int Id = BId * BSize + TId;
+  return returnValIfLevelIsActive(omp_get_level(), Id, 0);
+}
+
 int omp_get_team_size(int Level) {
  return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1);
 }
@@ -379,6 +391,11 @@ int omp_get_num_threads(void) {
  return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize();
 }

+int omp_get_bulk_num_threads(void) {
+  ASSERT(mapping::isSPMDMode(), "Not SPMD");
+  return mapping::getNumberOfThreadsInKernel();
+}
+
 int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); }

 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -333,7 +333,14 @@ void setCriticalLock(omp_lock_t *Lock) {

 uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
                   atomic::MemScopeTy MemScope) {
-  return __nvvm_atom_inc_gen_ui(Address, Val);
+  switch(MemScope) {
+  case atomic::MemScopeTy::all:
+    return __nvvm_atom_sys_inc_gen_ui(Address, Val);
+  case atomic::MemScopeTy::device:
+    return __nvvm_atom_inc_gen_ui(Address, Val);
+  case atomic::MemScopeTy::cgroup:
+    return __nvvm_atom_cta_inc_gen_ui(Address, Val);
+  }
 }

 void namedBarrierInit() {}
@@ -509,6 +516,19 @@ void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }

 void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); }

+void mutex::TicketLock::lock() {
+  uint64_t MyTicket = atomic::add(&NextTicket, 1, atomic::seq_cst);
+  while (atomic::load(&NowServing, atomic::aquire) != MyTicket)
+    ;
+  fence::kernel(atomic::aquire);
+}
+
+void mutex::TicketLock::unlock() {
+  fence::kernel(atomic::release);
+  atomic::add(&NowServing, 1, atomic::seq_cst);
+}
+
+
 extern "C" {
 void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}

--- a/offload/DeviceRTL/src/TeamAllocator.cpp
+++ b/offload/DeviceRTL/src/TeamAllocator.cpp
@@ -0,0 +1,308 @@
+//===------- WarpAllocator.cpp - Warp memory allocator ------- C++ -*-========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "Debug.h"
+#include "Mapping.h"
+#include "Memory.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace ompx;
+
+[[gnu::used, gnu::retain, gnu::weak,
+  gnu::visibility(
+      "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
+[[gnu::used, gnu::retain, gnu::weak,
+  gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
+    __omp_rtl_device_memory_pool_tracker;
+// TODO: implement Device Debug Allocation Tracker
+
+namespace {
+constexpr const size_t Alignment = 16;
+constexpr const size_t FirstThreadRatio = 40;
+constexpr const size_t SplitThreadhold = Alignment * 4;
+
+template <typename T> T abs(T V) { return V > 0 ? V : -V; }
+
+template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator;
+
+class WarpAllocatorEntry {
+  template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> friend struct WarpAllocator;
+
+  /// If Size is less than 0, the entry is allocated (in use).
+  int64_t Size = 0;
+  /// PrevSize is also supposed to be greater than or equal to 0. When it is 0,
+  /// it is the first entry of the buffer.
+  int64_t PrevSize = 0;
+
+public:
+  bool isFirst() const { return !PrevSize; }
+
+  size_t getSize() const { return abs(Size); }
+  void setSize(size_t V) { Size = V; }
+
+  void setPrevSize(WarpAllocatorEntry *Prev) {
+    PrevSize = Prev ? Prev->getSize() : 0;
+  }
+
+  size_t getUserSize() const { return getSize() - sizeof(WarpAllocatorEntry); }
+
+  // Note: isUsed can not be !isUnused or other way around because when Size is
+  // 0, it is uninitialized.
+  bool isUsed() const { return Size < 0; }
+  bool isUnused() const { return Size > 0; }
+
+  void setUsed() {
+    assert(isUnused() && "the entry is in use");
+    Size *= -1;
+  }
+  void setUnused() {
+    assert(isUsed() && "the entry is not in use");
+    Size *= -1;
+  }
+
+  char *getUserPtr() { return reinterpret_cast<char *>(this + 1); }
+  char *getEndPtr() { return reinterpret_cast<char *>(getNext()); }
+
+  WarpAllocatorEntry *getPrev() { return utils::advance(this, -PrevSize); }
+  WarpAllocatorEntry *getNext() { return utils::advance(this, getSize()); }
+
+  static WarpAllocatorEntry *fromUserPtr(void *Ptr) { return fromPtr(Ptr) - 1; }
+
+  static WarpAllocatorEntry *fromPtr(void *Ptr) {
+    return reinterpret_cast<WarpAllocatorEntry *>(Ptr);
+  }
+};
+
+static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");
+
+template <uint32_t WARP_SIZE, uint32_t TEAM_SIZE> struct WarpAllocator {
+  void init() {
+    if (mapping::isSPMDMode() &&
+        (mapping::getThreadIdInBlock() || mapping::getBlockIdInKernel()))
+      return;
+
+    size_t HeapSize = __omp_rtl_device_memory_pool.Size;
+
+    FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
+    FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
+    size_t OtherThreadHeapSize =
+        (HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
+    OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);
+
+    size_t TeamHeapSize = FirstThreadHeapSize / TEAM_SIZE;
+    TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
+    FirstTeamSize = TeamHeapSize;
+
+    char *LastLimit = reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr);
+    for (int I = 0; I < WARP_SIZE; ++I) {
+      for (int J = 0; J < TEAM_SIZE; ++J) {
+        Entries[I][J] = nullptr;
+        Limits[I][J] = LastLimit + TeamHeapSize * (J + 1);
+      }
+      LastLimit += I ? OtherThreadHeapSize : FirstThreadHeapSize;
+      Limits[I][TEAM_SIZE - 1] = LastLimit;
+      TeamHeapSize = OtherThreadHeapSize / TEAM_SIZE;
+      TeamHeapSize = utils::align_down(TeamHeapSize, Alignment);
+    }
+  }
+
+  void *allocate(size_t Size) {
+    int32_t TeamSlot = getTeamSlot();
+    int32_t TIdInWarp = mapping::getThreadIdInWarp();
+
+    Size = utils::align_up(Size + sizeof(WarpAllocatorEntry), Alignment);
+
+    // Error our early if the requested size is larger than the entire block.
+    if (Size > getBlockSize(TIdInWarp, TeamSlot))
+      return nullptr;
+
+    WarpAllocatorEntry *E = nullptr;
+    {
+      mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
+
+      auto *LastEntry = Entries[TIdInWarp][TeamSlot];
+      auto *NewWatermark = (LastEntry ? LastEntry->getEndPtr()
+                                      : getBlockBegin(TIdInWarp, TeamSlot)) +
+                           Size;
+      if (NewWatermark >= Limits[TIdInWarp][TeamSlot]) {
+        E = findMemorySlow(Size, TIdInWarp, TeamSlot);
+      } else {
+        E = LastEntry ? LastEntry->getNext()
+                      : WarpAllocatorEntry::fromPtr(
+                            getBlockBegin(TIdInWarp, TeamSlot));
+        E->setSize(Size);
+        E->setPrevSize(LastEntry);
+        Entries[TIdInWarp][TeamSlot] = E;
+      }
+
+      if (!E)
+        return nullptr;
+
+      E->setUsed();
+    }
+
+    return E->getUserPtr();
+  }
+
+  void deallocate(void *Ptr) {
+    WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(Ptr);
+
+    auto TeamSlot = getTeamSlot();
+    auto TIdInWarp = mapping::getThreadIdInWarp();
+
+    mutex::LockGuard LG(Locks[TIdInWarp][TeamSlot]);
+    E->setUnused();
+    // Is last entry?
+    if (E == Entries[TIdInWarp][TeamSlot]) {
+      do {
+        E = E->getPrev();
+      } while (!E->isFirst() && !E->isUsed());
+      Entries[TIdInWarp][TeamSlot] = E;
+    }
+  }
+
+  memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+    if (!utils::isInRange(P, reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr),
+                          __omp_rtl_device_memory_pool.Size))
+      return {};
+
+    auto TeamSlot = getTeamSlot();
+    auto TIdInWarp = mapping::getThreadIdInWarp();
+    for (int I = TIdInWarp; I < TIdInWarp + WARP_SIZE; ++I) {
+      int TId = I % WARP_SIZE;
+      for (int J = TeamSlot; J < TeamSlot + TEAM_SIZE; ++J) {
+        int SId = J % TEAM_SIZE;
+        if (P < getBlockBegin(TId, SId) || P >= getBlockEnd(TId, SId))
+          continue;
+
+        mutex::LockGuard LG(Locks[I][SId]);
+        WarpAllocatorEntry *E = Entries[I][SId];
+        if (!E)
+          return {};
+        if (E->getEndPtr() <= P)
+          return {};
+        bool isFirst = false;
+        while (!isFirst) {
+          if (E->getUserPtr() <= P && P < E->getEndPtr()) {
+            if (!E->isUsed())
+              return {};
+            return {E->getUserPtr(), E->getUserSize()};
+          }
+          isFirst = E->isFirst();
+          E = E->getPrev();
+        }
+      }
+    }
+    return {};
+  }
+
+private:
+  char *getBlockBegin(int32_t TIdInWarp, int32_t TeamSlot) const {
+    if (TeamSlot)
+      return Limits[TIdInWarp][TeamSlot - 1];
+    if (TIdInWarp)
+      return Limits[TIdInWarp - 1][TEAM_SIZE - 1];
+    return reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr);
+  }
+  char *getBlockEnd(int32_t TIdInWarp, int32_t TeamSlot) const {
+    return Limits[TIdInWarp][TeamSlot];
+  }
+
+  size_t getBlockSize(int32_t TIdInWarp, int32_t TeamSlot) const {
+    return getBlockEnd(TIdInWarp, TeamSlot) -
+           getBlockBegin(TIdInWarp, TeamSlot);
+  }
+
+  static int32_t getTeamSlot() { return mapping::getBlockIdInKernel() % TEAM_SIZE; }
+
+  WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp,
+                                     int32_t TeamSlot) {
+    char *Ptr = getBlockBegin(TIdInWarp, TeamSlot);
+    char *Limit = getBlockEnd(TIdInWarp, TeamSlot);
+
+    WarpAllocatorEntry *E = WarpAllocatorEntry::fromPtr(Ptr);
+    do {
+      if (!E->isUsed() && E->getSize() >= Size)
+        break;
+      E = E->getNext();
+      if (reinterpret_cast<char *>(E) + Size > Limit)
+        return nullptr;
+    } while (1);
+
+    size_t OldSize = E->getSize();
+    if (OldSize - Size >= SplitThreadhold) {
+      auto *OldNext = E->getNext();
+      E->setSize(Size);
+      auto *LeftOverE = E->getNext();
+      LeftOverE->setPrevSize(E);
+      LeftOverE->setSize(OldSize - Size);
+      OldNext->setPrevSize(LeftOverE);
+    }
+
+    return E;
+  }
+
+  WarpAllocatorEntry *Entries[WARP_SIZE][TEAM_SIZE];
+  char *Limits[WARP_SIZE][TEAM_SIZE];
+  mutex::TicketLock Locks[WARP_SIZE][TEAM_SIZE];
+  size_t FirstThreadHeapSize;
+  size_t FirstTeamSize;
+};
+
+WarpAllocator<32, 16> Allocator;
+
+} // namespace
+
+namespace ompx {
+namespace memory {
+MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+  return Allocator.getMemoryAllocationInfo(P);
+}
+} // namespace memory
+} // namespace ompx
+
+extern "C" {
+
+void *malloc(size_t Size) {
+  if (!Size)
+    return nullptr;
+  void *P = Allocator.allocate(Size);
+  assert(P && "allocator out of memory");
+  assert(reinterpret_cast<intptr_t>(P) % Alignment == 0 &&
+         "misaligned address");
+  return P;
+}
+
+void free(void *P) {
+  if (!P)
+    return;
+  Allocator.deallocate(P);
+}
+
+void *realloc(void *ptr, size_t new_size) {
+  void *NewPtr = malloc(new_size);
+  if (!NewPtr)
+    return nullptr;
+  WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(ptr);
+  __builtin_memcpy(NewPtr, ptr, utils::min(E->getUserSize(), new_size));
+  free(ptr);
+  return NewPtr;
+}
+
+void __kmpc_target_init_allocator() { Allocator.init(); }
+}
+
+#pragma omp end declare target
--- a/offload/DeviceRTL/src/WarpAllocator.cpp
+++ b/offload/DeviceRTL/src/WarpAllocator.cpp
@@ -0,0 +1,277 @@
+//===------- WarpAllocator.cpp - Warp memory allocator ------- C++ -*-========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma omp begin declare target device_type(nohost)
+
+#include "Debug.h"
+#include "Mapping.h"
+#include "Memory.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace ompx;
+
+[[gnu::used, gnu::retain, gnu::weak,
+  gnu::visibility(
+      "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
+[[gnu::used, gnu::retain, gnu::weak,
+  gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
+    __omp_rtl_device_memory_pool_tracker;
+// TODO: implement Device Debug Allocation Tracker
+
+namespace {
+constexpr const size_t Alignment = 16;
+constexpr const size_t FirstThreadRatio = 40;
+constexpr const size_t SplitThreadhold = Alignment * 16;
+
+template <typename T> T abs(T V) { return V > 0 ? V : -V; }
+
+template <int32_t WARP_SIZE> struct WarpAllocator;
+
+class WarpAllocatorEntry {
+  template <int32_t WARP_SIZE> friend struct WarpAllocator;
+
+  /// If Size is less than 0, the entry is allocated (in use).
+  int64_t Size = 0;
+  /// PrevSize is also supposed to be greater than or equal to 0. When it is 0,
+  /// it is the first entry of the buffer.
+  int64_t PrevSize = 0;
+
+public:
+  bool isFirst() const { return !PrevSize; }
+
+  size_t getSize() const { return abs(Size); }
+  void setSize(size_t V) { Size = V; }
+
+  void setPrevSize(WarpAllocatorEntry *Prev) {
+    PrevSize = Prev ? Prev->getSize() : 0;
+  }
+
+  size_t getUserSize() const { return getSize() - sizeof(WarpAllocatorEntry); }
+
+  // Note: isUsed can not be !isUnused or other way around because when Size is
+  // 0, it is uninitialized.
+  bool isUsed() const { return Size < 0; }
+  bool isUnused() const { return Size > 0; }
+
+  void setUsed() {
+    assert(isUnused() && "the entry is in use");
+    Size *= -1;
+  }
+  void setUnused() {
+    assert(isUsed() && "the entry is not in use");
+    Size *= -1;
+  }
+
+  char *getUserPtr() { return reinterpret_cast<char *>(this + 1); }
+  char *getEndPtr() { return reinterpret_cast<char *>(getNext()); }
+
+  WarpAllocatorEntry *getPrev() { return utils::advance(this, -PrevSize); }
+  WarpAllocatorEntry *getNext() { return utils::advance(this, getSize()); }
+
+  static WarpAllocatorEntry *fromUserPtr(void *Ptr) { return fromPtr(Ptr) - 1; }
+
+  static WarpAllocatorEntry *fromPtr(void *Ptr) {
+    return reinterpret_cast<WarpAllocatorEntry *>(Ptr);
+  }
+};
+
+static_assert(sizeof(WarpAllocatorEntry) == 16, "entry size mismatch");
+
+template <int32_t WARP_SIZE> struct WarpAllocator {
+  void init() {
+    if (mapping::isSPMDMode() &&
+        (mapping::getThreadIdInBlock() || mapping::getBlockId()))
+      return;
+
+    size_t HeapSize = __omp_rtl_device_memory_pool.Size;
+    size_t FirstThreadHeapSize = HeapSize * FirstThreadRatio / 100;
+    FirstThreadHeapSize = utils::align_down(FirstThreadHeapSize, Alignment);
+    size_t OtherThreadHeapSize =
+        (HeapSize - FirstThreadHeapSize) / (WARP_SIZE - 1);
+    OtherThreadHeapSize = utils::align_down(OtherThreadHeapSize, Alignment);
+
+    for (int I = 0; I < WARP_SIZE; ++I) {
+      Entries[I] = nullptr;
+      size_t PrivateOffset = OtherThreadHeapSize * I + FirstThreadHeapSize;
+      Limits[I] = reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr) + PrivateOffset;
+    }
+  }
+
+  void *allocate(size_t Size) {
+    int32_t TIdInWarp = mapping::getThreadIdInWarp();
+    Size = utils::align_up(Size + sizeof(WarpAllocatorEntry), Alignment);
+
+    // Error our early if the requested size is larger than the entire block.
+    if (Size > getBlockSize(TIdInWarp))
+      return nullptr;
+
+    WarpAllocatorEntry *E = nullptr;
+    {
+      mutex::LockGuard LG(Locks[TIdInWarp]);
+
+      auto *LastEntry = Entries[TIdInWarp];
+      auto *NewWatermark =
+          (LastEntry ? LastEntry->getEndPtr() : getBlockBegin(TIdInWarp)) +
+          Size;
+      if (NewWatermark >= Limits[TIdInWarp]) {
+        E = findMemorySlow(Size, TIdInWarp);
+      } else {
+        E = LastEntry ? LastEntry->getNext()
+                      : WarpAllocatorEntry::fromPtr(getBlockBegin(TIdInWarp));
+        E->setSize(Size);
+        E->setPrevSize(LastEntry);
+        Entries[TIdInWarp] = E;
+      }
+    }
+
+    if (!E)
+      return nullptr;
+
+    assert(E->isUnused() && "entry is not set to use properly");
+
+    E->setUsed();
+
+    return E->getUserPtr();
+  }
+
+  void deallocate(void *Ptr) {
+    WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(Ptr);
+
+    auto TIdInWarp = mapping::getThreadIdInWarp();
+
+    mutex::LockGuard LG(Locks[TIdInWarp]);
+    E->setUnused();
+    // Is last entry?
+    if (E == Entries[TIdInWarp]) {
+      do {
+        E = E->getPrev();
+      } while (!E->isFirst() && !E->isUsed());
+      Entries[TIdInWarp] = E;
+    }
+  }
+
+  memory::MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+    if (!utils::isInRange(P, reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr),
+                          __omp_rtl_device_memory_pool.Size))
+      return {};
+
+    auto TIdInWarp = mapping::getThreadIdInWarp();
+    for (int I = TIdInWarp; I < TIdInWarp + mapping::getWarpSize(); ++I) {
+      int TId = I % mapping::getWarpSize();
+      if (P < getBlockBegin(TId) || P >= getBlockEnd(TId))
+        continue;
+
+      mutex::LockGuard LG(Locks[I]);
+      WarpAllocatorEntry *E = Entries[I];
+      if (!E)
+        return {};
+      if (E->getEndPtr() <= P)
+        return {};
+      do {
+        if (E->getUserPtr() <= P && P < E->getEndPtr()) {
+          if (!E->isUsed())
+            return {};
+          return {E->getUserPtr(), E->getUserSize()};
+        }
+        E = E->getPrev();
+      } while (!E->isFirst());
+    }
+    return {};
+  }
+
+private:
+  char *getBlockBegin(int32_t TIdInWarp) const {
+    return TIdInWarp ? Limits[TIdInWarp - 1] : reinterpret_cast<char *>(__omp_rtl_device_memory_pool.Ptr);
+  }
+  char *getBlockEnd(int32_t TIdInWarp) const { return Limits[TIdInWarp]; }
+
+  size_t getBlockSize(int32_t TIdInWarp) const {
+    return getBlockEnd(TIdInWarp) - getBlockBegin(TIdInWarp);
+  }
+
+  WarpAllocatorEntry *findMemorySlow(size_t Size, int32_t TIdInWarp) {
+    char *Ptr = getBlockBegin(TIdInWarp);
+    char *Limit = getBlockEnd(TIdInWarp);
+
+    WarpAllocatorEntry *E = WarpAllocatorEntry::fromPtr(Ptr);
+    do {
+      if (!E->isUsed() && E->getSize() >= Size)
+        break;
+      E = E->getNext();
+      if (reinterpret_cast<char *>(E) + Size > Limit)
+        return nullptr;
+    } while (1);
+
+    size_t OldSize = E->getSize();
+    if (OldSize - Size >= SplitThreadhold) {
+      auto *OldNext = E->getNext();
+      E->setSize(Size);
+      auto *LeftOverE = E->getNext();
+      LeftOverE->setPrevSize(E);
+      LeftOverE->setSize(OldSize - Size);
+      OldNext->setPrevSize(LeftOverE);
+    }
+
+    return E;
+  }
+
+  WarpAllocatorEntry *Entries[WARP_SIZE];
+  char *Limits[WARP_SIZE];
+  mutex::TicketLock Locks[WARP_SIZE];
+};
+
+WarpAllocator<32> Allocator;
+
+} // namespace
+
+namespace ompx {
+namespace memory {
+MemoryAllocationInfo getMemoryAllocationInfo(void *P) {
+  return Allocator.getMemoryAllocationInfo(P);
+}
+} // namespace memory
+} // namespace ompx
+
+extern "C" {
+
+void *malloc(size_t Size) {
+  if (!Size)
+    return nullptr;
+  void *P = Allocator.allocate(Size);
+  assert(P && "allocator out of memory");
+  assert(reinterpret_cast<intptr_t>(P) % Alignment == 0 &&
+         "misaligned address");
+  return P;
+}
+
+void free(void *P) {
+  if (!P)
+    return;
+  Allocator.deallocate(P);
+}
+
+void *realloc(void *ptr, size_t new_size) {
+  void *NewPtr = malloc(new_size);
+  if (!NewPtr)
+    return nullptr;
+  WarpAllocatorEntry *E = WarpAllocatorEntry::fromUserPtr(ptr);
+  __builtin_memcpy(NewPtr, ptr, utils::min(E->getUserSize(), new_size));
+  free(ptr);
+  return NewPtr;
+}
+
+void __kmpc_target_init_allocator() { Allocator.init(); }
+}
+
+#pragma omp end declare target
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -111,6 +111,38 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
    stride = loopSize; // make sure we only do 1 chunk per warp
  }

+  static void ForStaticNoLoopNoTrunk(int &last, T &lb, T &ub, ST &stride,
+                                     ST &chunk, T entityId,
+                                     T numberOfEntities) {
+    T loopSize = ub - lb + 1;
+    chunk = loopSize / numberOfEntities;
+
+    // We have more entities than iterations.
+    if (chunk == 0) {
+      chunk = 1;
+      lb = lb + entityId * chunk;
+      T inputUb = ub;
+      ub = lb + chunk - 1;
+      last = lb <= inputUb && inputUb <= ub;
+      stride = loopSize; // make sure we only do 1 chunk per warp
+      return;
+    }
+
+    T leftOver = loopSize - chunk * numberOfEntities;
+
+    if (entityId < leftOver) {
+      chunk++;
+      lb = lb + entityId * chunk;
+    } else {
+      lb = lb + entityId * chunk + leftOver;
+    }
+
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    last = lb <= inputUb && inputUb <= ub;
+    stride = loopSize; // make sure we only do 1 chunk per warp
+  }
+
  ////////////////////////////////////////////////////////////////////////////////
  // Support for Static Init

@@ -184,6 +216,19 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
                     omp_get_num_teams() * numberOfActiveOMPThreads);
      break;
    }
+    case kmp_sched_distr_parallel_static_noloop_chunk: {
+      ForStaticChunk(lastiter, lb, ub, stride, chunk,
+                     numberOfActiveOMPThreads * omp_get_team_num() + gtid,
+                     omp_get_num_teams() * numberOfActiveOMPThreads);
+      break;
+    }
+    case kmp_sched_distr_parallel_static_noloop_nochunk: {
+      ForStaticNoLoopNoTrunk(lastiter, lb, ub, stride, chunk,
+                             numberOfActiveOMPThreads * omp_get_team_num() +
+                                 gtid,
+                             omp_get_num_teams() * numberOfActiveOMPThreads);
+      break;
+    }
    default: {
      // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
--- a/offload/DeviceRTL/src/exports
+++ b/offload/DeviceRTL/src/exports
@@ -11,8 +11,65 @@ _ZN4ompx*

 IsSPMDMode

-malloc
-free
-memcmp
-printf
 __assert_fail
+__ctype_b_loc
+__ctype_tolower_loc
+__xpg_basename
+abs
+atof
+atoi
+calloc
+clock
+clock_gettime
+exit
+fclose
+feof
+fflush
+fgets
+fopen
+fprintf
+fputs
+fread
+free
+fscanf
+fseek
+ftell
+fwrite
+getc
+gettimeofday
+gmtime
+malloc
+memcmp
+memcpy
+memset
+pclose
+perror
+popen
+printf
+qsort
+rand
+realloc
+rewind
+sprintf
+srand
+sscanf
+stat
+strcat
+strchr
+strcmp
+strcpy
+strftime
+strlen
+strncmp
+strncpy
+strstr
+strtod
+strtok
+strtol
+strtoul
+time
+tolower
+strcasecmp
+
+MPI_*
+
--- a/offload/DirectGPUCompilation/clang-gpu
+++ b/offload/DirectGPUCompilation/clang-gpu
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+
+
+from driver.wrapper import run
+
+
+def main():
+    run(is_cpp=False, is_mpi=False)
+
+
+if __name__ == "__main__":
+    main()
--- a/offload/DirectGPUCompilation/clang-gpu++
+++ b/offload/DirectGPUCompilation/clang-gpu++
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+
+
+from driver.wrapper import run
+
+
+def main():
+    run(is_cpp=True, is_mpi=False)
+
+
+if __name__ == "__main__":
+    main()
--- a/offload/DirectGPUCompilation/clang-mpi-gpu
+++ b/offload/DirectGPUCompilation/clang-mpi-gpu
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+
+
+from driver.wrapper import run
+
+
+def main():
+    run(is_cpp=False, is_mpi=True)
+
+
+if __name__ == "__main__":
+    main()
--- a/offload/DirectGPUCompilation/clang-mpi-gpu++
+++ b/offload/DirectGPUCompilation/clang-mpi-gpu++
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+
+
+from driver.wrapper import run
+
+
+def main():
+    run(is_cpp=True, is_mpi=True)
+
+
+if __name__ == "__main__":
+    main()
--- a/offload/DirectGPUCompilation/driver/Main.c
+++ b/offload/DirectGPUCompilation/driver/Main.c
@@ -0,0 +1,104 @@
+//===------- Main.c - Direct compilation program start point ------ C -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+extern int __user_main(int, char *[]);
+extern void __kmpc_target_init_allocator(void);
+
+//#ifdef SINGLE_THREAD_EXECUTION
+//#define THREAD_LIMIT 1
+//#define TEAM_LIMIT 1
+//#else
+// 110 * 960 = 105 600 threads max for now
+// could probably do a 110 * 1024 = 112640 if we fixe offload
+//#define TEAM_LIMIT 110
+//#define THREAD_LIMIT 961 // limit is not included, max is 960
+
+// 220 * 512 = 112 640 threads max
+//#define TEAM_LIMIT 221
+//#define THREAD_LIMIT 513 // limit is not included, max is 512
+
+// 440 * 256 = 112 640 threads max
+//#define TEAM_LIMIT 440
+//#define THREAD_LIMIT 257 // limit is not included, max is 256
+
+// 880 * 128 = 112 640 threads max
+//#define TEAM_LIMIT 880
+//#define THREAD_LIMIT 129 // limit is not included, max is 128
+
+// 1320 * 64 = 84 480 threads max
+//#define TEAM_LIMIT 1320 // will not do more that 1320 thread block per gpu, i don't know why
+//#define THREAD_LIMIT 65 // limit is not included, max is 65
+
+// /!\ Do not do less that 64 thread per teams, that will put more that 1 warp
+// per teams and the synchronisations functions are not build to handle that.
+
+// Light for testing
+//#define TEAM_LIMIT 880
+//#define THREAD_LIMIT 65 // limit is not included, max is 64
+
+//#define TEAM_LIMIT 1760
+//#define TEAM_LIMIT 1320
+
+//#define TEAM_LIMIT 660
+//#define THREAD_LIMIT 128
+
+//#endif
+
+
+//#pragma omp begin declare target device_type(nohost)
+//void scheduler_init(void);
+//#pragma omp begin declare variant match(device = {arch(amdgcn)})
+//void scheduler_init(void){
+//  __builtin_amdgcn_ds_gws_init(TEAM_LIMIT - 1, 0);
+//}
+//#pragma omp end declare variant
+//#pragma omp end declare target
+
+
+int main(int argc, char *argv[]) {
+
+  char* MPI_Ranks   = getenv("MPI_RANKS");
+  char* MPI_Threads = getenv("MPI_THREADS");
+
+  int nb_teams   = 8;
+  int nb_threads = 64;
+
+  if (MPI_Ranks != NULL)
+    nb_teams = atoi(MPI_Ranks);
+
+  if (MPI_Threads != NULL)
+    nb_threads = atoi(MPI_Threads);
+
+  printf("Using %d MPI_Ranks and %d MPI_Threads\n", nb_teams, nb_threads);
+
+#pragma omp target enter data map(to: argv[:argc])
+
+  for (int I = 0; I < argc; ++I) {
+#pragma omp target enter data map(to: argv[I][:strlen(argv[I])])
+  }
+
+  int Ret = 0;
+#pragma omp target enter data map(to: Ret)
+
+#pragma omp target teams num_teams(1) thread_limit(1)
+  {
+    __kmpc_target_init_allocator();
+  }
+
+#pragma omp target teams num_teams(nb_teams) thread_limit(nb_threads)
+  {
+    Ret = __user_main(argc, argv);
+  }
+
+#pragma omp target exit data map(from: Ret)
+
+  return Ret;
+}
--- a/offload/DirectGPUCompilation/driver/UserWrapper.h
+++ b/offload/DirectGPUCompilation/driver/UserWrapper.h
@@ -0,0 +1,14 @@
+//===------- UserWrapper.h - User code wrapper --------------- C++ --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
+#define OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
+
+#pragma omp begin declare target device_type(nohost)
+
+#endif
--- a/offload/DirectGPUCompilation/driver/init.py
+++ b/offload/DirectGPUCompilation/driver/init.py
--- a/offload/DirectGPUCompilation/driver/mpi.h
+++ b/offload/DirectGPUCompilation/driver/mpi.h
@@ -0,0 +1,4 @@
+
+#include <stdint.h>
+#include "../../DeviceRTL/include/Mpi.h"
+
--- a/offload/DirectGPUCompilation/driver/wrapper.py
+++ b/offload/DirectGPUCompilation/driver/wrapper.py
@@ -0,0 +1,147 @@
+import argparse
+import os
+import pathlib
+import subprocess
+import sys
+import tempfile
+
+
+cwd = os.path.dirname(os.path.realpath(__file__))
+
+source_file_suffix = [".c", ".cpp", ".cu", ".hip", ".cc", ".cxx"]
+
+
+def print_version():
+    cp = subprocess.run(["clang", "--version"])
+    if cp.returncode != 0:
+        sys.exit(cp.returncode)
+
+
+def compile_loader(loader_name, args, targets, verbose, dry_run):
+    cmd = [
+        "clang",
+        "-c",
+        "-fopenmp",
+        "-foffload-lto",
+        "-fopenmp-offload-mandatory",
+        "-o",
+        loader_name,
+    ]
+    if targets:
+        for arch in targets:
+            cmd.append("--offload-arch={}".format(arch))
+    else:
+        cmd.append("--offload-arch=native")
+    cmd += args
+    cmd.append(os.path.join(cwd, "Main.c"))
+    if verbose:
+        print(" ".join(cmd), file=sys.stderr)
+    if dry_run:
+        print(" ".join(cmd), file=sys.stderr)
+        return
+    cp = subprocess.run(cmd)
+    if cp.returncode != 0:
+        sys.exit(cp.returncode)
+
+
+def invoke_clang(is_cpp, is_mpi, args, targets, verbose, dry_run):
+    cmd = [
+        "clang++" if is_cpp else "clang",
+        "-fopenmp",
+        "-foffload-lto",
+        "-fopenmp-offload-mandatory",
+        "-fopenmp-globalize-to-global-space",
+        "-I" if is_mpi else "",
+        cwd if is_mpi else "",
+        "-include",
+        os.path.join(cwd, "UserWrapper.h"),
+        "-include" if is_mpi else "",
+        os.path.join(cwd, "mpi.h") if is_mpi else "",
+        "--save-temps",
+        "-rdynamic",
+        "-mllvm",
+        "-enable-host-rpc",
+        "-mllvm",
+        "-openmp-opt-disable-state-machine-rewrite",
+        "-mllvm",
+        "-enable-canonicalize-main-function",
+        "-mllvm",
+        "-canonical-main-function-name=__user_main",
+    ]
+    if targets:
+        for arch in targets:
+            cmd.append("--offload-arch={}".format(arch))
+    else:
+        cmd.append("--offload-arch=native")
+    cmd += args
+    if verbose:
+        print(" ".join(cmd), file=sys.stderr)
+    if dry_run:
+        print(" ".join(cmd))
+        return
+    cp = subprocess.run(cmd)
+    if cp.returncode != 0:
+        sys.exit(cp.returncode)
+
+
+def run(is_cpp=False, is_mpi=False):
+    parser = argparse.ArgumentParser(
+        prog="clang-gpu", description="clang LLVM GPU compiler"
+    )
+    # should be changed, this prevent call with -ccc-print-phases
+    parser.add_argument(
+        "-c",
+        action="store_true",
+        help="Only run preprocess, compile, and assemble steps",
+    )
+    parser.add_argument(
+        "-v", action="store_true", help="Show commands to run and use verbose output"
+    )
+    parser.add_argument(
+        "--version", action="store_true", help="Print version information"
+    )
+    parser.add_argument(
+        "-###",
+        action="store_true",
+        help="Print (but do not run) the commands to run for this compilation",
+    )
+    parser.add_argument(
+        "--offload-arch",
+        type=str,
+        help=(
+            "Specify an offloading device architecture for CUDA, HIP, or OpenMP. (e.g."
+            " sm_35). If 'native' is used the compiler will detect locally installed"
+            " architectures. For HIP offloading, the device architecture can be"
+            " followed by target ID features delimited by a colon (e.g."
+            " gfx908:xnack+:sramecc-). May be specified more than once."
+        ),
+        nargs="*",
+    )
+
+    args, fwd_args = parser.parse_known_args()
+
+    if args.version:
+        print_version()
+        return
+
+    if args.c:
+        fwd_args.append("-c")
+    if args.v:
+        fwd_args.append("-v")
+
+    dry_run = vars(args)["###"]
+    loader_name = None
+    temp_files = []
+
+    if not args.c:
+        tf = tempfile.NamedTemporaryFile()
+        loader_name = "{}.o".format(tf.name)
+        compile_loader(loader_name, ["-O1"], args.offload_arch, args.v, dry_run)
+        fwd_args.append(loader_name)
+        temp_files.append(loader_name)
+
+    invoke_clang(is_cpp, is_mpi, fwd_args, args.offload_arch, args.v, dry_run)
+
+    for f in temp_files:
+        if os.path.isfile(f):
+            os.unlink(f)
--- a/offload/include/HostRPC.h
+++ b/offload/include/HostRPC.h
@@ -0,0 +1,67 @@
+//===------- HostRPC.h - Host RPC ---------------------------- C++ --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_INCLUDE_HOSTRPC_H
+#define OPENMP_LIBOMPTARGET_INCLUDE_HOSTRPC_H
+
+#ifdef OMPTARGET_DEVICE_RUNTIME
+#include "Types.h"
+#else
+#include <cstdint>
+#endif
+
+namespace hostrpc {
+
+/// RPC call identifier. Note: negative value only. Non-negative values are for
+/// compiler generated functions.
+enum CallId {
+  CALLID___kmpc_launch_parallel_51_kernel = -1,
+  CALLID_invalid = -2147483648,
+};
+
+/// Execution status.
+enum ExecutionStatus {
+  EXEC_STAT_CREATED = 0,
+  EXEC_STAT_DONE = 1,
+};
+
+enum Type {
+  ARG_LITERAL = 0,
+  ARG_POINTER = 1,
+};
+
+struct Argument {
+  intptr_t Value;
+  int64_t ArgType;
+};
+
+struct Descriptor {
+  // The following member will be used by both host and device.
+  int32_t Id;
+  struct Argument *Args;
+  int64_t NumArgs;
+  int64_t Status;
+  int64_t ReturnValue;
+
+  // The following members will only be used by device.
+  void **ArgInfo;
+  void *ArgMap;
+};
+
+/// A wrapper of HostRPCDescriptor that will only be used between plugins and
+/// libomptarget. It contains the three stdio global variables.
+struct DescriptorWrapper {
+  Descriptor D;
+  void *StdIn = nullptr;
+  void *StdOut = nullptr;
+  void *StdErr = nullptr;
+};
+
+} // namespace hostrpc
+
+#endif
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -425,6 +425,10 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
                                 void *VAddr, bool IsRecord, bool SaveOutput,
                                 uint64_t &ReqPtrArgOffset);

+// Host RPC support
+int64_t __kmpc_host_rpc_get_arg(void *descriptor, int32_t ArgNo);
+void __kmpc_host_rpc_set_ret_val(void *descriptor, int64_t RetVal);
+
 #ifdef __cplusplus
 }
 #endif
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -678,7 +678,7 @@ struct AMDGPUQueueTy {
    if (Queue)
      return Plugin::success();
    hsa_status_t Status =
-        hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
+        hsa_queue_create(Agent, QueueSize, 2 /*HSA_QUEUE_TYPE_MULTI*/, callbackError,
                         nullptr, UINT32_MAX, UINT32_MAX, &Queue);
    return Plugin::check(Status, "Error in hsa_queue_create: %s");
  }
@@ -3315,16 +3315,33 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
  if (GenericDevice.getRPCServer())
    Stream->setRPCServer(GenericDevice.getRPCServer());

+
+  utils::AMDGPUMGSyncInfo *multigridSyncArgs = nullptr;
+  if (auto Err = ArgsMemoryManager.allocate(sizeof(utils::AMDGPUMGSyncInfo),
+        reinterpret_cast<void **>(&multigridSyncArgs)))
+    return Err;
+
+  multigridSyncArgs->mgs = nullptr;                     // multigrid sync barrier data pointer
+  multigridSyncArgs->sgs = {0, 0};                      // singlegrid sync barrier data
+  multigridSyncArgs->grid_id = 0;                       // id of the current grid in multigpu launch
+  multigridSyncArgs->num_grids = 1;                     // number of grid in multigpu launch
+  multigridSyncArgs->prev_sum = 0;                      // sum of all thread that have already been run
+  multigridSyncArgs->all_sum = NumBlocks * NumThreads;  // sum of all thread to run
+  multigridSyncArgs->num_wg = NumBlocks;                // number of workgroup (nb. teams)
+
+  //printf("%lu\n", NumBlocks);
+
  // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
  if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
-    ImplArgs->BlockCountX = NumBlocks;
-    ImplArgs->BlockCountY = 1;
-    ImplArgs->BlockCountZ = 1;
-    ImplArgs->GroupSizeX = NumThreads;
-    ImplArgs->GroupSizeY = 1;
-    ImplArgs->GroupSizeZ = 1;
-    ImplArgs->GridDims = 1;
-    ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
+    ImplArgs->block_count_x = NumBlocks;
+    ImplArgs->block_count_y = 1;
+    ImplArgs->block_count_z = 1;
+    ImplArgs->group_size_x = NumThreads;
+    ImplArgs->group_size_y = 1;
+    ImplArgs->group_size_z = 1;
+    ImplArgs->grid_dims = 1;
+    ImplArgs->dynamic_lds_size = KernelArgs.DynCGroupMem;
+    ImplArgs->multigrid_sync_arg = multigridSyncArgs;
  }

  // Push the kernel launch into the stream.
--- a/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -35,19 +35,55 @@ namespace target {
 namespace plugin {
 namespace utils {

+
+// GPU struct for cooperative group / grid sync / multigrid sync
+// The structures below for MGPU launch match the device library format
+struct AMDGPUMGSyncData {
+  uint32_t w0;
+  uint32_t w1;
+};
+
+struct AMDGPUMGSyncInfo {
+  struct AMDGPUMGSyncData* mgs;
+  uint32_t grid_id;
+  uint32_t num_grids;
+  uint64_t prev_sum;
+  uint64_t all_sum;
+  struct AMDGPUMGSyncData sgs;
+  uint num_wg;
+};
+
+
 // The implicit arguments of COV5 AMDGPU kernels.
+// See: https://github.com/llvm/llvm-project/blob/6e86e11148474e4ecd49dbf0ca5dd9caddcdbd11/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp#L583
+// static_assert(sizeof(void *) == 8)
 struct AMDGPUImplicitArgsTy {
-  uint32_t BlockCountX;
-  uint32_t BlockCountY;
-  uint32_t BlockCountZ;
-  uint16_t GroupSizeX;
-  uint16_t GroupSizeY;
-  uint16_t GroupSizeZ;
-  uint8_t Unused0[46]; // 46 byte offset.
-  uint16_t GridDims;
-  uint8_t Unused1[54]; // 54 byte offset.
-  uint32_t DynamicLdsSize;
-  uint8_t Unused2[132]; // 132 byte offset.
+  uint32_t block_count_x;
+  uint32_t block_count_y;
+  uint32_t block_count_z;
+  uint16_t group_size_x;
+  uint16_t group_size_y;
+  uint16_t group_size_z;
+  uint16_t remainder_x;
+  uint16_t remainder_y;
+  uint16_t remainder_z;
+  uint8_t  reserved0[16];  // reserved
+  uint64_t global_offset_x;
+  uint64_t global_offset_y;
+  uint64_t global_offset_z;
+  uint16_t grid_dims;
+  uint8_t  reserved1[6];   // reserved
+  void *   printf_buffer;
+  void *   hostcall_buffer;
+  AMDGPUMGSyncInfo *multigrid_sync_arg;
+  void *   heap_v1;
+  void *   completion_action;
+  uint32_t dynamic_lds_size;
+  uint8_t  reserved2[68];  // reserved
+  uint32_t private_base;
+  uint32_t shared_base;
+  void *   queue_ptr;
+  uint8_t  unused[48];    // 48 byte offset.
 };

 // Dummy struct for COV4 implicitargs.
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -15,12 +15,45 @@
 #if defined(LIBOMPTARGET_RPC_SUPPORT)
 #include "llvm-libc-types/rpc_opcodes_t.h"
 #include "llvmlibc_rpc_server.h"
+
+#include "HostRPC.h"
+#include "llvm/Support/DynamicLibrary.h"
 #endif

 using namespace llvm;
 using namespace omp;
 using namespace target;

+#ifdef LIBOMPTARGET_RPC_SUPPORT
+// GPUFirst Host Function Wrapper Invoker
+class HostRPCInvokerWrapper {
+  void (*Invoker)(int32_t, void *) = nullptr;
+  std::unique_ptr<sys::DynamicLibrary> DL;
+  std::once_flag Flag;
+
+  void initInvoker() {
+    std::string ErrMsg;
+    DL = std::make_unique<sys::DynamicLibrary>(
+        sys::DynamicLibrary::getPermanentLibrary(nullptr, &ErrMsg));
+
+    assert(DL->isValid() && "invalid DL");
+    *((void **)&Invoker) =
+        DL->getAddressOfSymbol("__kmpc_host_rpc_invoke_host_wrapper");
+    assert(Invoker && "Invoker is nullptr");
+  }
+
+public:
+  void invoke(int32_t CallNo, void *Desc) {
+    std::call_once(Flag, &HostRPCInvokerWrapper::initInvoker, this);
+    Invoker(CallNo, Desc);
+  }
+};
+
+HostRPCInvokerWrapper *Invoker;
+// GPUFirst END
+#endif
+
+
 RPCServerTy::RPCServerTy(plugin::GenericPluginTy &Plugin)
    : Handles(Plugin.getNumDevices()) {}

@@ -89,6 +122,75 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
        "Failed to register RPC free handler for device %d: %d\n",
        Device.getDeviceId(), Err);

+  // GPUFirst
+  // Register custom opcode handler for gpu first
+  auto GPUFirstHandler = [](rpc_port_t port, void *Data) {
+
+  //    printf("[HostRPC] [Host]: GPUFirstHandler\n");
+  //  // WORKING back & forth of an uint64_t
+  //
+  //  printf("[HostRPC] [Host]: Start \n");
+  //
+  //  uint64_t size_recv = 0;
+  //  void *buf_recv = nullptr;
+  //
+  //  rpc_recv_n(port, &buf_recv, &size_recv,
+  //    [](uint64_t size, void* data){ return malloc(size); }, nullptr);
+  //
+  //  printf("[HostRPC] [Host] [RECV]: %lu\n", *((uint64_t *) buf_recv));
+  //  printf("[HostRPC] [Host] [RECV] Size: %lu\n", size_recv);
+  //
+  //  uint64_t size_send = sizeof(uint64_t);
+  //  void *buf_send = malloc(size_send);
+  //  *((uint64_t *) buf_send) = 987654321;
+  //
+  //  printf("[Hostrpc] [Host] [SEND]: %lu\n", *((uint64_t *) buf_send));
+  //  printf("[HostRPC] [Host] [SEND] Size: %lu\n", size_send);
+  //
+  //  rpc_send_n(port, &buf_send, &size_send);
+  //
+  //  printf("[HostRPC] [Host]: End \n");
+  //
+  //  // END of working part
+
+    auto _rpc_recv_n = [](rpc_port_t *handle, void **dst, size_t *size){
+      rpc_recv_n(*handle, dst, size,
+        [](uint64_t size, void* data){ return malloc(size); },
+        nullptr);
+    };
+    auto _rpc_send_n = [](rpc_port_t *handle, void *src, size_t size){
+      rpc_send_n(*handle, &src, &size);
+    };
+
+
+    uint64_t size_recv = 0;
+
+    hostrpc::Descriptor *D = nullptr;
+    hostrpc::Argument *Args = nullptr;
+
+    _rpc_recv_n(&port, reinterpret_cast<void **>(&D), &size_recv);
+    _rpc_recv_n(&port, reinterpret_cast<void **>(&Args), &size_recv);
+
+    D->Args = Args;
+
+    if(Invoker == nullptr)
+      Invoker = new HostRPCInvokerWrapper();
+    Invoker->invoke(D->Id, D);
+
+    _rpc_send_n(&port, D, sizeof(hostrpc::Descriptor));
+    _rpc_send_n(&port, D->Args, sizeof(hostrpc::Argument) * D->NumArgs);
+
+    free(D->Args);
+    free(D);
+
+  };
+  if (rpc_status_t Err =
+          rpc_register_callback(RPCDevice, RPC_GPUFIRST, GPUFirstHandler, &Invoker))
+    return plugin::Plugin::error(
+        "Failed to register RPC GPU First handler for device %d: %d\n", Device.getDeviceId(),
+        Err);
+  // GPUFirst END
+
  // Get the address of the RPC client from the device.
  void *ClientPtr;
  plugin::GlobalTy ClientGlobal(rpc_client_symbol_name, sizeof(void *));
--- a/offload/src/exports
+++ b/offload/src/exports
@@ -70,7 +70,8 @@ VERS1.0 {
    __tgt_interop_init;
    __tgt_interop_use;
    __tgt_interop_destroy;
-    ompt_libomptarget_connect;
+    __kmpc_host_rpc_get_arg;
+    __kmpc_host_rpc_set_ret_val;
  local:
    *;
 };
--- a/offload/src/interface.cpp
+++ b/offload/src/interface.cpp
@@ -16,6 +16,8 @@
 #include "PluginManager.h"
 #include "private.h"

+#include "HostRPC.h"
+
 #include "Shared/EnvironmentVar.h"
 #include "Shared/Profile.h"

@@ -510,3 +512,21 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
  delete AsyncInfo;
  *AsyncHandle = nullptr;
 }
+
+// Host RPC support functions.
+EXTERN int64_t __kmpc_host_rpc_get_arg(void *descriptor, int32_t ArgNum) {
+  hostrpc::Descriptor &SD = *reinterpret_cast<hostrpc::Descriptor *>(descriptor);
+
+  assert(ArgNum < SD.NumArgs && "out-of-range argument");
+
+  int64_t ArgVal = SD.Args[ArgNum].Value;
+
+  DP("[host-rpc] get argno=%d arg=%lx...\n", ArgNum, ArgVal);
+
+  return ArgVal;
+}
+
+EXTERN void __kmpc_host_rpc_set_ret_val(void *descriptor, int64_t RetVal) {
+  hostrpc::Descriptor &SD = *reinterpret_cast<hostrpc::Descriptor *>(descriptor);
+  SD.ReturnValue = RetVal;
+}
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@@ -49,10 +49,10 @@ else()

  # When building in tree we install the runtime according to the LLVM settings.
  if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND NOT APPLE)
-    set(OPENMP_INSTALL_LIBDIR lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE} CACHE STRING
+    set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE}/openmp" CACHE STRING
        "Path where built openmp libraries should be installed.")
  else()
-    set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}" CACHE STRING
+    set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}/openmp" CACHE STRING
        "Path where built OpenMP libraries should be installed.")
  endif()
Author	SHA1	Message	Date
Nicolas Marie	7a714f7795	split runtimes libs install in multiples directories	2024-08-09 10:12:57 -07:00
Nicolas Marie	6ee95b22e5	Fixe segmentation fault fron Atributor Attributes	2024-08-09 10:09:56 -07:00
Nicolas Marie	762b333c3a	fixe issues where emmiting omp dynamic allocation instead of a pre alocated memory, we try to add debug informations to an invalide uninitialize adress	2024-08-09 10:01:37 -07:00
Nicolas Marie	13beaff03f	Add mpi header include directory to clang-mpi-gpu & add the possibility to setup gpu mpi ranks & threads from env variables MPI_RANKS & MPI_THREADS	2024-08-09 09:58:59 -07:00
Nicolas Marie	c7399b2039	modify rtl to be able to use multi grid syncs	2024-08-09 09:51:58 -07:00
Nicolas Marie	1ce8887a1a	temporary remove new operator in conflict with std cpp header when using host headers with host rpcs	2024-08-09 09:49:59 -07:00
Nicolas Marie	de0a400cdb	Make debug information of HOST Rpc more clean	2024-08-09 09:47:22 -07:00
Nicolas Marie	dc0bac14f1	continue MPI implementation to run LULESH	2024-08-09 09:45:59 -07:00
Nicolas Marie	c8b441d7c4	fixe allocations and namespace	2024-07-03 15:25:03 -07:00
Nicolas Marie	6b947a0cb3	remove leftover debug	2024-07-03 08:58:32 -07:00
Nicolas Marie	bb9875d4b4	continue implementation of mpi p2p	2024-07-02 17:29:14 -07:00
Nicolas Marie	0ae02a120c	mpi p2p communications first try	2024-07-02 09:40:45 -07:00
Nicolas Marie	ad1e11e0d9	Fixe GPU First RPCs to work when given a pointer - Fixe GPUFirst Memory Allocator to work with new offload plugin. - Fixe TeamAllocator to not Ignore first Allocation.	2024-06-18 16:42:25 -07:00
Nicolas Marie	3b1aae9380	Use GPUFirst with libc rpc - add missing headers in rpc.h.def - add an opcode in libc rpc to handle gpu first host functions calls - Fixe pointer casting - Fixe Generated function to account for AMD address space - remove LibC duplicate FILE declarations - remove global variable to allow asyncronize rpc call	2024-06-18 08:42:58 -07:00
Nicolas Marie	c59cbdebfd	Fixe wrapper to use -O1 on the wrapper (-O0 is not supported)	2024-06-17 17:46:29 -07:00
Nicolas Marie	87cc6ecc5f	Fixe libc rpc test	2024-06-17 17:40:54 -07:00
Nicolas Marie	8aa3a4431a	Fixe Attributors nulllptr	2024-06-17 17:39:25 -07:00
Nicolas Marie	1582f964a4	Fixe DeviceRTL atomics: - remove unsupported & unused operations - add scope awarness to nvidia atomicInc	2024-06-17 17:17:40 -07:00
Nicolas Marie	ef5d9941c3	Temporary fixe for main function canonicalization error.	2024-06-17 17:13:43 -07:00
Nicolas Marie	b4d5dec977	Fixe: remove call to removed LIBC_HAS_BUILTIN macro	2024-06-17 17:10:41 -07:00
Nicolas Marie	eae9159c1d	Stop using outdated LTO pipeline for HostRPC	2024-06-17 17:09:15 -07:00
Nicolas Marie	b553ad7fd7	Replace getRawPointer by emitRawPointer	2024-06-17 17:03:07 -07:00
Nicolas Marie	3b6c5536c4	Remove unused CMakeLists.txt	2024-06-17 16:03:17 -07:00
Nicolas Marie	8980abe311	Remove outdated AutoRPC Folder	2024-06-17 14:57:29 -07:00
Joseph Huber	bf407f0829	[libc] Export the RPC interface from `libc` Summary: This patch adds new extensions that allow us to export the RPC interface from `libc` to other programs. This should allow external users of the GPU `libc` to interface with the RPC client (which more or less behaves like syscalls in this context). This is done by wrapping the interface into a C-style function call. Obviously, this approach is far less safe than the carefully crafted C++ interface. For example, we now expose the internal packet buffer, and it is theoretically possible to open a single port with conflicting opcodes and break the whole interface. So, extra care will be needed when interacting with this. However, the usage is very similar as shown by the new test. This somewhat stretches the concept of `libc` just doing `libc` things, but I think this is important enough to justify it. It is difficult to split this out, as we do not want to have the situation where we have multiple RPC clients running at one time, so for now it makes sense to just leave it in `libc`.	2024-05-23 09:36:28 -07:00
Nicolas Marie	2a4a8ef91e	Fixe compilations issues after rebase of llvm-test-suite-gpu	2024-05-20 14:52:00 -07:00
Nicolas Marie	f1001ba68d	Revert "[LTO] Remove Config.UseDefaultPipeline (#82587 )" This reverts commit `ec24094b56`. We do need Config.UseDefaultPipeline.	2024-05-20 14:44:46 -07:00
Shilei Tian	1ef92ff738	[OpenMP] Add the initial support for direct gpu compilation Rebase from llvm-test-suite-gpu, fixe rebase conflict in: clang/include/clang/Driver/Options.td clang/lib/CodeGen/CGDecl.cpp clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp clang/lib/Driver/ToolChains/Clang.cpp clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp llvm/lib/Passes/PassRegistry.def llvm/lib/Transforms/IPO/AttributorAttributes.cpp llvm/lib/Transforms/IPO/OpenMPOpt.cpp openmp/libomptarget/DeviceRTL/src/Mapping.cpp openmp/libomptarget/DeviceRTL/src/State.cpp openmp/libomptarget/DeviceRTL/src/Synchronization.cpp openmp/libomptarget/DeviceRTL/src/Utils.cpp openmp/libomptarget/DeviceRTL/src/exports openmp/libomptarget/include/omptarget.h openmp/libomptarget/src/exports openmp/libomptarget/src/interface.cpp	2024-05-20 14:44:29 -07:00