[OpenMP] Add the simple support for direct gpu compilation

This support doesn't include any RPC related stuff. The simple libc implementation is removed as well.
2023-10-26 22:25:50 -04:00
18 changed files with 391 additions and 13 deletions
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -254,6 +254,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
 LANGOPT(OpenMPOptimisticCollapse  , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
 LANGOPT(OpenMPThreadSubscription  , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
 LANGOPT(OpenMPTeamSubscription  , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
+LANGOPT(OpenMPGlobalizeToGlobalSpace  , 1, 0, "Globalize to global space for the globalized variables")
 LANGOPT(OpenMPNoThreadState  , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
 LANGOPT(OpenMPNoNestedParallelism  , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
 LANGOPT(OpenMPOffloadMandatory  , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3353,6 +3353,10 @@ def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested
  HelpText<"Assert no nested parallel regions in the GPU">,
  MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;

+def fopenmp_globalize_to_global_space : Flag<["-"], "fopenmp-globalize-to-global-space">,
+  HelpText<"Globalize to global space for the globalized variables">,
+  MarshallingInfoFlag<LangOpts<"OpenMPGlobalizeToGlobalSpace">>;
+
 } // let Group = f_Group
 } // let Visibility = [ClangOption, CC1Option, FC1Option]
 } // let Flags = [NoArgumentUnused, HelpHidden]
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1067,10 +1067,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,

    // Allocate space for the variable to be globalized
    llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
-    llvm::CallBase *VoidPtr =
-        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                                CGM.getModule(), OMPRTL___kmpc_alloc_shared),
-                            AllocArgs, VD->getName());
+    llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(
+            CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
+                                 ? OMPRTL_malloc
+                                 : OMPRTL___kmpc_alloc_shared),
+        AllocArgs, VD->getName());
    // FIXME: We should use the variables actual alignment as an argument.
    VoidPtr->addRetAttr(llvm::Attribute::get(
        CGM.getLLVMContext(), llvm::Attribute::Alignment,
@@ -1162,20 +1164,29 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
    // globalized in the prolog (i.e. emitGenericVarsProlog).
    for (const auto &AddrSizePair :
         llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_free_shared),
-                          {AddrSizePair.first, AddrSizePair.second});
+      if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
+            {AddrSizePair.first});
+      else
+        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                CGM.getModule(), OMPRTL___kmpc_free_shared),
+                            {AddrSizePair.first, AddrSizePair.second});
    }
    // Deallocate the memory for each globalized value
    for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
      const auto *VD = cast<VarDecl>(Rec.first);
      I->getSecond().MappedParams->restore(CGF);

-      llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
-                                 CGF.getTypeSize(VD->getType())};
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_free_shared),
-                          FreeArgs);
+      if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
+            {Rec.second.GlobalizedVal});
+      else
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+                                                  OMPRTL___kmpc_free_shared),
+            {Rec.second.GlobalizedVal, CGF.getTypeSize(VD->getType())});
    }
  }
 }
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6397,6 +6397,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
        CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism");
      if (Args.hasArg(options::OPT_fopenmp_offload_mandatory))
        CmdArgs.push_back("-fopenmp-offload-mandatory");
+      if (Args.hasArg(options::OPT_fopenmp_globalize_to_global_space))
+        CmdArgs.push_back("-fopenmp-globalize-to-global-space");
      break;
    default:
      // By default, if Clang doesn't know how to generate useful OpenMP code
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -476,6 +476,8 @@ __OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr, Int32,

 __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)

+__OMP_RTL(malloc, false, VoidPtr, SizeTy)
+__OMP_RTL(free, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
 __OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)
 __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
--- a/llvm/include/llvm/Transforms/Utils/CanonicalizeMainFunction.h
+++ b/llvm/include/llvm/Transforms/Utils/CanonicalizeMainFunction.h
@@ -0,0 +1,30 @@
+//===--------------- CanonicalizeMainFunction.h -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility function to canonicalize main function.
+// The canonical main function is defined as: int main(int argc, char *argv[]);
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
+#define LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// A pass that canonicalizes main function in a module.
+class CanonicalizeMainFunctionPass
+    : public PassInfoMixin<CanonicalizeMainFunctionPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -233,6 +233,7 @@
 #include "llvm/Transforms/Utils/BreakCriticalEdges.h"
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
 #include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
+#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
 #include "llvm/Transforms/Utils/CountVisits.h"
 #include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/DXILUpgrade.h"
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -124,6 +124,7 @@
 #include "llvm/Transforms/Utils/AddDiscriminators.h"
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
+#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
 #include "llvm/Transforms/Utils/CountVisits.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
 #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
@@ -277,6 +278,10 @@ static cl::opt<bool> UseLoopVersioningLICM(
    "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
    cl::desc("Enable the experimental Loop Versioning LICM pass"));

+static cl::opt<bool> EnableCanonicalizeMainFunction(
+    "enable-canonicalize-main-function", cl::init(false), cl::Hidden,
+    cl::desc("Enable CanonicalizeMainFunction pass"));
+
 namespace llvm {
 cl::opt<bool> EnableMemProfContextDisambiguation(
    "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
@@ -1060,6 +1065,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
          PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
  }

+  if (EnableCanonicalizeMainFunction)
+    MPM.addPass(CanonicalizeMainFunctionPass());
+
  // Try to perform OpenMP specific optimizations on the module. This is a
  // (quick!) no-op if there are no OpenMP runtime calls present in the module.
  MPM.addPass(OpenMPOptPass());
@@ -1611,6 +1619,9 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
    MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary));
  }

+  if (EnableCanonicalizeMainFunction)
+    MPM.addPass(CanonicalizeMainFunctionPass());
+
  if (Level == OptimizationLevel::O0) {
    // Run a second time to clean up any type tests left behind by WPD for use
    // in ICP.
@@ -2059,6 +2070,9 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,

  MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));

+  if (EnableCanonicalizeMainFunction)
+    MPM.addPass(CanonicalizeMainFunctionPass());
+
  return MPM;
 }

--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -48,6 +48,7 @@ MODULE_PASS("openmp-opt", OpenMPOptPass())
 MODULE_PASS("openmp-opt-postlink", OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink))
 MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
 MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass())
+MODULE_PASS("canonicalize-main-function", CanonicalizeMainFunctionPass())
 MODULE_PASS("cg-profile", CGProfilePass())
 MODULE_PASS("check-debugify", NewPMCheckDebugifyPass())
 MODULE_PASS("constmerge", ConstantMergePass())
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMTransformUtils
  CallGraphUpdater.cpp
  CanonicalizeAliases.cpp
  CanonicalizeFreezeInLoops.cpp
+  CanonicalizeMainFunction.cpp
  CloneFunction.cpp
  CloneModule.cpp
  CodeExtractor.cpp
--- a/llvm/lib/Transforms/Utils/CanonicalizeMainFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeMainFunction.cpp
@@ -0,0 +1,103 @@
+//===-------------- CanonicalizeMainFunction.cpp ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility function to canonicalize main function.
+// The canonical main function is defined as: int main(int argc, char *argv[]);
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "canonicalize-main-function"
+
+static cl::opt<std::string>
+    MainFunctionName("canonical-main-function-name",
+                     cl::desc("New main function name"),
+                     cl::value_desc("main function name"));
+
+bool rewriteMainFunction(Function &F) {
+  if (F.arg_size() == 2 && F.getReturnType()->isIntegerTy(32))
+    return false;
+
+  auto &Ctx = F.getContext();
+  auto &DL = F.getParent()->getDataLayout();
+  auto *Int32Ty = IntegerType::getInt32Ty(Ctx);
+  auto *PtrTy = PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace());
+
+  FunctionType *NewFnTy =
+      FunctionType::get(Int32Ty, {Int32Ty, PtrTy}, /* isVarArg */ false);
+  Function *NewFn =
+      Function::Create(NewFnTy, F.getLinkage(), F.getAddressSpace(), "");
+  F.getParent()->getFunctionList().insert(F.getIterator(), NewFn);
+  NewFn->takeName(&F);
+  NewFn->copyAttributesFrom(&F);
+  NewFn->setSubprogram(F.getSubprogram());
+  F.setSubprogram(nullptr);
+  NewFn->splice(NewFn->begin(), &F);
+
+  if (!F.getReturnType()->isIntegerTy(32)) {
+    SmallVector<ReturnInst *> WorkList;
+    for (BasicBlock &BB : *NewFn)
+      for (Instruction &I : BB) {
+        auto *RI = dyn_cast<ReturnInst>(&I);
+        if (!RI)
+          continue;
+        assert(RI->getReturnValue() == nullptr &&
+               "return value of a void main function is not nullptr");
+        WorkList.push_back(RI);
+      }
+    for (auto *RI : WorkList) {
+      (void)ReturnInst::Create(Ctx, ConstantInt::getNullValue(Int32Ty), RI);
+      RI->eraseFromParent();
+    }
+  }
+
+  if (F.arg_size() == NewFn->arg_size())
+    for (unsigned I = 0; I < NewFn->arg_size(); ++I) {
+      Argument *OldArg = F.getArg(I);
+      Argument *NewArg = NewFn->getArg(I);
+      NewArg->takeName(OldArg);
+      OldArg->replaceAllUsesWith(NewArg);
+    }
+
+  return true;
+}
+
+PreservedAnalyses CanonicalizeMainFunctionPass::run(Module &M,
+                                                    ModuleAnalysisManager &AM) {
+  Function *MainFunc = nullptr;
+
+  for (Function &F : M)
+    if (F.getName() == "main") {
+      assert(MainFunc == nullptr && "more than one main function");
+      MainFunc = &F;
+    }
+
+  if (MainFunc == nullptr)
+    return PreservedAnalyses::all();
+
+  bool Changed = false;
+
+  if (!MainFunctionName.empty() && MainFunc->getName() != MainFunctionName) {
+    MainFunc->setName(MainFunctionName);
+    Changed = true;
+  }
+
+  if (rewriteMainFunction(*MainFunc)) {
+    MainFunc->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -116,7 +116,7 @@ set(src_files
 # vectorized accesses to the shared state. Generally, those are "good" but
 # the optimizer pipeline (esp. Attributor) does not fully support vectorized
 # instructions yet and we end up missing out on way more important constant
-# propagation. That said, we will run the vectorizer again after the runtime 
+# propagation. That said, we will run the vectorizer again after the runtime
 # has been linked into the user program.
 set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512 -mllvm -vectorize-slp=false )
 set(link_opt_flags  -O3        -openmp-opt-disable -attributor-enable=module -vectorize-slp=false )
--- a/openmp/libomptarget/DirectGPUCompilation/clang-gpu
+++ b/openmp/libomptarget/DirectGPUCompilation/clang-gpu
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+
+
+from driver.wrapper import run
+
+
+def main():
+    run(is_cpp=False)
+
+
+if __name__ == "__main__":
+    main()
--- a/openmp/libomptarget/DirectGPUCompilation/clang-gpu++
+++ b/openmp/libomptarget/DirectGPUCompilation/clang-gpu++
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+
+
+from driver.wrapper import run
+
+
+def main():
+    run(is_cpp=True)
+
+
+if __name__ == "__main__":
+    main()
--- a/openmp/libomptarget/DirectGPUCompilation/driver/Main.c
+++ b/openmp/libomptarget/DirectGPUCompilation/driver/Main.c
@@ -0,0 +1,31 @@
+//===------- Main.c - Direct compilation program start point ------ C -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <string.h>
+
+extern int __user_main(int, char *[]);
+
+int main(int argc, char *argv[]) {
+#pragma omp target enter data map(to: argv[:argc])
+
+  for (int I = 0; I < argc; ++I) {
+#pragma omp target enter data map(to: argv[I][:strlen(argv[I])])
+  }
+
+  int Ret = 0;
+#pragma omp target enter data map(to: Ret)
+
+#pragma omp target teams num_teams(1) thread_limit(1024)
+  {
+    Ret = __user_main(argc, argv);
+  }
+
+#pragma omp target exit data map(from: Ret)
+
+  return Ret;
+}
--- a/openmp/libomptarget/DirectGPUCompilation/driver/UserWrapper.h
+++ b/openmp/libomptarget/DirectGPUCompilation/driver/UserWrapper.h
@@ -0,0 +1,14 @@
+//===------- UserWrapper.h - User code wrapper --------------- C++ --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
+#define OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
+
+#pragma omp begin declare target device_type(nohost)
+
+#endif
--- a/openmp/libomptarget/DirectGPUCompilation/driver/init.py
+++ b/openmp/libomptarget/DirectGPUCompilation/driver/init.py
--- a/openmp/libomptarget/DirectGPUCompilation/driver/wrapper.py
+++ b/openmp/libomptarget/DirectGPUCompilation/driver/wrapper.py
@@ -0,0 +1,139 @@
+import argparse
+import os
+import pathlib
+import subprocess
+import sys
+import tempfile
+
+
+cwd = os.path.dirname(os.path.realpath(__file__))
+
+source_file_suffix = [".c", ".cpp", ".cu", ".hip", ".cc", ".cxx"]
+
+
+def print_version():
+    cp = subprocess.run(["clang", "--version"])
+    if cp.returncode != 0:
+        sys.exit(cp.returncode)
+
+
+def compile_loader(loader_name, targets, verbose, dry_run):
+    cmd = [
+        "clang",
+        "-c",
+        "-fopenmp",
+        "-foffload-lto",
+        "-fopenmp-offload-mandatory",
+        "-o",
+        loader_name,
+    ]
+    if targets:
+        for arch in targets:
+            cmd.append("--offload-arch={}".format(arch))
+    else:
+        cmd.append("--offload-arch=native")
+    cmd.append(os.path.join(cwd, "Main.c"))
+    if verbose:
+        print(" ".join(cmd), file=sys.stderr)
+    if dry_run:
+        print(" ".join(cmd), file=sys.stderr)
+        return
+    cp = subprocess.run(cmd)
+    if cp.returncode != 0:
+        sys.exit(cp.returncode)
+
+
+def invoke_clang(is_cpp, args, targets, verbose, dry_run):
+    cmd = [
+        "clang++" if is_cpp else "clang",
+        "-fopenmp",
+        "-foffload-lto",
+        "-fopenmp-offload-mandatory",
+        "-fopenmp-globalize-to-global-space",
+        "-include",
+        os.path.join(cwd, "UserWrapper.h"),
+        "--save-temps",
+        "-rdynamic",
+        "-mllvm",
+        "-openmp-opt-disable-state-machine-rewrite",
+        "-mllvm",
+        "-enable-canonicalize-main-function",
+        "-mllvm",
+        "-canonical-main-function-name=__user_main",
+    ]
+    if targets:
+        for arch in targets:
+            cmd.append("--offload-arch={}".format(arch))
+    else:
+        cmd.append("--offload-arch=native")
+    cmd += args
+    if verbose:
+        print(" ".join(cmd))
+    if dry_run:
+        print(" ".join(cmd))
+        return
+    cp = subprocess.run(cmd)
+    if cp.returncode != 0:
+        sys.exit(cp.returncode)
+
+
+def run(is_cpp=False):
+    parser = argparse.ArgumentParser(
+        prog="clang-gpu", description="clang LLVM GPU compiler"
+    )
+    parser.add_argument(
+        "-c",
+        action="store_true",
+        help="Only run preprocess, compile, and assemble steps",
+    )
+    parser.add_argument(
+        "-v", action="store_true", help="Show commands to run and use verbose output"
+    )
+    parser.add_argument(
+        "--version", action="store_true", help="Print version information"
+    )
+    parser.add_argument(
+        "-###",
+        action="store_true",
+        help="Print (but do not run) the commands to run for this compilation",
+    )
+    parser.add_argument(
+        "--offload-arch",
+        type=str,
+        help=(
+            "Specify an offloading device architecture for CUDA, HIP, or OpenMP. (e.g."
+            " sm_35). If 'native' is used the compiler will detect locally installed"
+            " architectures. For HIP offloading, the device architecture can be"
+            " followed by target ID features delimited by a colon (e.g."
+            " gfx908:xnack+:sramecc-). May be specified more than once."
+        ),
+        nargs="*",
+    )
+
+    args, fwd_args = parser.parse_known_args()
+
+    if args.version:
+        print_version()
+        return
+
+    if args.c:
+        fwd_args.append("-c")
+    if args.v:
+        fwd_args.append("-v")
+
+    dry_run = vars(args)["###"]
+    loader_name = None
+    temp_files = []
+
+    if not args.c:
+        tf = tempfile.NamedTemporaryFile()
+        loader_name = "{}.o".format(tf.name)
+        compile_loader(loader_name, args.offload_arch, args.v, dry_run)
+        fwd_args.append(loader_name)
+        temp_files.append(loader_name)
+
+    invoke_clang(is_cpp, fwd_args, args.offload_arch, args.v, dry_run)
+
+    for f in temp_files:
+        if os.path.isfile(f):
+            os.unlink(f)