Compare commits

...

1 Commits

Author SHA1 Message Date
Shilei Tian
4979b021fc [OpenMP] Add the simple support for direct gpu compilation
This support doesn't include any RPC related stuff. The simple libc
implementation is removed as well.
2023-10-26 22:25:50 -04:00
18 changed files with 391 additions and 13 deletions

View File

@@ -254,6 +254,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
LANGOPT(OpenMPGlobalizeToGlobalSpace , 1, 0, "Globalize to global space for the globalized variables")
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")

View File

@@ -3353,6 +3353,10 @@ def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested
HelpText<"Assert no nested parallel regions in the GPU">,
MarshallingInfoFlag<LangOpts<"OpenMPNoNestedParallelism">>;
def fopenmp_globalize_to_global_space : Flag<["-"], "fopenmp-globalize-to-global-space">,
HelpText<"Globalize to global space for the globalized variables">,
MarshallingInfoFlag<LangOpts<"OpenMPGlobalizeToGlobalSpace">>;
} // let Group = f_Group
} // let Visibility = [ClangOption, CC1Option, FC1Option]
} // let Flags = [NoArgumentUnused, HelpHidden]

View File

@@ -1067,10 +1067,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
// Allocate space for the variable to be globalized
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
llvm::CallBase *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace
? OMPRTL_malloc
: OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
// FIXME: We should use the variables actual alignment as an argument.
VoidPtr->addRetAttr(llvm::Attribute::get(
CGM.getLLVMContext(), llvm::Attribute::Alignment,
@@ -1162,20 +1164,29 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
// globalized in the prolog (i.e. emitGenericVarsProlog).
for (const auto &AddrSizePair :
llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
{AddrSizePair.first, AddrSizePair.second});
if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
{AddrSizePair.first});
else
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
{AddrSizePair.first, AddrSizePair.second});
}
// Deallocate the memory for each globalized value
for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
const auto *VD = cast<VarDecl>(Rec.first);
I->getSecond().MappedParams->restore(CGF);
llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
CGF.getTypeSize(VD->getType())};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
FreeArgs);
if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace)
CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free),
{Rec.second.GlobalizedVal});
else
CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
OMPRTL___kmpc_free_shared),
{Rec.second.GlobalizedVal, CGF.getTypeSize(VD->getType())});
}
}
}

View File

@@ -6397,6 +6397,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism");
if (Args.hasArg(options::OPT_fopenmp_offload_mandatory))
CmdArgs.push_back("-fopenmp-offload-mandatory");
if (Args.hasArg(options::OPT_fopenmp_globalize_to_global_space))
CmdArgs.push_back("-fopenmp-globalize-to-global-space");
break;
default:
// By default, if Clang doesn't know how to generate useful OpenMP code

View File

@@ -476,6 +476,8 @@ __OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr, Int32,
__OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
__OMP_RTL(malloc, false, VoidPtr, SizeTy)
__OMP_RTL(free, false, Void, VoidPtr)
__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy)
__OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)

View File

@@ -0,0 +1,30 @@
//===--------------- CanonicalizeMainFunction.h -----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Utility function to canonicalize main function.
// The canonical main function is defined as: int main(int argc, char *argv[]);
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
#define LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H
#include "llvm/IR/PassManager.h"
namespace llvm {
/// A pass that canonicalizes main function in a module.
class CanonicalizeMainFunctionPass
: public PassInfoMixin<CanonicalizeMainFunctionPass> {
public:
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};
} // end namespace llvm
#endif // LLVM_TRANSFORMS_UTILS_CANONICALIZEMAINFUNCTION_H

View File

@@ -233,6 +233,7 @@
#include "llvm/Transforms/Utils/BreakCriticalEdges.h"
#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
#include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
#include "llvm/Transforms/Utils/CountVisits.h"
#include "llvm/Transforms/Utils/Debugify.h"
#include "llvm/Transforms/Utils/DXILUpgrade.h"

View File

@@ -124,6 +124,7 @@
#include "llvm/Transforms/Utils/AddDiscriminators.h"
#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
#include "llvm/Transforms/Utils/CountVisits.h"
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
@@ -277,6 +278,10 @@ static cl::opt<bool> UseLoopVersioningLICM(
"enable-loop-versioning-licm", cl::init(false), cl::Hidden,
cl::desc("Enable the experimental Loop Versioning LICM pass"));
static cl::opt<bool> EnableCanonicalizeMainFunction(
"enable-canonicalize-main-function", cl::init(false), cl::Hidden,
cl::desc("Enable CanonicalizeMainFunction pass"));
namespace llvm {
cl::opt<bool> EnableMemProfContextDisambiguation(
"enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
@@ -1060,6 +1065,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
}
if (EnableCanonicalizeMainFunction)
MPM.addPass(CanonicalizeMainFunctionPass());
// Try to perform OpenMP specific optimizations on the module. This is a
// (quick!) no-op if there are no OpenMP runtime calls present in the module.
MPM.addPass(OpenMPOptPass());
@@ -1611,6 +1619,9 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary));
}
if (EnableCanonicalizeMainFunction)
MPM.addPass(CanonicalizeMainFunctionPass());
if (Level == OptimizationLevel::O0) {
// Run a second time to clean up any type tests left behind by WPD for use
// in ICP.
@@ -2059,6 +2070,9 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,
MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass()));
if (EnableCanonicalizeMainFunction)
MPM.addPass(CanonicalizeMainFunctionPass());
return MPM;
}

View File

@@ -48,6 +48,7 @@ MODULE_PASS("openmp-opt", OpenMPOptPass())
MODULE_PASS("openmp-opt-postlink", OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink))
MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass())
MODULE_PASS("canonicalize-main-function", CanonicalizeMainFunctionPass())
MODULE_PASS("cg-profile", CGProfilePass())
MODULE_PASS("check-debugify", NewPMCheckDebugifyPass())
MODULE_PASS("constmerge", ConstantMergePass())

View File

@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMTransformUtils
CallGraphUpdater.cpp
CanonicalizeAliases.cpp
CanonicalizeFreezeInLoops.cpp
CanonicalizeMainFunction.cpp
CloneFunction.cpp
CloneModule.cpp
CodeExtractor.cpp

View File

@@ -0,0 +1,103 @@
//===-------------- CanonicalizeMainFunction.cpp ----------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Utility function to canonicalize main function.
// The canonical main function is defined as: int main(int argc, char *argv[]);
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Utils/CanonicalizeMainFunction.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/Support/CommandLine.h"
using namespace llvm;
#define DEBUG_TYPE "canonicalize-main-function"
static cl::opt<std::string>
MainFunctionName("canonical-main-function-name",
cl::desc("New main function name"),
cl::value_desc("main function name"));
bool rewriteMainFunction(Function &F) {
if (F.arg_size() == 2 && F.getReturnType()->isIntegerTy(32))
return false;
auto &Ctx = F.getContext();
auto &DL = F.getParent()->getDataLayout();
auto *Int32Ty = IntegerType::getInt32Ty(Ctx);
auto *PtrTy = PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace());
FunctionType *NewFnTy =
FunctionType::get(Int32Ty, {Int32Ty, PtrTy}, /* isVarArg */ false);
Function *NewFn =
Function::Create(NewFnTy, F.getLinkage(), F.getAddressSpace(), "");
F.getParent()->getFunctionList().insert(F.getIterator(), NewFn);
NewFn->takeName(&F);
NewFn->copyAttributesFrom(&F);
NewFn->setSubprogram(F.getSubprogram());
F.setSubprogram(nullptr);
NewFn->splice(NewFn->begin(), &F);
if (!F.getReturnType()->isIntegerTy(32)) {
SmallVector<ReturnInst *> WorkList;
for (BasicBlock &BB : *NewFn)
for (Instruction &I : BB) {
auto *RI = dyn_cast<ReturnInst>(&I);
if (!RI)
continue;
assert(RI->getReturnValue() == nullptr &&
"return value of a void main function is not nullptr");
WorkList.push_back(RI);
}
for (auto *RI : WorkList) {
(void)ReturnInst::Create(Ctx, ConstantInt::getNullValue(Int32Ty), RI);
RI->eraseFromParent();
}
}
if (F.arg_size() == NewFn->arg_size())
for (unsigned I = 0; I < NewFn->arg_size(); ++I) {
Argument *OldArg = F.getArg(I);
Argument *NewArg = NewFn->getArg(I);
NewArg->takeName(OldArg);
OldArg->replaceAllUsesWith(NewArg);
}
return true;
}
PreservedAnalyses CanonicalizeMainFunctionPass::run(Module &M,
ModuleAnalysisManager &AM) {
Function *MainFunc = nullptr;
for (Function &F : M)
if (F.getName() == "main") {
assert(MainFunc == nullptr && "more than one main function");
MainFunc = &F;
}
if (MainFunc == nullptr)
return PreservedAnalyses::all();
bool Changed = false;
if (!MainFunctionName.empty() && MainFunc->getName() != MainFunctionName) {
MainFunc->setName(MainFunctionName);
Changed = true;
}
if (rewriteMainFunction(*MainFunc)) {
MainFunc->eraseFromParent();
Changed = true;
}
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}

View File

@@ -116,7 +116,7 @@ set(src_files
# vectorized accesses to the shared state. Generally, those are "good" but
# the optimizer pipeline (esp. Attributor) does not fully support vectorized
# instructions yet and we end up missing out on way more important constant
# propagation. That said, we will run the vectorizer again after the runtime
# propagation. That said, we will run the vectorizer again after the runtime
# has been linked into the user program.
set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512 -mllvm -vectorize-slp=false )
set(link_opt_flags -O3 -openmp-opt-disable -attributor-enable=module -vectorize-slp=false )

View File

@@ -0,0 +1,12 @@
#!/usr/bin/env python3
from driver.wrapper import run
def main():
run(is_cpp=False)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,12 @@
#!/usr/bin/env python3
from driver.wrapper import run
def main():
run(is_cpp=True)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,31 @@
//===------- Main.c - Direct compilation program start point ------ C -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <string.h>
extern int __user_main(int, char *[]);
int main(int argc, char *argv[]) {
#pragma omp target enter data map(to: argv[:argc])
for (int I = 0; I < argc; ++I) {
#pragma omp target enter data map(to: argv[I][:strlen(argv[I])])
}
int Ret = 0;
#pragma omp target enter data map(to: Ret)
#pragma omp target teams num_teams(1) thread_limit(1024)
{
Ret = __user_main(argc, argv);
}
#pragma omp target exit data map(from: Ret)
return Ret;
}

View File

@@ -0,0 +1,14 @@
//===------- UserWrapper.h - User code wrapper --------------- C++ --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
#define OPENMP_LIBOMPTARGET_DIRECTGPUCOMPILATION_USERWRAPPER_H
#pragma omp begin declare target device_type(nohost)
#endif

View File

@@ -0,0 +1,139 @@
import argparse
import os
import pathlib
import subprocess
import sys
import tempfile
cwd = os.path.dirname(os.path.realpath(__file__))
source_file_suffix = [".c", ".cpp", ".cu", ".hip", ".cc", ".cxx"]
def print_version():
cp = subprocess.run(["clang", "--version"])
if cp.returncode != 0:
sys.exit(cp.returncode)
def compile_loader(loader_name, targets, verbose, dry_run):
cmd = [
"clang",
"-c",
"-fopenmp",
"-foffload-lto",
"-fopenmp-offload-mandatory",
"-o",
loader_name,
]
if targets:
for arch in targets:
cmd.append("--offload-arch={}".format(arch))
else:
cmd.append("--offload-arch=native")
cmd.append(os.path.join(cwd, "Main.c"))
if verbose:
print(" ".join(cmd), file=sys.stderr)
if dry_run:
print(" ".join(cmd), file=sys.stderr)
return
cp = subprocess.run(cmd)
if cp.returncode != 0:
sys.exit(cp.returncode)
def invoke_clang(is_cpp, args, targets, verbose, dry_run):
cmd = [
"clang++" if is_cpp else "clang",
"-fopenmp",
"-foffload-lto",
"-fopenmp-offload-mandatory",
"-fopenmp-globalize-to-global-space",
"-include",
os.path.join(cwd, "UserWrapper.h"),
"--save-temps",
"-rdynamic",
"-mllvm",
"-openmp-opt-disable-state-machine-rewrite",
"-mllvm",
"-enable-canonicalize-main-function",
"-mllvm",
"-canonical-main-function-name=__user_main",
]
if targets:
for arch in targets:
cmd.append("--offload-arch={}".format(arch))
else:
cmd.append("--offload-arch=native")
cmd += args
if verbose:
print(" ".join(cmd))
if dry_run:
print(" ".join(cmd))
return
cp = subprocess.run(cmd)
if cp.returncode != 0:
sys.exit(cp.returncode)
def run(is_cpp=False):
parser = argparse.ArgumentParser(
prog="clang-gpu", description="clang LLVM GPU compiler"
)
parser.add_argument(
"-c",
action="store_true",
help="Only run preprocess, compile, and assemble steps",
)
parser.add_argument(
"-v", action="store_true", help="Show commands to run and use verbose output"
)
parser.add_argument(
"--version", action="store_true", help="Print version information"
)
parser.add_argument(
"-###",
action="store_true",
help="Print (but do not run) the commands to run for this compilation",
)
parser.add_argument(
"--offload-arch",
type=str,
help=(
"Specify an offloading device architecture for CUDA, HIP, or OpenMP. (e.g."
" sm_35). If 'native' is used the compiler will detect locally installed"
" architectures. For HIP offloading, the device architecture can be"
" followed by target ID features delimited by a colon (e.g."
" gfx908:xnack+:sramecc-). May be specified more than once."
),
nargs="*",
)
args, fwd_args = parser.parse_known_args()
if args.version:
print_version()
return
if args.c:
fwd_args.append("-c")
if args.v:
fwd_args.append("-v")
dry_run = vars(args)["###"]
loader_name = None
temp_files = []
if not args.c:
tf = tempfile.NamedTemporaryFile()
loader_name = "{}.o".format(tf.name)
compile_loader(loader_name, args.offload_arch, args.v, dry_run)
fwd_args.append(loader_name)
temp_files.append(loader_name)
invoke_clang(is_cpp, fwd_args, args.offload_arch, args.v, dry_run)
for f in temp_files:
if os.path.isfile(f):
os.unlink(f)