Compare commits

...

1 Commits

Author SHA1 Message Date
Shilei Tian
5a4965b173 [OpenMP][DeviceRTL] Add the initial support for simd directive 2021-09-21 19:00:33 -04:00
12 changed files with 249 additions and 46 deletions

View File

@@ -206,10 +206,10 @@ uint32_t __kmpc_get_hardware_thread_id_in_block();
///{
int8_t __kmpc_is_spmd_exec_mode();
int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
int32_t __kmpc_target_init(IdentTy *Ident, int Mode,
bool UseGenericStateMachine, bool);
void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool);
void __kmpc_target_deinit(IdentTy *Ident, int Mode, bool);
///}
@@ -281,6 +281,12 @@ void __kmpc_serialized_parallel(IdentTy *Loc, uint32_t);
/// TODO
void __kmpc_end_serialized_parallel(IdentTy *Loc, uint32_t);
/// TODO
bool __kmpc_kernel_simd(SIMDRegionFnTy *WorkFn);
/// TODO
void __kmpc_kernel_end_simd();
/// TODO
void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind);

View File

@@ -25,7 +25,7 @@ inline constexpr uint32_t MaxThreadsPerTeam = 1024;
#pragma omp end declare target
/// Initialize the mapping machinery.
void init(bool IsSPMD);
void init(int Mode);
/// Return true if the kernel is executed in SPMD mode.
bool isSPMDMode();
@@ -33,6 +33,9 @@ bool isSPMDMode();
/// Return true if the kernel is executed in generic mode.
bool isGenericMode();
/// Return true if the kernel is executed in SIMD mode.
bool isSIMDMode();
/// Return true if the executing thread is the main thread in generic mode.
bool isMainThreadInGenericMode();
@@ -55,6 +58,12 @@ uint32_t getThreadIdInWarp();
/// Return the thread Id in the block, in [0, getBlockSize()).
uint32_t getThreadIdInBlock();
/// Return the logic thread Id, which depends on how we map an OpenMP thread to
/// the target device. In non-SIMD mode, we map an OpenMP thread to a device
/// thread. In SIMD mode, we map an OpenMP thread to a warp, and each thread in
/// the warp is a SIMD lane.
uint32_t getLogicThreadId();
/// Return the warp id in the block.
uint32_t getWarpId();
@@ -79,6 +88,17 @@ uint32_t getKernelSize();
/// Return the number of processing elements on the device.
uint32_t getNumberOfProcessorElements();
namespace utils {
/// Return true if \p Mode indicates SPMD mode.
inline bool isSPMDMode(int Mode) { return Mode & 0x1; }
/// Return true if \p Mode indicates generic mode.
inline bool isGenericMode(int Mode) { return !isSPMDMode(Mode); }
/// Return true if \p Mode indicates SIMD mode.
inline bool isSIMDMode(int Mode) { return Mode & 0x2; }
} // namespace utils
} // namespace mapping
} // namespace _OMP

View File

@@ -24,7 +24,7 @@ namespace state {
inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;
/// Initialize the state machinery. Must be called by all threads.
void init(bool IsSPMD);
void init(int Mode);
/// TODO
enum ValueKind {
@@ -37,6 +37,10 @@ enum ValueKind {
VK_RunSchedChunk,
VK_ParallelRegionFn,
VK_ParallelTeamSize,
// SIMD
VK_SIMDLevel,
VK_SIMDRegionFn,
VK_SIMDLaneWidth,
};
/// TODO
@@ -145,10 +149,20 @@ inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize;
inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
ParallelRegionFn;
/// TODO
inline state::Value<uint32_t, state::VK_SIMDLaneWidth> SIMDLaneWidth;
/// TODO
inline state::PtrValue<SIMDRegionFnTy, state::VK_SIMDRegionFn> SIMDRegionFn;
void runAndCheckState(void(Func(void)));
void assumeInitialState(bool IsSPMD);
/// Propagate the thread state from the leader in the warp to the rest of SIMD
/// workers. This function should only be called in SIMD mode.
void propagateThreadState(unsigned SIMDLen);
} // namespace state
namespace icv {
@@ -171,6 +185,9 @@ inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels;
/// TODO
inline state::Value<uint32_t, state::VK_RunSched> RunSched;
/// TODO
inline state::Value<uint32_t, state::VK_SIMDLevel> SIMDLevel;
} // namespace icv
namespace memory {

View File

@@ -19,7 +19,7 @@ namespace _OMP {
namespace synchronize {
/// Initialize the synchronization machinery. Must be called by all threads.
void init(bool IsSPMD);
void init(int Mode);
/// Synchronize all threads in a warp identified by \p Mask.
void warp(LaneMaskTy Mask);

View File

@@ -150,6 +150,8 @@ using __kmpc_impl_lanemask_t = LaneMaskTy;
using ParallelRegionFnTy = void *;
using SIMDRegionFnTy = void *;
using CriticalNameTy = int32_t[8];
struct omp_lock_t {

View File

@@ -21,17 +21,17 @@ using namespace _OMP;
#pragma omp declare target
static void inititializeRuntime(bool IsSPMD) {
static void inititializeRuntime(int Mode) {
// Order is important here.
synchronize::init(IsSPMD);
mapping::init(IsSPMD);
state::init(IsSPMD);
synchronize::init(Mode);
mapping::init(Mode);
state::init(Mode);
}
/// Simple generic state machine for worker threads.
static void genericStateMachine(IdentTy *Ident) {
uint32_t TId = mapping::getThreadIdInBlock();
uint32_t TId = mapping::getLogicThreadId();
do {
ParallelRegionFnTy WorkFn = 0;
@@ -58,23 +58,56 @@ static void genericStateMachine(IdentTy *Ident) {
} while (true);
}
namespace {
void runSIMDStateMachine(IdentTy *Ident) {
uint32_t LaneId = mapping::getThreadIdInWarp();
do {
SIMDRegionFnTy WorkFn = nullptr;
// Wait for the signal that we have a new work function.
synchronize::warp(mapping::activemask());
// Retrieve the work function from the runtime.
bool IsActive = __kmpc_kernel_simd(&WorkFn);
if (!WorkFn)
return;
if (IsActive) {
((void (*)(uint32_t, uint32_t))WorkFn)(0, LaneId);
__kmpc_kernel_end_simd();
}
synchronize::warp(mapping::activemask());
} while (true);
}
} // namespace
extern "C" {
/// Initialization
///
/// \param Ident Source location identification, can be NULL.
///
int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
int32_t __kmpc_target_init(IdentTy *Ident, int Mode,
bool UseGenericStateMachine, bool) {
if (IsSPMD) {
inititializeRuntime(/* IsSPMD */ true);
synchronize::threads();
} else {
inititializeRuntime(/* IsSPMD */ false);
// No need to wait since only the main threads will execute user
// code and workers will run into a barrier right away.
Mode = Mode | 0x2;
inititializeRuntime(Mode);
// For all SIMD workers, start the simd state machine.
if (mapping::utils::isSIMDMode(Mode)) {
uint32_t LaneId = mapping::getThreadIdInWarp();
if (LaneId) {
runSIMDStateMachine(Ident);
return LaneId;
}
}
const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
if (IsSPMD)
synchronize::threads();
if (IsSPMD) {
state::assumeInitialState(IsSPMD);
return -1;
@@ -96,7 +129,9 @@ int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
///
/// \param Ident Source location identification, can be NULL.
///
void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool) {
void __kmpc_target_deinit(IdentTy *Ident, int Mode, bool) {
const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
state::assumeInitialState(IsSPMD);
if (IsSPMD)
return;

View File

@@ -178,7 +178,7 @@ bool mapping::isMainThreadInGenericMode() {
bool mapping::isLeaderInWarp() {
__kmpc_impl_lanemask_t Active = mapping::activemask();
__kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
return utils::popc(Active & LaneMaskLT) == 0;
return ::_OMP::utils::popc(Active & LaneMaskLT) == 0;
}
LaneMaskTy mapping::activemask() { return impl::activemask(); }
@@ -191,6 +191,13 @@ uint32_t mapping::getThreadIdInWarp() { return impl::getThreadIdInWarp(); }
uint32_t mapping::getThreadIdInBlock() { return impl::getThreadIdInBlock(); }
uint32_t mapping::getLogicThreadId() {
if (mapping::isSIMDMode())
return mapping::getWarpId();
return mapping::getThreadIdInBlock();
}
uint32_t mapping::getBlockSize() { return impl::getBlockSize(); }
uint32_t mapping::getKernelSize() { return impl::getKernelSize(); }
@@ -214,16 +221,20 @@ uint32_t mapping::getNumberOfWarpsInBlock() {
/// Execution mode
///
///{
static int SHARED(IsSPMDMode);
static int SHARED(ExecutionMode);
void mapping::init(bool IsSPMD) {
void mapping::init(int Mode) {
if (!mapping::getThreadIdInBlock())
IsSPMDMode = IsSPMD;
ExecutionMode = Mode;
}
bool mapping::isSPMDMode() { return IsSPMDMode; }
bool mapping::isSPMDMode() { return mapping::utils::isSPMDMode(ExecutionMode); }
bool mapping::isGenericMode() { return !isSPMDMode(); }
bool mapping::isGenericMode() {
return mapping::utils::isGenericMode(ExecutionMode);
}
bool mapping::isSIMDMode() { return mapping::utils::isSIMDMode(ExecutionMode); }
///}
extern "C" {

View File

@@ -49,20 +49,43 @@ namespace {
uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
uint32_t NThreadsICV =
NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
uint32_t NumThreads = mapping::getBlockSize();
const bool IsSIMDMode = mapping::isSIMDMode();
uint32_t NumThreads =
IsSIMDMode ? mapping::getNumberOfWarpsInBlock() : mapping::getBlockSize();
if (NThreadsICV != 0 && NThreadsICV < NumThreads)
NumThreads = NThreadsICV;
// Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
if (NumThreads < mapping::getWarpSize())
NumThreads = 1;
else
NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
// We don't need this for SIMD mode because an OpenMP thread is mapped to a
// warp on the device and it can be any number.
if (!IsSIMDMode) {
if (NumThreads < mapping::getWarpSize())
NumThreads = 1;
else
NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
}
return NumThreads;
}
uint32_t determineSIMDLen(int32_t SIMDLen, int32_t SafeLen) {
ASSERT(mapping::isSIMDMode());
// TODO: This is probably not right if the schedule is different.
if (SafeLen < SIMDLen)
SIMDLen = SafeLen;
// We currently maps an OpenMP thread to a warp in SIMD mode. If the simdlen
// is larger than the warp size, we have to ceil it.
if (SIMDLen > mapping::getWarpSize())
SIMDLen = mapping::getWarpSize();
return SIMDLen;
}
// Invoke an outlined parallel function unwrapping arguments (up to 32).
void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
void **args, int64_t nargs) {
@@ -78,11 +101,57 @@ void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
extern "C" {
void __kmpc_simd_51(IdentTy *ident, int32_t, int32_t if_expr, int32_t safelen,
int32_t simdlen, int order, void *fn, void *wrapper_fn,
void **args, int64_t nargs) {
// Handle non-SIMD case first, which can be:
// - if clause is evaluted to false
// - simdlen is set to 1
// - it is already in simd region
const uint32_t LogicThreadId = mapping::getLogicThreadId();
if (OMP_UNLIKELY(!if_expr || simdlen == 1 || safelen == 1 ||
icv::SIMDLevel)) {
invokeMicrotask(LogicThreadId, 0, fn, args, nargs);
return;
}
// Only the leader of each warp can execute the following code.
ASSERT(mapping::isLeaderInWarp());
const uint32_t SIMDLen = determineSIMDLen(simdlen, safelen);
if (LogicThreadId == 0)
state::SIMDLaneWidth = SIMDLen;
// Propagates the thread state to all SIMD workers from the leader.
state::propagateThreadState(SIMDLen);
// Synchronize all threads (leaders).
synchronize::threads();
{
state::ValueRAII SIMDRegionFnRAII(state::SIMDRegionFn, wrapper_fn,
(void *)nullptr, true);
state::ValueRAII SIMDLevelRAII(icv::SIMDLevel, 1u, 0u, true);
// Signal SIMD workers
synchronize::warp(mapping::activemask());
// TODO: Leader in warp also has to execute the SIMD region.
// What we need:
// - A work-sharing function that can take both thread id and lane id into
// consideration.
// Synchronize after execution of the SIMD region.
synchronize::warp(mapping::activemask());
}
}
void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
int32_t num_threads, int proc_bind, void *fn,
void *wrapper_fn, void **args, int64_t nargs) {
uint32_t TId = mapping::getThreadIdInBlock();
uint32_t TId = mapping::getLogicThreadId();
// Handle the serialized case first, same for SPMD/non-SPMD.
if (OMP_UNLIKELY(!if_expr || icv::Level)) {
__kmpc_serialized_parallel(ident, TId);
@@ -156,7 +225,7 @@ __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
return false;
// Set to true for workers participating in the parallel region.
uint32_t TId = mapping::getThreadIdInBlock();
uint32_t TId = mapping::getLogicThreadId();
bool ThreadIsActive = TId < state::ParallelTeamSize;
return ThreadIsActive;
}
@@ -170,6 +239,25 @@ __attribute__((noinline)) void __kmpc_kernel_end_parallel() {
ASSERT(!mapping::isSPMDMode());
}
__attribute__((noinline)) bool
__kmpc_kernel_simd(SIMDRegionFnTy *WorkFn) {
// Work function and arguments for L1 SIMD region.
*WorkFn = state::SIMDRegionFn;
// If this is the termination signal from the master, quit early.
if (!*WorkFn)
return false;
// Set to true for workers participating in the parallel region.
uint32_t LaneId = mapping::getThreadIdInWarp();
bool LaneActive = LaneId < state::SIMDLaneWidth;
return LaneActive;
}
__attribute__((noinline)) void __kmpc_kernel_end_simd() {
// TODO: Some clean-up of SIMD execution
}
void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) {
state::enterDataEnvironment();
++icv::Level;

View File

@@ -203,7 +203,7 @@ void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
struct TeamStateTy {
/// TODO: provide a proper init function.
void init(bool IsSPMD);
void init(int Mode);
bool operator==(const TeamStateTy &) const;
@@ -224,8 +224,13 @@ struct TeamStateTy {
TeamStateTy SHARED(TeamState);
void TeamStateTy::init(bool IsSPMD) {
ICVState.NThreadsVar = mapping::getBlockSize();
void TeamStateTy::init(int Mode) {
// In SIMD mode, we map an OpenMP thread to a warp.
if (mapping::utils::isSIMDMode(Mode))
ICVState.NThreadsVar = mapping::getNumberOfWarpsInBlock();
else
ICVState.NThreadsVar = mapping::getBlockSize();
ICVState.LevelVar = 0;
ICVState.ActiveLevelVar = 0;
ICVState.MaxActiveLevelsVar = 1;
@@ -357,7 +362,8 @@ void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
__builtin_unreachable();
}
void state::init(bool IsSPMD) {
void state::init(int Mode) {
const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
SharedMemorySmartStack.init(IsSPMD);
if (!mapping::getThreadIdInBlock())
TeamState.init(IsSPMD);
@@ -404,6 +410,15 @@ void state::assumeInitialState(bool IsSPMD) {
ASSERT(mapping::isSPMDMode() == IsSPMD);
}
void state::propagateThreadState(unsigned SIMDLen) {
ASSERT(mapping::isSIMDMode());
ASSERT(mapping::isLeaderInWarp());
const uint32_t TId = mapping::getThreadIdInBlock();
for (int I = 1; I < SIMDLen; ++I)
ThreadStates[I + TId] = ThreadStates[TId];
}
extern "C" {
void omp_set_dynamic(int V) {}
@@ -434,7 +449,7 @@ void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
}
int omp_get_ancestor_thread_num(int Level) {
return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
return returnValIfLevelIsActive(Level, mapping::getLogicThreadId(), 0);
}
int omp_get_thread_num(void) {

View File

@@ -214,8 +214,8 @@ void setLock(omp_lock_t *Lock) {
} // namespace impl
void synchronize::init(bool IsSPMD) {
if (!IsSPMD)
void synchronize::init(int Mode) {
if (!mapping::utils::isSPMDMode(Mode) || mapping::utils::isSIMDMode(Mode))
impl::namedBarrierInit();
}

View File

@@ -210,7 +210,7 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
static void dispatch_init(IdentTy *loc, int32_t threadId,
kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
DynamicScheduleTracker *DST) {
int tid = mapping::getThreadIdInBlock();
int tid = mapping::getLogicThreadId();
T tnum = omp_get_num_threads();
T tripCount = ub - lb + 1; // +1 because ub is inclusive
ASSERT0(LT_FUSSY, threadId < tnum,

View File

@@ -1099,13 +1099,22 @@ public:
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
int CudaThreadsPerBlock;
// TODO: Set this mode accordingly.
bool IsSIMDMode = false;
if (ThreadLimit > 0) {
DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
CudaThreadsPerBlock = ThreadLimit;
// Add master warp if necessary
if (KernelInfo->ExecutionMode == GENERIC) {
DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
if (IsSIMDMode) {
DP("Setting CUDA threads per block to requested %d\n",
ThreadLimit * DeviceData[DeviceId].WarpSize);
CudaThreadsPerBlock = ThreadLimit * DeviceData[DeviceId].WarpSize;
} else {
DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
CudaThreadsPerBlock = ThreadLimit;
// Add master warp if necessary
if (KernelInfo->ExecutionMode == GENERIC) {
DP("Adding master warp: +%d threads\n",
DeviceData[DeviceId].WarpSize);
CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
}
}
} else {
DP("Setting CUDA threads per block to default %d\n",