Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5a4965b173 |
@@ -206,10 +206,10 @@ uint32_t __kmpc_get_hardware_thread_id_in_block();
|
|||||||
///{
|
///{
|
||||||
int8_t __kmpc_is_spmd_exec_mode();
|
int8_t __kmpc_is_spmd_exec_mode();
|
||||||
|
|
||||||
int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
|
int32_t __kmpc_target_init(IdentTy *Ident, int Mode,
|
||||||
bool UseGenericStateMachine, bool);
|
bool UseGenericStateMachine, bool);
|
||||||
|
|
||||||
void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool);
|
void __kmpc_target_deinit(IdentTy *Ident, int Mode, bool);
|
||||||
|
|
||||||
///}
|
///}
|
||||||
|
|
||||||
@@ -281,6 +281,12 @@ void __kmpc_serialized_parallel(IdentTy *Loc, uint32_t);
|
|||||||
/// TODO
|
/// TODO
|
||||||
void __kmpc_end_serialized_parallel(IdentTy *Loc, uint32_t);
|
void __kmpc_end_serialized_parallel(IdentTy *Loc, uint32_t);
|
||||||
|
|
||||||
|
/// TODO
|
||||||
|
bool __kmpc_kernel_simd(SIMDRegionFnTy *WorkFn);
|
||||||
|
|
||||||
|
/// TODO
|
||||||
|
void __kmpc_kernel_end_simd();
|
||||||
|
|
||||||
/// TODO
|
/// TODO
|
||||||
void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind);
|
void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind);
|
||||||
|
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ inline constexpr uint32_t MaxThreadsPerTeam = 1024;
|
|||||||
#pragma omp end declare target
|
#pragma omp end declare target
|
||||||
|
|
||||||
/// Initialize the mapping machinery.
|
/// Initialize the mapping machinery.
|
||||||
void init(bool IsSPMD);
|
void init(int Mode);
|
||||||
|
|
||||||
/// Return true if the kernel is executed in SPMD mode.
|
/// Return true if the kernel is executed in SPMD mode.
|
||||||
bool isSPMDMode();
|
bool isSPMDMode();
|
||||||
@@ -33,6 +33,9 @@ bool isSPMDMode();
|
|||||||
/// Return true if the kernel is executed in generic mode.
|
/// Return true if the kernel is executed in generic mode.
|
||||||
bool isGenericMode();
|
bool isGenericMode();
|
||||||
|
|
||||||
|
/// Return true if the kernel is executed in SIMD mode.
|
||||||
|
bool isSIMDMode();
|
||||||
|
|
||||||
/// Return true if the executing thread is the main thread in generic mode.
|
/// Return true if the executing thread is the main thread in generic mode.
|
||||||
bool isMainThreadInGenericMode();
|
bool isMainThreadInGenericMode();
|
||||||
|
|
||||||
@@ -55,6 +58,12 @@ uint32_t getThreadIdInWarp();
|
|||||||
/// Return the thread Id in the block, in [0, getBlockSize()).
|
/// Return the thread Id in the block, in [0, getBlockSize()).
|
||||||
uint32_t getThreadIdInBlock();
|
uint32_t getThreadIdInBlock();
|
||||||
|
|
||||||
|
/// Return the logic thread Id, which depends on how we map an OpenMP thread to
|
||||||
|
/// the target device. In non-SIMD mode, we map an OpenMP thread to a device
|
||||||
|
/// thread. In SIMD mode, we map an OpenMP thread to a warp, and each thread in
|
||||||
|
/// the warp is a SIMD lane.
|
||||||
|
uint32_t getLogicThreadId();
|
||||||
|
|
||||||
/// Return the warp id in the block.
|
/// Return the warp id in the block.
|
||||||
uint32_t getWarpId();
|
uint32_t getWarpId();
|
||||||
|
|
||||||
@@ -79,6 +88,17 @@ uint32_t getKernelSize();
|
|||||||
/// Return the number of processing elements on the device.
|
/// Return the number of processing elements on the device.
|
||||||
uint32_t getNumberOfProcessorElements();
|
uint32_t getNumberOfProcessorElements();
|
||||||
|
|
||||||
|
namespace utils {
|
||||||
|
/// Return true if \p Mode indicates SPMD mode.
|
||||||
|
inline bool isSPMDMode(int Mode) { return Mode & 0x1; }
|
||||||
|
|
||||||
|
/// Return true if \p Mode indicates generic mode.
|
||||||
|
inline bool isGenericMode(int Mode) { return !isSPMDMode(Mode); }
|
||||||
|
|
||||||
|
/// Return true if \p Mode indicates SIMD mode.
|
||||||
|
inline bool isSIMDMode(int Mode) { return Mode & 0x2; }
|
||||||
|
} // namespace utils
|
||||||
|
|
||||||
} // namespace mapping
|
} // namespace mapping
|
||||||
|
|
||||||
} // namespace _OMP
|
} // namespace _OMP
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ namespace state {
|
|||||||
inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;
|
inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;
|
||||||
|
|
||||||
/// Initialize the state machinery. Must be called by all threads.
|
/// Initialize the state machinery. Must be called by all threads.
|
||||||
void init(bool IsSPMD);
|
void init(int Mode);
|
||||||
|
|
||||||
/// TODO
|
/// TODO
|
||||||
enum ValueKind {
|
enum ValueKind {
|
||||||
@@ -37,6 +37,10 @@ enum ValueKind {
|
|||||||
VK_RunSchedChunk,
|
VK_RunSchedChunk,
|
||||||
VK_ParallelRegionFn,
|
VK_ParallelRegionFn,
|
||||||
VK_ParallelTeamSize,
|
VK_ParallelTeamSize,
|
||||||
|
// SIMD
|
||||||
|
VK_SIMDLevel,
|
||||||
|
VK_SIMDRegionFn,
|
||||||
|
VK_SIMDLaneWidth,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// TODO
|
/// TODO
|
||||||
@@ -145,10 +149,20 @@ inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize;
|
|||||||
inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
|
inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
|
||||||
ParallelRegionFn;
|
ParallelRegionFn;
|
||||||
|
|
||||||
|
/// TODO
|
||||||
|
inline state::Value<uint32_t, state::VK_SIMDLaneWidth> SIMDLaneWidth;
|
||||||
|
|
||||||
|
/// TODO
|
||||||
|
inline state::PtrValue<SIMDRegionFnTy, state::VK_SIMDRegionFn> SIMDRegionFn;
|
||||||
|
|
||||||
void runAndCheckState(void(Func(void)));
|
void runAndCheckState(void(Func(void)));
|
||||||
|
|
||||||
void assumeInitialState(bool IsSPMD);
|
void assumeInitialState(bool IsSPMD);
|
||||||
|
|
||||||
|
/// Propagate the thread state from the leader in the warp to the rest of SIMD
|
||||||
|
/// workers. This function should only be called in SIMD mode.
|
||||||
|
void propagateThreadState(unsigned SIMDLen);
|
||||||
|
|
||||||
} // namespace state
|
} // namespace state
|
||||||
|
|
||||||
namespace icv {
|
namespace icv {
|
||||||
@@ -171,6 +185,9 @@ inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels;
|
|||||||
/// TODO
|
/// TODO
|
||||||
inline state::Value<uint32_t, state::VK_RunSched> RunSched;
|
inline state::Value<uint32_t, state::VK_RunSched> RunSched;
|
||||||
|
|
||||||
|
/// TODO
|
||||||
|
inline state::Value<uint32_t, state::VK_SIMDLevel> SIMDLevel;
|
||||||
|
|
||||||
} // namespace icv
|
} // namespace icv
|
||||||
|
|
||||||
namespace memory {
|
namespace memory {
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ namespace _OMP {
|
|||||||
namespace synchronize {
|
namespace synchronize {
|
||||||
|
|
||||||
/// Initialize the synchronization machinery. Must be called by all threads.
|
/// Initialize the synchronization machinery. Must be called by all threads.
|
||||||
void init(bool IsSPMD);
|
void init(int Mode);
|
||||||
|
|
||||||
/// Synchronize all threads in a warp identified by \p Mask.
|
/// Synchronize all threads in a warp identified by \p Mask.
|
||||||
void warp(LaneMaskTy Mask);
|
void warp(LaneMaskTy Mask);
|
||||||
|
|||||||
@@ -150,6 +150,8 @@ using __kmpc_impl_lanemask_t = LaneMaskTy;
|
|||||||
|
|
||||||
using ParallelRegionFnTy = void *;
|
using ParallelRegionFnTy = void *;
|
||||||
|
|
||||||
|
using SIMDRegionFnTy = void *;
|
||||||
|
|
||||||
using CriticalNameTy = int32_t[8];
|
using CriticalNameTy = int32_t[8];
|
||||||
|
|
||||||
struct omp_lock_t {
|
struct omp_lock_t {
|
||||||
|
|||||||
@@ -21,17 +21,17 @@ using namespace _OMP;
|
|||||||
|
|
||||||
#pragma omp declare target
|
#pragma omp declare target
|
||||||
|
|
||||||
static void inititializeRuntime(bool IsSPMD) {
|
static void inititializeRuntime(int Mode) {
|
||||||
// Order is important here.
|
// Order is important here.
|
||||||
synchronize::init(IsSPMD);
|
synchronize::init(Mode);
|
||||||
mapping::init(IsSPMD);
|
mapping::init(Mode);
|
||||||
state::init(IsSPMD);
|
state::init(Mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Simple generic state machine for worker threads.
|
/// Simple generic state machine for worker threads.
|
||||||
static void genericStateMachine(IdentTy *Ident) {
|
static void genericStateMachine(IdentTy *Ident) {
|
||||||
|
|
||||||
uint32_t TId = mapping::getThreadIdInBlock();
|
uint32_t TId = mapping::getLogicThreadId();
|
||||||
|
|
||||||
do {
|
do {
|
||||||
ParallelRegionFnTy WorkFn = 0;
|
ParallelRegionFnTy WorkFn = 0;
|
||||||
@@ -58,22 +58,55 @@ static void genericStateMachine(IdentTy *Ident) {
|
|||||||
} while (true);
|
} while (true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
void runSIMDStateMachine(IdentTy *Ident) {
|
||||||
|
uint32_t LaneId = mapping::getThreadIdInWarp();
|
||||||
|
do {
|
||||||
|
SIMDRegionFnTy WorkFn = nullptr;
|
||||||
|
|
||||||
|
// Wait for the signal that we have a new work function.
|
||||||
|
synchronize::warp(mapping::activemask());
|
||||||
|
|
||||||
|
// Retrieve the work function from the runtime.
|
||||||
|
bool IsActive = __kmpc_kernel_simd(&WorkFn);
|
||||||
|
|
||||||
|
if (!WorkFn)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (IsActive) {
|
||||||
|
((void (*)(uint32_t, uint32_t))WorkFn)(0, LaneId);
|
||||||
|
__kmpc_kernel_end_simd();
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronize::warp(mapping::activemask());
|
||||||
|
} while (true);
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
||||||
/// Initialization
|
/// Initialization
|
||||||
///
|
///
|
||||||
/// \param Ident Source location identification, can be NULL.
|
/// \param Ident Source location identification, can be NULL.
|
||||||
///
|
///
|
||||||
int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
|
int32_t __kmpc_target_init(IdentTy *Ident, int Mode,
|
||||||
bool UseGenericStateMachine, bool) {
|
bool UseGenericStateMachine, bool) {
|
||||||
if (IsSPMD) {
|
Mode = Mode | 0x2;
|
||||||
inititializeRuntime(/* IsSPMD */ true);
|
|
||||||
synchronize::threads();
|
inititializeRuntime(Mode);
|
||||||
} else {
|
|
||||||
inititializeRuntime(/* IsSPMD */ false);
|
// For all SIMD workers, start the simd state machine.
|
||||||
// No need to wait since only the main threads will execute user
|
if (mapping::utils::isSIMDMode(Mode)) {
|
||||||
// code and workers will run into a barrier right away.
|
uint32_t LaneId = mapping::getThreadIdInWarp();
|
||||||
|
if (LaneId) {
|
||||||
|
runSIMDStateMachine(Ident);
|
||||||
|
return LaneId;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
|
||||||
|
if (IsSPMD)
|
||||||
|
synchronize::threads();
|
||||||
|
|
||||||
if (IsSPMD) {
|
if (IsSPMD) {
|
||||||
state::assumeInitialState(IsSPMD);
|
state::assumeInitialState(IsSPMD);
|
||||||
@@ -96,7 +129,9 @@ int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
|
|||||||
///
|
///
|
||||||
/// \param Ident Source location identification, can be NULL.
|
/// \param Ident Source location identification, can be NULL.
|
||||||
///
|
///
|
||||||
void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool) {
|
void __kmpc_target_deinit(IdentTy *Ident, int Mode, bool) {
|
||||||
|
const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
|
||||||
|
|
||||||
state::assumeInitialState(IsSPMD);
|
state::assumeInitialState(IsSPMD);
|
||||||
if (IsSPMD)
|
if (IsSPMD)
|
||||||
return;
|
return;
|
||||||
|
|||||||
@@ -178,7 +178,7 @@ bool mapping::isMainThreadInGenericMode() {
|
|||||||
bool mapping::isLeaderInWarp() {
|
bool mapping::isLeaderInWarp() {
|
||||||
__kmpc_impl_lanemask_t Active = mapping::activemask();
|
__kmpc_impl_lanemask_t Active = mapping::activemask();
|
||||||
__kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
|
__kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
|
||||||
return utils::popc(Active & LaneMaskLT) == 0;
|
return ::_OMP::utils::popc(Active & LaneMaskLT) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
LaneMaskTy mapping::activemask() { return impl::activemask(); }
|
LaneMaskTy mapping::activemask() { return impl::activemask(); }
|
||||||
@@ -191,6 +191,13 @@ uint32_t mapping::getThreadIdInWarp() { return impl::getThreadIdInWarp(); }
|
|||||||
|
|
||||||
uint32_t mapping::getThreadIdInBlock() { return impl::getThreadIdInBlock(); }
|
uint32_t mapping::getThreadIdInBlock() { return impl::getThreadIdInBlock(); }
|
||||||
|
|
||||||
|
uint32_t mapping::getLogicThreadId() {
|
||||||
|
if (mapping::isSIMDMode())
|
||||||
|
return mapping::getWarpId();
|
||||||
|
|
||||||
|
return mapping::getThreadIdInBlock();
|
||||||
|
}
|
||||||
|
|
||||||
uint32_t mapping::getBlockSize() { return impl::getBlockSize(); }
|
uint32_t mapping::getBlockSize() { return impl::getBlockSize(); }
|
||||||
|
|
||||||
uint32_t mapping::getKernelSize() { return impl::getKernelSize(); }
|
uint32_t mapping::getKernelSize() { return impl::getKernelSize(); }
|
||||||
@@ -214,16 +221,20 @@ uint32_t mapping::getNumberOfWarpsInBlock() {
|
|||||||
/// Execution mode
|
/// Execution mode
|
||||||
///
|
///
|
||||||
///{
|
///{
|
||||||
static int SHARED(IsSPMDMode);
|
static int SHARED(ExecutionMode);
|
||||||
|
|
||||||
void mapping::init(bool IsSPMD) {
|
void mapping::init(int Mode) {
|
||||||
if (!mapping::getThreadIdInBlock())
|
if (!mapping::getThreadIdInBlock())
|
||||||
IsSPMDMode = IsSPMD;
|
ExecutionMode = Mode;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool mapping::isSPMDMode() { return IsSPMDMode; }
|
bool mapping::isSPMDMode() { return mapping::utils::isSPMDMode(ExecutionMode); }
|
||||||
|
|
||||||
bool mapping::isGenericMode() { return !isSPMDMode(); }
|
bool mapping::isGenericMode() {
|
||||||
|
return mapping::utils::isGenericMode(ExecutionMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool mapping::isSIMDMode() { return mapping::utils::isSIMDMode(ExecutionMode); }
|
||||||
///}
|
///}
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|||||||
@@ -49,20 +49,43 @@ namespace {
|
|||||||
uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
|
uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
|
||||||
uint32_t NThreadsICV =
|
uint32_t NThreadsICV =
|
||||||
NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
|
NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
|
||||||
uint32_t NumThreads = mapping::getBlockSize();
|
|
||||||
|
const bool IsSIMDMode = mapping::isSIMDMode();
|
||||||
|
|
||||||
|
uint32_t NumThreads =
|
||||||
|
IsSIMDMode ? mapping::getNumberOfWarpsInBlock() : mapping::getBlockSize();
|
||||||
|
|
||||||
if (NThreadsICV != 0 && NThreadsICV < NumThreads)
|
if (NThreadsICV != 0 && NThreadsICV < NumThreads)
|
||||||
NumThreads = NThreadsICV;
|
NumThreads = NThreadsICV;
|
||||||
|
|
||||||
// Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
|
// Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
|
||||||
|
// We don't need this for SIMD mode because an OpenMP thread is mapped to a
|
||||||
|
// warp on the device and it can be any number.
|
||||||
|
if (!IsSIMDMode) {
|
||||||
if (NumThreads < mapping::getWarpSize())
|
if (NumThreads < mapping::getWarpSize())
|
||||||
NumThreads = 1;
|
NumThreads = 1;
|
||||||
else
|
else
|
||||||
NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
|
NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
|
||||||
|
}
|
||||||
|
|
||||||
return NumThreads;
|
return NumThreads;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint32_t determineSIMDLen(int32_t SIMDLen, int32_t SafeLen) {
|
||||||
|
ASSERT(mapping::isSIMDMode());
|
||||||
|
|
||||||
|
// TODO: This is probably not right if the schedule is different.
|
||||||
|
if (SafeLen < SIMDLen)
|
||||||
|
SIMDLen = SafeLen;
|
||||||
|
|
||||||
|
// We currently maps an OpenMP thread to a warp in SIMD mode. If the simdlen
|
||||||
|
// is larger than the warp size, we have to ceil it.
|
||||||
|
if (SIMDLen > mapping::getWarpSize())
|
||||||
|
SIMDLen = mapping::getWarpSize();
|
||||||
|
|
||||||
|
return SIMDLen;
|
||||||
|
}
|
||||||
|
|
||||||
// Invoke an outlined parallel function unwrapping arguments (up to 32).
|
// Invoke an outlined parallel function unwrapping arguments (up to 32).
|
||||||
void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
|
void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
|
||||||
void **args, int64_t nargs) {
|
void **args, int64_t nargs) {
|
||||||
@@ -78,11 +101,57 @@ void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
|
|||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
||||||
|
void __kmpc_simd_51(IdentTy *ident, int32_t, int32_t if_expr, int32_t safelen,
|
||||||
|
int32_t simdlen, int order, void *fn, void *wrapper_fn,
|
||||||
|
void **args, int64_t nargs) {
|
||||||
|
// Handle non-SIMD case first, which can be:
|
||||||
|
// - if clause is evaluted to false
|
||||||
|
// - simdlen is set to 1
|
||||||
|
// - it is already in simd region
|
||||||
|
const uint32_t LogicThreadId = mapping::getLogicThreadId();
|
||||||
|
if (OMP_UNLIKELY(!if_expr || simdlen == 1 || safelen == 1 ||
|
||||||
|
icv::SIMDLevel)) {
|
||||||
|
invokeMicrotask(LogicThreadId, 0, fn, args, nargs);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only the leader of each warp can execute the following code.
|
||||||
|
ASSERT(mapping::isLeaderInWarp());
|
||||||
|
|
||||||
|
const uint32_t SIMDLen = determineSIMDLen(simdlen, safelen);
|
||||||
|
|
||||||
|
if (LogicThreadId == 0)
|
||||||
|
state::SIMDLaneWidth = SIMDLen;
|
||||||
|
|
||||||
|
// Propagates the thread state to all SIMD workers from the leader.
|
||||||
|
state::propagateThreadState(SIMDLen);
|
||||||
|
|
||||||
|
// Synchronize all threads (leaders).
|
||||||
|
synchronize::threads();
|
||||||
|
|
||||||
|
{
|
||||||
|
state::ValueRAII SIMDRegionFnRAII(state::SIMDRegionFn, wrapper_fn,
|
||||||
|
(void *)nullptr, true);
|
||||||
|
state::ValueRAII SIMDLevelRAII(icv::SIMDLevel, 1u, 0u, true);
|
||||||
|
|
||||||
|
// Signal SIMD workers
|
||||||
|
synchronize::warp(mapping::activemask());
|
||||||
|
|
||||||
|
// TODO: Leader in warp also has to execute the SIMD region.
|
||||||
|
// What we need:
|
||||||
|
// - A work-sharing function that can take both thread id and lane id into
|
||||||
|
// consideration.
|
||||||
|
|
||||||
|
// Synchronize after execution of the SIMD region.
|
||||||
|
synchronize::warp(mapping::activemask());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
|
void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
|
||||||
int32_t num_threads, int proc_bind, void *fn,
|
int32_t num_threads, int proc_bind, void *fn,
|
||||||
void *wrapper_fn, void **args, int64_t nargs) {
|
void *wrapper_fn, void **args, int64_t nargs) {
|
||||||
|
|
||||||
uint32_t TId = mapping::getThreadIdInBlock();
|
uint32_t TId = mapping::getLogicThreadId();
|
||||||
// Handle the serialized case first, same for SPMD/non-SPMD.
|
// Handle the serialized case first, same for SPMD/non-SPMD.
|
||||||
if (OMP_UNLIKELY(!if_expr || icv::Level)) {
|
if (OMP_UNLIKELY(!if_expr || icv::Level)) {
|
||||||
__kmpc_serialized_parallel(ident, TId);
|
__kmpc_serialized_parallel(ident, TId);
|
||||||
@@ -156,7 +225,7 @@ __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
// Set to true for workers participating in the parallel region.
|
// Set to true for workers participating in the parallel region.
|
||||||
uint32_t TId = mapping::getThreadIdInBlock();
|
uint32_t TId = mapping::getLogicThreadId();
|
||||||
bool ThreadIsActive = TId < state::ParallelTeamSize;
|
bool ThreadIsActive = TId < state::ParallelTeamSize;
|
||||||
return ThreadIsActive;
|
return ThreadIsActive;
|
||||||
}
|
}
|
||||||
@@ -170,6 +239,25 @@ __attribute__((noinline)) void __kmpc_kernel_end_parallel() {
|
|||||||
ASSERT(!mapping::isSPMDMode());
|
ASSERT(!mapping::isSPMDMode());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__attribute__((noinline)) bool
|
||||||
|
__kmpc_kernel_simd(SIMDRegionFnTy *WorkFn) {
|
||||||
|
// Work function and arguments for L1 SIMD region.
|
||||||
|
*WorkFn = state::SIMDRegionFn;
|
||||||
|
|
||||||
|
// If this is the termination signal from the master, quit early.
|
||||||
|
if (!*WorkFn)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// Set to true for workers participating in the parallel region.
|
||||||
|
uint32_t LaneId = mapping::getThreadIdInWarp();
|
||||||
|
bool LaneActive = LaneId < state::SIMDLaneWidth;
|
||||||
|
return LaneActive;
|
||||||
|
}
|
||||||
|
|
||||||
|
__attribute__((noinline)) void __kmpc_kernel_end_simd() {
|
||||||
|
// TODO: Some clean-up of SIMD execution
|
||||||
|
}
|
||||||
|
|
||||||
void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) {
|
void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) {
|
||||||
state::enterDataEnvironment();
|
state::enterDataEnvironment();
|
||||||
++icv::Level;
|
++icv::Level;
|
||||||
|
|||||||
@@ -203,7 +203,7 @@ void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
|
|||||||
|
|
||||||
struct TeamStateTy {
|
struct TeamStateTy {
|
||||||
/// TODO: provide a proper init function.
|
/// TODO: provide a proper init function.
|
||||||
void init(bool IsSPMD);
|
void init(int Mode);
|
||||||
|
|
||||||
bool operator==(const TeamStateTy &) const;
|
bool operator==(const TeamStateTy &) const;
|
||||||
|
|
||||||
@@ -224,8 +224,13 @@ struct TeamStateTy {
|
|||||||
|
|
||||||
TeamStateTy SHARED(TeamState);
|
TeamStateTy SHARED(TeamState);
|
||||||
|
|
||||||
void TeamStateTy::init(bool IsSPMD) {
|
void TeamStateTy::init(int Mode) {
|
||||||
|
// In SIMD mode, we map an OpenMP thread to a warp.
|
||||||
|
if (mapping::utils::isSIMDMode(Mode))
|
||||||
|
ICVState.NThreadsVar = mapping::getNumberOfWarpsInBlock();
|
||||||
|
else
|
||||||
ICVState.NThreadsVar = mapping::getBlockSize();
|
ICVState.NThreadsVar = mapping::getBlockSize();
|
||||||
|
|
||||||
ICVState.LevelVar = 0;
|
ICVState.LevelVar = 0;
|
||||||
ICVState.ActiveLevelVar = 0;
|
ICVState.ActiveLevelVar = 0;
|
||||||
ICVState.MaxActiveLevelsVar = 1;
|
ICVState.MaxActiveLevelsVar = 1;
|
||||||
@@ -357,7 +362,8 @@ void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
|
|||||||
__builtin_unreachable();
|
__builtin_unreachable();
|
||||||
}
|
}
|
||||||
|
|
||||||
void state::init(bool IsSPMD) {
|
void state::init(int Mode) {
|
||||||
|
const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
|
||||||
SharedMemorySmartStack.init(IsSPMD);
|
SharedMemorySmartStack.init(IsSPMD);
|
||||||
if (!mapping::getThreadIdInBlock())
|
if (!mapping::getThreadIdInBlock())
|
||||||
TeamState.init(IsSPMD);
|
TeamState.init(IsSPMD);
|
||||||
@@ -404,6 +410,15 @@ void state::assumeInitialState(bool IsSPMD) {
|
|||||||
ASSERT(mapping::isSPMDMode() == IsSPMD);
|
ASSERT(mapping::isSPMDMode() == IsSPMD);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void state::propagateThreadState(unsigned SIMDLen) {
|
||||||
|
ASSERT(mapping::isSIMDMode());
|
||||||
|
ASSERT(mapping::isLeaderInWarp());
|
||||||
|
|
||||||
|
const uint32_t TId = mapping::getThreadIdInBlock();
|
||||||
|
for (int I = 1; I < SIMDLen; ++I)
|
||||||
|
ThreadStates[I + TId] = ThreadStates[TId];
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
void omp_set_dynamic(int V) {}
|
void omp_set_dynamic(int V) {}
|
||||||
|
|
||||||
@@ -434,7 +449,7 @@ void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int omp_get_ancestor_thread_num(int Level) {
|
int omp_get_ancestor_thread_num(int Level) {
|
||||||
return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
|
return returnValIfLevelIsActive(Level, mapping::getLogicThreadId(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
int omp_get_thread_num(void) {
|
int omp_get_thread_num(void) {
|
||||||
|
|||||||
@@ -214,8 +214,8 @@ void setLock(omp_lock_t *Lock) {
|
|||||||
|
|
||||||
} // namespace impl
|
} // namespace impl
|
||||||
|
|
||||||
void synchronize::init(bool IsSPMD) {
|
void synchronize::init(int Mode) {
|
||||||
if (!IsSPMD)
|
if (!mapping::utils::isSPMDMode(Mode) || mapping::utils::isSIMDMode(Mode))
|
||||||
impl::namedBarrierInit();
|
impl::namedBarrierInit();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -210,7 +210,7 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
|
|||||||
static void dispatch_init(IdentTy *loc, int32_t threadId,
|
static void dispatch_init(IdentTy *loc, int32_t threadId,
|
||||||
kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
|
kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
|
||||||
DynamicScheduleTracker *DST) {
|
DynamicScheduleTracker *DST) {
|
||||||
int tid = mapping::getThreadIdInBlock();
|
int tid = mapping::getLogicThreadId();
|
||||||
T tnum = omp_get_num_threads();
|
T tnum = omp_get_num_threads();
|
||||||
T tripCount = ub - lb + 1; // +1 because ub is inclusive
|
T tripCount = ub - lb + 1; // +1 because ub is inclusive
|
||||||
ASSERT0(LT_FUSSY, threadId < tnum,
|
ASSERT0(LT_FUSSY, threadId < tnum,
|
||||||
|
|||||||
@@ -1099,14 +1099,23 @@ public:
|
|||||||
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
|
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
|
||||||
|
|
||||||
int CudaThreadsPerBlock;
|
int CudaThreadsPerBlock;
|
||||||
|
// TODO: Set this mode accordingly.
|
||||||
|
bool IsSIMDMode = false;
|
||||||
if (ThreadLimit > 0) {
|
if (ThreadLimit > 0) {
|
||||||
|
if (IsSIMDMode) {
|
||||||
|
DP("Setting CUDA threads per block to requested %d\n",
|
||||||
|
ThreadLimit * DeviceData[DeviceId].WarpSize);
|
||||||
|
CudaThreadsPerBlock = ThreadLimit * DeviceData[DeviceId].WarpSize;
|
||||||
|
} else {
|
||||||
DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
|
DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
|
||||||
CudaThreadsPerBlock = ThreadLimit;
|
CudaThreadsPerBlock = ThreadLimit;
|
||||||
// Add master warp if necessary
|
// Add master warp if necessary
|
||||||
if (KernelInfo->ExecutionMode == GENERIC) {
|
if (KernelInfo->ExecutionMode == GENERIC) {
|
||||||
DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
|
DP("Adding master warp: +%d threads\n",
|
||||||
|
DeviceData[DeviceId].WarpSize);
|
||||||
CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
|
CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
DP("Setting CUDA threads per block to default %d\n",
|
DP("Setting CUDA threads per block to default %d\n",
|
||||||
DeviceData[DeviceId].NumThreads);
|
DeviceData[DeviceId].NumThreads);
|
||||||
|
|||||||
Reference in New Issue
Block a user