Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5a4965b173 |
@@ -206,10 +206,10 @@ uint32_t __kmpc_get_hardware_thread_id_in_block();
|
||||
///{
|
||||
int8_t __kmpc_is_spmd_exec_mode();
|
||||
|
||||
int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
|
||||
int32_t __kmpc_target_init(IdentTy *Ident, int Mode,
|
||||
bool UseGenericStateMachine, bool);
|
||||
|
||||
void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool);
|
||||
void __kmpc_target_deinit(IdentTy *Ident, int Mode, bool);
|
||||
|
||||
///}
|
||||
|
||||
@@ -281,6 +281,12 @@ void __kmpc_serialized_parallel(IdentTy *Loc, uint32_t);
|
||||
/// TODO
|
||||
void __kmpc_end_serialized_parallel(IdentTy *Loc, uint32_t);
|
||||
|
||||
/// TODO
|
||||
bool __kmpc_kernel_simd(SIMDRegionFnTy *WorkFn);
|
||||
|
||||
/// TODO
|
||||
void __kmpc_kernel_end_simd();
|
||||
|
||||
/// TODO
|
||||
void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind);
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ inline constexpr uint32_t MaxThreadsPerTeam = 1024;
|
||||
#pragma omp end declare target
|
||||
|
||||
/// Initialize the mapping machinery.
|
||||
void init(bool IsSPMD);
|
||||
void init(int Mode);
|
||||
|
||||
/// Return true if the kernel is executed in SPMD mode.
|
||||
bool isSPMDMode();
|
||||
@@ -33,6 +33,9 @@ bool isSPMDMode();
|
||||
/// Return true if the kernel is executed in generic mode.
|
||||
bool isGenericMode();
|
||||
|
||||
/// Return true if the kernel is executed in SIMD mode.
|
||||
bool isSIMDMode();
|
||||
|
||||
/// Return true if the executing thread is the main thread in generic mode.
|
||||
bool isMainThreadInGenericMode();
|
||||
|
||||
@@ -55,6 +58,12 @@ uint32_t getThreadIdInWarp();
|
||||
/// Return the thread Id in the block, in [0, getBlockSize()).
|
||||
uint32_t getThreadIdInBlock();
|
||||
|
||||
/// Return the logic thread Id, which depends on how we map an OpenMP thread to
|
||||
/// the target device. In non-SIMD mode, we map an OpenMP thread to a device
|
||||
/// thread. In SIMD mode, we map an OpenMP thread to a warp, and each thread in
|
||||
/// the warp is a SIMD lane.
|
||||
uint32_t getLogicThreadId();
|
||||
|
||||
/// Return the warp id in the block.
|
||||
uint32_t getWarpId();
|
||||
|
||||
@@ -79,6 +88,17 @@ uint32_t getKernelSize();
|
||||
/// Return the number of processing elements on the device.
|
||||
uint32_t getNumberOfProcessorElements();
|
||||
|
||||
namespace utils {
|
||||
/// Return true if \p Mode indicates SPMD mode.
|
||||
inline bool isSPMDMode(int Mode) { return Mode & 0x1; }
|
||||
|
||||
/// Return true if \p Mode indicates generic mode.
|
||||
inline bool isGenericMode(int Mode) { return !isSPMDMode(Mode); }
|
||||
|
||||
/// Return true if \p Mode indicates SIMD mode.
|
||||
inline bool isSIMDMode(int Mode) { return Mode & 0x2; }
|
||||
} // namespace utils
|
||||
|
||||
} // namespace mapping
|
||||
|
||||
} // namespace _OMP
|
||||
|
||||
@@ -24,7 +24,7 @@ namespace state {
|
||||
inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;
|
||||
|
||||
/// Initialize the state machinery. Must be called by all threads.
|
||||
void init(bool IsSPMD);
|
||||
void init(int Mode);
|
||||
|
||||
/// TODO
|
||||
enum ValueKind {
|
||||
@@ -37,6 +37,10 @@ enum ValueKind {
|
||||
VK_RunSchedChunk,
|
||||
VK_ParallelRegionFn,
|
||||
VK_ParallelTeamSize,
|
||||
// SIMD
|
||||
VK_SIMDLevel,
|
||||
VK_SIMDRegionFn,
|
||||
VK_SIMDLaneWidth,
|
||||
};
|
||||
|
||||
/// TODO
|
||||
@@ -145,10 +149,20 @@ inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize;
|
||||
inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
|
||||
ParallelRegionFn;
|
||||
|
||||
/// TODO
|
||||
inline state::Value<uint32_t, state::VK_SIMDLaneWidth> SIMDLaneWidth;
|
||||
|
||||
/// TODO
|
||||
inline state::PtrValue<SIMDRegionFnTy, state::VK_SIMDRegionFn> SIMDRegionFn;
|
||||
|
||||
void runAndCheckState(void(Func(void)));
|
||||
|
||||
void assumeInitialState(bool IsSPMD);
|
||||
|
||||
/// Propagate the thread state from the leader in the warp to the rest of SIMD
|
||||
/// workers. This function should only be called in SIMD mode.
|
||||
void propagateThreadState(unsigned SIMDLen);
|
||||
|
||||
} // namespace state
|
||||
|
||||
namespace icv {
|
||||
@@ -171,6 +185,9 @@ inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels;
|
||||
/// TODO
|
||||
inline state::Value<uint32_t, state::VK_RunSched> RunSched;
|
||||
|
||||
/// TODO
|
||||
inline state::Value<uint32_t, state::VK_SIMDLevel> SIMDLevel;
|
||||
|
||||
} // namespace icv
|
||||
|
||||
namespace memory {
|
||||
|
||||
@@ -19,7 +19,7 @@ namespace _OMP {
|
||||
namespace synchronize {
|
||||
|
||||
/// Initialize the synchronization machinery. Must be called by all threads.
|
||||
void init(bool IsSPMD);
|
||||
void init(int Mode);
|
||||
|
||||
/// Synchronize all threads in a warp identified by \p Mask.
|
||||
void warp(LaneMaskTy Mask);
|
||||
|
||||
@@ -150,6 +150,8 @@ using __kmpc_impl_lanemask_t = LaneMaskTy;
|
||||
|
||||
using ParallelRegionFnTy = void *;
|
||||
|
||||
using SIMDRegionFnTy = void *;
|
||||
|
||||
using CriticalNameTy = int32_t[8];
|
||||
|
||||
struct omp_lock_t {
|
||||
|
||||
@@ -21,17 +21,17 @@ using namespace _OMP;
|
||||
|
||||
#pragma omp declare target
|
||||
|
||||
static void inititializeRuntime(bool IsSPMD) {
|
||||
static void inititializeRuntime(int Mode) {
|
||||
// Order is important here.
|
||||
synchronize::init(IsSPMD);
|
||||
mapping::init(IsSPMD);
|
||||
state::init(IsSPMD);
|
||||
synchronize::init(Mode);
|
||||
mapping::init(Mode);
|
||||
state::init(Mode);
|
||||
}
|
||||
|
||||
/// Simple generic state machine for worker threads.
|
||||
static void genericStateMachine(IdentTy *Ident) {
|
||||
|
||||
uint32_t TId = mapping::getThreadIdInBlock();
|
||||
uint32_t TId = mapping::getLogicThreadId();
|
||||
|
||||
do {
|
||||
ParallelRegionFnTy WorkFn = 0;
|
||||
@@ -58,23 +58,56 @@ static void genericStateMachine(IdentTy *Ident) {
|
||||
} while (true);
|
||||
}
|
||||
|
||||
namespace {
|
||||
void runSIMDStateMachine(IdentTy *Ident) {
|
||||
uint32_t LaneId = mapping::getThreadIdInWarp();
|
||||
do {
|
||||
SIMDRegionFnTy WorkFn = nullptr;
|
||||
|
||||
// Wait for the signal that we have a new work function.
|
||||
synchronize::warp(mapping::activemask());
|
||||
|
||||
// Retrieve the work function from the runtime.
|
||||
bool IsActive = __kmpc_kernel_simd(&WorkFn);
|
||||
|
||||
if (!WorkFn)
|
||||
return;
|
||||
|
||||
if (IsActive) {
|
||||
((void (*)(uint32_t, uint32_t))WorkFn)(0, LaneId);
|
||||
__kmpc_kernel_end_simd();
|
||||
}
|
||||
|
||||
synchronize::warp(mapping::activemask());
|
||||
} while (true);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
extern "C" {
|
||||
|
||||
/// Initialization
|
||||
///
|
||||
/// \param Ident Source location identification, can be NULL.
|
||||
///
|
||||
int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
|
||||
int32_t __kmpc_target_init(IdentTy *Ident, int Mode,
|
||||
bool UseGenericStateMachine, bool) {
|
||||
if (IsSPMD) {
|
||||
inititializeRuntime(/* IsSPMD */ true);
|
||||
synchronize::threads();
|
||||
} else {
|
||||
inititializeRuntime(/* IsSPMD */ false);
|
||||
// No need to wait since only the main threads will execute user
|
||||
// code and workers will run into a barrier right away.
|
||||
Mode = Mode | 0x2;
|
||||
|
||||
inititializeRuntime(Mode);
|
||||
|
||||
// For all SIMD workers, start the simd state machine.
|
||||
if (mapping::utils::isSIMDMode(Mode)) {
|
||||
uint32_t LaneId = mapping::getThreadIdInWarp();
|
||||
if (LaneId) {
|
||||
runSIMDStateMachine(Ident);
|
||||
return LaneId;
|
||||
}
|
||||
}
|
||||
|
||||
const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
|
||||
if (IsSPMD)
|
||||
synchronize::threads();
|
||||
|
||||
if (IsSPMD) {
|
||||
state::assumeInitialState(IsSPMD);
|
||||
return -1;
|
||||
@@ -96,7 +129,9 @@ int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
|
||||
///
|
||||
/// \param Ident Source location identification, can be NULL.
|
||||
///
|
||||
void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool) {
|
||||
void __kmpc_target_deinit(IdentTy *Ident, int Mode, bool) {
|
||||
const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
|
||||
|
||||
state::assumeInitialState(IsSPMD);
|
||||
if (IsSPMD)
|
||||
return;
|
||||
|
||||
@@ -178,7 +178,7 @@ bool mapping::isMainThreadInGenericMode() {
|
||||
bool mapping::isLeaderInWarp() {
|
||||
__kmpc_impl_lanemask_t Active = mapping::activemask();
|
||||
__kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
|
||||
return utils::popc(Active & LaneMaskLT) == 0;
|
||||
return ::_OMP::utils::popc(Active & LaneMaskLT) == 0;
|
||||
}
|
||||
|
||||
LaneMaskTy mapping::activemask() { return impl::activemask(); }
|
||||
@@ -191,6 +191,13 @@ uint32_t mapping::getThreadIdInWarp() { return impl::getThreadIdInWarp(); }
|
||||
|
||||
uint32_t mapping::getThreadIdInBlock() { return impl::getThreadIdInBlock(); }
|
||||
|
||||
uint32_t mapping::getLogicThreadId() {
|
||||
if (mapping::isSIMDMode())
|
||||
return mapping::getWarpId();
|
||||
|
||||
return mapping::getThreadIdInBlock();
|
||||
}
|
||||
|
||||
uint32_t mapping::getBlockSize() { return impl::getBlockSize(); }
|
||||
|
||||
uint32_t mapping::getKernelSize() { return impl::getKernelSize(); }
|
||||
@@ -214,16 +221,20 @@ uint32_t mapping::getNumberOfWarpsInBlock() {
|
||||
/// Execution mode
|
||||
///
|
||||
///{
|
||||
static int SHARED(IsSPMDMode);
|
||||
static int SHARED(ExecutionMode);
|
||||
|
||||
void mapping::init(bool IsSPMD) {
|
||||
void mapping::init(int Mode) {
|
||||
if (!mapping::getThreadIdInBlock())
|
||||
IsSPMDMode = IsSPMD;
|
||||
ExecutionMode = Mode;
|
||||
}
|
||||
|
||||
bool mapping::isSPMDMode() { return IsSPMDMode; }
|
||||
bool mapping::isSPMDMode() { return mapping::utils::isSPMDMode(ExecutionMode); }
|
||||
|
||||
bool mapping::isGenericMode() { return !isSPMDMode(); }
|
||||
bool mapping::isGenericMode() {
|
||||
return mapping::utils::isGenericMode(ExecutionMode);
|
||||
}
|
||||
|
||||
bool mapping::isSIMDMode() { return mapping::utils::isSIMDMode(ExecutionMode); }
|
||||
///}
|
||||
|
||||
extern "C" {
|
||||
|
||||
@@ -49,20 +49,43 @@ namespace {
|
||||
uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
|
||||
uint32_t NThreadsICV =
|
||||
NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
|
||||
uint32_t NumThreads = mapping::getBlockSize();
|
||||
|
||||
const bool IsSIMDMode = mapping::isSIMDMode();
|
||||
|
||||
uint32_t NumThreads =
|
||||
IsSIMDMode ? mapping::getNumberOfWarpsInBlock() : mapping::getBlockSize();
|
||||
|
||||
if (NThreadsICV != 0 && NThreadsICV < NumThreads)
|
||||
NumThreads = NThreadsICV;
|
||||
|
||||
// Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
|
||||
if (NumThreads < mapping::getWarpSize())
|
||||
NumThreads = 1;
|
||||
else
|
||||
NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
|
||||
// We don't need this for SIMD mode because an OpenMP thread is mapped to a
|
||||
// warp on the device and it can be any number.
|
||||
if (!IsSIMDMode) {
|
||||
if (NumThreads < mapping::getWarpSize())
|
||||
NumThreads = 1;
|
||||
else
|
||||
NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
|
||||
}
|
||||
|
||||
return NumThreads;
|
||||
}
|
||||
|
||||
uint32_t determineSIMDLen(int32_t SIMDLen, int32_t SafeLen) {
|
||||
ASSERT(mapping::isSIMDMode());
|
||||
|
||||
// TODO: This is probably not right if the schedule is different.
|
||||
if (SafeLen < SIMDLen)
|
||||
SIMDLen = SafeLen;
|
||||
|
||||
// We currently maps an OpenMP thread to a warp in SIMD mode. If the simdlen
|
||||
// is larger than the warp size, we have to ceil it.
|
||||
if (SIMDLen > mapping::getWarpSize())
|
||||
SIMDLen = mapping::getWarpSize();
|
||||
|
||||
return SIMDLen;
|
||||
}
|
||||
|
||||
// Invoke an outlined parallel function unwrapping arguments (up to 32).
|
||||
void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
|
||||
void **args, int64_t nargs) {
|
||||
@@ -78,11 +101,57 @@ void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
|
||||
|
||||
extern "C" {
|
||||
|
||||
void __kmpc_simd_51(IdentTy *ident, int32_t, int32_t if_expr, int32_t safelen,
|
||||
int32_t simdlen, int order, void *fn, void *wrapper_fn,
|
||||
void **args, int64_t nargs) {
|
||||
// Handle non-SIMD case first, which can be:
|
||||
// - if clause is evaluted to false
|
||||
// - simdlen is set to 1
|
||||
// - it is already in simd region
|
||||
const uint32_t LogicThreadId = mapping::getLogicThreadId();
|
||||
if (OMP_UNLIKELY(!if_expr || simdlen == 1 || safelen == 1 ||
|
||||
icv::SIMDLevel)) {
|
||||
invokeMicrotask(LogicThreadId, 0, fn, args, nargs);
|
||||
return;
|
||||
}
|
||||
|
||||
// Only the leader of each warp can execute the following code.
|
||||
ASSERT(mapping::isLeaderInWarp());
|
||||
|
||||
const uint32_t SIMDLen = determineSIMDLen(simdlen, safelen);
|
||||
|
||||
if (LogicThreadId == 0)
|
||||
state::SIMDLaneWidth = SIMDLen;
|
||||
|
||||
// Propagates the thread state to all SIMD workers from the leader.
|
||||
state::propagateThreadState(SIMDLen);
|
||||
|
||||
// Synchronize all threads (leaders).
|
||||
synchronize::threads();
|
||||
|
||||
{
|
||||
state::ValueRAII SIMDRegionFnRAII(state::SIMDRegionFn, wrapper_fn,
|
||||
(void *)nullptr, true);
|
||||
state::ValueRAII SIMDLevelRAII(icv::SIMDLevel, 1u, 0u, true);
|
||||
|
||||
// Signal SIMD workers
|
||||
synchronize::warp(mapping::activemask());
|
||||
|
||||
// TODO: Leader in warp also has to execute the SIMD region.
|
||||
// What we need:
|
||||
// - A work-sharing function that can take both thread id and lane id into
|
||||
// consideration.
|
||||
|
||||
// Synchronize after execution of the SIMD region.
|
||||
synchronize::warp(mapping::activemask());
|
||||
}
|
||||
}
|
||||
|
||||
void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
|
||||
int32_t num_threads, int proc_bind, void *fn,
|
||||
void *wrapper_fn, void **args, int64_t nargs) {
|
||||
|
||||
uint32_t TId = mapping::getThreadIdInBlock();
|
||||
uint32_t TId = mapping::getLogicThreadId();
|
||||
// Handle the serialized case first, same for SPMD/non-SPMD.
|
||||
if (OMP_UNLIKELY(!if_expr || icv::Level)) {
|
||||
__kmpc_serialized_parallel(ident, TId);
|
||||
@@ -156,7 +225,7 @@ __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
|
||||
return false;
|
||||
|
||||
// Set to true for workers participating in the parallel region.
|
||||
uint32_t TId = mapping::getThreadIdInBlock();
|
||||
uint32_t TId = mapping::getLogicThreadId();
|
||||
bool ThreadIsActive = TId < state::ParallelTeamSize;
|
||||
return ThreadIsActive;
|
||||
}
|
||||
@@ -170,6 +239,25 @@ __attribute__((noinline)) void __kmpc_kernel_end_parallel() {
|
||||
ASSERT(!mapping::isSPMDMode());
|
||||
}
|
||||
|
||||
__attribute__((noinline)) bool
|
||||
__kmpc_kernel_simd(SIMDRegionFnTy *WorkFn) {
|
||||
// Work function and arguments for L1 SIMD region.
|
||||
*WorkFn = state::SIMDRegionFn;
|
||||
|
||||
// If this is the termination signal from the master, quit early.
|
||||
if (!*WorkFn)
|
||||
return false;
|
||||
|
||||
// Set to true for workers participating in the parallel region.
|
||||
uint32_t LaneId = mapping::getThreadIdInWarp();
|
||||
bool LaneActive = LaneId < state::SIMDLaneWidth;
|
||||
return LaneActive;
|
||||
}
|
||||
|
||||
__attribute__((noinline)) void __kmpc_kernel_end_simd() {
|
||||
// TODO: Some clean-up of SIMD execution
|
||||
}
|
||||
|
||||
void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) {
|
||||
state::enterDataEnvironment();
|
||||
++icv::Level;
|
||||
|
||||
@@ -203,7 +203,7 @@ void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
|
||||
|
||||
struct TeamStateTy {
|
||||
/// TODO: provide a proper init function.
|
||||
void init(bool IsSPMD);
|
||||
void init(int Mode);
|
||||
|
||||
bool operator==(const TeamStateTy &) const;
|
||||
|
||||
@@ -224,8 +224,13 @@ struct TeamStateTy {
|
||||
|
||||
TeamStateTy SHARED(TeamState);
|
||||
|
||||
void TeamStateTy::init(bool IsSPMD) {
|
||||
ICVState.NThreadsVar = mapping::getBlockSize();
|
||||
void TeamStateTy::init(int Mode) {
|
||||
// In SIMD mode, we map an OpenMP thread to a warp.
|
||||
if (mapping::utils::isSIMDMode(Mode))
|
||||
ICVState.NThreadsVar = mapping::getNumberOfWarpsInBlock();
|
||||
else
|
||||
ICVState.NThreadsVar = mapping::getBlockSize();
|
||||
|
||||
ICVState.LevelVar = 0;
|
||||
ICVState.ActiveLevelVar = 0;
|
||||
ICVState.MaxActiveLevelsVar = 1;
|
||||
@@ -357,7 +362,8 @@ void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
|
||||
__builtin_unreachable();
|
||||
}
|
||||
|
||||
void state::init(bool IsSPMD) {
|
||||
void state::init(int Mode) {
|
||||
const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
|
||||
SharedMemorySmartStack.init(IsSPMD);
|
||||
if (!mapping::getThreadIdInBlock())
|
||||
TeamState.init(IsSPMD);
|
||||
@@ -404,6 +410,15 @@ void state::assumeInitialState(bool IsSPMD) {
|
||||
ASSERT(mapping::isSPMDMode() == IsSPMD);
|
||||
}
|
||||
|
||||
void state::propagateThreadState(unsigned SIMDLen) {
|
||||
ASSERT(mapping::isSIMDMode());
|
||||
ASSERT(mapping::isLeaderInWarp());
|
||||
|
||||
const uint32_t TId = mapping::getThreadIdInBlock();
|
||||
for (int I = 1; I < SIMDLen; ++I)
|
||||
ThreadStates[I + TId] = ThreadStates[TId];
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
void omp_set_dynamic(int V) {}
|
||||
|
||||
@@ -434,7 +449,7 @@ void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
|
||||
}
|
||||
|
||||
int omp_get_ancestor_thread_num(int Level) {
|
||||
return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
|
||||
return returnValIfLevelIsActive(Level, mapping::getLogicThreadId(), 0);
|
||||
}
|
||||
|
||||
int omp_get_thread_num(void) {
|
||||
|
||||
@@ -214,8 +214,8 @@ void setLock(omp_lock_t *Lock) {
|
||||
|
||||
} // namespace impl
|
||||
|
||||
void synchronize::init(bool IsSPMD) {
|
||||
if (!IsSPMD)
|
||||
void synchronize::init(int Mode) {
|
||||
if (!mapping::utils::isSPMDMode(Mode) || mapping::utils::isSIMDMode(Mode))
|
||||
impl::namedBarrierInit();
|
||||
}
|
||||
|
||||
|
||||
@@ -210,7 +210,7 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
|
||||
static void dispatch_init(IdentTy *loc, int32_t threadId,
|
||||
kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
|
||||
DynamicScheduleTracker *DST) {
|
||||
int tid = mapping::getThreadIdInBlock();
|
||||
int tid = mapping::getLogicThreadId();
|
||||
T tnum = omp_get_num_threads();
|
||||
T tripCount = ub - lb + 1; // +1 because ub is inclusive
|
||||
ASSERT0(LT_FUSSY, threadId < tnum,
|
||||
|
||||
@@ -1099,13 +1099,22 @@ public:
|
||||
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
|
||||
|
||||
int CudaThreadsPerBlock;
|
||||
// TODO: Set this mode accordingly.
|
||||
bool IsSIMDMode = false;
|
||||
if (ThreadLimit > 0) {
|
||||
DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
|
||||
CudaThreadsPerBlock = ThreadLimit;
|
||||
// Add master warp if necessary
|
||||
if (KernelInfo->ExecutionMode == GENERIC) {
|
||||
DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
|
||||
CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
|
||||
if (IsSIMDMode) {
|
||||
DP("Setting CUDA threads per block to requested %d\n",
|
||||
ThreadLimit * DeviceData[DeviceId].WarpSize);
|
||||
CudaThreadsPerBlock = ThreadLimit * DeviceData[DeviceId].WarpSize;
|
||||
} else {
|
||||
DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
|
||||
CudaThreadsPerBlock = ThreadLimit;
|
||||
// Add master warp if necessary
|
||||
if (KernelInfo->ExecutionMode == GENERIC) {
|
||||
DP("Adding master warp: +%d threads\n",
|
||||
DeviceData[DeviceId].WarpSize);
|
||||
CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
DP("Setting CUDA threads per block to default %d\n",
|
||||
|
||||
Reference in New Issue
Block a user