[OpenMP][DeviceRTL] Add the initial support for simd directive

2021-09-21 19:00:33 -04:00
12 changed files with 249 additions and 46 deletions
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -206,10 +206,10 @@ uint32_t __kmpc_get_hardware_thread_id_in_block();
 ///{
 int8_t __kmpc_is_spmd_exec_mode();

-int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
+int32_t __kmpc_target_init(IdentTy *Ident, int Mode,
                           bool UseGenericStateMachine, bool);

-void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool);
+void __kmpc_target_deinit(IdentTy *Ident, int Mode, bool);

 ///}

@@ -281,6 +281,12 @@ void __kmpc_serialized_parallel(IdentTy *Loc, uint32_t);
 /// TODO
 void __kmpc_end_serialized_parallel(IdentTy *Loc, uint32_t);

+/// TODO
+bool __kmpc_kernel_simd(SIMDRegionFnTy *WorkFn);
+
+/// TODO
+void __kmpc_kernel_end_simd();
+
 /// TODO
 void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind);

--- a/openmp/libomptarget/DeviceRTL/include/Mapping.h
+++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h
@@ -25,7 +25,7 @@ inline constexpr uint32_t MaxThreadsPerTeam = 1024;
 #pragma omp end declare target

 /// Initialize the mapping machinery.
-void init(bool IsSPMD);
+void init(int Mode);

 /// Return true if the kernel is executed in SPMD mode.
 bool isSPMDMode();
@@ -33,6 +33,9 @@ bool isSPMDMode();
 /// Return true if the kernel is executed in generic mode.
 bool isGenericMode();

+/// Return true if the kernel is executed in SIMD mode.
+bool isSIMDMode();
+
 /// Return true if the executing thread is the main thread in generic mode.
 bool isMainThreadInGenericMode();

@@ -55,6 +58,12 @@ uint32_t getThreadIdInWarp();
 /// Return the thread Id in the block, in [0, getBlockSize()).
 uint32_t getThreadIdInBlock();

+/// Return the logic thread Id, which depends on how we map an OpenMP thread to
+/// the target device. In non-SIMD mode, we map an OpenMP thread to a device
+/// thread. In SIMD mode, we map an OpenMP thread to a warp, and each thread in
+/// the warp is a SIMD lane.
+uint32_t getLogicThreadId();
+
 /// Return the warp id in the block.
 uint32_t getWarpId();

@@ -79,6 +88,17 @@ uint32_t getKernelSize();
 /// Return the number of processing elements on the device.
 uint32_t getNumberOfProcessorElements();

+namespace utils {
+/// Return true if \p Mode indicates SPMD mode.
+inline bool isSPMDMode(int Mode) { return Mode & 0x1; }
+
+/// Return true if \p Mode indicates generic mode.
+inline bool isGenericMode(int Mode) { return !isSPMDMode(Mode); }
+
+/// Return true if \p Mode indicates SIMD mode.
+inline bool isSIMDMode(int Mode) { return Mode & 0x2; }
+} // namespace utils
+
 } // namespace mapping

 } // namespace _OMP
--- a/openmp/libomptarget/DeviceRTL/include/State.h
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -24,7 +24,7 @@ namespace state {
 inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;

 /// Initialize the state machinery. Must be called by all threads.
-void init(bool IsSPMD);
+void init(int Mode);

 /// TODO
 enum ValueKind {
@@ -37,6 +37,10 @@ enum ValueKind {
  VK_RunSchedChunk,
  VK_ParallelRegionFn,
  VK_ParallelTeamSize,
+  // SIMD
+  VK_SIMDLevel,
+  VK_SIMDRegionFn,
+  VK_SIMDLaneWidth,
 };

 /// TODO
@@ -145,10 +149,20 @@ inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize;
 inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
    ParallelRegionFn;

+/// TODO
+inline state::Value<uint32_t, state::VK_SIMDLaneWidth> SIMDLaneWidth;
+
+/// TODO
+inline state::PtrValue<SIMDRegionFnTy, state::VK_SIMDRegionFn> SIMDRegionFn;
+
 void runAndCheckState(void(Func(void)));

 void assumeInitialState(bool IsSPMD);

+/// Propagate the thread state from the leader in the warp to the rest of SIMD
+/// workers. This function should only be called in SIMD mode.
+void propagateThreadState(unsigned SIMDLen);
+
 } // namespace state

 namespace icv {
@@ -171,6 +185,9 @@ inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels;
 /// TODO
 inline state::Value<uint32_t, state::VK_RunSched> RunSched;

+/// TODO
+inline state::Value<uint32_t, state::VK_SIMDLevel> SIMDLevel;
+
 } // namespace icv

 namespace memory {
--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -19,7 +19,7 @@ namespace _OMP {
 namespace synchronize {

 /// Initialize the synchronization machinery. Must be called by all threads.
-void init(bool IsSPMD);
+void init(int Mode);

 /// Synchronize all threads in a warp identified by \p Mask.
 void warp(LaneMaskTy Mask);
--- a/openmp/libomptarget/DeviceRTL/include/Types.h
+++ b/openmp/libomptarget/DeviceRTL/include/Types.h
@@ -150,6 +150,8 @@ using __kmpc_impl_lanemask_t = LaneMaskTy;

 using ParallelRegionFnTy = void *;

+using SIMDRegionFnTy = void *;
+
 using CriticalNameTy = int32_t[8];

 struct omp_lock_t {
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -21,17 +21,17 @@ using namespace _OMP;

 #pragma omp declare target

-static void inititializeRuntime(bool IsSPMD) {
+static void inititializeRuntime(int Mode) {
  // Order is important here.
-  synchronize::init(IsSPMD);
-  mapping::init(IsSPMD);
-  state::init(IsSPMD);
+  synchronize::init(Mode);
+  mapping::init(Mode);
+  state::init(Mode);
 }

 /// Simple generic state machine for worker threads.
 static void genericStateMachine(IdentTy *Ident) {

-  uint32_t TId = mapping::getThreadIdInBlock();
+  uint32_t TId = mapping::getLogicThreadId();

  do {
    ParallelRegionFnTy WorkFn = 0;
@@ -58,23 +58,56 @@ static void genericStateMachine(IdentTy *Ident) {
  } while (true);
 }

+namespace {
+void runSIMDStateMachine(IdentTy *Ident) {
+  uint32_t LaneId = mapping::getThreadIdInWarp();
+  do {
+    SIMDRegionFnTy WorkFn = nullptr;
+
+    // Wait for the signal that we have a new work function.
+    synchronize::warp(mapping::activemask());
+
+    // Retrieve the work function from the runtime.
+    bool IsActive = __kmpc_kernel_simd(&WorkFn);
+
+    if (!WorkFn)
+      return;
+
+    if (IsActive) {
+      ((void (*)(uint32_t, uint32_t))WorkFn)(0, LaneId);
+      __kmpc_kernel_end_simd();
+    }
+
+    synchronize::warp(mapping::activemask());
+  } while (true);
+}
+} // namespace
+
 extern "C" {

 /// Initialization
 ///
 /// \param Ident               Source location identification, can be NULL.
 ///
-int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
+int32_t __kmpc_target_init(IdentTy *Ident, int Mode,
                           bool UseGenericStateMachine, bool) {
-  if (IsSPMD) {
-    inititializeRuntime(/* IsSPMD */ true);
-    synchronize::threads();
-  } else {
-    inititializeRuntime(/* IsSPMD */ false);
-    // No need to wait since only the main threads will execute user
-    // code and workers will run into a barrier right away.
+  Mode = Mode | 0x2;
+
+  inititializeRuntime(Mode);
+
+  // For all SIMD workers, start the simd state machine.
+  if (mapping::utils::isSIMDMode(Mode)) {
+    uint32_t LaneId = mapping::getThreadIdInWarp();
+    if (LaneId) {
+      runSIMDStateMachine(Ident);
+      return LaneId;
+    }
  }

+  const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
+  if (IsSPMD)
+    synchronize::threads();
+
  if (IsSPMD) {
    state::assumeInitialState(IsSPMD);
    return -1;
@@ -96,7 +129,9 @@ int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
 ///
 /// \param Ident Source location identification, can be NULL.
 ///
-void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool) {
+void __kmpc_target_deinit(IdentTy *Ident, int Mode, bool) {
+  const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
+
  state::assumeInitialState(IsSPMD);
  if (IsSPMD)
    return;
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -178,7 +178,7 @@ bool mapping::isMainThreadInGenericMode() {
 bool mapping::isLeaderInWarp() {
  __kmpc_impl_lanemask_t Active = mapping::activemask();
  __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
-  return utils::popc(Active & LaneMaskLT) == 0;
+  return ::_OMP::utils::popc(Active & LaneMaskLT) == 0;
 }

 LaneMaskTy mapping::activemask() { return impl::activemask(); }
@@ -191,6 +191,13 @@ uint32_t mapping::getThreadIdInWarp() { return impl::getThreadIdInWarp(); }

 uint32_t mapping::getThreadIdInBlock() { return impl::getThreadIdInBlock(); }

+uint32_t mapping::getLogicThreadId() {
+  if (mapping::isSIMDMode())
+    return mapping::getWarpId();
+
+  return mapping::getThreadIdInBlock();
+}
+
 uint32_t mapping::getBlockSize() { return impl::getBlockSize(); }

 uint32_t mapping::getKernelSize() { return impl::getKernelSize(); }
@@ -214,16 +221,20 @@ uint32_t mapping::getNumberOfWarpsInBlock() {
 /// Execution mode
 ///
 ///{
-static int SHARED(IsSPMDMode);
+static int SHARED(ExecutionMode);

-void mapping::init(bool IsSPMD) {
+void mapping::init(int Mode) {
  if (!mapping::getThreadIdInBlock())
-    IsSPMDMode = IsSPMD;
+    ExecutionMode = Mode;
 }

-bool mapping::isSPMDMode() { return IsSPMDMode; }
+bool mapping::isSPMDMode() { return mapping::utils::isSPMDMode(ExecutionMode); }

-bool mapping::isGenericMode() { return !isSPMDMode(); }
+bool mapping::isGenericMode() {
+  return mapping::utils::isGenericMode(ExecutionMode);
+}
+
+bool mapping::isSIMDMode() { return mapping::utils::isSIMDMode(ExecutionMode); }
 ///}

 extern "C" {
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -49,20 +49,43 @@ namespace {
 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
  uint32_t NThreadsICV =
      NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
-  uint32_t NumThreads = mapping::getBlockSize();
+
+  const bool IsSIMDMode = mapping::isSIMDMode();
+
+  uint32_t NumThreads =
+      IsSIMDMode ? mapping::getNumberOfWarpsInBlock() : mapping::getBlockSize();

  if (NThreadsICV != 0 && NThreadsICV < NumThreads)
    NumThreads = NThreadsICV;

  // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
-  if (NumThreads < mapping::getWarpSize())
-    NumThreads = 1;
-  else
-    NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
+  // We don't need this for SIMD mode because an OpenMP thread is mapped to a
+  // warp on the device and it can be any number.
+  if (!IsSIMDMode) {
+    if (NumThreads < mapping::getWarpSize())
+      NumThreads = 1;
+    else
+      NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
+  }

  return NumThreads;
 }

+uint32_t determineSIMDLen(int32_t SIMDLen, int32_t SafeLen) {
+  ASSERT(mapping::isSIMDMode());
+
+  // TODO: This is probably not right if the schedule is different.
+  if (SafeLen < SIMDLen)
+    SIMDLen = SafeLen;
+
+  // We currently maps an OpenMP thread to a warp in SIMD mode. If the simdlen
+  // is larger than the warp size, we have to ceil it.
+  if (SIMDLen > mapping::getWarpSize())
+    SIMDLen = mapping::getWarpSize();
+
+  return SIMDLen;
+}
+
 // Invoke an outlined parallel function unwrapping arguments (up to 32).
 void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
                     void **args, int64_t nargs) {
@@ -78,11 +101,57 @@ void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,

 extern "C" {

+void __kmpc_simd_51(IdentTy *ident, int32_t, int32_t if_expr, int32_t safelen,
+                    int32_t simdlen, int order, void *fn, void *wrapper_fn,
+                    void **args, int64_t nargs) {
+  // Handle non-SIMD case first, which can be:
+  // - if clause is evaluted to false
+  // - simdlen is set to 1
+  // - it is already in simd region
+  const uint32_t LogicThreadId = mapping::getLogicThreadId();
+  if (OMP_UNLIKELY(!if_expr || simdlen == 1 || safelen == 1 ||
+                   icv::SIMDLevel)) {
+    invokeMicrotask(LogicThreadId, 0, fn, args, nargs);
+    return;
+  }
+
+  // Only the leader of each warp can execute the following code.
+  ASSERT(mapping::isLeaderInWarp());
+
+  const uint32_t SIMDLen = determineSIMDLen(simdlen, safelen);
+
+  if (LogicThreadId == 0)
+    state::SIMDLaneWidth = SIMDLen;
+
+  // Propagates the thread state to all SIMD workers from the leader.
+  state::propagateThreadState(SIMDLen);
+
+  // Synchronize all threads (leaders).
+  synchronize::threads();
+
+  {
+    state::ValueRAII SIMDRegionFnRAII(state::SIMDRegionFn, wrapper_fn,
+                                      (void *)nullptr, true);
+    state::ValueRAII SIMDLevelRAII(icv::SIMDLevel, 1u, 0u, true);
+
+    // Signal SIMD workers
+    synchronize::warp(mapping::activemask());
+
+    // TODO: Leader in warp also has to execute the SIMD region.
+    // What we need:
+    // - A work-sharing function that can take both thread id and lane id into
+    // consideration.
+
+    // Synchronize after execution of the SIMD region.
+    synchronize::warp(mapping::activemask());
+  }
+}
+
 void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
                        int32_t num_threads, int proc_bind, void *fn,
                        void *wrapper_fn, void **args, int64_t nargs) {

-  uint32_t TId = mapping::getThreadIdInBlock();
+  uint32_t TId = mapping::getLogicThreadId();
  // Handle the serialized case first, same for SPMD/non-SPMD.
  if (OMP_UNLIKELY(!if_expr || icv::Level)) {
    __kmpc_serialized_parallel(ident, TId);
@@ -156,7 +225,7 @@ __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
    return false;

  // Set to true for workers participating in the parallel region.
-  uint32_t TId = mapping::getThreadIdInBlock();
+  uint32_t TId = mapping::getLogicThreadId();
  bool ThreadIsActive = TId < state::ParallelTeamSize;
  return ThreadIsActive;
 }
@@ -170,6 +239,25 @@ __attribute__((noinline)) void __kmpc_kernel_end_parallel() {
  ASSERT(!mapping::isSPMDMode());
 }

+__attribute__((noinline)) bool
+__kmpc_kernel_simd(SIMDRegionFnTy *WorkFn) {
+  // Work function and arguments for L1 SIMD region.
+  *WorkFn = state::SIMDRegionFn;
+
+  // If this is the termination signal from the master, quit early.
+  if (!*WorkFn)
+    return false;
+
+  // Set to true for workers participating in the parallel region.
+  uint32_t LaneId = mapping::getThreadIdInWarp();
+  bool LaneActive = LaneId < state::SIMDLaneWidth;
+  return LaneActive;
+}
+
+__attribute__((noinline)) void __kmpc_kernel_end_simd() {
+  // TODO: Some clean-up of SIMD execution
+}
+
 void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) {
  state::enterDataEnvironment();
  ++icv::Level;
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -203,7 +203,7 @@ void ICVStateTy::assertEqual(const ICVStateTy &Other) const {

 struct TeamStateTy {
  /// TODO: provide a proper init function.
-  void init(bool IsSPMD);
+  void init(int Mode);

  bool operator==(const TeamStateTy &) const;

@@ -224,8 +224,13 @@ struct TeamStateTy {

 TeamStateTy SHARED(TeamState);

-void TeamStateTy::init(bool IsSPMD) {
-  ICVState.NThreadsVar = mapping::getBlockSize();
+void TeamStateTy::init(int Mode) {
+  // In SIMD mode, we map an OpenMP thread to a warp.
+  if (mapping::utils::isSIMDMode(Mode))
+    ICVState.NThreadsVar = mapping::getNumberOfWarpsInBlock();
+  else
+    ICVState.NThreadsVar = mapping::getBlockSize();
+
  ICVState.LevelVar = 0;
  ICVState.ActiveLevelVar = 0;
  ICVState.MaxActiveLevelsVar = 1;
@@ -357,7 +362,8 @@ void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
  __builtin_unreachable();
 }

-void state::init(bool IsSPMD) {
+void state::init(int Mode) {
+  const bool IsSPMD = mapping::utils::isSPMDMode(Mode);
  SharedMemorySmartStack.init(IsSPMD);
  if (!mapping::getThreadIdInBlock())
    TeamState.init(IsSPMD);
@@ -404,6 +410,15 @@ void state::assumeInitialState(bool IsSPMD) {
  ASSERT(mapping::isSPMDMode() == IsSPMD);
 }

+void state::propagateThreadState(unsigned SIMDLen) {
+  ASSERT(mapping::isSIMDMode());
+  ASSERT(mapping::isLeaderInWarp());
+
+  const uint32_t TId = mapping::getThreadIdInBlock();
+  for (int I = 1; I < SIMDLen; ++I)
+    ThreadStates[I + TId] = ThreadStates[TId];
+}
+
 extern "C" {
 void omp_set_dynamic(int V) {}

@@ -434,7 +449,7 @@ void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
 }

 int omp_get_ancestor_thread_num(int Level) {
-  return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
+  return returnValIfLevelIsActive(Level, mapping::getLogicThreadId(), 0);
 }

 int omp_get_thread_num(void) {
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -214,8 +214,8 @@ void setLock(omp_lock_t *Lock) {

 } // namespace impl

-void synchronize::init(bool IsSPMD) {
-  if (!IsSPMD)
+void synchronize::init(int Mode) {
+  if (!mapping::utils::isSPMDMode(Mode) || mapping::utils::isSIMDMode(Mode))
    impl::namedBarrierInit();
 }

--- a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -210,7 +210,7 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
  static void dispatch_init(IdentTy *loc, int32_t threadId,
                            kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
                            DynamicScheduleTracker *DST) {
-    int tid = mapping::getThreadIdInBlock();
+    int tid = mapping::getLogicThreadId();
    T tnum = omp_get_num_threads();
    T tripCount = ub - lb + 1; // +1 because ub is inclusive
    ASSERT0(LT_FUSSY, threadId < tnum,
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -1099,13 +1099,22 @@ public:
    KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);

    int CudaThreadsPerBlock;
+    // TODO: Set this mode accordingly.
+    bool IsSIMDMode = false;
    if (ThreadLimit > 0) {
-      DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
-      CudaThreadsPerBlock = ThreadLimit;
-      // Add master warp if necessary
-      if (KernelInfo->ExecutionMode == GENERIC) {
-        DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
-        CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
+      if (IsSIMDMode) {
+        DP("Setting CUDA threads per block to requested %d\n",
+           ThreadLimit * DeviceData[DeviceId].WarpSize);
+        CudaThreadsPerBlock = ThreadLimit * DeviceData[DeviceId].WarpSize;
+      } else {
+        DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
+        CudaThreadsPerBlock = ThreadLimit;
+        // Add master warp if necessary
+        if (KernelInfo->ExecutionMode == GENERIC) {
+          DP("Adding master warp: +%d threads\n",
+             DeviceData[DeviceId].WarpSize);
+          CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
+        }
      }
    } else {
      DP("Setting CUDA threads per block to default %d\n",