Proto  3.2
Proto_Reduction.H
Go to the documentation of this file.
1 #ifndef __PROTO_REDUCTION_H__
2 #define __PROTO_REDUCTION_H__
3 
4 #include <limits>
5 #include <typeinfo>
6 
7 #include "Proto_SPMD.H"
8 #include "Proto_MemType.H"
9 #include "Proto_Macros.H"
10 #include "Proto_Accel.H"
11 
12 
13 namespace Proto {
14 
16 enum Atomic { Warp, Block, None };
17 
18 constexpr int line = 128; // bytes in a cache line
19 
20 #ifdef PROTO_ACCEL // namespace collision in host builds
21 template<typename T>
23 static constexpr T max(T a, T b) { return (a > b ? a : b); }
24 template<typename T>
26 static constexpr T min(T a, T b) { return (a < b ? a : b); }
27 #endif
28 
29 template<typename T, Operation OP, MemType MEM=MEMTYPE_DEFAULT>
30 class Reduction
31 {
32 public:
34 
35  /// Constructor
36  /**
37  Creates a Reduction operator with or without dynamic block allocation.
38  If <code>a_dynamic==true</code>, the minimum number of GPU blocks
39  needed to compute a reduction will be determined on each call to
40  <code>Reduction::reduce</code>. Otherwise, the maximum bumber of blocks
41  available on the device will be used instead.
42 
43  \param a_dynamic Determine number of GPU blocks dynamically?
44  */
45  Reduction<T,OP,MEM>(bool a_dynamic);
46 
47  /// Destructor
49 
50  /// Initialize Value
51  /**
52  Initialize a value based on OP.
53  Minima Ops: <code>a_value</code> -> max value of T
54  Maxima Ops: <code>a_value</code> -> min value of T
55  Sum Ops: <code>a_value</code> -> 0
56  */
58  static T init();
59 
60  /// Update Value
61  /**
62  Computed an updated value by executing OP on two values.
63 
64  \param a_v1 A value compared with OP
65  \param a_v2 A value compared with OP
66  */
68  static void update(T& a_v1, const T a_v2);
69 
70  /// Get Reduction
71  /**
72  Returns the computed reduction, waiting for kernel completion on GPUs.
73  The result is local to a single MPI process if MPI is enabled.
74  */
75  T fetchLocal();
76 
77  /// Get Reduction
78  /**
79  Returns the computed reduction, waiting for kernel completion on GPUs.
80  If MPI is enabled, the result is communicated to all processes.
81  */
82  T fetch();
83 
84  /// Compute Reduction
85  /**
86  Calculates a reduction on the buffer <code>a_data</code> of size <code>a_size</code>.
87  Here <code>a_size</code> is the number of elements of <code>a_data</code>.
88  Subsequent calls to <code>reduce</code> will update the reduction until
89  <code>Reduction::reset()</code> is called.
90 
91  \param a_data A buffer of data.
92  \param a_size The number of elements in <code>a_data</code>.
93  */
94  void reduce(const T *a_data, const size_t a_size); // configures and calls the kernel
95 
96  /// Reset Reduction
97  /**
98  Reinitializes the reduction.
99  */
100  void reset(); // initializes pointer based on operation
101 
102 private:
105 #ifdef PROTO_ACCEL
106  bool m_dynamic;
107  T *m_deviTemp, *m_deviTotal;
108  int m_numThreads, m_numBlocks, m_warpSize;
109 #endif
110 };
111 
112 template<typename T, Operation OP>
114 void initKernel(T* ptr)
115 {
116  *ptr = Reduction<T,OP>::init();
117 }
118 
119 #ifdef PROTO_ACCEL
120 
121 //=================================================================================================
122 // KERNEL AND DEVICE CODE
123 
124 template<typename T, Operation OP>
125 __device__ // reduction within a warp, only lane 0 gets right result
126 void warpOp(T& val, size_t idx, size_t size) {
127  unsigned mask = 0xffffffff; // FULL_MASK
128  for (unsigned int delta = warpSize/2; delta > 0; delta /= 2)
129  {
130 #if defined PROTO_HIP
131  // __shfl_down depreciated since CUDA 9.0 but __shfl_down_sync isn't available with hipcc (17/07/2020)
132  Reduction<T,OP>::update(val, (T)__shfl_down(val,delta));
133 #elif defined PROTO_CUDA
134  Reduction<T,OP>::update(val, (T)__shfl_down_sync(mask,val,delta));
135 #endif
136  }
137 }
138 
139 template<typename T, Operation OP>
140 __device__ // reduce within a block by storing partial warp reductions in shmem
141 void blockOp(T& val, int idx, int size) {
142  PR_assert(blockDim.x <= warpSize*warpSize); // otherwise block won't be reduced by end of function
143  extern __shared__ __align__(sizeof(T)) unsigned int shdata[];
144  T *shmem = reinterpret_cast<T*>(shdata);
145  int lane = threadIdx.x % warpSize;
146  int wid = threadIdx.x / warpSize;
147  int warps = (blockDim.x+warpSize-1)/warpSize;
148 
149  warpOp<T,OP>(val, idx, size);
150  if (warps == 1) return;
151 
152  if (!lane) shmem[wid] = val; // first lane of each warp fills memory
153  __syncthreads();
154  if (!wid) {
155  val = (threadIdx.x < warps ? shmem[lane] : Reduction<T,OP>::init());
156  // only first lane of first warp ends up with real value
157  warpOp<T,OP>(val, threadIdx.x, warps);
158  }
159 }
160 
161 template<typename T, Operation OP>
162 ACCEL_KERNEL // every block's sum is written to out[]
163 void kernel(size_t size, const T* in, T* out, T* val)
164 { // if called twice by reduce, out=nullptr
165  PR_assert(gridDim.x <= line/sizeof(T)); // each block writes to unique out[] index
166  int idx = blockIdx.x*blockDim.x + threadIdx.x;
167  T ret = Reduction<T,OP>::init();
168  for (size_t i = idx; i < size; i += blockDim.x*gridDim.x)
169  {
170  Reduction<T,OP>::update(ret,in[i]);
171  }
172  blockOp<T,OP>(ret, idx, size);
173  if (!threadIdx.x)
174  {
175  if (gridDim.x > 1)
176  {
177  out[blockIdx.x] = ret; // block result is reused in 2nd call
178  }
179  else // only one block in this kernel -- always true for 2nd call
180  {
181  Reduction<T,OP>::update(*val, ret);
182  }
183  }
184 }
185 #endif
186 
187 } // end namespace Proto
188 
189 #endif // __PROTO_REDUCTION_H__
static ACCEL_DECORATION T init()
Initialize Value.
static ACCEL_DECORATION void update(T &a_v1, const T a_v2)
Update Value.
Operation
Definition: Proto_Reduction.H:15
Definition: Proto_Reduction.H:15
Definition: Proto_Reduction.H:16
void reset()
Reset Reduction.
Definition: Proto_Reduction.H:16
Definition: Proto_Reduction.H:15
#define ACCEL_KERNEL
Definition: Proto_Accel.H:13
constexpr int line
Definition: Proto_Reduction.H:18
T * m_hostTotal
Definition: Proto_Reduction.H:103
Definition: Proto_Reduction.H:15
Definition: Proto_Reduction.H:30
#define PR_assert(stmt)
Definition: Proto_PAssert.H:68
#define ACCEL_DECORATION
Definition: Proto_Accel.H:12
T fetchLocal()
Get Reduction.
T * m_hostTemp
Definition: Proto_Reduction.H:104
Definition: Proto_Reduction.H:16
Definition: Proto_Array.H:17
ACCEL_KERNEL void initKernel(T *ptr)
Definition: Proto_Reduction.H:114
T fetch()
Get Reduction.
Atomic
Definition: Proto_Reduction.H:16
void reduce(const T *a_data, const size_t a_size)
Compute Reduction.
Definition: Proto_Reduction.H:15
Definition: Proto_Reduction.H:15
Definition: Proto_Reduction.H:15