Proto
Proto_Reduction.H
1 #ifndef __PROTO_REDUCTION_H__
2 #define __PROTO_REDUCTION_H__
3 
4 //#include "Proto_gpu.H"
5 #include "Proto_cuda.H"
6 #include "Proto_macros.H"
7 
8 #include <limits>
9 #include <typeinfo>
10 
11 namespace Proto {
12 
13 enum Operation { Max, Min, Abs, Sum };
14 enum Atomic { Warp, Block, None };
15 
16 constexpr int line = 128; // bytes in a cache line
17 
18 #ifdef PROTO_CUDA // namespace collision in host builds
19 template<typename T>
20 CUDA_DECORATION
21 static constexpr T max(T a, T b) { return a > b ? a : b; }
22 template<typename T>
23 CUDA_DECORATION
24 static constexpr T min(T a, T b) { return a < b ? a : b; }
25 #endif
26 
27 template<typename T, Operation op>
28 CUDA_DECORATION // every block's sum is written to out[]
29 T update(T last, T next) {
30  T res;
31  switch(op) {
32  case Max:
33  res = max<T>(last,next);
34  break;
35  case Min:
36  res = min<T>(last,next);
37  break;
38  case Abs:
39  res = (next > 0 ? max<T>(last,next) : max<T>(last,-next));
40  break;
41  case Sum:
42  res = last + next;
43  }
44  return res;
45 }
46 
47 template<typename T, Operation op>
48 CUDA_KERNEL
49 void init(T* ptr) {
50  switch (op) {
51  case Max:
52  *ptr = std::numeric_limits<T>::min();
53  break;
54  case Min:
55  *ptr = std::numeric_limits<T>::max();
56  break;
57  default: // Abs and Sum
58  *ptr = static_cast<T>(0);
59  break;
60  }
61 }
62 
63 #ifdef PROTO_CUDA
64 
65 template<typename T, Operation op>
66 __device__ // reduction within a warp, only lane 0 gets right result
67 T warpOp(T val, int idx, int size) {
68  unsigned mask = 0xffffffff; // FULL_MASK
69  if ((size/warpSize)*warpSize <= idx) // is thread in the top warp?
70  mask = (1 << size%warpSize) - 1; // set bits = # of active threads in top warp
71  for (unsigned int delta = warpSize/2; delta > 0; delta /= 2)
72 #ifdef PROTO_HIP
73  val = update<T,op>(val, (T)__shfl_down(val,delta)); // __shfl_down depreciated since CUDA 9.0 but __shfl_down_sync isn't available with hipcc (17/07/2020)
74 #else
75  val = update<T,op>(val, (T)__shfl_down_sync(mask,val,delta));
76 #endif
77  return val;
78 }
79 
80 template<typename T, Operation op>
81 __device__ // reduce within a block by storing partial warp reductions in shmem
82 T blockOp(T val, int idx, int size) {
83  extern __shared__ __align__(sizeof(T)) unsigned char shdata[];
84  T *shmem = reinterpret_cast<T*>(shdata);
85  int lane = threadIdx.x % warpSize;
86  int wid = threadIdx.x / warpSize;
87 
88  val = warpOp<T,op>(val, idx, size);
89 
90  if (!lane) shmem[wid] = val; // first lane of each warp fills memory
91 
92  int warps = (blockDim.x+warpSize-1)/warpSize;
93  __syncthreads();
94  if (threadIdx.x < warps)
95  val = shmem[lane];
96  // only first lane of first warp ends up with real value
97  if (!wid) val = warpOp<T,op>(val, threadIdx.x, warps);
98 
99  return val;
100 }
101 
102 template<typename T, Operation op>
103 __global__ // every block's sum is written to out[]
104 void kernel(size_t size, const T* in, T* out, T* val) { // if called twice by reduce, out=nullptr
105  PR_assert(gridDim.x <= line/sizeof(T)); // each block writes to unique out[] index
106  int idx = blockIdx.x*blockDim.x + threadIdx.x;
107  T ret = *val; // ensures that each reduction starts with result of previous one
108 
109  for (size_t i = idx; i < size; i += blockDim.x*gridDim.x)
110 
111  ret = update<T,op>(ret,in[i]);
112 
113  ret = blockOp<T,op>(ret, idx, size);
114  if (!threadIdx.x) {
115  if (gridDim.x > 1)
116  out[blockIdx.x] = ret; // block result is reused in 2nd call
117  else // only one block in this kernel -- always true for 2nd call
118  *val = ret;
119  }
120 }
121 #endif
122 
123 template<typename T, Operation op = Abs>
124 class Reduction
125 {
126 public:
127  Reduction<T,op>() : Reduction<T,op>(false) {}
128 
129  // static allocation at construction, or dynamic allocation in reduce
130  Reduction<T,op>(bool dynamic);
131 
132  ~Reduction<T,op>();
133 
134  T fetch(); // return value, waiting for kernel completion in a GPU run
135 
136  void reduce(const T *data, const size_t size); // configures and calls the kernel
137 
138  void reset(); // initializes pointer based on operation
139 
140 private:
141  T *host;
142  int size;
143 #ifdef PROTO_CUDA
144  bool dyn;
145  T *temp, *ult;
146  int threads, blocks, warp;
147 #endif
148 };
149 
150 template<typename T, Operation op>
151 void Reduction<T,op>::reset() {
152 #ifdef PROTO_CUDA
153  init<T,op><<<1,1>>>(ult);
154 #else
155  init<T,op>(host);
156 #endif
157 }
158 
159 template<typename T, Operation op>
160 Reduction<T,op>::Reduction(bool dynamic) {
161 #ifdef PROTO_CUDA
162 // TODO: get number of streams in runtime
163  //dyn = dynamic;
164  dyn = false;
165  protoDeviceProp prop;
166  protoGetDeviceProperties(&prop, 0); // assumes system has identical GPUs
167  threads = prop.maxThreadsPerBlock; // threads in a block
168  warp = prop.warpSize; // threads in a warp
169  protoMallocHost(host,sizeof(T));
170  protoMalloc(MEMTYPE_DEFAULT,ult,sizeof(T));
171  init<T,op><<<1,1>>>(ult);
172  if (!dyn) {
173  // filling device with blocks for this kernel
174  blocks = (prop.maxThreadsPerMultiProcessor/threads)*prop.multiProcessorCount; // TODO: devide by streams
175  protoMalloc(DEVICE,temp,blocks*sizeof(T)); // TODO: multiply by streams
176  }
177 #else
178  host = new T;
179  init<T,op>(host);
180 #endif
181 }
182 
183 template<typename T, Operation op>
185 #ifdef PROTO_CUDA
186  protoFreeHost(host);
187  protoFree(MEMTYPE_DEFAULT,ult);
188  protoFree(DEVICE,temp);
189 #else
190  delete host;
191 #endif
192 }
193 
194 template<typename T, Operation op>
196 #ifdef PROTO_CUDA
197  // TODO: synchronize multiple streams
198  protoMemcpy(DEVICE,host,ult,sizeof(T),protoMemcpyDeviceToHost);
199 #endif
200  return *host;
201 }
202 
203 template<typename T, Operation op>
204 void Reduction<T,op>::reduce(const T *data, const size_t size) {
205 #ifdef PROTO_CUDA
206  if (dyn) {
207  protoDeviceProp prop;
208  protoGetDeviceProperties(&prop, 0); // assumes system has identical GPUs
209  threads = min(size,(size_t)threads); // threads in a block
210  blocks = (size+threads-1)/threads; // blocks in a kernel
211  protoMalloc(DEVICE,temp,blocks*sizeof(T)); // first kernel call gives one reduction per block
212  }
213  int shmem = (threads+warp-1)/warp; // warps in a block
214 
215  protoLaunchKernelMemAsyncGPU((kernel<T,op>),blocks,shmem*warp,shmem*sizeof(T), (protoStream_t) 0, size, data, temp, ult);
216  if (blocks > 1) {
217  shmem = (blocks+warp-1)/warp; // each block in first kernel left partial reduction
218  protoLaunchKernelMemAsyncGPU((kernel<T,op>),1,shmem*warp,shmem*sizeof(T), (protoStream_t) 0, blocks, temp, nullptr, ult); // updates ult
219  }
220 
221 #else
222  T val = *host;
223  for(int it=0 ; it < size ; it++)
224  val = update<T,op>(val,data[it]);
225  *host = val;
226 #endif
227 }
228 
229 } // end namespace Proto
230 
231 #endif // __PROTO_REDUCTION_H__
Definition: Proto_Reduction.H:124
Definition: Proto_Box.H:11