1 #ifndef __PROTO_REDUCTION_H__ 2 #define __PROTO_REDUCTION_H__ 5 #include "Proto_cuda.H" 6 #include "Proto_macros.H" 13 enum Operation { Max, Min, Abs, Sum };
14 enum Atomic { Warp, Block, None };
16 constexpr
int line = 128;
18 #ifdef PROTO_CUDA // namespace collision in host builds 21 static constexpr T max(T a, T b) {
return a > b ? a : b; }
24 static constexpr T min(T a, T b) {
return a < b ? a : b; }
27 template<
typename T, Operation op>
29 T update(T last, T next) {
33 res = max<T>(last,next);
36 res = min<T>(last,next);
39 res = (next > 0 ? max<T>(last,next) : max<T>(last,-next));
47 template<
typename T, Operation op>
52 *ptr = std::numeric_limits<T>::min();
55 *ptr = std::numeric_limits<T>::max();
58 *ptr =
static_cast<T
>(0);
65 template<
typename T, Operation op>
67 T warpOp(T val,
int idx,
int size) {
68 unsigned mask = 0xffffffff;
69 if ((size/warpSize)*warpSize <= idx)
70 mask = (1 << size%warpSize) - 1;
71 for (
unsigned int delta = warpSize/2; delta > 0; delta /= 2)
73 val = update<T,op>(val, (T)__shfl_down(val,delta));
75 val = update<T,op>(val, (T)__shfl_down_sync(mask,val,delta));
80 template<
typename T, Operation op>
82 T blockOp(T val,
int idx,
int size) {
83 extern __shared__ __align__(
sizeof(T))
unsigned char shdata[];
84 T *shmem =
reinterpret_cast<T*
>(shdata);
85 int lane = threadIdx.x % warpSize;
86 int wid = threadIdx.x / warpSize;
88 val = warpOp<T,op>(val, idx, size);
90 if (!lane) shmem[wid] = val;
92 int warps = (blockDim.x+warpSize-1)/warpSize;
94 if (threadIdx.x < warps)
97 if (!wid) val = warpOp<T,op>(val, threadIdx.x, warps);
102 template<
typename T, Operation op>
104 void kernel(
size_t size,
const T* in, T* out, T* val) {
105 PR_assert(gridDim.x <= line/
sizeof(T));
106 int idx = blockIdx.x*blockDim.x + threadIdx.x;
109 for (
size_t i = idx; i < size; i += blockDim.x*gridDim.x)
111 ret = update<T,op>(ret,in[i]);
113 ret = blockOp<T,op>(ret, idx, size);
116 out[blockIdx.x] = ret;
123 template<
typename T, Operation op = Abs>
136 void reduce(
const T *data,
const size_t size);
146 int threads, blocks, warp;
150 template<
typename T, Operation op>
153 init<T,op><<<1,1>>>(ult);
159 template<
typename T, Operation op>
165 protoDeviceProp prop;
166 protoGetDeviceProperties(&prop, 0);
167 threads = prop.maxThreadsPerBlock;
168 warp = prop.warpSize;
169 protoMallocHost(host,
sizeof(T));
170 protoMalloc(MEMTYPE_DEFAULT,ult,
sizeof(T));
171 init<T,op><<<1,1>>>(ult);
174 blocks = (prop.maxThreadsPerMultiProcessor/threads)*prop.multiProcessorCount;
175 protoMalloc(DEVICE,temp,blocks*
sizeof(T));
183 template<
typename T, Operation op>
187 protoFree(MEMTYPE_DEFAULT,ult);
188 protoFree(DEVICE,temp);
194 template<
typename T, Operation op>
198 protoMemcpy(DEVICE,host,ult,
sizeof(T),protoMemcpyDeviceToHost);
203 template<
typename T, Operation op>
207 protoDeviceProp prop;
208 protoGetDeviceProperties(&prop, 0);
209 threads = min(size,(
size_t)threads);
210 blocks = (size+threads-1)/threads;
211 protoMalloc(DEVICE,temp,blocks*
sizeof(T));
213 int shmem = (threads+warp-1)/warp;
215 protoLaunchKernelMemAsyncGPU((kernel<T,op>),blocks,shmem*warp,shmem*
sizeof(T), (protoStream_t) 0, size, data, temp, ult);
217 shmem = (blocks+warp-1)/warp;
218 protoLaunchKernelMemAsyncGPU((kernel<T,op>),1,shmem*warp,shmem*
sizeof(T), (protoStream_t) 0, blocks, temp,
nullptr, ult);
223 for(
int it=0 ; it < size ; it++)
224 val = update<T,op>(val,data[it]);
231 #endif // __PROTO_REDUCTION_H__ Definition: Proto_Reduction.H:124
Definition: Proto_Box.H:11