OpenCV 2.4.8 components for OpenCVgrabber.
[mmanager-3rdparty.git] / OpenCV2.4.8 / build / include / opencv2 / gpu / device / detail / reduce_key_val.hpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
44 #define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
45
46 #include <thrust/tuple.h>
47 #include "../warp.hpp"
48 #include "../warp_shuffle.hpp"
49
50 namespace cv { namespace gpu { namespace device
51 {
52     namespace reduce_key_val_detail
53     {
54         template <typename T> struct GetType;
55         template <typename T> struct GetType<T*>
56         {
57             typedef T type;
58         };
59         template <typename T> struct GetType<volatile T*>
60         {
61             typedef T type;
62         };
63         template <typename T> struct GetType<T&>
64         {
65             typedef T type;
66         };
67
68         template <unsigned int I, unsigned int N>
69         struct For
70         {
71             template <class PointerTuple, class ReferenceTuple>
72             static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
73             {
74                 thrust::get<I>(smem)[tid] = thrust::get<I>(data);
75
76                 For<I + 1, N>::loadToSmem(smem, data, tid);
77             }
78             template <class PointerTuple, class ReferenceTuple>
79             static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
80             {
81                 thrust::get<I>(data) = thrust::get<I>(smem)[tid];
82
83                 For<I + 1, N>::loadFromSmem(smem, data, tid);
84             }
85
86             template <class ReferenceTuple>
87             static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
88             {
89                 thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
90
91                 For<I + 1, N>::copyShfl(val, delta, width);
92             }
93             template <class PointerTuple, class ReferenceTuple>
94             static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
95             {
96                 thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
97
98                 For<I + 1, N>::copy(svals, val, tid, delta);
99             }
100
101             template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
102             static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
103             {
104                 typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
105
106                 if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
107                 {
108                     thrust::get<I>(key) = reg;
109                     thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
110                 }
111
112                 For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
113             }
114             template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
115             static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
116                                          const ValPointerTuple& svals, const ValReferenceTuple& val,
117                                          const CmpTuple& cmp,
118                                          unsigned int tid, unsigned int delta)
119             {
120                 typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
121
122                 if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
123                 {
124                     thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
125                     thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
126                 }
127
128                 For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
129             }
130         };
131         template <unsigned int N>
132         struct For<N, N>
133         {
134             template <class PointerTuple, class ReferenceTuple>
135             static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
136             {
137             }
138             template <class PointerTuple, class ReferenceTuple>
139             static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
140             {
141             }
142
143             template <class ReferenceTuple>
144             static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
145             {
146             }
147             template <class PointerTuple, class ReferenceTuple>
148             static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
149             {
150             }
151
152             template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
153             static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
154             {
155             }
156             template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
157             static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
158                                          const ValPointerTuple&, const ValReferenceTuple&,
159                                          const CmpTuple&,
160                                          unsigned int, unsigned int)
161             {
162             }
163         };
164
165         //////////////////////////////////////////////////////
166         // loadToSmem
167
168         template <typename T>
169         __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
170         {
171             smem[tid] = data;
172         }
173         template <typename T>
174         __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
175         {
176             data = smem[tid];
177         }
178         template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
179                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
180         __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
181                                                    const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
182                                                    unsigned int tid)
183         {
184             For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
185         }
186         template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
187                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
188         __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
189                                                      const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
190                                                      unsigned int tid)
191         {
192             For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
193         }
194
195         //////////////////////////////////////////////////////
196         // copyVals
197
198         template <typename V>
199         __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
200         {
201             val = shfl_down(val, delta, width);
202         }
203         template <typename V>
204         __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
205         {
206             svals[tid] = val = svals[tid + delta];
207         }
208         template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
209         __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
210                                                      unsigned int delta,
211                                                      int width)
212         {
213             For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
214         }
215         template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
216                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
217         __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
218                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
219                                                  unsigned int tid, unsigned int delta)
220         {
221             For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
222         }
223
224         //////////////////////////////////////////////////////
225         // merge
226
227         template <typename K, typename V, class Cmp>
228         __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
229         {
230             K reg = shfl_down(key, delta, width);
231
232             if (cmp(reg, key))
233             {
234                 key = reg;
235                 copyValsShfl(val, delta, width);
236             }
237         }
238         template <typename K, typename V, class Cmp>
239         __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
240         {
241             K reg = skeys[tid + delta];
242
243             if (cmp(reg, key))
244             {
245                 skeys[tid] = key = reg;
246                 copyVals(svals, val, tid, delta);
247             }
248         }
249         template <typename K,
250                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
251                   class Cmp>
252         __device__ __forceinline__ void mergeShfl(K& key,
253                                                   const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
254                                                   const Cmp& cmp,
255                                                   unsigned int delta, int width)
256         {
257             K reg = shfl_down(key, delta, width);
258
259             if (cmp(reg, key))
260             {
261                 key = reg;
262                 copyValsShfl(val, delta, width);
263             }
264         }
265         template <typename K,
266                   typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
267                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
268                   class Cmp>
269         __device__ __forceinline__ void merge(volatile K* skeys, K& key,
270                                               const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
271                                               const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
272                                               const Cmp& cmp, unsigned int tid, unsigned int delta)
273         {
274             K reg = skeys[tid + delta];
275
276             if (cmp(reg, key))
277             {
278                 skeys[tid] = key = reg;
279                 copyVals(svals, val, tid, delta);
280             }
281         }
282         template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
283                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
284                   class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
285         __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
286                                                   const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
287                                                   const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
288                                                   unsigned int delta, int width)
289         {
290             For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
291         }
292         template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
293                   typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
294                   typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
295                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
296                   class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
297         __device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
298                                               const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
299                                               const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
300                                               const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
301                                               const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
302                                               unsigned int tid, unsigned int delta)
303         {
304             For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
305         }
306
307         //////////////////////////////////////////////////////
308         // Generic
309
310         template <unsigned int N> struct Generic
311         {
312             template <class KP, class KR, class VP, class VR, class Cmp>
313             static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
314             {
315                 loadToSmem(skeys, key, tid);
316                 loadValsToSmem(svals, val, tid);
317                 if (N >= 32)
318                     __syncthreads();
319
320                 if (N >= 2048)
321                 {
322                     if (tid < 1024)
323                         merge(skeys, key, svals, val, cmp, tid, 1024);
324
325                     __syncthreads();
326                 }
327                 if (N >= 1024)
328                 {
329                     if (tid < 512)
330                         merge(skeys, key, svals, val, cmp, tid, 512);
331
332                     __syncthreads();
333                 }
334                 if (N >= 512)
335                 {
336                     if (tid < 256)
337                         merge(skeys, key, svals, val, cmp, tid, 256);
338
339                     __syncthreads();
340                 }
341                 if (N >= 256)
342                 {
343                     if (tid < 128)
344                         merge(skeys, key, svals, val, cmp, tid, 128);
345
346                     __syncthreads();
347                 }
348                 if (N >= 128)
349                 {
350                     if (tid < 64)
351                         merge(skeys, key, svals, val, cmp, tid, 64);
352
353                     __syncthreads();
354                 }
355                 if (N >= 64)
356                 {
357                     if (tid < 32)
358                         merge(skeys, key, svals, val, cmp, tid, 32);
359                 }
360
361                 if (tid < 16)
362                 {
363                     merge(skeys, key, svals, val, cmp, tid, 16);
364                     merge(skeys, key, svals, val, cmp, tid, 8);
365                     merge(skeys, key, svals, val, cmp, tid, 4);
366                     merge(skeys, key, svals, val, cmp, tid, 2);
367                     merge(skeys, key, svals, val, cmp, tid, 1);
368                 }
369             }
370         };
371
372         template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
373         struct Unroll
374         {
375             static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
376             {
377                 mergeShfl(key, val, cmp, I, N);
378                 Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
379             }
380             static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
381             {
382                 merge(skeys, key, svals, val, cmp, tid, I);
383                 Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
384             }
385         };
386         template <class KP, class KR, class VP, class VR, class Cmp>
387         struct Unroll<0, KP, KR, VP, VR, Cmp>
388         {
389             static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
390             {
391             }
392             static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
393             {
394             }
395         };
396
397         template <unsigned int N> struct WarpOptimized
398         {
399             template <class KP, class KR, class VP, class VR, class Cmp>
400             static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
401             {
402             #if 0 // __CUDA_ARCH__ >= 300
403                 (void) skeys;
404                 (void) svals;
405                 (void) tid;
406
407                 Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
408             #else
409                 loadToSmem(skeys, key, tid);
410                 loadToSmem(svals, val, tid);
411
412                 if (tid < N / 2)
413                     Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
414             #endif
415             }
416         };
417
418         template <unsigned int N> struct GenericOptimized32
419         {
420             enum { M = N / 32 };
421
422             template <class KP, class KR, class VP, class VR, class Cmp>
423             static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
424             {
425                 const unsigned int laneId = Warp::laneId();
426
427             #if 0 // __CUDA_ARCH__ >= 300
428                 Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
429
430                 if (laneId == 0)
431                 {
432                     loadToSmem(skeys, key, tid / 32);
433                     loadToSmem(svals, val, tid / 32);
434                 }
435             #else
436                 loadToSmem(skeys, key, tid);
437                 loadToSmem(svals, val, tid);
438
439                 if (laneId < 16)
440                     Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
441
442                 __syncthreads();
443
444                 if (laneId == 0)
445                 {
446                     loadToSmem(skeys, key, tid / 32);
447                     loadToSmem(svals, val, tid / 32);
448                 }
449             #endif
450
451                 __syncthreads();
452
453                 loadFromSmem(skeys, key, tid);
454
455                 if (tid < 32)
456                 {
457                 #if 0 // __CUDA_ARCH__ >= 300
458                     loadFromSmem(svals, val, tid);
459
460                     Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
461                 #else
462                     Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
463                 #endif
464                 }
465             }
466         };
467
468         template <bool val, class T1, class T2> struct StaticIf;
469         template <class T1, class T2> struct StaticIf<true, T1, T2>
470         {
471             typedef T1 type;
472         };
473         template <class T1, class T2> struct StaticIf<false, T1, T2>
474         {
475             typedef T2 type;
476         };
477
478         template <unsigned int N> struct IsPowerOf2
479         {
480             enum { value = ((N != 0) && !(N & (N - 1))) };
481         };
482
483         template <unsigned int N> struct Dispatcher
484         {
485             typedef typename StaticIf<
486                 (N <= 32) && IsPowerOf2<N>::value,
487                 WarpOptimized<N>,
488                 typename StaticIf<
489                     (N <= 1024) && IsPowerOf2<N>::value,
490                     GenericOptimized32<N>,
491                     Generic<N>
492                 >::type
493             >::type reductor;
494         };
495     }
496 }}}
497
498 #endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__