18 #if defined (_WIN32) || defined (__i386__) 19 #define BT_USE_SSE_IN_API 27 #if defined BT_USE_SIMD_VECTOR3 36 typedef float float4 __attribute__ ((vector_size(16)));
43 #if defined BT_USE_SSE || defined _WIN32 45 #define LOG2_ARRAY_SIZE 6 46 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE) 48 #include <emmintrin.h> 50 long _maxdot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
51 long _maxdot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
53 const float4 *vertices = (
const float4*) vv;
54 static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
56 float4 vvec = _mm_loadu_ps( vec );
57 float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));
58 float4 vLo = _mm_movelh_ps( vvec, vvec );
63 float4 stack_array[ STACK_ARRAY_COUNT ];
72 for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
76 for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
78 float4 v0 = vertices[0];
79 float4 v1 = vertices[1];
80 float4 v2 = vertices[2];
81 float4 v3 = vertices[3]; vertices += 4;
83 float4 lo0 = _mm_movelh_ps( v0, v1);
84 float4 hi0 = _mm_movehl_ps( v1, v0);
85 float4 lo1 = _mm_movelh_ps( v2, v3);
86 float4 hi1 = _mm_movehl_ps( v3, v2);
90 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
91 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
92 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
96 stack_array[index] = x;
97 max = _mm_max_ps( x, max );
102 v3 = vertices[3]; vertices += 4;
104 lo0 = _mm_movelh_ps( v0, v1);
105 hi0 = _mm_movehl_ps( v1, v0);
106 lo1 = _mm_movelh_ps( v2, v3);
107 hi1 = _mm_movehl_ps( v3, v2);
111 z = _mm_shuffle_ps(hi0, hi1, 0x88);
112 x = _mm_shuffle_ps(lo0, lo1, 0x88);
113 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
117 stack_array[index+1] = x;
118 max = _mm_max_ps( x, max );
123 v3 = vertices[3]; vertices += 4;
125 lo0 = _mm_movelh_ps( v0, v1);
126 hi0 = _mm_movehl_ps( v1, v0);
127 lo1 = _mm_movelh_ps( v2, v3);
128 hi1 = _mm_movehl_ps( v3, v2);
132 z = _mm_shuffle_ps(hi0, hi1, 0x88);
133 x = _mm_shuffle_ps(lo0, lo1, 0x88);
134 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
138 stack_array[index+2] = x;
139 max = _mm_max_ps( x, max );
144 v3 = vertices[3]; vertices += 4;
146 lo0 = _mm_movelh_ps( v0, v1);
147 hi0 = _mm_movehl_ps( v1, v0);
148 lo1 = _mm_movelh_ps( v2, v3);
149 hi1 = _mm_movehl_ps( v3, v2);
153 z = _mm_shuffle_ps(hi0, hi1, 0x88);
154 x = _mm_shuffle_ps(lo0, lo1, 0x88);
155 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
159 stack_array[index+3] = x;
160 max = _mm_max_ps( x, max );
166 if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
169 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
170 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
176 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )
179 maxIndex = 4*index + segment + indexTable[test];
193 for( ; index + 4 <= count / 4; index+=4 )
195 float4 v0 = vertices[0];
196 float4 v1 = vertices[1];
197 float4 v2 = vertices[2];
198 float4 v3 = vertices[3]; vertices += 4;
200 float4 lo0 = _mm_movelh_ps( v0, v1);
201 float4 hi0 = _mm_movehl_ps( v1, v0);
202 float4 lo1 = _mm_movelh_ps( v2, v3);
203 float4 hi1 = _mm_movehl_ps( v3, v2);
207 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
208 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
209 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
213 stack_array[index] = x;
214 max = _mm_max_ps( x, max );
219 v3 = vertices[3]; vertices += 4;
221 lo0 = _mm_movelh_ps( v0, v1);
222 hi0 = _mm_movehl_ps( v1, v0);
223 lo1 = _mm_movelh_ps( v2, v3);
224 hi1 = _mm_movehl_ps( v3, v2);
228 z = _mm_shuffle_ps(hi0, hi1, 0x88);
229 x = _mm_shuffle_ps(lo0, lo1, 0x88);
230 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
234 stack_array[index+1] = x;
235 max = _mm_max_ps( x, max );
240 v3 = vertices[3]; vertices += 4;
242 lo0 = _mm_movelh_ps( v0, v1);
243 hi0 = _mm_movehl_ps( v1, v0);
244 lo1 = _mm_movelh_ps( v2, v3);
245 hi1 = _mm_movehl_ps( v3, v2);
249 z = _mm_shuffle_ps(hi0, hi1, 0x88);
250 x = _mm_shuffle_ps(lo0, lo1, 0x88);
251 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
255 stack_array[index+2] = x;
256 max = _mm_max_ps( x, max );
261 v3 = vertices[3]; vertices += 4;
263 lo0 = _mm_movelh_ps( v0, v1);
264 hi0 = _mm_movehl_ps( v1, v0);
265 lo1 = _mm_movelh_ps( v2, v3);
266 hi1 = _mm_movehl_ps( v3, v2);
270 z = _mm_shuffle_ps(hi0, hi1, 0x88);
271 x = _mm_shuffle_ps(lo0, lo1, 0x88);
272 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
276 stack_array[index+3] = x;
277 max = _mm_max_ps( x, max );
283 size_t localCount = (count & -4L) - 4*index;
287 float4 t0, t1, t2, t3, t4;
288 float4 * sap = &stack_array[index + localCount / 4];
289 vertices += localCount;
290 size_t byteIndex = -(localCount) *
sizeof(
float);
294 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\ 295 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\ 296 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\ 297 movaps %[t0], %[max] // vertices[0] \n\ 298 movlhps %[t1], %[max] // x0y0x1y1 \n\ 299 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\ 300 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\ 301 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\ 302 movhlps %[t0], %[t1] // z0w0z1w1 \n\ 303 movaps %[t3], %[t0] // vertices[2] \n\ 304 movlhps %[t4], %[t0] // x2y2x3y3 \n\ 305 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\ 306 movhlps %[t3], %[t4] // z2w2z3w3 \n\ 307 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\ 308 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\ 309 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\ 310 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\ 311 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\ 312 addps %[t3], %[max] // x + y \n\ 313 addps %[t1], %[max] // x + y + z \n\ 314 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\ 315 maxps %[t2], %[max] // record max, restore max \n\ 316 add $16, %[byteIndex] // advance loop counter\n\ 319 : [max]
"+x" (max), [t0]
"=&x" (t0), [t1]
"=&x" (t1), [t2]
"=&x" (t2), [t3]
"=&x" (t3), [t4]
"=&x" (t4), [byteIndex]
"+r" (byteIndex)
320 : [vLo]
"x" (vLo), [vHi]
"x" (vHi), [vertices]
"r" (vertices), [sap]
"r" (sap)
323 index += localCount/4;
326 for(
unsigned int i=0; i<localCount/4; i++,index++)
328 float4 v0 = vertices[0];
329 float4 v1 = vertices[1];
330 float4 v2 = vertices[2];
331 float4 v3 = vertices[3];
334 float4 lo0 = _mm_movelh_ps( v0, v1);
335 float4 hi0 = _mm_movehl_ps( v1, v0);
336 float4 lo1 = _mm_movelh_ps( v2, v3);
337 float4 hi1 = _mm_movehl_ps( v3, v2);
341 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
342 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
343 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
347 stack_array[index] = x;
348 max = _mm_max_ps( x, max );
357 float4 v0, v1, v2, x, y, z;
367 float4 lo0 = _mm_movelh_ps( v0, v1);
368 float4 hi0 = _mm_movehl_ps( v1, v0);
370 z = _mm_shuffle_ps(hi0, v2, 0xa8 );
372 float4 lo1 = _mm_movelh_ps(v2, v2);
374 x = _mm_shuffle_ps(lo0, lo1, 0x88);
375 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
382 float4 xy = _mm_movelh_ps(v0, v1);
383 z = _mm_movehl_ps(v1, v0);
385 z = _mm_shuffle_ps( z, z, 0xa8);
386 x = _mm_shuffle_ps( xy, xy, 0xa8);
387 y = _mm_shuffle_ps( xy, xy, 0xfd);
393 float4 xy = vertices[0];
394 z = _mm_shuffle_ps( xy, xy, 0xaa);
397 x = _mm_shuffle_ps(xy, xy, 0);
398 y = _mm_shuffle_ps(xy, xy, 0x55);
404 stack_array[index] = x;
405 max = _mm_max_ps( x, max );
410 if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
413 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
414 max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
425 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )
427 maxIndex = 4*index + segment + indexTable[test];
430 _mm_store_ss( dotResult, dotMax);
434 long _mindot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
436 long _mindot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
438 const float4 *vertices = (
const float4*) vv;
439 static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
441 float4 vvec = _mm_loadu_ps( vec );
442 float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));
443 float4 vLo = _mm_movelh_ps( vvec, vvec );
448 float4 stack_array[ STACK_ARRAY_COUNT ];
457 for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
461 for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
463 float4 v0 = vertices[0];
464 float4 v1 = vertices[1];
465 float4 v2 = vertices[2];
466 float4 v3 = vertices[3]; vertices += 4;
468 float4 lo0 = _mm_movelh_ps( v0, v1);
469 float4 hi0 = _mm_movehl_ps( v1, v0);
470 float4 lo1 = _mm_movelh_ps( v2, v3);
471 float4 hi1 = _mm_movehl_ps( v3, v2);
475 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
476 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
477 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
481 stack_array[index] = x;
482 min = _mm_min_ps( x, min );
487 v3 = vertices[3]; vertices += 4;
489 lo0 = _mm_movelh_ps( v0, v1);
490 hi0 = _mm_movehl_ps( v1, v0);
491 lo1 = _mm_movelh_ps( v2, v3);
492 hi1 = _mm_movehl_ps( v3, v2);
496 z = _mm_shuffle_ps(hi0, hi1, 0x88);
497 x = _mm_shuffle_ps(lo0, lo1, 0x88);
498 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
502 stack_array[index+1] = x;
503 min = _mm_min_ps( x, min );
508 v3 = vertices[3]; vertices += 4;
510 lo0 = _mm_movelh_ps( v0, v1);
511 hi0 = _mm_movehl_ps( v1, v0);
512 lo1 = _mm_movelh_ps( v2, v3);
513 hi1 = _mm_movehl_ps( v3, v2);
517 z = _mm_shuffle_ps(hi0, hi1, 0x88);
518 x = _mm_shuffle_ps(lo0, lo1, 0x88);
519 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
523 stack_array[index+2] = x;
524 min = _mm_min_ps( x, min );
529 v3 = vertices[3]; vertices += 4;
531 lo0 = _mm_movelh_ps( v0, v1);
532 hi0 = _mm_movehl_ps( v1, v0);
533 lo1 = _mm_movelh_ps( v2, v3);
534 hi1 = _mm_movehl_ps( v3, v2);
538 z = _mm_shuffle_ps(hi0, hi1, 0x88);
539 x = _mm_shuffle_ps(lo0, lo1, 0x88);
540 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
544 stack_array[index+3] = x;
545 min = _mm_min_ps( x, min );
551 if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
554 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
555 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
561 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )
564 minIndex = 4*index + segment + indexTable[test];
578 for( ; index + 4 <= count / 4; index+=4 )
580 float4 v0 = vertices[0];
581 float4 v1 = vertices[1];
582 float4 v2 = vertices[2];
583 float4 v3 = vertices[3]; vertices += 4;
585 float4 lo0 = _mm_movelh_ps( v0, v1);
586 float4 hi0 = _mm_movehl_ps( v1, v0);
587 float4 lo1 = _mm_movelh_ps( v2, v3);
588 float4 hi1 = _mm_movehl_ps( v3, v2);
592 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
593 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
594 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
598 stack_array[index] = x;
599 min = _mm_min_ps( x, min );
604 v3 = vertices[3]; vertices += 4;
606 lo0 = _mm_movelh_ps( v0, v1);
607 hi0 = _mm_movehl_ps( v1, v0);
608 lo1 = _mm_movelh_ps( v2, v3);
609 hi1 = _mm_movehl_ps( v3, v2);
613 z = _mm_shuffle_ps(hi0, hi1, 0x88);
614 x = _mm_shuffle_ps(lo0, lo1, 0x88);
615 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
619 stack_array[index+1] = x;
620 min = _mm_min_ps( x, min );
625 v3 = vertices[3]; vertices += 4;
627 lo0 = _mm_movelh_ps( v0, v1);
628 hi0 = _mm_movehl_ps( v1, v0);
629 lo1 = _mm_movelh_ps( v2, v3);
630 hi1 = _mm_movehl_ps( v3, v2);
634 z = _mm_shuffle_ps(hi0, hi1, 0x88);
635 x = _mm_shuffle_ps(lo0, lo1, 0x88);
636 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
640 stack_array[index+2] = x;
641 min = _mm_min_ps( x, min );
646 v3 = vertices[3]; vertices += 4;
648 lo0 = _mm_movelh_ps( v0, v1);
649 hi0 = _mm_movehl_ps( v1, v0);
650 lo1 = _mm_movelh_ps( v2, v3);
651 hi1 = _mm_movehl_ps( v3, v2);
655 z = _mm_shuffle_ps(hi0, hi1, 0x88);
656 x = _mm_shuffle_ps(lo0, lo1, 0x88);
657 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
661 stack_array[index+3] = x;
662 min = _mm_min_ps( x, min );
668 size_t localCount = (count & -4L) - 4*index;
674 vertices += localCount;
675 float4 t0, t1, t2, t3, t4;
676 size_t byteIndex = -(localCount) *
sizeof(
float);
677 float4 * sap = &stack_array[index + localCount / 4];
681 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\ 682 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\ 683 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\ 684 movaps %[t0], %[min] // vertices[0] \n\ 685 movlhps %[t1], %[min] // x0y0x1y1 \n\ 686 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\ 687 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\ 688 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\ 689 movhlps %[t0], %[t1] // z0w0z1w1 \n\ 690 movaps %[t3], %[t0] // vertices[2] \n\ 691 movlhps %[t4], %[t0] // x2y2x3y3 \n\ 692 movhlps %[t3], %[t4] // z2w2z3w3 \n\ 693 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\ 694 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\ 695 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\ 696 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\ 697 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\ 698 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\ 699 addps %[t3], %[min] // x + y \n\ 700 addps %[t1], %[min] // x + y + z \n\ 701 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\ 702 minps %[t2], %[min] // record min, restore min \n\ 703 add $16, %[byteIndex] // advance loop counter\n\ 706 : [min]
"+x" (min), [t0]
"=&x" (t0), [t1]
"=&x" (t1), [t2]
"=&x" (t2), [t3]
"=&x" (t3), [t4]
"=&x" (t4), [byteIndex]
"+r" (byteIndex)
707 : [vLo]
"x" (vLo), [vHi]
"x" (vHi), [vertices]
"r" (vertices), [sap]
"r" (sap)
710 index += localCount/4;
713 for(
unsigned int i=0; i<localCount/4; i++,index++)
715 float4 v0 = vertices[0];
716 float4 v1 = vertices[1];
717 float4 v2 = vertices[2];
718 float4 v3 = vertices[3];
721 float4 lo0 = _mm_movelh_ps( v0, v1);
722 float4 hi0 = _mm_movehl_ps( v1, v0);
723 float4 lo1 = _mm_movelh_ps( v2, v3);
724 float4 hi1 = _mm_movehl_ps( v3, v2);
728 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
729 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
730 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
734 stack_array[index] = x;
735 min = _mm_min_ps( x, min );
745 float4 v0, v1, v2, x, y, z;
755 float4 lo0 = _mm_movelh_ps( v0, v1);
756 float4 hi0 = _mm_movehl_ps( v1, v0);
758 z = _mm_shuffle_ps(hi0, v2, 0xa8 );
760 float4 lo1 = _mm_movelh_ps(v2, v2);
762 x = _mm_shuffle_ps(lo0, lo1, 0x88);
763 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
770 float4 xy = _mm_movelh_ps(v0, v1);
771 z = _mm_movehl_ps(v1, v0);
773 z = _mm_shuffle_ps( z, z, 0xa8);
774 x = _mm_shuffle_ps( xy, xy, 0xa8);
775 y = _mm_shuffle_ps( xy, xy, 0xfd);
781 float4 xy = vertices[0];
782 z = _mm_shuffle_ps( xy, xy, 0xaa);
785 x = _mm_shuffle_ps(xy, xy, 0);
786 y = _mm_shuffle_ps(xy, xy, 0x55);
792 stack_array[index] = x;
793 min = _mm_min_ps( x, min );
798 if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
801 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
802 min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
813 for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )
815 minIndex = 4*index + segment + indexTable[test];
818 _mm_store_ss( dotResult, dotmin);
823 #elif defined BT_USE_NEON 825 #define ARM_NEON_GCC_COMPATIBILITY 1 826 #include <arm_neon.h> 827 #include <sys/types.h> 828 #include <sys/sysctl.h> 830 static long _maxdot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
831 static long _maxdot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
832 static long _maxdot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
833 static long _mindot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
834 static long _mindot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
835 static long _mindot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult );
837 long (*_maxdot_large)(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult ) = _maxdot_large_sel;
838 long (*_mindot_large)(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult ) = _mindot_large_sel;
841 static inline uint32_t btGetCpuCapabilities(
void )
844 static bool testedCapabilities =
false;
846 if( 0 == testedCapabilities)
849 size_t featureSize =
sizeof( hasFeature );
850 int err = sysctlbyname(
"hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0 );
852 if( 0 == err && hasFeature)
853 capabilities |= 0x2000;
855 testedCapabilities =
true;
864 static long _maxdot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
867 if( btGetCpuCapabilities() & 0x2000 )
868 _maxdot_large = _maxdot_large_v1;
870 _maxdot_large = _maxdot_large_v0;
872 return _maxdot_large(vv, vec, count, dotResult);
875 static long _mindot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
878 if( btGetCpuCapabilities() & 0x2000 )
879 _mindot_large = _mindot_large_v1;
881 _mindot_large = _mindot_large_v0;
883 return _mindot_large(vv, vec, count, dotResult);
889 # define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); _r; }) 892 # define vld1q_f32_aligned_postincrement( _ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); _r; }) 896 long _maxdot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
899 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
900 float32x2_t vLo = vget_low_f32(vvec);
901 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
903 float32x2_t dotMaxHi = (float32x2_t) { -
BT_INFINITY, -BT_INFINITY };
904 uint32x2_t indexLo = (uint32x2_t) {0, 1};
905 uint32x2_t indexHi = (uint32x2_t) {2, 3};
906 uint32x2_t iLo = (uint32x2_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
907 uint32x2_t iHi = (uint32x2_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
908 const uint32x2_t four = (uint32x2_t) {4,4};
910 for( ; i+8 <= count; i+= 8 )
912 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
913 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
914 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
915 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
917 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
918 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
919 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
920 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
922 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
923 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
924 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
925 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
927 float32x2_t rLo = vpadd_f32( xy0, xy1);
928 float32x2_t rHi = vpadd_f32( xy2, xy3);
929 rLo = vadd_f32(rLo, zLo);
930 rHi = vadd_f32(rHi, zHi);
932 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
933 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
934 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
935 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
936 iLo = vbsl_u32(maskLo, indexLo, iLo);
937 iHi = vbsl_u32(maskHi, indexHi, iHi);
938 indexLo = vadd_u32(indexLo, four);
939 indexHi = vadd_u32(indexHi, four);
941 v0 = vld1q_f32_aligned_postincrement( vv );
942 v1 = vld1q_f32_aligned_postincrement( vv );
943 v2 = vld1q_f32_aligned_postincrement( vv );
944 v3 = vld1q_f32_aligned_postincrement( vv );
946 xy0 = vmul_f32( vget_low_f32(v0), vLo);
947 xy1 = vmul_f32( vget_low_f32(v1), vLo);
948 xy2 = vmul_f32( vget_low_f32(v2), vLo);
949 xy3 = vmul_f32( vget_low_f32(v3), vLo);
951 z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
952 z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
953 zLo = vmul_f32( z0.val[0], vHi);
954 zHi = vmul_f32( z1.val[0], vHi);
956 rLo = vpadd_f32( xy0, xy1);
957 rHi = vpadd_f32( xy2, xy3);
958 rLo = vadd_f32(rLo, zLo);
959 rHi = vadd_f32(rHi, zHi);
961 maskLo = vcgt_f32( rLo, dotMaxLo );
962 maskHi = vcgt_f32( rHi, dotMaxHi );
963 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
964 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
965 iLo = vbsl_u32(maskLo, indexLo, iLo);
966 iHi = vbsl_u32(maskHi, indexHi, iHi);
967 indexLo = vadd_u32(indexLo, four);
968 indexHi = vadd_u32(indexHi, four);
971 for( ; i+4 <= count; i+= 4 )
973 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
974 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
975 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
976 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
978 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
979 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
980 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
981 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
983 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
984 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
985 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
986 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
988 float32x2_t rLo = vpadd_f32( xy0, xy1);
989 float32x2_t rHi = vpadd_f32( xy2, xy3);
990 rLo = vadd_f32(rLo, zLo);
991 rHi = vadd_f32(rHi, zHi);
993 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
994 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
995 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
996 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
997 iLo = vbsl_u32(maskLo, indexLo, iLo);
998 iHi = vbsl_u32(maskHi, indexHi, iHi);
999 indexLo = vadd_u32(indexLo, four);
1000 indexHi = vadd_u32(indexHi, four);
1007 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1008 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1009 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1011 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1012 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1013 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1015 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1016 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1017 float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1019 float32x2_t rLo = vpadd_f32( xy0, xy1);
1020 float32x2_t rHi = vpadd_f32( xy2, xy2);
1021 rLo = vadd_f32(rLo, zLo);
1022 rHi = vadd_f32(rHi, zHi);
1024 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1025 uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
1026 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1027 dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
1028 iLo = vbsl_u32(maskLo, indexLo, iLo);
1029 iHi = vbsl_u32(maskHi, indexHi, iHi);
1034 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1035 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1037 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1038 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1040 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1041 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1043 float32x2_t rLo = vpadd_f32( xy0, xy1);
1044 rLo = vadd_f32(rLo, zLo);
1046 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1047 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1048 iLo = vbsl_u32(maskLo, indexLo, iLo);
1053 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1054 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1055 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1056 float32x2_t zLo = vmul_f32( z0, vHi);
1057 float32x2_t rLo = vpadd_f32( xy0, xy0);
1058 rLo = vadd_f32(rLo, zLo);
1059 uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1060 dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1061 iLo = vbsl_u32(maskLo, indexLo, iLo);
1070 uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
1071 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1072 iLo = vbsl_u32(mask, iHi, iLo);
1075 dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1076 iHi = vdup_lane_u32(iLo, 1);
1077 mask = vcgt_f32( dotMaxHi, dotMaxLo );
1078 dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1079 iLo = vbsl_u32(mask, iHi, iLo);
1081 *dotResult = vget_lane_f32( dotMaxLo, 0);
1082 return vget_lane_u32(iLo, 0);
1086 long _maxdot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
1088 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1089 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1090 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1091 const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1092 uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1093 uint32x4_t index = (uint32x4_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
1096 unsigned long i = 0;
1097 for( ; i + 8 <= count; i += 8 )
1099 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1100 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1101 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1102 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1105 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1106 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1108 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1109 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1111 xy0 = vmulq_f32(xy0, vLo);
1112 xy1 = vmulq_f32(xy1, vLo);
1114 float32x4x2_t zb = vuzpq_f32( z0, z1);
1115 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1116 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1117 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1118 x = vaddq_f32(x, z);
1120 uint32x4_t mask = vcgtq_f32(x, maxDot);
1121 maxDot = vbslq_f32( mask, x, maxDot);
1122 index = vbslq_u32(mask, local_index, index);
1123 local_index = vaddq_u32(local_index, four);
1125 v0 = vld1q_f32_aligned_postincrement( vv );
1126 v1 = vld1q_f32_aligned_postincrement( vv );
1127 v2 = vld1q_f32_aligned_postincrement( vv );
1128 v3 = vld1q_f32_aligned_postincrement( vv );
1131 xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1132 xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1134 z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1135 z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1137 xy0 = vmulq_f32(xy0, vLo);
1138 xy1 = vmulq_f32(xy1, vLo);
1140 zb = vuzpq_f32( z0, z1);
1141 z = vmulq_f32( zb.val[0], vHi);
1142 xy = vuzpq_f32( xy0, xy1);
1143 x = vaddq_f32(xy.val[0], xy.val[1]);
1144 x = vaddq_f32(x, z);
1146 mask = vcgtq_f32(x, maxDot);
1147 maxDot = vbslq_f32( mask, x, maxDot);
1148 index = vbslq_u32(mask, local_index, index);
1149 local_index = vaddq_u32(local_index, four);
1152 for( ; i + 4 <= count; i += 4 )
1154 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1155 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1156 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1157 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1160 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1161 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1163 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1164 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1166 xy0 = vmulq_f32(xy0, vLo);
1167 xy1 = vmulq_f32(xy1, vLo);
1169 float32x4x2_t zb = vuzpq_f32( z0, z1);
1170 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1171 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1172 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1173 x = vaddq_f32(x, z);
1175 uint32x4_t mask = vcgtq_f32(x, maxDot);
1176 maxDot = vbslq_f32( mask, x, maxDot);
1177 index = vbslq_u32(mask, local_index, index);
1178 local_index = vaddq_u32(local_index, four);
1181 switch (count & 3) {
1184 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1185 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1186 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1189 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1190 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1192 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1193 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1195 xy0 = vmulq_f32(xy0, vLo);
1196 xy1 = vmulq_f32(xy1, vLo);
1198 float32x4x2_t zb = vuzpq_f32( z0, z1);
1199 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1200 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1201 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1202 x = vaddq_f32(x, z);
1204 uint32x4_t mask = vcgtq_f32(x, maxDot);
1205 maxDot = vbslq_f32( mask, x, maxDot);
1206 index = vbslq_u32(mask, local_index, index);
1207 local_index = vaddq_u32(local_index, four);
1213 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1214 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1217 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1219 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1221 xy0 = vmulq_f32(xy0, vLo);
1223 float32x4x2_t zb = vuzpq_f32( z0, z0);
1224 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1225 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1226 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1227 x = vaddq_f32(x, z);
1229 uint32x4_t mask = vcgtq_f32(x, maxDot);
1230 maxDot = vbslq_f32( mask, x, maxDot);
1231 index = vbslq_u32(mask, local_index, index);
1232 local_index = vaddq_u32(local_index, four);
1238 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1241 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1243 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1245 xy0 = vmulq_f32(xy0, vLo);
1247 z = vmulq_f32( z, vHi);
1248 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1249 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1250 x = vaddq_f32(x, z);
1252 uint32x4_t mask = vcgtq_f32(x, maxDot);
1253 maxDot = vbslq_f32( mask, x, maxDot);
1254 index = vbslq_u32(mask, local_index, index);
1255 local_index = vaddq_u32(local_index, four);
1265 uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
1266 float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1267 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1270 float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1271 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1272 mask = vcgt_f32( maxDotO, maxDot2 );
1273 maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1274 index2 = vbsl_u32(mask, indexHi, index2);
1276 *dotResult = vget_lane_f32( maxDot2, 0);
1277 return vget_lane_u32(index2, 0);
1281 long _mindot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
1283 unsigned long i = 0;
1284 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1285 float32x2_t vLo = vget_low_f32(vvec);
1286 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1287 float32x2_t dotMinLo = (float32x2_t) {
BT_INFINITY, BT_INFINITY };
1288 float32x2_t dotMinHi = (float32x2_t) {
BT_INFINITY, BT_INFINITY };
1289 uint32x2_t indexLo = (uint32x2_t) {0, 1};
1290 uint32x2_t indexHi = (uint32x2_t) {2, 3};
1291 uint32x2_t iLo = (uint32x2_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1292 uint32x2_t iHi = (uint32x2_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1293 const uint32x2_t four = (uint32x2_t) {4,4};
1295 for( ; i+8 <= count; i+= 8 )
1297 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1298 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1299 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1300 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1302 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1303 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1304 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1305 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1307 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1308 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1309 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1310 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1312 float32x2_t rLo = vpadd_f32( xy0, xy1);
1313 float32x2_t rHi = vpadd_f32( xy2, xy3);
1314 rLo = vadd_f32(rLo, zLo);
1315 rHi = vadd_f32(rHi, zHi);
1317 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1318 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1319 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1320 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1321 iLo = vbsl_u32(maskLo, indexLo, iLo);
1322 iHi = vbsl_u32(maskHi, indexHi, iHi);
1323 indexLo = vadd_u32(indexLo, four);
1324 indexHi = vadd_u32(indexHi, four);
1326 v0 = vld1q_f32_aligned_postincrement( vv );
1327 v1 = vld1q_f32_aligned_postincrement( vv );
1328 v2 = vld1q_f32_aligned_postincrement( vv );
1329 v3 = vld1q_f32_aligned_postincrement( vv );
1331 xy0 = vmul_f32( vget_low_f32(v0), vLo);
1332 xy1 = vmul_f32( vget_low_f32(v1), vLo);
1333 xy2 = vmul_f32( vget_low_f32(v2), vLo);
1334 xy3 = vmul_f32( vget_low_f32(v3), vLo);
1336 z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1337 z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1338 zLo = vmul_f32( z0.val[0], vHi);
1339 zHi = vmul_f32( z1.val[0], vHi);
1341 rLo = vpadd_f32( xy0, xy1);
1342 rHi = vpadd_f32( xy2, xy3);
1343 rLo = vadd_f32(rLo, zLo);
1344 rHi = vadd_f32(rHi, zHi);
1346 maskLo = vclt_f32( rLo, dotMinLo );
1347 maskHi = vclt_f32( rHi, dotMinHi );
1348 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1349 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1350 iLo = vbsl_u32(maskLo, indexLo, iLo);
1351 iHi = vbsl_u32(maskHi, indexHi, iHi);
1352 indexLo = vadd_u32(indexLo, four);
1353 indexHi = vadd_u32(indexHi, four);
1356 for( ; i+4 <= count; i+= 4 )
1358 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1359 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1360 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1361 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1363 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1364 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1365 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1366 float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1368 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1369 float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1370 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1371 float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1373 float32x2_t rLo = vpadd_f32( xy0, xy1);
1374 float32x2_t rHi = vpadd_f32( xy2, xy3);
1375 rLo = vadd_f32(rLo, zLo);
1376 rHi = vadd_f32(rHi, zHi);
1378 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1379 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1380 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1381 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1382 iLo = vbsl_u32(maskLo, indexLo, iLo);
1383 iHi = vbsl_u32(maskHi, indexHi, iHi);
1384 indexLo = vadd_u32(indexLo, four);
1385 indexHi = vadd_u32(indexHi, four);
1391 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1392 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1393 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1395 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1396 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1397 float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1399 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1400 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1401 float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1403 float32x2_t rLo = vpadd_f32( xy0, xy1);
1404 float32x2_t rHi = vpadd_f32( xy2, xy2);
1405 rLo = vadd_f32(rLo, zLo);
1406 rHi = vadd_f32(rHi, zHi);
1408 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1409 uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1410 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1411 dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1412 iLo = vbsl_u32(maskLo, indexLo, iLo);
1413 iHi = vbsl_u32(maskHi, indexHi, iHi);
1418 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1419 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1421 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1422 float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1424 float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1425 float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1427 float32x2_t rLo = vpadd_f32( xy0, xy1);
1428 rLo = vadd_f32(rLo, zLo);
1430 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1431 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1432 iLo = vbsl_u32(maskLo, indexLo, iLo);
1437 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1438 float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1439 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1440 float32x2_t zLo = vmul_f32( z0, vHi);
1441 float32x2_t rLo = vpadd_f32( xy0, xy0);
1442 rLo = vadd_f32(rLo, zLo);
1443 uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1444 dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1445 iLo = vbsl_u32(maskLo, indexLo, iLo);
1454 uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
1455 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1456 iLo = vbsl_u32(mask, iHi, iLo);
1459 dotMinHi = vdup_lane_f32(dotMinLo, 1);
1460 iHi = vdup_lane_u32(iLo, 1);
1461 mask = vclt_f32( dotMinHi, dotMinLo );
1462 dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1463 iLo = vbsl_u32(mask, iHi, iLo);
1465 *dotResult = vget_lane_f32( dotMinLo, 0);
1466 return vget_lane_u32(iLo, 0);
1469 long _mindot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult )
1471 float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1472 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1473 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1474 const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1475 uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1476 uint32x4_t index = (uint32x4_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
1479 unsigned long i = 0;
1480 for( ; i + 8 <= count; i += 8 )
1482 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1483 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1484 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1485 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1488 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1489 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1491 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1492 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1494 xy0 = vmulq_f32(xy0, vLo);
1495 xy1 = vmulq_f32(xy1, vLo);
1497 float32x4x2_t zb = vuzpq_f32( z0, z1);
1498 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1499 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1500 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1501 x = vaddq_f32(x, z);
1503 uint32x4_t mask = vcltq_f32(x, minDot);
1504 minDot = vbslq_f32( mask, x, minDot);
1505 index = vbslq_u32(mask, local_index, index);
1506 local_index = vaddq_u32(local_index, four);
1508 v0 = vld1q_f32_aligned_postincrement( vv );
1509 v1 = vld1q_f32_aligned_postincrement( vv );
1510 v2 = vld1q_f32_aligned_postincrement( vv );
1511 v3 = vld1q_f32_aligned_postincrement( vv );
1514 xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1515 xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1517 z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1518 z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1520 xy0 = vmulq_f32(xy0, vLo);
1521 xy1 = vmulq_f32(xy1, vLo);
1523 zb = vuzpq_f32( z0, z1);
1524 z = vmulq_f32( zb.val[0], vHi);
1525 xy = vuzpq_f32( xy0, xy1);
1526 x = vaddq_f32(xy.val[0], xy.val[1]);
1527 x = vaddq_f32(x, z);
1529 mask = vcltq_f32(x, minDot);
1530 minDot = vbslq_f32( mask, x, minDot);
1531 index = vbslq_u32(mask, local_index, index);
1532 local_index = vaddq_u32(local_index, four);
1535 for( ; i + 4 <= count; i += 4 )
1537 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1538 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1539 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1540 float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1543 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1544 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1546 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1547 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1549 xy0 = vmulq_f32(xy0, vLo);
1550 xy1 = vmulq_f32(xy1, vLo);
1552 float32x4x2_t zb = vuzpq_f32( z0, z1);
1553 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1554 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1555 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1556 x = vaddq_f32(x, z);
1558 uint32x4_t mask = vcltq_f32(x, minDot);
1559 minDot = vbslq_f32( mask, x, minDot);
1560 index = vbslq_u32(mask, local_index, index);
1561 local_index = vaddq_u32(local_index, four);
1564 switch (count & 3) {
1567 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1568 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1569 float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1572 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1573 float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1575 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1576 float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1578 xy0 = vmulq_f32(xy0, vLo);
1579 xy1 = vmulq_f32(xy1, vLo);
1581 float32x4x2_t zb = vuzpq_f32( z0, z1);
1582 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1583 float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1584 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1585 x = vaddq_f32(x, z);
1587 uint32x4_t mask = vcltq_f32(x, minDot);
1588 minDot = vbslq_f32( mask, x, minDot);
1589 index = vbslq_u32(mask, local_index, index);
1590 local_index = vaddq_u32(local_index, four);
1596 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1597 float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1600 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1602 float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1604 xy0 = vmulq_f32(xy0, vLo);
1606 float32x4x2_t zb = vuzpq_f32( z0, z0);
1607 float32x4_t z = vmulq_f32( zb.val[0], vHi);
1608 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1609 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1610 x = vaddq_f32(x, z);
1612 uint32x4_t mask = vcltq_f32(x, minDot);
1613 minDot = vbslq_f32( mask, x, minDot);
1614 index = vbslq_u32(mask, local_index, index);
1615 local_index = vaddq_u32(local_index, four);
1621 float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1624 float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1626 float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1628 xy0 = vmulq_f32(xy0, vLo);
1630 z = vmulq_f32( z, vHi);
1631 float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1632 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1633 x = vaddq_f32(x, z);
1635 uint32x4_t mask = vcltq_f32(x, minDot);
1636 minDot = vbslq_f32( mask, x, minDot);
1637 index = vbslq_u32(mask, local_index, index);
1638 local_index = vaddq_u32(local_index, four);
1648 uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
1649 float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1650 uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1653 float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1654 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1655 mask = vclt_f32( minDotO, minDot2 );
1656 minDot2 = vbsl_f32(mask, minDotO, minDot2);
1657 index2 = vbsl_u32(mask, indexHi, index2);
1659 *dotResult = vget_lane_f32( minDot2, 0);
1660 return vget_lane_u32(index2, 0);
1665 #error Unhandled __APPLE__ arch