GDAL
gdalsse_priv.h
1/******************************************************************************
2 * $Id: gdalsse_priv.h 60827fdad75b8bd6178c450fcc15ca6abdd467f0 2019-04-01 17:20:03 +0200 Raul Marin $
3 *
4 * Project: GDAL
5 * Purpose: SSE2 helper
6 * Author: Even Rouault <even dot rouault at spatialys dot com>
7 *
8 ******************************************************************************
9 * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
10 *
11 * Permission is hereby granted, free of charge, to any person obtaining a
12 * copy of this software and associated documentation files (the "Software"),
13 * to deal in the Software without restriction, including without limitation
14 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15 * and/or sell copies of the Software, and to permit persons to whom the
16 * Software is furnished to do so, subject to the following conditions:
17 *
18 * The above copyright notice and this permission notice shall be included
19 * in all copies or substantial portions of the Software.
20 *
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 * DEALINGS IN THE SOFTWARE.
28 ****************************************************************************/
29
30#ifndef GDALSSE_PRIV_H_INCLUDED
31#define GDALSSE_PRIV_H_INCLUDED
32
33#ifndef DOXYGEN_SKIP
34
35#include "cpl_port.h"
36
37/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
38/* Could possibly be used too on 32bit, but we would need to check at runtime */
39#if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)
40
41/* Requires SSE2 */
42#include <emmintrin.h>
43#include <string.h>
44
45#ifdef __SSE4_1__
46#include <smmintrin.h>
47#endif
48
49#include "gdal_priv_templates.hpp"
50
51static inline __m128i GDALCopyInt16ToXMM(const void* ptr)
52{
53#ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
54 unsigned short s;
55 memcpy(&s, ptr, 2);
56 return _mm_cvtsi32_si128(s);
57#else
58 return _mm_cvtsi32_si128(*static_cast<const unsigned short*>(ptr));
59#endif
60}
61
62static inline __m128i GDALCopyInt32ToXMM(const void* ptr)
63{
64#ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
65 GInt32 i;
66 memcpy(&i, ptr, 4);
67 return _mm_cvtsi32_si128(i);
68#else
69 return _mm_cvtsi32_si128(*static_cast<const GInt32*>(ptr));
70#endif
71}
72
73static inline __m128i GDALCopyInt64ToXMM(const void* ptr)
74{
75#ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
76 GInt64 i;
77 memcpy(&i, ptr, 8);
78 return _mm_cvtsi64_si128(i);
79#else
80 return _mm_cvtsi64_si128(*static_cast<const GInt64*>(ptr));
81#endif
82}
83
84static inline void GDALCopyXMMToInt16(const __m128i xmm, void* pDest)
85{
86#ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
87 GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
88 memcpy(pDest, &i, 2);
89#else
90 *static_cast<GInt16*>(pDest) = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
91#endif
92}
93
94class XMMReg2Double
95{
96 public:
97 __m128d xmm;
98
99#if defined(__GNUC__)
100#pragma GCC diagnostic push
101#pragma GCC diagnostic ignored "-Weffc++"
102#endif
103 /* coverity[uninit_member] */
104 XMMReg2Double() = default;
105#if defined(__GNUC__)
106#pragma GCC diagnostic pop
107#endif
108
109 XMMReg2Double(double val): xmm(_mm_load_sd (&val)) {}
110 XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {}
111
112 static inline XMMReg2Double Zero()
113 {
114 XMMReg2Double reg;
115 reg.Zeroize();
116 return reg;
117 }
118
119 static inline XMMReg2Double Load1ValHighAndLow(const double* ptr)
120 {
121 XMMReg2Double reg;
122 reg.nsLoad1ValHighAndLow(ptr);
123 return reg;
124 }
125
126 static inline XMMReg2Double Load2Val(const double* ptr)
127 {
128 XMMReg2Double reg;
129 reg.nsLoad2Val(ptr);
130 return reg;
131 }
132
133 static inline XMMReg2Double Load2Val(const float* ptr)
134 {
135 XMMReg2Double reg;
136 reg.nsLoad2Val(ptr);
137 return reg;
138 }
139
140 static inline XMMReg2Double Load2ValAligned(const double* ptr)
141 {
142 XMMReg2Double reg;
143 reg.nsLoad2ValAligned(ptr);
144 return reg;
145 }
146
147 static inline XMMReg2Double Load2Val(const unsigned char* ptr)
148 {
149 XMMReg2Double reg;
150 reg.nsLoad2Val(ptr);
151 return reg;
152 }
153
154 static inline XMMReg2Double Load2Val(const short* ptr)
155 {
156 XMMReg2Double reg;
157 reg.nsLoad2Val(ptr);
158 return reg;
159 }
160
161 static inline XMMReg2Double Load2Val(const unsigned short* ptr)
162 {
163 XMMReg2Double reg;
164 reg.nsLoad2Val(ptr);
165 return reg;
166 }
167
168 static inline XMMReg2Double Equals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
169 {
170 XMMReg2Double reg;
171 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
172 return reg;
173 }
174
175 static inline XMMReg2Double NotEquals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
176 {
177 XMMReg2Double reg;
178 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
179 return reg;
180 }
181
182 static inline XMMReg2Double Greater(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
183 {
184 XMMReg2Double reg;
185 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
186 return reg;
187 }
188
189 static inline XMMReg2Double And(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
190 {
191 XMMReg2Double reg;
192 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
193 return reg;
194 }
195
196 static inline XMMReg2Double Ternary(const XMMReg2Double& cond, const XMMReg2Double& true_expr, const XMMReg2Double& false_expr)
197 {
198 XMMReg2Double reg;
199 reg.xmm = _mm_or_pd(_mm_and_pd (cond.xmm, true_expr.xmm), _mm_andnot_pd(cond.xmm, false_expr.xmm));
200 return reg;
201 }
202
203 static inline XMMReg2Double Min(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
204 {
205 XMMReg2Double reg;
206 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
207 return reg;
208 }
209
210 inline void nsLoad1ValHighAndLow(const double* ptr)
211 {
212 xmm = _mm_load1_pd(ptr);
213 }
214
215 inline void nsLoad2Val(const double* ptr)
216 {
217 xmm = _mm_loadu_pd(ptr);
218 }
219
220 inline void nsLoad2ValAligned(const double* ptr)
221 {
222 xmm = _mm_load_pd(ptr);
223 }
224
225 inline void nsLoad2Val(const float* ptr)
226 {
227 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
228 }
229
230 inline void nsLoad2Val(const unsigned char* ptr)
231 {
232 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
233#ifdef __SSE4_1__
234 xmm_i = _mm_cvtepu8_epi32(xmm_i);
235#else
236 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
237 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
238#endif
239 xmm = _mm_cvtepi32_pd(xmm_i);
240 }
241
242 inline void nsLoad2Val(const short* ptr)
243 {
244 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
245#ifdef __SSE4_1__
246 xmm_i = _mm_cvtepi16_epi32(xmm_i);
247#else
248 xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
249 xmm_i = _mm_srai_epi32(xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
250#endif
251 xmm = _mm_cvtepi32_pd(xmm_i);
252 }
253
254 inline void nsLoad2Val(const unsigned short* ptr)
255 {
256 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
257#ifdef __SSE4_1__
258 xmm_i = _mm_cvtepu16_epi32(xmm_i);
259#else
260 xmm_i = _mm_unpacklo_epi16(xmm_i,_mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
261#endif
262 xmm = _mm_cvtepi32_pd(xmm_i);
263 }
264
265 static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
266 {
267 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
268#ifdef __SSE4_1__
269 xmm_i = _mm_cvtepu8_epi32(xmm_i);
270#else
271 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
272 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
273#endif
274 low.xmm = _mm_cvtepi32_pd(xmm_i);
275 high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
276 }
277
278 static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
279 {
280 low.nsLoad2Val(ptr);
281 high.nsLoad2Val(ptr+2);
282 }
283
284 static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
285 {
286 low.nsLoad2Val(ptr);
287 high.nsLoad2Val(ptr+2);
288 }
289
290 static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
291 {
292 low.nsLoad2Val(ptr);
293 high.nsLoad2Val(ptr+2);
294 }
295
296 static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
297 {
298 __m128 temp1 = _mm_loadu_ps(ptr);
299 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
300 low.xmm = _mm_cvtps_pd(temp1);
301 high.xmm = _mm_cvtps_pd(temp2);
302 }
303
304 inline void Zeroize()
305 {
306 xmm = _mm_setzero_pd();
307 }
308
309 inline XMMReg2Double& operator= (const XMMReg2Double& other)
310 {
311 xmm = other.xmm;
312 return *this;
313 }
314
315 inline XMMReg2Double& operator+= (const XMMReg2Double& other)
316 {
317 xmm = _mm_add_pd(xmm, other.xmm);
318 return *this;
319 }
320
321 inline XMMReg2Double& operator*= (const XMMReg2Double& other)
322 {
323 xmm = _mm_mul_pd(xmm, other.xmm);
324 return *this;
325 }
326
327 inline XMMReg2Double operator+ (const XMMReg2Double& other) const
328 {
329 XMMReg2Double ret;
330 ret.xmm = _mm_add_pd(xmm, other.xmm);
331 return ret;
332 }
333
334 inline XMMReg2Double operator- (const XMMReg2Double& other) const
335 {
336 XMMReg2Double ret;
337 ret.xmm = _mm_sub_pd(xmm, other.xmm);
338 return ret;
339 }
340
341 inline XMMReg2Double operator* (const XMMReg2Double& other) const
342 {
343 XMMReg2Double ret;
344 ret.xmm = _mm_mul_pd(xmm, other.xmm);
345 return ret;
346 }
347
348 inline XMMReg2Double operator/ (const XMMReg2Double& other) const
349 {
350 XMMReg2Double ret;
351 ret.xmm = _mm_div_pd(xmm, other.xmm);
352 return ret;
353 }
354
355 inline double GetHorizSum() const
356 {
357 __m128d xmm2;
358 xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1)); /* transfer high word into low word of xmm2 */
359 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
360 }
361
362 inline void Store2Val(double* ptr) const
363 {
364 _mm_storeu_pd(ptr, xmm);
365 }
366
367 inline void Store2ValAligned(double* ptr) const
368 {
369 _mm_store_pd(ptr, xmm);
370 }
371
372 inline void Store2Val(float* ptr) const
373 {
374 __m128i xmm_i = _mm_castps_si128( _mm_cvtpd_ps(xmm) );
375 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
376 }
377
378 inline void Store2Val(unsigned char* ptr) const
379 {
380 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
381 // X X X X 0 B 0 A --> 0 X X X X 0 B 0 (srli)
382 // or X X X X 0 B 0 A
383 tmp = _mm_or_si128(tmp, _mm_srli_si128(tmp, 2));
384 tmp = _mm_packus_epi16(tmp, tmp);
385 GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16*>(ptr));
386 }
387
388 inline void Store2Val(unsigned short* ptr) const
389 {
390 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
391 // X X X X 0 B 0 A --> 0 X X X X 0 B 0 (srli)
392 // or X X X X 0 B 0 A
393 tmp = _mm_or_si128(tmp, _mm_srli_si128(tmp, 2));
394 GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32*>(ptr));
395
396 //ptr[0] = (GUInt16)_mm_extract_epi16(tmp, 0);
397 //ptr[1] = (GUInt16)_mm_extract_epi16(tmp, 2);
398 }
399
400 inline void StoreMask(unsigned char* ptr) const
401 {
402 _mm_storeu_si128( reinterpret_cast<__m128i*>(ptr), _mm_castpd_si128(xmm) );
403 }
404
405 inline operator double () const
406 {
407 return _mm_cvtsd_f64(xmm);
408 }
409};
410
411#else
412
413#warning "Software emulation of SSE2 !"
414
415class XMMReg2Double
416{
417 public:
418 double low;
419 double high;
420
421 XMMReg2Double() {}
422 XMMReg2Double(double val) { low = val; high = 0.0; }
423 XMMReg2Double(const XMMReg2Double& other) : low(other.low), high(other.high) {}
424
425 static inline XMMReg2Double Zero()
426 {
427 XMMReg2Double reg;
428 reg.Zeroize();
429 return reg;
430 }
431
432 static inline XMMReg2Double Load1ValHighAndLow(const double* ptr)
433 {
434 XMMReg2Double reg;
435 reg.nsLoad1ValHighAndLow(ptr);
436 return reg;
437 }
438
439 static inline XMMReg2Double Equals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
440 {
441 XMMReg2Double reg;
442
443 if (expr1.low == expr2.low)
444 memset(&(reg.low), 0xFF, sizeof(double));
445 else
446 reg.low = 0;
447
448 if (expr1.high == expr2.high)
449 memset(&(reg.high), 0xFF, sizeof(double));
450 else
451 reg.high = 0;
452
453 return reg;
454 }
455
456 static inline XMMReg2Double NotEquals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
457 {
458 XMMReg2Double reg;
459
460 if (expr1.low != expr2.low)
461 memset(&(reg.low), 0xFF, sizeof(double));
462 else
463 reg.low = 0;
464
465 if (expr1.high != expr2.high)
466 memset(&(reg.high), 0xFF, sizeof(double));
467 else
468 reg.high = 0;
469
470 return reg;
471 }
472
473 static inline XMMReg2Double Greater(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
474 {
475 XMMReg2Double reg;
476
477 if (expr1.low > expr2.low)
478 memset(&(reg.low), 0xFF, sizeof(double));
479 else
480 reg.low = 0;
481
482 if (expr1.high > expr2.high)
483 memset(&(reg.high), 0xFF, sizeof(double));
484 else
485 reg.high = 0;
486
487 return reg;
488 }
489
490 static inline XMMReg2Double And(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
491 {
492 XMMReg2Double reg;
493 int low1[2], high1[2];
494 int low2[2], high2[2];
495 memcpy(low1, &expr1.low, sizeof(double));
496 memcpy(high1, &expr1.high, sizeof(double));
497 memcpy(low2, &expr2.low, sizeof(double));
498 memcpy(high2, &expr2.high, sizeof(double));
499 low1[0] &= low2[0];
500 low1[1] &= low2[1];
501 high1[0] &= high2[0];
502 high1[1] &= high2[1];
503 memcpy(&reg.low, low1, sizeof(double));
504 memcpy(&reg.high, high1, sizeof(double));
505 return reg;
506 }
507
508 static inline XMMReg2Double Ternary(const XMMReg2Double& cond, const XMMReg2Double& true_expr, const XMMReg2Double& false_expr)
509 {
510 XMMReg2Double reg;
511 if( cond.low )
512 reg.low = true_expr.low;
513 else
514 reg.low = false_expr.low;
515 if( cond.high )
516 reg.high = true_expr.high;
517 else
518 reg.high = false_expr.high;
519 return reg;
520 }
521
522 static inline XMMReg2Double Min(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
523 {
524 XMMReg2Double reg;
525 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
526 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
527 return reg;
528 }
529
530 static inline XMMReg2Double Load2Val(const double* ptr)
531 {
532 XMMReg2Double reg;
533 reg.nsLoad2Val(ptr);
534 return reg;
535 }
536
537 static inline XMMReg2Double Load2ValAligned(const double* ptr)
538 {
539 XMMReg2Double reg;
540 reg.nsLoad2ValAligned(ptr);
541 return reg;
542 }
543
544 static inline XMMReg2Double Load2Val(const float* ptr)
545 {
546 XMMReg2Double reg;
547 reg.nsLoad2Val(ptr);
548 return reg;
549 }
550
551 static inline XMMReg2Double Load2Val(const unsigned char* ptr)
552 {
553 XMMReg2Double reg;
554 reg.nsLoad2Val(ptr);
555 return reg;
556 }
557
558 static inline XMMReg2Double Load2Val(const short* ptr)
559 {
560 XMMReg2Double reg;
561 reg.nsLoad2Val(ptr);
562 return reg;
563 }
564
565 static inline XMMReg2Double Load2Val(const unsigned short* ptr)
566 {
567 XMMReg2Double reg;
568 reg.nsLoad2Val(ptr);
569 return reg;
570 }
571
572 inline void nsLoad1ValHighAndLow(const double* ptr)
573 {
574 low = ptr[0];
575 high = ptr[0];
576 }
577
578 inline void nsLoad2Val(const double* ptr)
579 {
580 low = ptr[0];
581 high = ptr[1];
582 }
583
584 inline void nsLoad2ValAligned(const double* ptr)
585 {
586 low = ptr[0];
587 high = ptr[1];
588 }
589
590 inline void nsLoad2Val(const float* ptr)
591 {
592 low = ptr[0];
593 high = ptr[1];
594 }
595
596 inline void nsLoad2Val(const unsigned char* ptr)
597 {
598 low = ptr[0];
599 high = ptr[1];
600 }
601
602 inline void nsLoad2Val(const short* ptr)
603 {
604 low = ptr[0];
605 high = ptr[1];
606 }
607
608 inline void nsLoad2Val(const unsigned short* ptr)
609 {
610 low = ptr[0];
611 high = ptr[1];
612 }
613
614 static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
615 {
616 low.low = ptr[0];
617 low.high = ptr[1];
618 high.low = ptr[2];
619 high.high = ptr[3];
620 }
621
622 static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
623 {
624 low.nsLoad2Val(ptr);
625 high.nsLoad2Val(ptr+2);
626 }
627
628 static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
629 {
630 low.nsLoad2Val(ptr);
631 high.nsLoad2Val(ptr+2);
632 }
633
634 static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
635 {
636 low.nsLoad2Val(ptr);
637 high.nsLoad2Val(ptr+2);
638 }
639
640 static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
641 {
642 low.nsLoad2Val(ptr);
643 high.nsLoad2Val(ptr+2);
644 }
645
646 inline void Zeroize()
647 {
648 low = 0.0;
649 high = 0.0;
650 }
651
652 inline XMMReg2Double& operator= (const XMMReg2Double& other)
653 {
654 low = other.low;
655 high = other.high;
656 return *this;
657 }
658
659 inline XMMReg2Double& operator+= (const XMMReg2Double& other)
660 {
661 low += other.low;
662 high += other.high;
663 return *this;
664 }
665
666 inline XMMReg2Double& operator*= (const XMMReg2Double& other)
667 {
668 low *= other.low;
669 high *= other.high;
670 return *this;
671 }
672
673 inline XMMReg2Double operator+ (const XMMReg2Double& other) const
674 {
675 XMMReg2Double ret;
676 ret.low = low + other.low;
677 ret.high = high + other.high;
678 return ret;
679 }
680
681 inline XMMReg2Double operator- (const XMMReg2Double& other) const
682 {
683 XMMReg2Double ret;
684 ret.low = low - other.low;
685 ret.high = high - other.high;
686 return ret;
687 }
688
689 inline XMMReg2Double operator* (const XMMReg2Double& other) const
690 {
691 XMMReg2Double ret;
692 ret.low = low * other.low;
693 ret.high = high * other.high;
694 return ret;
695 }
696
697 inline XMMReg2Double operator/ (const XMMReg2Double& other) const
698 {
699 XMMReg2Double ret;
700 ret.low = low / other.low;
701 ret.high = high / other.high;
702 return ret;
703 }
704
705 inline double GetHorizSum() const
706 {
707 return low + high;
708 }
709
710 inline void Store2Val(double* ptr) const
711 {
712 ptr[0] = low;
713 ptr[1] = high;
714 }
715
716 inline void Store2ValAligned(double* ptr) const
717 {
718 ptr[0] = low;
719 ptr[1] = high;
720 }
721
722 inline void Store2Val(float* ptr) const
723 {
724 ptr[0] = low;
725 ptr[1] = high;
726 }
727
728 void Store2Val(unsigned char* ptr) const
729 {
730 ptr[0] = (unsigned char)(low + 0.5);
731 ptr[1] = (unsigned char)(high + 0.5);
732 }
733
734 void Store2Val(unsigned short* ptr) const
735 {
736 ptr[0] = (GUInt16)(low + 0.5);
737 ptr[1] = (GUInt16)(high + 0.5);
738 }
739
740 inline void StoreMask(unsigned char* ptr) const
741 {
742 memcpy(ptr, &low, 8);
743 memcpy(ptr + 8, &high, 8);
744 }
745
746 inline operator double () const
747 {
748 return low;
749 }
750};
751
752#endif /* defined(__x86_64) || defined(_M_X64) */
753
754#ifdef __AVX__
755
756#include <immintrin.h>
757
758class XMMReg4Double
759{
760 public:
761 __m256d ymm;
762
763 XMMReg4Double(): ymm(_mm256_setzero_pd()) {}
764 XMMReg4Double(const XMMReg4Double& other) : ymm(other.ymm) {}
765
766 static inline XMMReg4Double Zero()
767 {
768 XMMReg4Double reg;
769 reg.Zeroize();
770 return reg;
771 }
772
773 inline void Zeroize()
774 {
775 ymm = _mm256_setzero_pd();
776 }
777
778 static inline XMMReg4Double Load1ValHighAndLow(const double* ptr)
779 {
780 XMMReg4Double reg;
781 reg.nsLoad1ValHighAndLow(ptr);
782 return reg;
783 }
784
785 inline void nsLoad1ValHighAndLow(const double* ptr)
786 {
787 ymm = _mm256_set1_pd(*ptr);
788 }
789
790 static inline XMMReg4Double Load4Val(const unsigned char* ptr)
791 {
792 XMMReg4Double reg;
793 reg.nsLoad4Val(ptr);
794 return reg;
795 }
796
797 inline void nsLoad4Val(const unsigned char* ptr)
798 {
799 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
800 xmm_i = _mm_cvtepu8_epi32(xmm_i);
801 ymm = _mm256_cvtepi32_pd(xmm_i);
802 }
803
804 static inline XMMReg4Double Load4Val(const short* ptr)
805 {
806 XMMReg4Double reg;
807 reg.nsLoad4Val(ptr);
808 return reg;
809 }
810
811 inline void nsLoad4Val(const short* ptr)
812 {
813 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
814 xmm_i = _mm_cvtepi16_epi32(xmm_i);
815 ymm = _mm256_cvtepi32_pd(xmm_i);
816 }
817
818 static inline XMMReg4Double Load4Val(const unsigned short* ptr)
819 {
820 XMMReg4Double reg;
821 reg.nsLoad4Val(ptr);
822 return reg;
823 }
824
825 inline void nsLoad4Val(const unsigned short* ptr)
826 {
827 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
828 xmm_i = _mm_cvtepu16_epi32(xmm_i);
829 ymm = _mm256_cvtepi32_pd(xmm_i); // ok to use signed conversion since we are in the ushort range, so cannot be interpreted as negative int32
830 }
831
832 static inline XMMReg4Double Load4Val(const double* ptr)
833 {
834 XMMReg4Double reg;
835 reg.nsLoad4Val(ptr);
836 return reg;
837 }
838
839 inline void nsLoad4Val(const double* ptr)
840 {
841 ymm = _mm256_loadu_pd(ptr);
842 }
843
844 static inline XMMReg4Double Load4ValAligned(const double* ptr)
845 {
846 XMMReg4Double reg;
847 reg.nsLoad4ValAligned(ptr);
848 return reg;
849 }
850
851 inline void nsLoad4ValAligned(const double* ptr)
852 {
853 ymm = _mm256_load_pd(ptr);
854 }
855
856 static inline XMMReg4Double Load4Val(const float* ptr)
857 {
858 XMMReg4Double reg;
859 reg.nsLoad4Val(ptr);
860 return reg;
861 }
862
863 inline void nsLoad4Val(const float* ptr)
864 {
865 ymm = _mm256_cvtps_pd( _mm_loadu_ps(ptr) );
866 }
867
868 static inline XMMReg4Double Equals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
869 {
870 XMMReg4Double reg;
871 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
872 return reg;
873 }
874
875 static inline XMMReg4Double NotEquals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
876 {
877 XMMReg4Double reg;
878 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
879 return reg;
880 }
881
882 static inline XMMReg4Double Greater(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
883 {
884 XMMReg4Double reg;
885 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
886 return reg;
887 }
888
889 static inline XMMReg4Double And(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
890 {
891 XMMReg4Double reg;
892 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
893 return reg;
894 }
895
896 static inline XMMReg4Double Ternary(const XMMReg4Double& cond, const XMMReg4Double& true_expr, const XMMReg4Double& false_expr)
897 {
898 XMMReg4Double reg;
899 reg.ymm = _mm256_or_pd(_mm256_and_pd (cond.ymm, true_expr.ymm), _mm256_andnot_pd(cond.ymm, false_expr.ymm));
900 return reg;
901 }
902
903 static inline XMMReg4Double Min(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
904 {
905 XMMReg4Double reg;
906 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
907 return reg;
908 }
909
910 inline XMMReg4Double& operator= (const XMMReg4Double& other)
911 {
912 ymm = other.ymm;
913 return *this;
914 }
915
916 inline XMMReg4Double& operator+= (const XMMReg4Double& other)
917 {
918 ymm = _mm256_add_pd(ymm, other.ymm);
919 return *this;
920 }
921
922 inline XMMReg4Double& operator*= (const XMMReg4Double& other)
923 {
924 ymm = _mm256_mul_pd(ymm, other.ymm);
925 return *this;
926 }
927
928 inline XMMReg4Double operator+ (const XMMReg4Double& other) const
929 {
930 XMMReg4Double ret;
931 ret.ymm = _mm256_add_pd(ymm, other.ymm);
932 return ret;
933 }
934
935 inline XMMReg4Double operator- (const XMMReg4Double& other) const
936 {
937 XMMReg4Double ret;
938 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
939 return ret;
940 }
941
942 inline XMMReg4Double operator* (const XMMReg4Double& other) const
943 {
944 XMMReg4Double ret;
945 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
946 return ret;
947 }
948
949 inline XMMReg4Double operator/ (const XMMReg4Double& other) const
950 {
951 XMMReg4Double ret;
952 ret.ymm = _mm256_div_pd(ymm, other.ymm);
953 return ret;
954 }
955
956 void AddToLow( const XMMReg2Double& other )
957 {
958 __m256d ymm2 = _mm256_setzero_pd();
959 ymm2 = _mm256_insertf128_pd( ymm2, other.xmm, 0);
960 ymm = _mm256_add_pd(ymm, ymm2);
961 }
962
963 inline double GetHorizSum() const
964 {
965 __m256d ymm_tmp1, ymm_tmp2;
966 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
967 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
968 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
969 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
970 }
971
972 inline void Store4Val(unsigned char* ptr) const
973 {
974 __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
975 //xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
976 //xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
977 xmm_i = _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24))); // SSSE3
978 GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32*>(ptr));
979 }
980
981 inline void Store4Val(unsigned short* ptr) const
982 {
983 __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
984 xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack uint32 to uint16
985 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
986 }
987
988 inline void Store4Val(float* ptr) const
989 {
990 _mm_storeu_ps(ptr, _mm256_cvtpd_ps (ymm));
991 }
992
993 inline void Store4Val(double* ptr) const
994 {
995 _mm256_storeu_pd(ptr, ymm);
996 }
997
998 inline void StoreMask(unsigned char* ptr) const
999 {
1000 _mm256_storeu_si256( reinterpret_cast<__m256i*>(ptr), _mm256_castpd_si256(ymm) );
1001 }
1002};
1003
1004#else
1005
1006class XMMReg4Double
1007{
1008 public:
1009 XMMReg2Double low, high;
1010
1011#if defined(__GNUC__)
1012#pragma GCC diagnostic push
1013#pragma GCC diagnostic ignored "-Weffc++"
1014#endif
1015 /* coverity[uninit_member] */
1016 XMMReg4Double() = default;
1017#if defined(__GNUC__)
1018#pragma GCC diagnostic pop
1019#endif
1020
1021 XMMReg4Double(const XMMReg4Double& other) : low(other.low), high(other.high) {}
1022
1023 static inline XMMReg4Double Zero()
1024 {
1025 XMMReg4Double reg;
1026 reg.low.Zeroize();
1027 reg.high.Zeroize();
1028 return reg;
1029 }
1030
1031 static inline XMMReg4Double Load1ValHighAndLow(const double* ptr)
1032 {
1033 XMMReg4Double reg;
1034 reg.low.nsLoad1ValHighAndLow(ptr);
1035 reg.high = reg.low;
1036 return reg;
1037 }
1038
1039 static inline XMMReg4Double Load4Val(const unsigned char* ptr)
1040 {
1041 XMMReg4Double reg;
1042 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1043 return reg;
1044 }
1045
1046 static inline XMMReg4Double Load4Val(const short* ptr)
1047 {
1048 XMMReg4Double reg;
1049 reg.low.nsLoad2Val(ptr);
1050 reg.high.nsLoad2Val(ptr+2);
1051 return reg;
1052 }
1053
1054 static inline XMMReg4Double Load4Val(const unsigned short* ptr)
1055 {
1056 XMMReg4Double reg;
1057 reg.low.nsLoad2Val(ptr);
1058 reg.high.nsLoad2Val(ptr+2);
1059 return reg;
1060 }
1061
1062 static inline XMMReg4Double Load4Val(const double* ptr)
1063 {
1064 XMMReg4Double reg;
1065 reg.low.nsLoad2Val(ptr);
1066 reg.high.nsLoad2Val(ptr+2);
1067 return reg;
1068 }
1069
1070 static inline XMMReg4Double Load4ValAligned(const double* ptr)
1071 {
1072 XMMReg4Double reg;
1073 reg.low.nsLoad2ValAligned(ptr);
1074 reg.high.nsLoad2ValAligned(ptr+2);
1075 return reg;
1076 }
1077
1078 static inline XMMReg4Double Load4Val(const float* ptr)
1079 {
1080 XMMReg4Double reg;
1081 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1082 return reg;
1083 }
1084
1085 static inline XMMReg4Double Equals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1086 {
1087 XMMReg4Double reg;
1088 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1089 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1090 return reg;
1091 }
1092
1093 static inline XMMReg4Double NotEquals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1094 {
1095 XMMReg4Double reg;
1096 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1097 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1098 return reg;
1099 }
1100
1101 static inline XMMReg4Double Greater(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1102 {
1103 XMMReg4Double reg;
1104 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1105 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1106 return reg;
1107 }
1108
1109 static inline XMMReg4Double And(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1110 {
1111 XMMReg4Double reg;
1112 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1113 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1114 return reg;
1115 }
1116
1117 static inline XMMReg4Double Ternary(const XMMReg4Double& cond, const XMMReg4Double& true_expr, const XMMReg4Double& false_expr)
1118 {
1119 XMMReg4Double reg;
1120 reg.low = XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1121 reg.high = XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1122 return reg;
1123 }
1124
1125 static inline XMMReg4Double Min(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1126 {
1127 XMMReg4Double reg;
1128 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1129 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1130 return reg;
1131 }
1132
1133 inline XMMReg4Double& operator= (const XMMReg4Double& other)
1134 {
1135 low = other.low;
1136 high = other.high;
1137 return *this;
1138 }
1139
1140 inline XMMReg4Double& operator+= (const XMMReg4Double& other)
1141 {
1142 low += other.low;
1143 high += other.high;
1144 return *this;
1145 }
1146
1147 inline XMMReg4Double& operator*= (const XMMReg4Double& other)
1148 {
1149 low *= other.low;
1150 high *= other.high;
1151 return *this;
1152 }
1153
1154 inline XMMReg4Double operator+ (const XMMReg4Double& other) const
1155 {
1156 XMMReg4Double ret;
1157 ret.low = low + other.low;
1158 ret.high = high + other.high;
1159 return ret;
1160 }
1161
1162 inline XMMReg4Double operator- (const XMMReg4Double& other) const
1163 {
1164 XMMReg4Double ret;
1165 ret.low = low - other.low;
1166 ret.high = high - other.high;
1167 return ret;
1168 }
1169
1170 inline XMMReg4Double operator* (const XMMReg4Double& other) const
1171 {
1172 XMMReg4Double ret;
1173 ret.low = low * other.low;
1174 ret.high = high * other.high;
1175 return ret;
1176 }
1177
1178 inline XMMReg4Double operator/ (const XMMReg4Double& other) const
1179 {
1180 XMMReg4Double ret;
1181 ret.low = low / other.low;
1182 ret.high = high / other.high;
1183 return ret;
1184 }
1185
1186 void AddToLow( const XMMReg2Double& other )
1187 {
1188 low += other;
1189 }
1190
1191 inline double GetHorizSum() const
1192 {
1193 return (low + high).GetHorizSum();
1194 }
1195
1196 inline void Store4Val(unsigned char* ptr) const
1197 {
1198 low.Store2Val(ptr);
1199 high.Store2Val(ptr+2);
1200 }
1201
1202 inline void Store4Val(unsigned short* ptr) const
1203 {
1204#if 1
1205 low.Store2Val(ptr);
1206 high.Store2Val(ptr+2);
1207#else
1208 __m128i xmm0 = _mm_cvtpd_epi32 (low.xmm);
1209 __m128i xmm1 = _mm_cvtpd_epi32 (high.xmm);
1210 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1211#if __SSE4_1__
1212 xmm0 = _mm_packus_epi32(xmm0, xmm0); // Pack uint32 to uint16
1213#else
1214 xmm0 = _mm_add_epi32( xmm0, _mm_set1_epi32(-32768) );
1215 xmm0 = _mm_packs_epi32( xmm0, xmm0 );
1216 xmm0 = _mm_sub_epi16( xmm0, _mm_set1_epi16(-32768) );
1217#endif
1218 GDALCopyXMMToInt64(xmm0, (GInt64*)ptr);
1219#endif
1220 }
1221
1222 inline void Store4Val(float* ptr) const
1223 {
1224 low.Store2Val(ptr);
1225 high.Store2Val(ptr+2);
1226 }
1227
1228 inline void Store4Val(double* ptr) const
1229 {
1230 low.Store2Val(ptr);
1231 high.Store2Val(ptr+2);
1232 }
1233
1234 inline void StoreMask(unsigned char* ptr) const
1235 {
1236 low.StoreMask(ptr);
1237 high.StoreMask(ptr+16);
1238 }
1239
1240};
1241
1242#endif
1243
1244#endif /* #ifndef DOXYGEN_SKIP */
1245
1246#endif /* GDALSSE_PRIV_H_INCLUDED */
Core portability definitions for CPL.
short GInt16
Int16 type.
Definition: cpl_port.h:211
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:267
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:213
int GInt32
Int32 type.
Definition: cpl_port.h:205

Generated for GDAL by doxygen 1.9.4.