// -*-C++-*- // Vectorise using Intel's or AMD's AVX // Use the type __m256 directly, without introducing a wrapper class #include #include #include #ifdef __FMA4__ # include #endif #ifdef __FMA4__ # define vec4_architecture_FMA4 "+FMA4" #else # define vec4_architecture_FMA4 "" #endif #define vec4_architecture "AVX" vec4_architecture_FMA4 " (32-bit precision)" // Vector type corresponding to CCTK_REAL // Note: some boolean masks (e.g. ~0) correspond to nan when // interpreted as floating point number. gcc 4.8 is clever enough to // optimize away such constants with fast-math. We therefore need to // handle this constant as integer number. typedef __m256 CCTK_REAL4_VEC; typedef __m256i CCTK_INTEGER4_VEC; typedef __m256i CCTK_BOOLEAN4_VEC; // Number of vector elements in a CCTK_REAL_VEC #define CCTK_REAL4_VEC_SIZE 8 vec_static_assert(sizeof(CCTK_REAL4_VEC) == sizeof(CCTK_REAL4) * CCTK_REAL4_VEC_SIZE); // Integer and boolean types corresponding to this real type typedef CCTK_INT4 CCTK_INTEGER4; typedef CCTK_INT4 CCTK_BOOLEAN4; // These macros are undefined at the end of this file -- use them only // within functions, not within macros that are exported #define I2R(x) _mm256_castsi256_ps(x) #define R2I(x) _mm256_castps_si256(x) union k4const_t { CCTK_INTEGER4 i[CCTK_REAL4_VEC_SIZE]; CCTK_INTEGER4_VEC vi; }; #define k4sign (vec4_set1i( (CCTK_INTEGER4)(1UL << 31UL))) #define k4notsign (vec4_set1i(~ (CCTK_INTEGER4)(1UL << 31UL))) // Create vectors, extract vector elements static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC vec4_set1(CCTK_REAL4 const a) { return _mm256_set1_ps(a); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_INTEGER4_VEC vec4_set1i(CCTK_INT4 const a) { return _mm256_set1_epi32(a); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC vec4_set(CCTK_REAL4 const a, CCTK_REAL4 const b, CCTK_REAL4 const c, CCTK_REAL4 const d, CCTK_REAL4 const e, CCTK_REAL4 const f, CCTK_REAL4 const g, CCTK_REAL4 const h) { return _mm256_set_ps(h,g,f,e,d,c,b,a); // note reversed arguments } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4 vec4_elt(CCTK_REAL4_VEC const x, std::ptrdiff_t const d) { CCTK_REAL4 e; std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); return e; } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_INTEGER4 vec4_elti(CCTK_INTEGER4_VEC const x, std::ptrdiff_t const d) { CCTK_INTEGER4 e; std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); return e; } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4 vec4_eltb(CCTK_BOOLEAN4_VEC const x, std::ptrdiff_t const d) { CCTK_BOOLEAN4 e; std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); return e; } // Load and store vectors // Load a vector from memory (aligned and unaligned); this loads from // a reference to a scalar static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC vec4_load(CCTK_REAL4 const& p) { return _mm256_load_ps(&p); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC vec4_loadu(CCTK_REAL4 const& p) { return _mm256_loadu_ps(&p); } #if VECTORISE_ALWAYS_USE_ALIGNED_LOADS # error "VECTORISE_ALWAYS_USE_ALIGNED_LOADS not yet supported" #endif // Load a vector from memory that may or may not be aligned, as // decided by the offset off and the vector size #if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS // Implementation: Always use unaligned load static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC vec4_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL4 const& p) { return vec4_loadu(p); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1, std::ptrdiff_t const off2, std::ptrdiff_t const off3, CCTK_REAL4 const& p) { return vec4_loadu(p); } #else static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC vec4_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL4 const& p) { return off % CCTK_REAL4_VEC_SIZE == 0 ? vec4_load(p) : vec4_loadu(p); } # if VECTORISE_ALIGNED_ARRAYS // Assume all array x sizes are multiples of the vector size static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1, std::ptrdiff_t const off2, std::ptrdiff_t const off3, CCTK_REAL4 const& p) { return vec4_loadu_maybe(off1, p); } # else static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1, std::ptrdiff_t const off2, std::ptrdiff_t const off3, CCTK_REAL4 const& p) { return off2 % CCTK_REAL4_VEC_SIZE != 0 or off3 % CCTK_REAL4_VEC_SIZE != 0 ? vec4_loadu(p) : vec4_loadu_maybe(off1, p); } # endif #endif // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec4_store(CCTK_REAL4& p, CCTK_REAL4_VEC const x) { return _mm256_store_ps(&p, x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec4_storeu(CCTK_REAL4& p, CCTK_REAL4_VEC const x) { return _mm256_storeu_ps(&p, x); } #if ! VECTORISE_STREAMING_STORES static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec4_store_nta(CCTK_REAL4& p, CCTK_REAL4_VEC const x) { return vec4_store(p, x); } #else static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec4_store_nta(CCTK_REAL4& p, CCTK_REAL4_VEC const x) { return _mm256_stream_ps(&p, x); } #endif // Store a partial vector (aligned and non-temporal) #define vec4_store_partial_prepare(i,imin,imax) \ bool v4stp_all; \ __m256i v4stp_mask; \ vec4_store_partial_prepare_(v4stp_all, v4stp_mask, i, imin, imax); static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec4_store_partial_prepare_(bool& all, __m256i& mask, std::ptrdiff_t const i, std::ptrdiff_t const imin, std::ptrdiff_t const imax) { all = i>=imin and i+CCTK_REAL4_VEC_SIZE-1 1400 return _mm256_fmod_ps(x,y); #else return K4REPL2(fmodf,x,y); #endif } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4log(CCTK_REAL4_VEC const x) { return _mm256_log_ps(x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4pow(CCTK_REAL4_VEC const x, CCTK_REAL4 const a) { return _mm256_pow_ps(x, _mm256_set1_ps(a)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4sin(CCTK_REAL4_VEC const x) { return _mm256_sin_ps(x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4sinh(CCTK_REAL4_VEC const x) { return _mm256_sinh_ps(x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4tan(CCTK_REAL4_VEC const x) { return _mm256_tan_ps(x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4tanh(CCTK_REAL4_VEC const x) { return _mm256_tanh_ps(x); } #else static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4acos(CCTK_REAL4_VEC const x) { return K4REPL(acosf,x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4acosh(CCTK_REAL4_VEC const x) { return K4REPL(acoshf,x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4asin(CCTK_REAL4_VEC const x) { return K4REPL(asinf,x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4asinh(CCTK_REAL4_VEC const x) { return K4REPL(asinhf,x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4atan(CCTK_REAL4_VEC const x) { return K4REPL(atanf,x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4atan2(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { return K4REPL2(atan2f,x,y); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4atanh(CCTK_REAL4_VEC const x) { return K4REPL(atanhf,x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4cos(CCTK_REAL4_VEC const x) { return K4REPL(cosf,x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4cosh(CCTK_REAL4_VEC const x) { return K4REPL(coshf,x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4exp(CCTK_REAL4_VEC const x) { return K4REPL(expf,x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4fmod(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { return K4REPL2(fmodf,x,y); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4log(CCTK_REAL4_VEC const x) { return K4REPL(logf,x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4pow(CCTK_REAL4_VEC const x, CCTK_REAL4 const a) { return K4REPL2S(powf,x,a); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4sin(CCTK_REAL4_VEC const x) { return K4REPL(sinf,x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4sinh(CCTK_REAL4_VEC const x) { return K4REPL(sinhf,x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4tan(CCTK_REAL4_VEC const x) { return K4REPL(tanf,x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4tanh(CCTK_REAL4_VEC const x) { return K4REPL(tanhf,x); } #endif #define k4lfalse (vec4_set1i( 0)) #define k4ltrue (vec4_set1i(~0)) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4lnot(CCTK_BOOLEAN4_VEC const x) { return R2I(_mm256_xor_ps(I2R(k4ltrue), I2R(x))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4land(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y) { return R2I(_mm256_and_ps(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4lor(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y) { return R2I(_mm256_or_ps(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4lxor(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y) { return R2I(_mm256_xor_ps(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4ifthen(CCTK_BOOLEAN4_VEC const x, CCTK_REAL4_VEC const y, CCTK_REAL4_VEC const z) { return _mm256_blendv_ps(z, y, I2R(x)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmpeq(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { return R2I(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmpne(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { return R2I(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmpgt(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { return R2I(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmpge(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { return R2I(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmplt(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { return R2I(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmple(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { return R2I(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4sgn(CCTK_REAL4_VEC const x) { CCTK_BOOLEAN4_VEC const iszero = k4cmpeq(x, vec4_set1(0.0)); CCTK_REAL4_VEC const sign = _mm256_and_ps(I2R(k4sign), x); CCTK_REAL4_VEC const signedone = _mm256_or_ps(sign, vec4_set1(1.0)); return k4ifthen(iszero, vec4_set1(0.0), signedone); } #undef I2R #undef R2I