diff --git a/CMakeLists.txt b/CMakeLists.txt index 7026665..39c8ab0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -978,6 +978,8 @@ elseif(WIN32) # MSVC11 and MSVC12 need _ALLOW_KEYWORD_MACROS to build if(MSVC11 OR MSVC12) add_definitions(/D_ALLOW_KEYWORD_MACROS) + #TODO: Eigen cannot detect SSE3+ instructions with MSVC + add_definitions(/DEIGEN_VECTORIZE_SSE3 /DEIGEN_VECTORIZE_SSSE3 /DEIGEN_VECTORIZE_SSE4_1 /DEIGEN_VECTORIZE_SSE4_2) endif() set(CMAKE_CXX_FLAGS "/nologo /J /Gd /EHsc /MP" CACHE STRING "MSVC MT C++ flags " FORCE) diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 2e73639..bc3b50b 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -31,7 +31,6 @@ #include #include #include - #endif #include "util_types.h" @@ -655,8 +654,8 @@ template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b) ccl_device_inline float4 operator-(const float4& a) { #ifdef __KERNEL_SSE__ - __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - return _mm_xor_ps(a.m128, mask); + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000)); + return _mm_xor_ps(a, mask); #else return make_float4(-a.x, -a.y, -a.z, -a.w); #endif @@ -665,7 +664,7 @@ ccl_device_inline float4 operator-(const float4& a) ccl_device_inline float4 operator*(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return _mm_mul_ps(a.m128, b.m128); + return _mm_mul_ps(a, b); #else return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); #endif @@ -688,13 +687,23 @@ ccl_device_inline float4 operator*(float f, const float4& a) ccl_device_inline float4 rcp(const float4& a) { #ifdef __KERNEL_SSE__ - float4 r = _mm_rcp_ps(a.m128); + const float4 r = _mm_rcp_ps(a); return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); #else return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w); #endif } +ccl_device_inline float4 abs(const float4& a) +{ +#ifdef __KERNEL_SSE__ + const float4 mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF)); + return _mm_and_ps(a,mask); +#else + return make_float4(fabs(a.x)), fabs(a.y), fabs(a.z), fabs(a.w)); +#endif +} + ccl_device_inline float4 operator/(const float4& a, float f) { return a * (1.0f/f); @@ -703,7 +712,8 @@ ccl_device_inline float4 operator/(const float4& a, float f) ccl_device_inline float4 operator/(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return a * rcp(b); + /* return a * rcp(b); */ + return _mm_div_ps(a, b); #else return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); #endif @@ -713,7 +723,7 @@ ccl_device_inline float4 operator/(const float4& a, const float4& b) ccl_device_inline float4 operator+(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return _mm_add_ps(a.m128, b.m128); + return _mm_add_ps(a, b); #else return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); #endif @@ -722,7 +732,7 @@ ccl_device_inline float4 operator+(const float4& a, const float4& b) ccl_device_inline float4 operator-(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return _mm_sub_ps(a.m128, b.m128); + return _mm_sub_ps(a, b); #else return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); #endif @@ -782,9 +792,14 @@ ccl_device_inline bool operator==(const float4 a, const float4 b) ccl_device_inline float4 cross(const float4& a, const float4& b) { #ifdef __KERNEL_SSE__ - return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b)); + /* return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b));*/ + /* Same thing, expanded */ + return _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(a, a, 9), _mm_shuffle_ps(b, b, 18)), + _mm_mul_ps(_mm_shuffle_ps(b, b, 9), _mm_shuffle_ps(a, a, 18))); + + #else - return make_float4(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f); + return make_float4(a.y*b.z - a.z*b.y, a.z*b.a - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f); #endif } @@ -797,14 +812,40 @@ ccl_device_inline bool is_zero(const float4& a) #endif } +ccl_device_inline float float4x(const float4& a) +{ +#ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE3__ +#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER) + return a.m128.m128_f32[0]; +#elif defined(_MSC_VER) && !defined(__INTEL_COMPILER) + /* The temporary variable fixes an internal a possible compilation error in vs <= 2008 and a wrong-result bug in vs 2010 */ + float x = _mm_cvtss_f32(a); + return x; +#else /* Other compiler*/ + return _mm_cvtss_f32(a); +#endif /* compilers */ +#endif /* __KERNEL_SSE3__ */ +#else + return a.x; +#endif /* _KERNEL_SSE__ */ +} + ccl_device_inline float reduce_add(const float4& a) { #ifdef __KERNEL_SSE__ +#ifdef __KERNEL_SSE3__ + float4 h = _mm_hadd_ps(a, a); + return float4x(_mm_hadd_ps(h, h)); +#elif defined(__KERNEL_SSE2__) + float4 h = _mm_add_ps(a, _mm_movehl_ps(a, a)); + return float4x(_mm_add_ss(h, _mm_shuffle_ps(h, h, 1))); + /* Original: float4 h = shuffle<1,0,3,2>(a) + a; - return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); /* todo: efficiency? */ -#else + return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); */ /* todo: efficiency? */ +#endif /* __KERNEL_SSE3__ */ return ((a.x + a.y) + (a.z + a.w)); -#endif +#endif /* __KERNEL_SSE__ */ } ccl_device_inline float average(const float4& a) diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h index f901513..763f733 100644 --- a/intern/cycles/util/util_optimization.h +++ b/intern/cycles/util/util_optimization.h @@ -78,7 +78,7 @@ * This is disabled code for an experiment to use SSE types globally for types * such as float3 and float4. Currently this gives an overall slowdown. */ -#if 0 +#if 1 #define __KERNEL_SSE__ #ifndef __KERNEL_SSE2__ #define __KERNEL_SSE2__ diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index d5f67ab..b79106b 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -90,6 +90,11 @@ CCL_NAMESPACE_BEGIN * Also vector types, named to be compatible with OpenCL builtin types, while * working for CUDA and C++ too. */ +#ifdef _MSC_VER +#pragma pack(push,16) /* Must ensure class & union 16-B aligned */ +#endif /* _MSC_VER */ + + /* Shorter Unsigned Names */ #ifndef __KERNEL_OPENCL__ @@ -450,6 +455,10 @@ ccl_device_inline int4 make_int4(const float3& f) #endif +#ifdef _MSC_VER +#pragma pack(pop) +#endif /* _MSC_VER */ + CCL_NAMESPACE_END #endif /* __UTIL_TYPES_H__ */