00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #include <stdlib.h>
00029 #include <malloc.h>
00030 #include <memory.h>
00031
00032 #if 1400 <= _MSC_VER
00033 #include <intrin.h>
00034 #endif
00035
00036 inline static void* vecalloc(size_t size)
00037 {
00038 void *memblock = _aligned_malloc(size, 16);
00039 if (memblock != NULL) {
00040 memset(memblock, 0, size);
00041 }
00042 return memblock;
00043 }
00044
00045 inline static void vecfree(void *memblock)
00046 {
00047 _aligned_free(memblock);
00048 }
00049
00050 #define fsigndiff(x, y) \
00051 ((_mm_movemask_pd(_mm_set_pd(*(x), *(y))) + 1) & 0x002)
00052
00053 #define vecset(x, c, n) \
00054 { \
00055 int i; \
00056 __m128d XMM0 = _mm_set1_pd(c); \
00057 for (i = 0;i < (n);i += 8) { \
00058 _mm_store_pd((x)+i , XMM0); \
00059 _mm_store_pd((x)+i+2, XMM0); \
00060 _mm_store_pd((x)+i+4, XMM0); \
00061 _mm_store_pd((x)+i+6, XMM0); \
00062 } \
00063 }
00064
00065 #define veccpy(y, x, n) \
00066 { \
00067 int i; \
00068 for (i = 0;i < (n);i += 8) { \
00069 __m128d XMM0 = _mm_load_pd((x)+i ); \
00070 __m128d XMM1 = _mm_load_pd((x)+i+2); \
00071 __m128d XMM2 = _mm_load_pd((x)+i+4); \
00072 __m128d XMM3 = _mm_load_pd((x)+i+6); \
00073 _mm_store_pd((y)+i , XMM0); \
00074 _mm_store_pd((y)+i+2, XMM1); \
00075 _mm_store_pd((y)+i+4, XMM2); \
00076 _mm_store_pd((y)+i+6, XMM3); \
00077 } \
00078 }
00079
00080 #define vecncpy(y, x, n) \
00081 { \
00082 int i; \
00083 for (i = 0;i < (n);i += 8) { \
00084 __m128d XMM0 = _mm_setzero_pd(); \
00085 __m128d XMM1 = _mm_setzero_pd(); \
00086 __m128d XMM2 = _mm_setzero_pd(); \
00087 __m128d XMM3 = _mm_setzero_pd(); \
00088 __m128d XMM4 = _mm_load_pd((x)+i ); \
00089 __m128d XMM5 = _mm_load_pd((x)+i+2); \
00090 __m128d XMM6 = _mm_load_pd((x)+i+4); \
00091 __m128d XMM7 = _mm_load_pd((x)+i+6); \
00092 XMM0 = _mm_sub_pd(XMM0, XMM4); \
00093 XMM1 = _mm_sub_pd(XMM1, XMM5); \
00094 XMM2 = _mm_sub_pd(XMM2, XMM6); \
00095 XMM3 = _mm_sub_pd(XMM3, XMM7); \
00096 _mm_store_pd((y)+i , XMM0); \
00097 _mm_store_pd((y)+i+2, XMM1); \
00098 _mm_store_pd((y)+i+4, XMM2); \
00099 _mm_store_pd((y)+i+6, XMM3); \
00100 } \
00101 }
00102
00103 #define vecadd(y, x, c, n) \
00104 { \
00105 int i; \
00106 __m128d XMM7 = _mm_set1_pd(c); \
00107 for (i = 0;i < (n);i += 4) { \
00108 __m128d XMM0 = _mm_load_pd((x)+i ); \
00109 __m128d XMM1 = _mm_load_pd((x)+i+2); \
00110 __m128d XMM2 = _mm_load_pd((y)+i ); \
00111 __m128d XMM3 = _mm_load_pd((y)+i+2); \
00112 XMM0 = _mm_mul_pd(XMM0, XMM7); \
00113 XMM1 = _mm_mul_pd(XMM1, XMM7); \
00114 XMM2 = _mm_add_pd(XMM2, XMM0); \
00115 XMM3 = _mm_add_pd(XMM3, XMM1); \
00116 _mm_store_pd((y)+i , XMM2); \
00117 _mm_store_pd((y)+i+2, XMM3); \
00118 } \
00119 }
00120
00121 #define vecdiff(z, x, y, n) \
00122 { \
00123 int i; \
00124 for (i = 0;i < (n);i += 8) { \
00125 __m128d XMM0 = _mm_load_pd((x)+i ); \
00126 __m128d XMM1 = _mm_load_pd((x)+i+2); \
00127 __m128d XMM2 = _mm_load_pd((x)+i+4); \
00128 __m128d XMM3 = _mm_load_pd((x)+i+6); \
00129 __m128d XMM4 = _mm_load_pd((y)+i ); \
00130 __m128d XMM5 = _mm_load_pd((y)+i+2); \
00131 __m128d XMM6 = _mm_load_pd((y)+i+4); \
00132 __m128d XMM7 = _mm_load_pd((y)+i+6); \
00133 XMM0 = _mm_sub_pd(XMM0, XMM4); \
00134 XMM1 = _mm_sub_pd(XMM1, XMM5); \
00135 XMM2 = _mm_sub_pd(XMM2, XMM6); \
00136 XMM3 = _mm_sub_pd(XMM3, XMM7); \
00137 _mm_store_pd((z)+i , XMM0); \
00138 _mm_store_pd((z)+i+2, XMM1); \
00139 _mm_store_pd((z)+i+4, XMM2); \
00140 _mm_store_pd((z)+i+6, XMM3); \
00141 } \
00142 }
00143
00144 #define vecscale(y, c, n) \
00145 { \
00146 int i; \
00147 __m128d XMM7 = _mm_set1_pd(c); \
00148 for (i = 0;i < (n);i += 4) { \
00149 __m128d XMM0 = _mm_load_pd((y)+i ); \
00150 __m128d XMM1 = _mm_load_pd((y)+i+2); \
00151 XMM0 = _mm_mul_pd(XMM0, XMM7); \
00152 XMM1 = _mm_mul_pd(XMM1, XMM7); \
00153 _mm_store_pd((y)+i , XMM0); \
00154 _mm_store_pd((y)+i+2, XMM1); \
00155 } \
00156 }
00157
00158 #define vecmul(y, x, n) \
00159 { \
00160 int i; \
00161 for (i = 0;i < (n);i += 8) { \
00162 __m128d XMM0 = _mm_load_pd((x)+i ); \
00163 __m128d XMM1 = _mm_load_pd((x)+i+2); \
00164 __m128d XMM2 = _mm_load_pd((x)+i+4); \
00165 __m128d XMM3 = _mm_load_pd((x)+i+6); \
00166 __m128d XMM4 = _mm_load_pd((y)+i ); \
00167 __m128d XMM5 = _mm_load_pd((y)+i+2); \
00168 __m128d XMM6 = _mm_load_pd((y)+i+4); \
00169 __m128d XMM7 = _mm_load_pd((y)+i+6); \
00170 XMM4 = _mm_mul_pd(XMM4, XMM0); \
00171 XMM5 = _mm_mul_pd(XMM5, XMM1); \
00172 XMM6 = _mm_mul_pd(XMM6, XMM2); \
00173 XMM7 = _mm_mul_pd(XMM7, XMM3); \
00174 _mm_store_pd((y)+i , XMM4); \
00175 _mm_store_pd((y)+i+2, XMM5); \
00176 _mm_store_pd((y)+i+4, XMM6); \
00177 _mm_store_pd((y)+i+6, XMM7); \
00178 } \
00179 }
00180
00181
00182
00183 #if 3 <= __SSE__
00184
00185
00186
00187
00188 #define __horizontal_sum(r, rw) \
00189 r = _mm_hadd_ps(r, r); \
00190 r = _mm_hadd_ps(r, r);
00191
00192 #else
00193
00194
00195
00196 #define __horizontal_sum(r, rw) \
00197 rw = r; \
00198 r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(1, 0, 3, 2)); \
00199 r = _mm_add_ps(r, rw); \
00200 rw = r; \
00201 r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(2, 3, 0, 1)); \
00202 r = _mm_add_ps(r, rw);
00203
00204 #endif
00205
00206 #define vecdot(s, x, y, n) \
00207 { \
00208 int i; \
00209 __m128d XMM0 = _mm_setzero_pd(); \
00210 __m128d XMM1 = _mm_setzero_pd(); \
00211 __m128d XMM2, XMM3, XMM4, XMM5; \
00212 for (i = 0;i < (n);i += 4) { \
00213 XMM2 = _mm_load_pd((x)+i ); \
00214 XMM3 = _mm_load_pd((x)+i+2); \
00215 XMM4 = _mm_load_pd((y)+i ); \
00216 XMM5 = _mm_load_pd((y)+i+2); \
00217 XMM2 = _mm_mul_pd(XMM2, XMM4); \
00218 XMM3 = _mm_mul_pd(XMM3, XMM5); \
00219 XMM0 = _mm_add_pd(XMM0, XMM2); \
00220 XMM1 = _mm_add_pd(XMM1, XMM3); \
00221 } \
00222 XMM0 = _mm_add_pd(XMM0, XMM1); \
00223 XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
00224 XMM0 = _mm_add_pd(XMM0, XMM1); \
00225 _mm_store_sd((s), XMM0); \
00226 }
00227
00228 #define vecnorm(s, x, n) \
00229 { \
00230 int i; \
00231 __m128d XMM0 = _mm_setzero_pd(); \
00232 __m128d XMM1 = _mm_setzero_pd(); \
00233 __m128d XMM2, XMM3, XMM4, XMM5; \
00234 for (i = 0;i < (n);i += 4) { \
00235 XMM2 = _mm_load_pd((x)+i ); \
00236 XMM3 = _mm_load_pd((x)+i+2); \
00237 XMM4 = XMM2; \
00238 XMM5 = XMM3; \
00239 XMM2 = _mm_mul_pd(XMM2, XMM4); \
00240 XMM3 = _mm_mul_pd(XMM3, XMM5); \
00241 XMM0 = _mm_add_pd(XMM0, XMM2); \
00242 XMM1 = _mm_add_pd(XMM1, XMM3); \
00243 } \
00244 XMM0 = _mm_add_pd(XMM0, XMM1); \
00245 XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
00246 XMM0 = _mm_add_pd(XMM0, XMM1); \
00247 XMM0 = _mm_sqrt_pd(XMM0); \
00248 _mm_store_sd((s), XMM0); \
00249 }
00250
00251
00252 #define vecrnorm(s, x, n) \
00253 { \
00254 int i; \
00255 __m128d XMM0 = _mm_setzero_pd(); \
00256 __m128d XMM1 = _mm_setzero_pd(); \
00257 __m128d XMM2, XMM3, XMM4, XMM5; \
00258 for (i = 0;i < (n);i += 4) { \
00259 XMM2 = _mm_load_pd((x)+i ); \
00260 XMM3 = _mm_load_pd((x)+i+2); \
00261 XMM4 = XMM2; \
00262 XMM5 = XMM3; \
00263 XMM2 = _mm_mul_pd(XMM2, XMM4); \
00264 XMM3 = _mm_mul_pd(XMM3, XMM5); \
00265 XMM0 = _mm_add_pd(XMM0, XMM2); \
00266 XMM1 = _mm_add_pd(XMM1, XMM3); \
00267 } \
00268 XMM2 = _mm_set1_pd(1.0); \
00269 XMM0 = _mm_add_pd(XMM0, XMM1); \
00270 XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
00271 XMM0 = _mm_add_pd(XMM0, XMM1); \
00272 XMM0 = _mm_sqrt_pd(XMM0); \
00273 XMM2 = _mm_div_pd(XMM2, XMM0); \
00274 _mm_store_sd((s), XMM2); \
00275 }