00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #include <stdlib.h>
00029 #include <malloc.h>
00030 #include <memory.h>
00031
00032 #if 1400 <= _MSC_VER
00033 #include <intrin.h>
00034 #endif
00035
00036 #if LBFGS_FLOAT == 32 && LBFGS_IEEE_FLOAT
00037 #define fsigndiff(x, y) (((*(uint32_t*)(x)) ^ (*(uint32_t*)(y))) & 0x80000000U)
00038 #else
00039 #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
00040 #endif
00041
00042 inline static void* vecalloc(size_t size)
00043 {
00044 void *memblock = _aligned_malloc(size, 16);
00045 if (memblock != NULL) {
00046 memset(memblock, 0, size);
00047 }
00048 return memblock;
00049 }
00050
00051 inline static void vecfree(void *memblock)
00052 {
00053 _aligned_free(memblock);
00054 }
00055
00056 #define vecset(x, c, n) \
00057 { \
00058 int i; \
00059 __m128 XMM0 = _mm_set_ps1(c); \
00060 for (i = 0;i < (n);i += 16) { \
00061 _mm_store_ps((x)+i , XMM0); \
00062 _mm_store_ps((x)+i+ 4, XMM0); \
00063 _mm_store_ps((x)+i+ 8, XMM0); \
00064 _mm_store_ps((x)+i+12, XMM0); \
00065 } \
00066 }
00067
00068 #define veccpy(y, x, n) \
00069 { \
00070 int i; \
00071 for (i = 0;i < (n);i += 16) { \
00072 __m128 XMM0 = _mm_load_ps((x)+i ); \
00073 __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
00074 __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
00075 __m128 XMM3 = _mm_load_ps((x)+i+12); \
00076 _mm_store_ps((y)+i , XMM0); \
00077 _mm_store_ps((y)+i+ 4, XMM1); \
00078 _mm_store_ps((y)+i+ 8, XMM2); \
00079 _mm_store_ps((y)+i+12, XMM3); \
00080 } \
00081 }
00082
00083 #define vecncpy(y, x, n) \
00084 { \
00085 int i; \
00086 const uint32_t mask = 0x80000000; \
00087 __m128 XMM4 = _mm_load_ps1((float*)&mask); \
00088 for (i = 0;i < (n);i += 16) { \
00089 __m128 XMM0 = _mm_load_ps((x)+i ); \
00090 __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
00091 __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
00092 __m128 XMM3 = _mm_load_ps((x)+i+12); \
00093 XMM0 = _mm_xor_ps(XMM0, XMM4); \
00094 XMM1 = _mm_xor_ps(XMM1, XMM4); \
00095 XMM2 = _mm_xor_ps(XMM2, XMM4); \
00096 XMM3 = _mm_xor_ps(XMM3, XMM4); \
00097 _mm_store_ps((y)+i , XMM0); \
00098 _mm_store_ps((y)+i+ 4, XMM1); \
00099 _mm_store_ps((y)+i+ 8, XMM2); \
00100 _mm_store_ps((y)+i+12, XMM3); \
00101 } \
00102 }
00103
00104 #define vecadd(y, x, c, n) \
00105 { \
00106 int i; \
00107 __m128 XMM7 = _mm_set_ps1(c); \
00108 for (i = 0;i < (n);i += 8) { \
00109 __m128 XMM0 = _mm_load_ps((x)+i ); \
00110 __m128 XMM1 = _mm_load_ps((x)+i+4); \
00111 __m128 XMM2 = _mm_load_ps((y)+i ); \
00112 __m128 XMM3 = _mm_load_ps((y)+i+4); \
00113 XMM0 = _mm_mul_ps(XMM0, XMM7); \
00114 XMM1 = _mm_mul_ps(XMM1, XMM7); \
00115 XMM2 = _mm_add_ps(XMM2, XMM0); \
00116 XMM3 = _mm_add_ps(XMM3, XMM1); \
00117 _mm_store_ps((y)+i , XMM2); \
00118 _mm_store_ps((y)+i+4, XMM3); \
00119 } \
00120 }
00121
00122 #define vecdiff(z, x, y, n) \
00123 { \
00124 int i; \
00125 for (i = 0;i < (n);i += 16) { \
00126 __m128 XMM0 = _mm_load_ps((x)+i ); \
00127 __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
00128 __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
00129 __m128 XMM3 = _mm_load_ps((x)+i+12); \
00130 __m128 XMM4 = _mm_load_ps((y)+i ); \
00131 __m128 XMM5 = _mm_load_ps((y)+i+ 4); \
00132 __m128 XMM6 = _mm_load_ps((y)+i+ 8); \
00133 __m128 XMM7 = _mm_load_ps((y)+i+12); \
00134 XMM0 = _mm_sub_ps(XMM0, XMM4); \
00135 XMM1 = _mm_sub_ps(XMM1, XMM5); \
00136 XMM2 = _mm_sub_ps(XMM2, XMM6); \
00137 XMM3 = _mm_sub_ps(XMM3, XMM7); \
00138 _mm_store_ps((z)+i , XMM0); \
00139 _mm_store_ps((z)+i+ 4, XMM1); \
00140 _mm_store_ps((z)+i+ 8, XMM2); \
00141 _mm_store_ps((z)+i+12, XMM3); \
00142 } \
00143 }
00144
00145 #define vecscale(y, c, n) \
00146 { \
00147 int i; \
00148 __m128 XMM7 = _mm_set_ps1(c); \
00149 for (i = 0;i < (n);i += 8) { \
00150 __m128 XMM0 = _mm_load_ps((y)+i ); \
00151 __m128 XMM1 = _mm_load_ps((y)+i+4); \
00152 XMM0 = _mm_mul_ps(XMM0, XMM7); \
00153 XMM1 = _mm_mul_ps(XMM1, XMM7); \
00154 _mm_store_ps((y)+i , XMM0); \
00155 _mm_store_ps((y)+i+4, XMM1); \
00156 } \
00157 }
00158
00159 #define vecmul(y, x, n) \
00160 { \
00161 int i; \
00162 for (i = 0;i < (n);i += 16) { \
00163 __m128 XMM0 = _mm_load_ps((x)+i ); \
00164 __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
00165 __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
00166 __m128 XMM3 = _mm_load_ps((x)+i+12); \
00167 __m128 XMM4 = _mm_load_ps((y)+i ); \
00168 __m128 XMM5 = _mm_load_ps((y)+i+ 4); \
00169 __m128 XMM6 = _mm_load_ps((y)+i+ 8); \
00170 __m128 XMM7 = _mm_load_ps((y)+i+12); \
00171 XMM4 = _mm_mul_ps(XMM4, XMM0); \
00172 XMM5 = _mm_mul_ps(XMM5, XMM1); \
00173 XMM6 = _mm_mul_ps(XMM6, XMM2); \
00174 XMM7 = _mm_mul_ps(XMM7, XMM3); \
00175 _mm_store_ps((y)+i , XMM4); \
00176 _mm_store_ps((y)+i+ 4, XMM5); \
00177 _mm_store_ps((y)+i+ 8, XMM6); \
00178 _mm_store_ps((y)+i+12, XMM7); \
00179 } \
00180 }
00181
00182
00183
00184 #if 3 <= __SSE__
00185
00186
00187
00188
00189 #define __horizontal_sum(r, rw) \
00190 r = _mm_hadd_ps(r, r); \
00191 r = _mm_hadd_ps(r, r);
00192
00193 #else
00194
00195
00196
00197 #define __horizontal_sum(r, rw) \
00198 rw = r; \
00199 r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(1, 0, 3, 2)); \
00200 r = _mm_add_ps(r, rw); \
00201 rw = r; \
00202 r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(2, 3, 0, 1)); \
00203 r = _mm_add_ps(r, rw);
00204
00205 #endif
00206
00207 #define vecdot(s, x, y, n) \
00208 { \
00209 int i; \
00210 __m128 XMM0 = _mm_setzero_ps(); \
00211 __m128 XMM1 = _mm_setzero_ps(); \
00212 __m128 XMM2, XMM3, XMM4, XMM5; \
00213 for (i = 0;i < (n);i += 8) { \
00214 XMM2 = _mm_load_ps((x)+i ); \
00215 XMM3 = _mm_load_ps((x)+i+4); \
00216 XMM4 = _mm_load_ps((y)+i ); \
00217 XMM5 = _mm_load_ps((y)+i+4); \
00218 XMM2 = _mm_mul_ps(XMM2, XMM4); \
00219 XMM3 = _mm_mul_ps(XMM3, XMM5); \
00220 XMM0 = _mm_add_ps(XMM0, XMM2); \
00221 XMM1 = _mm_add_ps(XMM1, XMM3); \
00222 } \
00223 XMM0 = _mm_add_ps(XMM0, XMM1); \
00224 __horizontal_sum(XMM0, XMM1); \
00225 _mm_store_ss((s), XMM0); \
00226 }
00227
00228 #define vecnorm(s, x, n) \
00229 { \
00230 int i; \
00231 __m128 XMM0 = _mm_setzero_ps(); \
00232 __m128 XMM1 = _mm_setzero_ps(); \
00233 __m128 XMM2, XMM3; \
00234 for (i = 0;i < (n);i += 8) { \
00235 XMM2 = _mm_load_ps((x)+i ); \
00236 XMM3 = _mm_load_ps((x)+i+4); \
00237 XMM2 = _mm_mul_ps(XMM2, XMM2); \
00238 XMM3 = _mm_mul_ps(XMM3, XMM3); \
00239 XMM0 = _mm_add_ps(XMM0, XMM2); \
00240 XMM1 = _mm_add_ps(XMM1, XMM3); \
00241 } \
00242 XMM0 = _mm_add_ps(XMM0, XMM1); \
00243 __horizontal_sum(XMM0, XMM1); \
00244 XMM2 = XMM0; \
00245 XMM1 = _mm_rsqrt_ss(XMM0); \
00246 XMM3 = XMM1; \
00247 XMM1 = _mm_mul_ss(XMM1, XMM1); \
00248 XMM1 = _mm_mul_ss(XMM1, XMM3); \
00249 XMM1 = _mm_mul_ss(XMM1, XMM0); \
00250 XMM1 = _mm_mul_ss(XMM1, _mm_set_ss(-0.5f)); \
00251 XMM3 = _mm_mul_ss(XMM3, _mm_set_ss(1.5f)); \
00252 XMM3 = _mm_add_ss(XMM3, XMM1); \
00253 XMM3 = _mm_mul_ss(XMM3, XMM2); \
00254 _mm_store_ss((s), XMM3); \
00255 }
00256
00257 #define vecrnorm(s, x, n) \
00258 { \
00259 int i; \
00260 __m128 XMM0 = _mm_setzero_ps(); \
00261 __m128 XMM1 = _mm_setzero_ps(); \
00262 __m128 XMM2, XMM3; \
00263 for (i = 0;i < (n);i += 16) { \
00264 XMM2 = _mm_load_ps((x)+i ); \
00265 XMM3 = _mm_load_ps((x)+i+4); \
00266 XMM2 = _mm_mul_ps(XMM2, XMM2); \
00267 XMM3 = _mm_mul_ps(XMM3, XMM3); \
00268 XMM0 = _mm_add_ps(XMM0, XMM2); \
00269 XMM1 = _mm_add_ps(XMM1, XMM3); \
00270 } \
00271 XMM0 = _mm_add_ps(XMM0, XMM1); \
00272 __horizontal_sum(XMM0, XMM1); \
00273 XMM2 = XMM0; \
00274 XMM1 = _mm_rsqrt_ss(XMM0); \
00275 XMM3 = XMM1; \
00276 XMM1 = _mm_mul_ss(XMM1, XMM1); \
00277 XMM1 = _mm_mul_ss(XMM1, XMM3); \
00278 XMM1 = _mm_mul_ss(XMM1, XMM0); \
00279 XMM1 = _mm_mul_ss(XMM1, _mm_set_ss(-0.5f)); \
00280 XMM3 = _mm_mul_ss(XMM3, _mm_set_ss(1.5f)); \
00281 XMM3 = _mm_add_ss(XMM3, XMM1); \
00282 _mm_store_ss((s), XMM3); \
00283 }