1 #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
2 #define INCLUDED_volk_32f_s32f_convert_8i_a_H
17 static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
18 unsigned int number = 0;
20 const unsigned int sixteenthPoints = num_points / 16;
22 const float* inputVectorPtr = (
const float*)inputVector;
23 int8_t* outputVectorPtr = outputVector;
29 __m128 vScalar = _mm_set_ps1(scalar);
30 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
31 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
32 __m128 vmin_val = _mm_set_ps1(min_val);
33 __m128 vmax_val = _mm_set_ps1(max_val);
35 for(;number < sixteenthPoints; number++){
36 inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
37 inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
38 inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
39 inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
41 inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
42 inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
43 inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
44 inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
46 intInputVal1 = _mm_cvtps_epi32(inputVal1);
47 intInputVal2 = _mm_cvtps_epi32(inputVal2);
48 intInputVal3 = _mm_cvtps_epi32(inputVal3);
49 intInputVal4 = _mm_cvtps_epi32(inputVal4);
51 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
52 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
54 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
56 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
57 outputVectorPtr += 16;
60 number = sixteenthPoints * 16;
61 for(; number < num_points; number++){
62 r = inputVector[number] * scalar;
67 outputVector[number] = (int8_t)(r);
73 #include <xmmintrin.h>
81 static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
82 unsigned int number = 0;
84 const unsigned int quarterPoints = num_points / 4;
86 const float* inputVectorPtr = (
const float*)inputVector;
92 int8_t* outputVectorPtr = outputVector;
93 __m128 vScalar = _mm_set_ps1(scalar);
95 __m128 vmin_val = _mm_set_ps1(min_val);
96 __m128 vmax_val = _mm_set_ps1(max_val);
100 for(;number < quarterPoints; number++){
101 ret = _mm_load_ps(inputVectorPtr);
104 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
106 _mm_store_ps(outputFloatBuffer, ret);
107 *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
108 *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
109 *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
110 *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
113 number = quarterPoints * 4;
114 for(; number < num_points; number++){
115 r = inputVector[number] * scalar;
120 outputVector[number] = (int8_t)(r);
125 #ifdef LV_HAVE_GENERIC
133 static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
134 int8_t* outputVectorPtr = outputVector;
135 const float* inputVectorPtr = inputVector;
136 unsigned int number = 0;
137 float min_val = -128;
141 for(number = 0; number < num_points; number++){
142 r = *inputVectorPtr++ * scalar;
147 *outputVectorPtr++ = (int8_t)(r);