). So far I've got C++ code done. " /> ). So far I've got C++ code done. " /> ). So far I've got C++ code done. "/>

SSE: Counter in a loop

269 Views Asked by At

I'm converting this Mathematica sources by user "J. M. ain't a mathematician" to C++ and also support SIMD (SSE2 -->). So far I've got C++ code done. Here's the sine version on Godbolt .

I (as in noobie regarding SSE) have not managed to solve this yet and having troubles when trying to control which elements are calculated in a while loop. Here's my code :

#include <iostream>
#include <cmath>
#include <immintrin.h>
#include <cstdint>


 // simple version 
float pade_sine(float x){  
 float f=0.25f;  
 float z=x*0.5f;  
    int k=0;  
    while (std::abs(z)>f){  
        z*=0.5f;  
        k++;  
        }  
    float z2=std::pow(z,2.0f);  
    float r=z*(1.0f+(z2/105.0f-1.0f)*std::pow(z/3.0f,2.0f))/  
          (1.0f+(z2/7.0f-4.0f)*std::pow(z/3.0f,2.0f));  
    while(k > 0){  
        r = (2.0f*r)/(1.0f-r*r);  
        k--;
    } 
    return (2.0f*r)/(1.0f+r*r);  
}  

__m128 sincos_sse(__m128 x, __m128 f, bool sine, int maxloops) {

const __m128 c1 = _mm_set1_ps(105.0f);
const __m128 c2 = _mm_set1_ps(3.0f);
const __m128 c3 = _mm_set1_ps(7.0f);
const __m128 zero = _mm_set1_ps(0.0f);
const __m128 half = _mm_set1_ps(0.5f);
const __m128 one = _mm_set1_ps(1.0f);
const __m128 two = _mm_set1_ps(2.0f);
const __m128 four = _mm_set1_ps(4.0f);
const __m128 signmask = half; //_mm_set1_ps(-1.0f);

int k = 0;
int mask, oldmask = 0;
    __m128  z  = _mm_mul_ps(x, half);
    __m128  flags, abs_z;
    __m128 kvec;

    // repeat halving           
    while (k < maxloops) {
                
        abs_z = _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), z), z);
        // make flag either 0 or 0.5 (this value is used as a multiplier later)
        flags = _mm_cmpgt_ps(abs_z, f);  
        flags = _mm_mul_ps(_mm_and_ps(flags, signmask), one);
        // calculate k for each element
        kvec = _mm_add_ps(kvec,_mm_mul_ps(_mm_and_ps(flags, signmask), two));  
        // get mask value to control the loop
        mask = _mm_movemask_epi8((__m128i)abs_z);
        // break loop if         
        if( mask == oldmask){
            //k--;
            //kvec = _mm_sub_ps(kvec, _mm_and_ps(_mm_cmpgt_ps(kvec, half), one));
            break;
        }
        oldmask = mask;
        k++;
        // calculate new z values by flags
        z  = _mm_sub_ps(z, _mm_mul_ps(z, flags));
    }

    __m128  z2 = _mm_mul_ps(z, z);
    __m128  t  = _mm_div_ps(z, c2);
    __m128  p  = _mm_mul_ps(t, t);
            t  = _mm_div_ps(z2, c1);
            p  = _mm_mul_ps(p, _mm_sub_ps(t, one));
            p  = _mm_add_ps(one, p);
            p  = _mm_mul_ps(z, p);
            t  = _mm_div_ps(z, c2);
    __m128  q  = _mm_mul_ps(t, t); 
            t  = _mm_div_ps(z2, c3);
            q  = _mm_mul_ps(q, _mm_sub_ps(t, four));
            q  = _mm_add_ps(q, one);
        
    __m128  r  = _mm_div_ps(p, q);


    // double-angle identity
    __m128 tmp = zero;
    __m128 multipliers;
        while (k){
            multipliers = _mm_and_ps(_mm_cmpgt_ps(kvec, half), one);
            tmp = _mm_mul_ps(_mm_div_ps(_mm_mul_ps(two, r), _mm_sub_ps(one, _mm_mul_ps(r,r))), multipliers);
            r = _mm_add_ps(r, tmp);
            --k;
            kvec = _mm_sub_ps(kvec, multipliers);
        }

        if (sine){
            return _mm_div_ps(_mm_mul_ps(two, r), _mm_add_ps(one, _mm_mul_ps(r,r)));
        }
        return _mm_div_ps(_mm_sub_ps(one, _mm_mul_ps(r, r)), _mm_add_ps(one, _mm_mul_ps(r,r)));
}

int main() {
    
    bool sine = true;
    float res[4] = { 0 };
    
    float in = 0.5;
    if (std::abs(in) > M_PI/2.0){ sine = false;}
    
    __m128 x = _mm_set_ps(in, 1.01f, -0.23f, 0.99f);
    __m128 f = _mm_set1_ps(0.25f);

    _mm_storeu_ps(res, sincos_sse(x, f, sine, 10));

    std::cout << "\n\nsincos_sse  = " << res[3] << "  " << res[2] << "  " << res[1] << "  "<< res[0] << "  " << std::endl;    
    std::cout << "std::sin    = " << std::sin(x[3]) << "  " << std::sin(x[2]) << "  "<< std::sin(x[1]) << "  "<< std::sin(x[0]) << "  " << std::endl;
    
    for (int i = 3; i >= 0; i--){
        if (sine){
            std::cout << "std::sin       = " << std::sin(x[i]) << std::endl;
            std::cout << "std::sin <> sincos_sse " << std::sin(x[i]) - res[i]<< std::endl;
        }
        else{
            std::cout << "std::cos       = " << std::cos(x[i]) << std::endl;
            std::cout << "std::cos <> sincos_sse " << std::cos(x[i]) - res[i]<< std::endl;
        }

    }
    
   
}

Online at https://godbolt.org/z/T5r1xPv8f

My non-SSE code uses variable k to control the double-angle identity loop but, in SSE implementation one needs to know k for each element (which is/can be solved in repeat halving loop).

I think the issue is following and is located in double-angle identity loop:

EDIT: Lets say f = 0.25 and the input vector x to be calculated is

{0.5, 1.01, -0.23, 0.99}

X values are halved before loop. After repeat halving loop (where all x[n] > f[n] elements are halved as many times as needed to get x[n] <= f[n]), k's (in kvec) for x vector elements are:

{0, 2, 0, 1}

so, in double-angle identity loop, one element needs to pass the calculation twice, one element once and rest two jumps over.

Loop control is handled by kvec so that kvec value decreases by one after each loop. kvec value (1 or 0) is used as a multiplier in calculation.

Problem is how to sum the values in loop because of loop is executed by highest k value (twice in this example case) when the kvec related values (i.e. multipliers 1 or 0) are in each round as:

  1. {0 1 0 1}
  2. {0 1 0 0}
  3. {0 0 0 0}

i.e. last kvec element zeroes the result in 2nd calculation round and 2nd element does it too if we go to 3rd round. So, I would need to get that values saved before it is zeroed. I do it now by summing the values but that is done wrongly because of the result is wrong. If _mm_max_ps() is used the result is OK for positive input values but for negative it is -0.

Any suggestions?

1

There are 1 best solutions below

2
Juha P On

After good sleep I had an answer in mind to get the function work correctly. What was needed was just to make all calculations done through positive input values. So had to take signs of input values and then restore them before final calculation. This answer is based on my own code given in question but, I do make changes @PeterCordes suggested to get the performance improved. I have not yet tested if performance is worse or better compared to std library functions.

Here's the code:

#include <iostream>
#include <cmath>
#include <immintrin.h>
#include <cstdint>

__m128 sincos_sse(__m128 x, __m128 f, bool sine, int maxloops) {

const __m128 c1 = _mm_set1_ps(105.0f);
const __m128 c2 = _mm_set1_ps(3.0f);
const __m128 c3 = _mm_set1_ps(7.0f);
const __m128 zero = _mm_set1_ps(0.0f);
const __m128 half = _mm_set1_ps(0.5f);
const __m128 one = _mm_set1_ps(1.0f);
const __m128 two = _mm_set1_ps(2.0f);
const __m128 four = _mm_set1_ps(4.0f);
const __m128 multipliermask = half;

const __m128 signmask = _mm_set1_ps(-0.0f);
__m128 signs = _mm_and_ps(x, signmask); // save signs

// make all values positive
x = _mm_andnot_ps(signmask, x);

int k = 0;
int mask, oldmask = 0;

    __m128  z  = _mm_mul_ps(x, half);
    __m128  flags;
    __m128 kvec;
    // repeat halving           
    while (k < maxloops) {
                
        // make flag either 0 or 0.5 (this value is used as a multiplier later)
        flags = _mm_cmpgt_ps(z, f);  
        flags = _mm_mul_ps(_mm_and_ps(flags, multipliermask), one);
        // calculate k for each element
        kvec = _mm_add_ps(kvec,_mm_mul_ps(_mm_and_ps(flags, multipliermask), two));  
        // get mask value to control the loop
        mask = _mm_movemask_epi8((__m128i)z);
        // break loop when all needed done       
        if( mask == oldmask){
                           std::cout << "BREAK " << k << "\n";
            k--;
            break;
        }
        oldmask = mask;
        k++;
        // calculate new z values by flags
        z  = _mm_sub_ps(z, _mm_mul_ps(z, flags));
    }

    __m128  z2 = _mm_mul_ps(z, z);
    __m128  t  = _mm_div_ps(z, c2);
    __m128  p  = _mm_mul_ps(t, t);
            t  = _mm_div_ps(z2, c1);
            p  = _mm_mul_ps(p, _mm_sub_ps(t, one));
            p  = _mm_add_ps(one, p);
            p  = _mm_mul_ps(z, p);
            t  = _mm_div_ps(z, c2);
    __m128  q  = _mm_mul_ps(t, t); 
            t  = _mm_div_ps(z2, c3);
            q  = _mm_mul_ps(q, _mm_sub_ps(t, four));
            q  = _mm_add_ps(q, one);
        
    __m128  r  = _mm_div_ps(p, q);

    // double-angle identity
    __m128 tmp = r;
    __m128 multipliers;
    while (k>=0){  // release looped continuously if no condition check
        multipliers = _mm_and_ps(_mm_cmpgt_ps(kvec, half), one);
        tmp = _mm_mul_ps(_mm_div_ps(_mm_mul_ps(two, tmp), _mm_sub_ps(one, _mm_mul_ps(tmp,tmp))), multipliers);
        tmp = _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), tmp), tmp);
        r = _mm_max_ps(r, tmp);  
        kvec = _mm_sub_ps(kvec, multipliers);
        --k;
    }
    // restore signs
    r = _mm_xor_ps(signs,r);
    // finalize calculations and return values
    if (sine){
            __m128 res = _mm_div_ps(_mm_mul_ps(two, r), _mm_add_ps(one, _mm_mul_ps(r,r)));
            return res;
        }
        // cosine
        return _mm_div_ps(_mm_sub_ps(one, _mm_mul_ps(r, r)), _mm_add_ps(one, _mm_mul_ps(r,r)));
}

int main() {
    
    bool sine = true;
    float res[4] = { 0 };
    float in = 0.05;
    int maxloops = 10;
    if (std::abs(in) > M_PI/2.0){ sine = false;}

    __m128 x = _mm_set_ps(in, 1.31f, -0.23f, 0.79f);
    __m128 f = _mm_set_ps(0.5f, 0.25f, 0.5f, 0.5f);

    _mm_storeu_ps(res, sincos_sse(x, f, true, maxloops));
    
    std::cout << "\n\nsincos_sse  = " << res[3] << "  " << res[2] << "  " << res[1] << "  "<< res[0] << "  " << std::endl;    
    std::cout << "std::sin    = " << std::sin(x[3]) << "  " << std::sin(x[2]) << "  "<< std::sin(x[1]) << "  "<< std::sin(x[0]) << "  " << std::endl;
    
    std::cout << "difference  = ";
    for (int i = 3; i >= 0; i--){
        if (sine){
            std::cout << std::sin(x[i]) - res[i] << "\t";
        }
        else{
            std::cout << std::cos(x[i]) - res[i] << "\t";
        }

    }
    
   
}

With outputs at GodBolt

Here is the version based on Peter Cordes's suggestions.

https://godbolt.org/z/T177YYPPr (cleaned: https://godbolt.org/z/csK19EEP3 )