The Azimuth Project
Experiments in programming style for HPC (changes)

Showing changes from revision #5 to #6: Added | Removed | Changed

Experiments in programming style for HPC

This page links to a simple experiment in styles for high-performance computing (HPC). (Being more precise, it’s realy high-performance numerical computing.) The source code is simplel C++ file.


In case the above link is temporarily unrecognised by the wiki, here is an inline version:

/*A very, very simple example that shows some examples of different
 *programming styles (using C++). Only absolutely necessary parts has
 *been written, and because to keep the line count down in this standalone
 *example some NASTY macros have been used for structure setup and
 *timing. In a real program they could be dealt with in a more structured
 *way requiring more boilerplate code.
 *Also there's no claim that this task is necessarily useful or a performance
 *bottleneck in any real-world task are made, and this kind of optimisation
 *should only be applied to the SMALL areas of code which are bottlenecks.
 *
 * g++ -msse -msse2 -O2 runtime.cc
 *
 *On DavidTweed's netbook typical results are:
Timing: 29740728 Neat code result=3.84e+08
Timing: 10612368 Optimised code result=3.84e+08
Timing: 10034928 Optimised code alternative result=3.84e+08
 *These results may not be typical (and exhibit worrying variability).
 */
#include "xmmintrin.h"
#include "emmintrin.h"
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <vector>
#include <algorithm>

#define rdtscll(val) do { \
     unsigned int __a,__d; \
     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
     (val) = uint64_t((unsigned long)__a) | (uint64_t((unsigned long)__d)<<32); \
} while(0)

class VoxelFlowVectorO { public:
float flowComponent[3];
    float sqrMagnitude(void) { float energy=0;
        int i;
        for(i=0;i<3;++i){
            energy+=flowComponent[i]*flowComponent[i];
        }
        return energy;
    }
};

class VolumeVectorFieldO {
public:
    int limits[3];
    VoxelFlowVectorO ***flowVecs;
    float getEnergy();
};

float VolumeVectorFieldO::getEnergy()
{
//The sumsJ and sumsK variables aren't "logically" needed, but are a quick
//and dirty attempt to deal with the issue that adding floating point numbers
//of dramatically different magnitudes, as would occur if you just used the
//sums variable on the marked line, is a bad idea for accuracy. More sophisticated
//accumulation schemes could be used if they proved necessary in practice.
    float sums=0;
    int i=0;
    do{
        float sumsJ=0;
        int j=0;
        do{
            float sumsK=0;
            int k=0;
            do{
                //extract the block of 4 elements for each component
                sumsK+=flowVecs[i][j][k].sqrMagnitude();
            }while(++k<limits[2]);
            sumsJ+=sumsK;
        }while(++j<limits[1]);
        sums+=sumsJ;
    }while(++i<limits[0]);
    return sums;
}

union M128 {
    __m128 m128;
    float arr[4];
};

class VolumeVectorField {
public:
    int limits[3];
    __m128 ***flowVecs[3];
    float getEnergy();
    float getEnergyAlt();
};

float VolumeVectorField::getEnergy()
{
//The sumsJ and sumsK variables aren't "logically" needed, but are a quick
//and dirty attempt to deal with the issue that adding floating point numbers
//of dramatically different magnitudes, as would occur if you just used the
//sums variable on the marked line, is a bad idea for accuracy. More sophisticated
//accumulation schemes could be used if they proved necessary in practice.
    __m128 sums=_mm_setzero_ps();
    int i=0;
    do{
        __m128 sumsJ=_mm_setzero_ps();
        int j=0;
        do{
            __m128 sumsK=_mm_setzero_ps();
            int k=0;
            do{
                //extract the block of 4 elements for each component
                __m128 x=flowVecs[0][i][j][k];
                __m128 y=flowVecs[1][i][j][k];
                __m128 z=flowVecs[2][i][j][k];
                //add to the accumulator of squared sums for this dimension
                sumsK=_mm_add_ps(sumsK,_mm_add_ps(_mm_add_ps(_mm_mul_ps(x,x),_mm_mul_ps(y,y)),_mm_mul_ps(z,z)));
            }while(++k<limits[2]);
            sumsJ=_mm_add_ps(sumsJ,sumsK);
        }while(++j<limits[1]);
        sums=_mm_add_ps(sums,sumsJ);
    }while(++i<limits[0]);
    //sideways-add the 4 sum elements into one sum
    M128 transfer;
    transfer.m128=sums;
    float energy=transfer.arr[0]+transfer.arr[1]+transfer.arr[2]+transfer.arr[3];
    return energy;
}

//variant that tries to remove "bottleneck" on sumsK: few more variables but marginally more efficient
float VolumeVectorField::getEnergyAlt()
{
//The sumsJ and sumsK variables aren't "logically" needed, but are a quick
//and dirty attempt to deal with the issue that adding floating point numbers
//of dramatically different magnitudes, as would occur if you just used the
//sums variable on the marked line, is a bad idea for accuracy. More sophisticated
//accumulation schemes could be used if they proved necessary in practice.
    __m128 sums=_mm_setzero_ps();
    int i=0;
    do{
        __m128 sumsJ=_mm_setzero_ps();
        int j=0;
        do{
            __m128 sumsKX=_mm_setzero_ps(),sumsKY=_mm_setzero_ps(),sumsKZ=_mm_setzero_ps();
            int k=0;
            do{
                //extract the block of 4 elements for each component
                __m128 x=flowVecs[0][i][j][k];
                __m128 y=flowVecs[1][i][j][k];
                __m128 z=flowVecs[2][i][j][k];
                //square them and sum to get 4-elements of complete squared magnitudes
                sumsKX=_mm_add_ps(sumsKX,_mm_mul_ps(x,x));
                sumsKY=_mm_add_ps(sumsKY,_mm_mul_ps(y,y));
                sumsKZ=_mm_add_ps(sumsKZ,_mm_mul_ps(z,z));
            }while(++k<limits[2]);
            sumsJ=_mm_add_ps(sumsJ,_mm_add_ps(sumsKX,_mm_add_ps(sumsKY,sumsKZ)));
        }while(++j<limits[1]);
        sums=_mm_add_ps(sums,sumsJ);
    }while(++i<limits[0]);
    //sideways-add the 4 sum elements into one sum
    M128 transfer;
    transfer.m128=sums;
    float energy=transfer.arr[0]+transfer.arr[1]+transfer.arr[2]+transfer.arr[3];
    return energy;
}

#define DODGY_MACRO(objType,LIM,LIM2,ptr,val)   \
      ptr=new objType**[LIM];\
      for(i=0;i<LIM;++i){\
          void* t;\
          int dummy=posix_memalign(&t,16,sizeof(objType)*LIM*LIM2);\
          objType *t2=(objType*)t;\
          ptr[i]=new objType*[LIM];\
          for(j=0;j<LIM;++j){\
              ptr[i][j]=(t2[j*LIM2]);\
              for(k=0;kLIM2;++k){\
                  ptr[i][j][k]=val;         \
              }\
          }\
    }\

void
setup(VolumeVectorFieldO *v1,VolumeVectorField *v2)
{
    int i,j,k;
    static const int LIMIT=100;
    M128 transfer;
    transfer.arr[0]=1;
    transfer.arr[1]=1;
    transfer.arr[2]=1;
    transfer.arr[3]=1;

    VoxelFlowVectorO unitVec;
    for(i=0;i3;++i){
        v1-limits[i]=LIMIT;
        v2-limits[i]=(i==2?LIMIT/4:LIMIT);
        unitVec.flowComponent[i]=1;
    }
    DODGY_MACRO(VoxelFlowVectorO,LIMIT,LIMIT,v1-flowVecs,unitVec);
    DODGY_MACRO(__m128,LIMIT,LIMIT/4,v2-flowVecs[0],transfer.m128);
    DODGY_MACRO(__m128,LIMIT,LIMIT/4,v2-flowVecs[1],transfer.m128);
    DODGY_MACRO(__m128,LIMIT,LIMIT/4,v2-flowVecs[2],transfer.m128);
}

//return the median timing (to avoid outliers)
#define TIMING_MACRO(msg,v,m) times.clear();          \
e=0;        \
for(k=0;k128;++k){\
rdtscll(t1);\
e+=v-m();\
rdtscll(t2);\
times.push_back(t2-t1);                         \
}\
std::sort(times.begin(),times.end());\
printf("Timing: %llu %s result=%g\n",times[63],msg,e)

int
main(int argc,char* argv[])
{
    VolumeVectorFieldO *v1=new VolumeVectorFieldO;
    VolumeVectorField *v2=new VolumeVectorField;
    setup(v1,v2);
    int k;
    float e;
    std::vectoruint64_t times;
    uint64_t t1,t2;
    TIMING_MACRO("Neat code",v1,getEnergy);
    TIMING_MACRO("Optimised code",v2,getEnergy);
    TIMING_MACRO("Optimised code alternative",v2,getEnergyAlt);
    return 0;
}

category: experiments, software

sex shop sex shop sex shop sex shop sex shop lingerie sex shop atacado calcinhas uniformes profissionais uniformes dicas de sexo