#include #ifndef USE_SSE2 #error Woops, DMatrix4x2 is not yet implemented for non-SSE2 architectures #else class DMatrix4x2{ public: DMatrix4x2() : mA( ALIGNED_16_BLOCK_PTR(union dvec, 2, mA) ) { for (unsigned int j=0;j<2;j++){ mA[0].v[j]=_mm_setzero_pd(); mA[1].v[j]=_mm_setzero_pd(); } } DMatrix4x2(double a11, double a12, double a21, double a22, double a31, double a32, double a41, double a42) : mA( ALIGNED_16_BLOCK_PTR(union dvec, 2, mA) ) { mA[0].d[0]=a11; mA[0].d[1]=a21; mA[0].d[2]=a31; mA[0].d[3]=a41; mA[1].d[0]=a12; mA[1].d[1]=a22; mA[1].d[2]=a32; mA[1].d[3]=a42; } DMatrix4x2(const __m128d aa, const __m128d ab, const __m128d ba, const __m128d bb) : mA( ALIGNED_16_BLOCK_PTR(union dvec, 2, mA) ) { mA[0].v[0]=aa; mA[0].v[1]=ba; mA[1].v[0]=ab; mA[1].v[1]=bb; } DMatrix4x2(const DMatrix4x2& dm) : mA( ALIGNED_16_BLOCK_PTR(union dvec, 2, mA) ) { mA[0].v[0]=dm.mA[0].v[0]; mA[0].v[1]=dm.mA[0].v[1]; mA[1].v[0]=dm.mA[1].v[0]; mA[1].v[1]=dm.mA[1].v[1]; } ~DMatrix4x2(){}; __m128d GetV(const int pair,const int col) const{ return mA[col].v[pair]; } void SetV(int pair,int col,__m128d v){ mA[col].v[pair]=v; } // Assignment DMatrix4x2& operator=(const DMatrix4x2& dm){ mA[0].v[0]=dm.mA[0].v[0]; mA[0].v[1]=dm.mA[0].v[1]; mA[1].v[0]=dm.mA[1].v[0]; mA[1].v[1]=dm.mA[1].v[1]; return *this; } double &operator() (int row,int col){ return mA[col].d[row]; } double operator() (int row,int col) const{ return mA[col].d[row]; } DMatrix4x2 operator*(const DMatrix2x2 &m2){ ALIGNED_16_BLOCK_WITH_PTR(__m128d, 4, p) __m128d &a11=p[0]; // row,col __m128d &a12=p[1]; __m128d &a21=p[2]; __m128d &a22=p[3]; a11=_mm_set1_pd(m2(0,0)); a12=_mm_set1_pd(m2(0,1)); a21=_mm_set1_pd(m2(1,0)); a22=_mm_set1_pd(m2(1,1)); return DMatrix4x2(_mm_add_pd(_mm_mul_pd(GetV(0,0),a11), _mm_mul_pd(GetV(0,1),a21)), _mm_add_pd(_mm_mul_pd(GetV(0,0),a12), _mm_mul_pd(GetV(0,1),a22)), _mm_add_pd(_mm_mul_pd(GetV(1,0),a11), _mm_mul_pd(GetV(1,1),a21)), _mm_add_pd(_mm_mul_pd(GetV(1,0),a12), _mm_mul_pd(GetV(1,1),a22))); } void Print(){ cout << "DMatrix4x2:" <