#define NOSSE2          1
#define HAVESSE2        2

ALIGN16 DBL RTable[] ;

#undef Hash1d
#undef Hash2d

// Hash2dSSE2 assumed values in the range 0..8191
//#define Hash2d(a,b)   \
//  hashTable[(int)(hashTable[(int)((a) & 0xfffL)] ^ ((b) & 0xfffL))]

#define Hash2d(a,b)   \
  hashTable[(int)(hashTable[(int)(a)] ^ (b))]

// Hash1dRTableIndex assumed values in the range 0..8191
#define Hash1d(a,b)   \
  hashTable[(int)(a) ^ ((b) & 0xfffL)]

/********************************************* SSE2 Table ******************************************************************/
// The Zeros help the Cache line alignement, keep them.

__declspec(align(16)) static DBL SSE2RTable[267*2] =
{
         -1, 0.0 ,   0.604974, 0.0 ,  -0.937102, 0.0 ,   0.414115, 0.0 ,   0.576226, 0.0 , -0.0161593,0.0,
   0.432334, 0.0 ,   0.103685, 0.0 ,   0.590539, 0.0 ,  0.0286412, 0.0 ,    0.46981, 0.0 ,   -0.84622,0.0,
 -0.0734112, 0.0 ,  -0.304097, 0.0 ,   -0.40206, 0.0 ,  -0.210132, 0.0 ,  -0.919127, 0.0 ,   0.652033,0.0,
   -0.83151, 0.0 ,  -0.183948, 0.0 ,  -0.671107, 0.0 ,   0.852476, 0.0 ,   0.043595, 0.0 ,  -0.404532,0.0,
    0.75494, 0.0 ,  -0.335653, 0.0 ,   0.618433, 0.0 ,   0.605707, 0.0 ,   0.708583, 0.0 ,  -0.477195,0.0,
   0.899474, 0.0 ,   0.490623, 0.0 ,   0.221729, 0.0 ,  -0.400381, 0.0 ,  -0.853727, 0.0 ,  -0.932586,0.0,
   0.659113, 0.0 ,   0.961303, 0.0 ,   0.325948, 0.0 ,  -0.750851, 0.0 ,   0.842466, 0.0 ,   0.734401,0.0,
  -0.649866, 0.0 ,   0.394491, 0.0 ,  -0.466056, 0.0 ,  -0.434073, 0.0 ,   0.109026, 0.0 ,  0.0847028,0.0,
  -0.738857, 0.0 ,   0.241505, 0.0 ,    0.16228, 0.0 ,   -0.71426, 0.0 ,  -0.883665, 0.0 ,  -0.150408,0.0,
   -0.90396, 0.0 ,  -0.686549, 0.0 ,  -0.785214, 0.0 ,   0.488548, 0.0 ,  0.0246433, 0.0 ,   0.142473,0.0,
  -0.602136, 0.0 ,   0.375845, 0.0 ,-0.00779736, 0.0 ,   0.498955, 0.0 ,  -0.268147, 0.0 ,   0.856382,0.0,
  -0.386007, 0.0 ,  -0.596094, 0.0 ,  -0.867735, 0.0 ,  -0.570977, 0.0 ,  -0.914366, 0.0 ,    0.28896,0.0,
   0.672206, 0.0 ,  -0.233783, 0.0 ,    0.94815, 0.0 ,   0.895262, 0.0 ,   0.343252, 0.0 ,  -0.173388,0.0,
  -0.767971, 0.0 ,  -0.314748, 0.0 ,   0.824308, 0.0 ,  -0.342092, 0.0 ,   0.721431, 0.0 ,   -0.24004,0.0,
   -0.63653, 0.0 ,   0.553277, 0.0 ,   0.376272, 0.0 ,   0.158984, 0.0 ,  -0.452659, 0.0 ,   0.396323,0.0,
  -0.420676, 0.0 ,  -0.454154, 0.0 ,   0.122179, 0.0 ,   0.295857, 0.0 ,  0.0664225, 0.0 ,  -0.202075,0.0,
  -0.724788, 0.0 ,   0.453513, 0.0 ,   0.224567, 0.0 ,  -0.908812, 0.0 ,   0.176349, 0.0 ,  -0.320516,0.0,
  -0.697139, 0.0 ,   0.742702, 0.0 ,  -0.900786, 0.0 ,   0.471489, 0.0 ,  -0.133532, 0.0 ,   0.119127,0.0,
  -0.889769, 0.0 ,   -0.23183, 0.0 ,  -0.669673, 0.0 ,  -0.046891, 0.0 ,  -0.803433, 0.0 ,  -0.966735,0.0,
   0.475578, 0.0 ,  -0.652644, 0.0 ,  0.0112459, 0.0 ,  -0.730007, 0.0 ,   0.128283, 0.0 ,   0.145647,0.0,
  -0.619318, 0.0 ,   0.272023, 0.0 ,   0.392966, 0.0 ,   0.646418, 0.0 , -0.0207675, 0.0 ,  -0.315908,0.0,
   0.480797, 0.0 ,   0.535668, 0.0 ,  -0.250172, 0.0 ,   -0.83093, 0.0 ,  -0.653773, 0.0 ,  -0.443809,0.0,
   0.119982, 0.0 ,  -0.897642, 0.0 ,    0.89453, 0.0 ,   0.165789, 0.0 ,   0.633875, 0.0 ,  -0.886839,0.0,
   0.930877, 0.0 ,  -0.537194, 0.0 ,   0.587732, 0.0 ,   0.722011, 0.0 ,  -0.209461, 0.0 , -0.0424659,0.0,
  -0.814267, 0.0 ,  -0.919432, 0.0 ,   0.280262, 0.0 ,   -0.66302, 0.0 ,  -0.558099, 0.0 ,  -0.537469,0.0,
  -0.598779, 0.0 ,   0.929656, 0.0 ,  -0.170794, 0.0 ,  -0.537163, 0.0 ,   0.312581, 0.0 ,   0.959442,0.0,
   0.722652, 0.0 ,   0.499931, 0.0 ,   0.175616, 0.0 ,  -0.534874, 0.0 ,  -0.685115, 0.0 ,   0.444999,0.0,
    0.17171, 0.0 ,   0.108202, 0.0 ,  -0.768704, 0.0 ,  -0.463828, 0.0 ,   0.254231, 0.0 ,   0.546014,0.0,
   0.869474, 0.0 ,   0.875212, 0.0 ,  -0.944427, 0.0 ,   0.130724, 0.0 ,  -0.110185, 0.0 ,   0.312184,0.0,
   -0.33138, 0.0 ,  -0.629206, 0.0 ,  0.0606546, 0.0 ,   0.722866, 0.0 , -0.0979477, 0.0 ,   0.821561,0.0,
  0.0931258, 0.0 ,  -0.972808, 0.0 ,  0.0318151, 0.0 ,  -0.867033, 0.0 ,  -0.387228, 0.0 ,   0.280995,0.0,
  -0.218189, 0.0 ,  -0.539178, 0.0 ,  -0.427359, 0.0 ,  -0.602075, 0.0 ,   0.311971, 0.0 ,   0.277974,0.0,
   0.773159, 0.0 ,   0.592493, 0.0 , -0.0331884, 0.0 ,  -0.630854, 0.0 ,  -0.269947, 0.0 ,   0.339132,0.0,
   0.581079, 0.0 ,   0.209461, 0.0 ,  -0.317433, 0.0 ,  -0.284993, 0.0 ,   0.181323, 0.0 ,   0.341634,0.0,
   0.804959, 0.0 ,  -0.229572, 0.0 ,  -0.758907, 0.0 ,  -0.336721, 0.0 ,   0.605463, 0.0 ,  -0.991272,0.0,
 -0.0188754, 0.0 ,  -0.300191, 0.0 ,   0.368307, 0.0 ,  -0.176135, 0.0 ,    -0.3832, 0.0 ,  -0.749569,0.0,
    0.62356, 0.0 ,  -0.573938, 0.0 ,   0.278309, 0.0 ,  -0.971313, 0.0 ,   0.839994, 0.0 ,  -0.830686,0.0,
   0.439078, 0.0 ,    0.66128, 0.0 ,   0.694514, 0.0 ,  0.0565042, 0.0 ,    0.54342, 0.0 ,  -0.438804,0.0,
 -0.0228428, 0.0 ,  -0.687068, 0.0 ,   0.857267, 0.0 ,   0.301991, 0.0 ,  -0.494255, 0.0 ,  -0.941039,0.0,
   0.775509, 0.0 ,   0.410575, 0.0 ,  -0.362081, 0.0 ,  -0.671534, 0.0 ,  -0.348379, 0.0 ,   0.932433,0.0,
   0.886442, 0.0 ,   0.868681, 0.0 ,  -0.225666, 0.0 ,  -0.062211, 0.0 , -0.0976425, 0.0 ,  -0.641444,0.0,
  -0.848112, 0.0 ,   0.724697, 0.0 ,   0.473503, 0.0 ,   0.998749, 0.0 ,   0.174701, 0.0 ,   0.559625,0.0,
  -0.029099, 0.0 ,  -0.337392, 0.0 ,  -0.958129, 0.0 ,  -0.659785, 0.0 ,   0.236042, 0.0 ,  -0.246937,0.0,
   0.659449, 0.0 ,  -0.027512, 0.0 ,   0.821897, 0.0 ,  -0.226215, 0.0 ,  0.0181735, 0.0 ,   0.500481,0.0,
  -0.420127, 0.0 ,  -0.427878, 0.0 ,   0.5661860
};

/********************************************* SSE2 Table ******************************************************************/

void NoSSE2()
{
  SSE2ALREADYDETECTED=NOSSE2;
} ;

inline int CPUDetect ()
{
  SSE2ALREADYDETECTED = HAVESSE2 ;
  __try
  {
    __asm { movapd xmm0,xmm1 }
  }
  __except(NoSSE2(),1)
  {
  }
  return (0) ;
} ;

DBL SSE2Noise(VECTOR EPoint, TPATTERN *TPat)
{


 // __declspec(align(16)) DBL x, y, z;
 // __declspec(align(16))DBL *mp;
  __declspec(align(8)) long ix, iy, iz, jx, jy, jz, tmp;
  __declspec(align(8)) int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;
  __declspec(align(8)) int noise_generator = 0;
  

  __declspec(align(8)) DBL sx, sy, sz, tx, ty, tz;
  __declspec(align(8)) DBL sum = 0.0;
  
  __declspec(align(8)) DBL x_ix, x_jx, y_iy, y_jy, z_iz, z_jz, txty, sxty, txsy, sxsy;


  
  __declspec(align(8)) __m128d epsi;
  __declspec(align(8)) __m64 MIN,MASK; 
  __declspec(align(8)) __m64 One32; 

  __declspec(align(8)) __m128d xy;
  __declspec(align(8)) __m128d onlyz;
  __declspec(align(8)) __m128d xy_ixy;
  __declspec(align(8)) __m128d zw_izw;
  __declspec(align(8)) __m128d xy_jxy;
  __declspec(align(8)) __m128d zw_jzw;
    __declspec(align(8)) __m128d sxy;

    __declspec(align(8)) __m128d szw;
  __declspec(align(8)) __m128d txy;
  __declspec(align(8)) __m128d tzw;
  __declspec(align(8)) __m128d txtysxsy;
  __declspec(align(8)) __m128d  sxtytxsy;
  __declspec(align(8)) __m128d SIMDmp;
  __declspec(align(8)) __m128d SIMDmp2;
  

    __declspec(align(8)) __m128d SIMDs;
  __declspec(align(8)) __m128d Half;

     __declspec(align(8)) __m128d ResultXY;
     __declspec(align(8)) __m128d ResultZW;
   __declspec(align(8)) __m64 inttmp; 
   __declspec(align(8)) __m64 inttmp2; 
  __declspec(align(8)) __m64 jxy; __m64 ixy; 
  __declspec(align(8)) __m64 jzw; __m64 izw; 

  
 __declspec(align(8)) __m128d s1s6;
 __declspec(align(8)) __m128d s5s2;
 __declspec(align(8)) __m128d s4s3;
 __declspec(align(8)) __m128d s0s7;
  __declspec(align(8)) __m128d Mp0s,Mp1s,Mp2s,Mp3s;

  __declspec(align(8)) __m128d SIMDXmp,SIMDXmp2,SIMDYmp,SIMDYmp2,SIMDZmp,SIMDZmp2;
 __declspec(align(8)) __m128d Xs,Ys,Zs;
 __declspec(align(8))   __m128d One ;
  __m128d zero,Two,Tree;

register __declspec(align(8)) long t;



register   __m128d SSE2temp;
register   __m128d SSE2temp2;



  Increase_Counter(stats[Calls_To_Noise]);

  if (TPat != NULL)
    noise_generator = (TPat->Flags & NOISE_FLAGS) >> 4;
  if (!noise_generator)
    noise_generator=opts.Noise_Generator;

  if ((noise_generator==3) && opts.Language_Version >= 350) 
  {
    // The 1.59 and 0.985 are to correct for some biasing problems with
    // the random # generator used to create the noise tables.  Final
    // range of values is about 5.0e-4 below 0.0 and above 1.0.  Mean
    // value is 0.49 (ideally it would be 0.5).
    sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985);

    // Clamp final value to 0-1 range
      if (sum < 0.0) sum = 0.0;
      if (sum > 1.0) sum = 1.0;

    return sum;
  }

/*
  x = EPoint[X];
  y = EPoint[Y];
  z = EPoint[Z];
  
  // its equivalent integer lattice point. 
  // ix = (long)x; iy = (long)y; iz = (long)z; 
        // JB fix for the range problem 
  tmp = (x>=0)?(long)x:(long)(x-(1-EPSILON));
  ix = (tmp-MINX)&0xFFF;
  x_ix = x-tmp;
    
  tmp = (y>=0)?(long)y:(long)(y-(1-EPSILON));
  iy = (tmp-MINY)&0xFFF;
  y_iy = y-tmp;
  
  tmp = (z>=0)?(long)z:(long)(z-(1-EPSILON));
  iz = (tmp-MINZ)&0xFFF;
  z_iz = z-tmp;

  jx = (ix+1)&0xFFF; jy = (iy+1)&0xFFF; jz = (iz+1)&0xFFF;
  
  x_jx = x_ix-1; y_jy = y_iy-1; z_jz = z_iz-1;

  sx = SCURVE(x_ix); sy = SCURVE(y_iy); sz = SCURVE(z_iz);
  
  // the complement values of sx,sy,sz 
  tx = 1 - sx; ty = 1 - sy; tz = 1 - sz;
  
  //
   //  interpolate!
   //
  txty = tx * ty;
  sxty = sx * ty;
  txsy = tx * sy;
  sxsy = sx * sy;
  */
    

  zero=_mm_setzero_pd();
 One=_mm_set_pd(1.0,1.0);
  Two=_mm_set_pd(2.0,2.0);
  Tree=_mm_set_pd(3.0,3.0);
  epsi=_mm_set_pd(1-EPSILON,1-EPSILON);
  



 
    


  Half=_mm_set_pd(0.5,0.5);

 Increase_Counter(stats[Calls_To_DNoise]);
 
  xy= _mm_loadu_pd(EPoint); // Load x and y in xy
  onlyz=_mm_load_sd(EPoint+2); // Load only z;
 
// Here
 zero=_mm_setzero_pd();

  SSE2temp=_mm_cmplt_pd(xy,zero);

  SSE2temp=_mm_and_pd(SSE2temp, epsi);
  MIN=_mm_setr_pi32(MINX,MINY);
  SSE2temp=_mm_sub_pd(xy, SSE2temp);
  inttmp=_mm_cvttpd_pi32(SSE2temp);

  ixy=_mm_sub_pi32(inttmp,MIN);
  MASK=_mm_setr_pi32(0xFFF,0xFFF);
  ixy=_mm_and_si64(ixy,MASK);
  SSE2temp=_mm_cvtpi32_pd(inttmp); 
  xy_ixy=_mm_sub_pd(xy,SSE2temp);
  One32=_mm_setr_pi32(1,1);
  jxy=_mm_add_pi32(ixy,One32);
//  jxy=_mm_and_si64(jxy,MASK);
  One=_mm_set_pd(1.0,1.0);
  xy_jxy=_mm_sub_pd(xy_ixy,One);

zero=_mm_setzero_pd(); 
 SSE2temp=_mm_cmplt_pd(onlyz,zero);
 SSE2temp=_mm_and_pd(SSE2temp, epsi);
 SSE2temp=_mm_sub_pd(onlyz, SSE2temp);
 inttmp2=_mm_cvttpd_pi32(SSE2temp);
 MIN=_mm_setr_pi32(MINX,MINY);
 izw=_mm_sub_pi32(inttmp2,MIN);
 MASK=_mm_setr_pi32(0xFFF,0xFFF);
 izw=_mm_and_si64(izw,MASK);
 SSE2temp=_mm_cvtpi32_pd(inttmp2); 
 zw_izw=_mm_sub_pd(onlyz,SSE2temp);


 jzw=_mm_add_pi32(izw,One32);
// jzw=_mm_and_si64(jzw,MASK);
 zw_jzw=_mm_sub_pd(zw_izw,One); 
// scurve begining
 sxy=_mm_mul_pd(xy_ixy,xy_ixy);  // a*a
 SSE2temp2=_mm_mul_pd(Two,xy_ixy);  // 2.0 * a
 SSE2temp=_mm_sub_pd(Tree,SSE2temp2); // 3.0 - 2.0*a
 
 sxy=_mm_mul_pd(sxy,SSE2temp);
 txy=_mm_sub_pd(One,sxy);

 szw=_mm_mul_pd(zw_izw,zw_izw);  // a*a
 SSE2temp2=_mm_mul_pd(Two,zw_izw);  // 2.0 * a
 SSE2temp=_mm_sub_pd(Tree,SSE2temp2); // 3.0 - 2.0*a

 szw=_mm_mul_pd(szw,SSE2temp);
 tzw=_mm_sub_pd(One,szw);


 SSE2temp=_mm_shuffle_pd(txy,txy,_MM_SHUFFLE2(0,1));
 sxtytxsy=_mm_mul_pd(sxy,SSE2temp);
 SSE2temp=_mm_shuffle_pd(tzw,szw,_MM_SHUFFLE2(0,0));
 s1s6=_mm_mul_pd(sxtytxsy,SSE2temp);   // sxty*tz   txsy*sz    1
 SSE2temp2=_mm_shuffle_pd(szw,tzw,_MM_SHUFFLE2(0,0));
 s5s2=_mm_mul_pd(sxtytxsy,SSE2temp2);   // sxty*tz   txsy*sz    2

 SSE2temp2=_mm_shuffle_pd(txy,sxy,_MM_SHUFFLE2(1,1));
 SSE2temp=_mm_shuffle_pd(txy,sxy,_MM_SHUFFLE2(0,0));
 txtysxsy=_mm_mul_pd(SSE2temp2,SSE2temp);

 SSE2temp=_mm_shuffle_pd(szw,tzw,_MM_SHUFFLE2(0,0));
 s4s3=_mm_mul_pd(txtysxsy,SSE2temp);   // txty*sz  sxsy*tz     3

 SSE2temp2=_mm_shuffle_pd(tzw,szw,_MM_SHUFFLE2(0,0));
 s0s7=_mm_mul_pd(txtysxsy,SSE2temp2);   // XnothingX  sxsy*tz   4

// The S steps of the Lock up process
//s0 = txty*tz;   //     4  
//s1 = sxty*tz;    //   1
//s2 = sxsy*tz;   //    3
//s3 = txsy*tz;   //    2 
//s4 = txsy*sz;   //    1 
//s5 = sxsy*sz;   //    4
//s6 = sxty*sz;  //   2
//s7 = txty*sz;  //       3


 ix= _mm_cvtsi64_si32 ( ixy );
 ixy=_mm_srli_si64(ixy,32);
 iy= _mm_cvtsi64_si32 ( ixy );
 iz= _mm_cvtsi64_si32 ( izw ); 

 jx= _mm_cvtsi64_si32 ( jxy );
 jxy=_mm_srli_si64(jxy,32);
 jy= _mm_cvtsi64_si32 ( jxy );
 jz= _mm_cvtsi64_si32 ( jzw ); 


   
 

  ixiy_hash = Hash2d(ix, iy);
  jxiy_hash = Hash2d(jx, iy);
  ixjy_hash = Hash2d(ix, jy);
  jxjy_hash = Hash2d(jx, jy);
   
  int t1,t2,t3,t4,t5,t6,t7,t8;
     __declspec(align(8)) DBL *mp1,*mp2,*mp3,*mp4,*mp5,*mp6,*mp7,*mp8;


   t1=(long) 2* (Hash1d(ixiy_hash, iz) & 0xFF);  // 0
    t2=(long) 2* (Hash1d(jxiy_hash, iz) & 0xFF); //1
     t3=(long) 2* (Hash1d(ixjy_hash, iz) & 0xFF); //2
        t4=(long) 2* (Hash1d(jxjy_hash, iz) & 0xFF); //3
    t5=(long) 2* (Hash1d(ixiy_hash, jz) & 0xFF); //4
    t6=(long) 2* (Hash1d(jxiy_hash, jz) & 0xFF); //5
     t7=(long) 2* (Hash1d(ixjy_hash, jz) & 0xFF); //6
   t8=(long) 2* (Hash1d(jxjy_hash, jz) & 0xFF); //7



  mp1 = &SSE2RTable[t1]; // FP need to check the alignement
  mp2 = &SSE2RTable[t2]; // FP need to check the alignement
  mp3 = &SSE2RTable[t3]; // FP need to check the alignement
  mp4 = &SSE2RTable[t4]; // FP need to check the alignement
  mp5 = &SSE2RTable[t5]; // FP need to check the alignement
  mp6 = &SSE2RTable[t6]; // FP need to check the alignement
  mp7 = &SSE2RTable[t7]; // FP need to check the alignement
  mp8 = &SSE2RTable[t8]; // FP need to check the alignement

  // Iteration 0 and 1
//  t=(long) 2* (Hash1d(ixiy_hash, iz) & 0xFF);  // 0
 // mp = &SSE2RTable[t1]; // FP need to check the alignement
  SIMDXmp=_mm_load_pd(mp1);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp1+2);
  SIMDXmp=_mm_shuffle_pd(SIMDXmp,SSE2temp,_MM_SHUFFLE2(0,0));

  SIMDXmp2=_mm_load_pd(mp1+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp1+6);
  SIMDXmp2=_mm_shuffle_pd(SIMDXmp2,SSE2temp,_MM_SHUFFLE2(0,0));

 // t=(long) 2* (Hash1d(jxiy_hash, iz) & 0xFF); //1
//  mp = &SSE2RTable[t2]; // FP need to check the alignement
  SIMDYmp=_mm_load_pd(mp2);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp2+2);

  SIMDYmp=_mm_shuffle_pd(SIMDYmp,SSE2temp,_MM_SHUFFLE2(0,0));

  SIMDYmp2=_mm_load_pd(mp2+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp2+6);

  SIMDYmp2=_mm_shuffle_pd(SIMDYmp2,SSE2temp,_MM_SHUFFLE2(0,0));


 // We will process Result[x] and Result[y] in parallele  
 Mp0s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Xmp[] and Ymp[]
 Mp1s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Xmp[] and Ymp[]
 Mp2s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Xmp[] and Ymp[]
 Mp3s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Xmp[] and Ymp[]
 

 Xs=_mm_shuffle_pd(xy_ixy,xy_jxy,_MM_SHUFFLE2(0,0));
 Ys=_mm_shuffle_pd(xy_ixy,xy_ixy,_MM_SHUFFLE2(1,1)); // Grouping the Xs Ys and Zs **********> Change the PARAMETER here 
 Zs=_mm_shuffle_pd(zw_izw,zw_izw,_MM_SHUFFLE2(0,0));

 
 // Processing of INCRSUMP(mp, s, x_jx, y_iy, z_iz)


 // Processing XY part
  Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)




 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)

 SIMDs=_mm_shuffle_pd(s0s7,s1s6,_MM_SHUFFLE2(0,0)); // **********> Change the PARAMETER here //PF
 ResultXY=_mm_mul_pd(SSE2temp2,SIMDs);



//    Iteration 2 and 3

 //t=(long) 2* (Hash1d(ixjy_hash, iz) & 0xFF); //2
//  mp = &SSE2RTable[t3]; // FP need to check the alignement
  SIMDXmp=_mm_load_pd(mp3);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp3+2);
  SIMDXmp=_mm_shuffle_pd(SIMDXmp,SSE2temp,_MM_SHUFFLE2(0,0));

  SIMDXmp2=_mm_load_pd(mp3+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp3+6);
  SIMDXmp2=_mm_shuffle_pd(SIMDXmp2,SSE2temp,_MM_SHUFFLE2(0,0));

 // t=(long) 2* (Hash1d(jxjy_hash, iz) & 0xFF); //3
//  mp = &SSE2RTable[t4]; // FP need to check the alignement
  SIMDYmp=_mm_load_pd(mp4);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp4+2);
  SIMDYmp=_mm_shuffle_pd(SIMDYmp,SSE2temp,_MM_SHUFFLE2(0,0));

  SIMDYmp2=_mm_load_pd(mp4+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp4+6);
  SIMDYmp2=_mm_shuffle_pd(SIMDYmp2,SSE2temp,_MM_SHUFFLE2(0,0));


 // We will process Result[x] and Result[y] in parallele  
 Mp0s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Xmp[] and Ymp[]
 Mp1s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Xmp[] and Ymp[]
 Mp2s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Xmp[] and Ymp[]
 Mp3s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Xmp[] and Ymp[]
 

 Xs=_mm_shuffle_pd(xy_ixy,xy_jxy,_MM_SHUFFLE2(0,0));
 Ys=_mm_shuffle_pd(xy_jxy,xy_jxy,_MM_SHUFFLE2(1,1)); // Grouping the Xs Ys and Zs **********> Change the PARAMETER here //PF
 Zs=_mm_shuffle_pd(zw_izw,zw_izw,_MM_SHUFFLE2(0,0));

 
 // Processing of INCRSUMP(mp, s, x_jx, y_iy, z_iz)


 // Processing XY part
   Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)

 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)

 SIMDs=_mm_shuffle_pd(s5s2,s4s3,_MM_SHUFFLE2(1,1)); // **********> Change the PARAMETER here //PF

 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultXY=_mm_add_pd(ResultXY,SSE2temp2);
 


// Iteration 4 and 5


//  t=(long) 2* (Hash1d(ixiy_hash, jz) & 0xFF); //4
 // mp = &SSE2RTable[t5]; // FP need to check the alignement
  SIMDXmp=_mm_load_pd(mp5);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp5+2);
  SIMDXmp=_mm_shuffle_pd(SIMDXmp,SSE2temp,_MM_SHUFFLE2(0,0));

  SIMDXmp2=_mm_load_pd(mp5+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp5+6);
  SIMDXmp2=_mm_shuffle_pd(SIMDXmp2,SSE2temp,_MM_SHUFFLE2(0,0));

//  t=(long) 2* (Hash1d(jxiy_hash, jz) & 0xFF); //5
//  mp = &SSE2RTable[t6]; // FP need to check the alignement
  SIMDYmp=_mm_load_pd(mp6);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp6+2);
  SIMDYmp=_mm_shuffle_pd(SIMDYmp,SSE2temp,_MM_SHUFFLE2(0,0));

  SIMDYmp2=_mm_load_pd(mp6+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp6+6);
  SIMDYmp2=_mm_shuffle_pd(SIMDYmp2,SSE2temp,_MM_SHUFFLE2(0,0));


 // We will process Result[x] and Result[y] in parallele  
 Mp0s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Xmp[] and Ymp[]
 Mp1s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Xmp[] and Ymp[]
 Mp2s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Xmp[] and Ymp[]
 Mp3s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Xmp[] and Ymp[]
 

 Xs=_mm_shuffle_pd(xy_ixy,xy_jxy,_MM_SHUFFLE2(0,0));
 Ys=_mm_shuffle_pd(xy_ixy,xy_ixy,_MM_SHUFFLE2(1,1)); // Grouping the Xs Ys and Zs **********> Change the PARAMETER here //PF
 Zs=_mm_shuffle_pd(zw_jzw,zw_jzw,_MM_SHUFFLE2(0,0));

 
 // Processing of INCRSUMP(mp, s, x_jx, y_iy, z_iz)


 // Processing XY part
   Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)




 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)

 SIMDs=_mm_shuffle_pd(s4s3,s5s2,_MM_SHUFFLE2(0,0)); // **********> Change the PARAMETER here //PF

 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultXY=_mm_add_pd(ResultXY,SSE2temp2);
 




// Iteration 6 and 7



 // t=(long) 2* (Hash1d(ixjy_hash, jz) & 0xFF); //6
//  mp = &SSE2RTable[t7]; // FP need to check the alignement
  SIMDXmp=_mm_load_pd(mp7);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp7+2);
  SIMDXmp=_mm_shuffle_pd(SIMDXmp,SSE2temp,_MM_SHUFFLE2(0,0));

  SIMDXmp2=_mm_load_pd(mp7+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp7+6);
  SIMDXmp2=_mm_shuffle_pd(SIMDXmp2,SSE2temp,_MM_SHUFFLE2(0,0));

//  t=(long) 2* (Hash1d(jxjy_hash, jz) & 0xFF); //7
 // mp = &SSE2RTable[t8]; // FP need to check the alignement
  SIMDYmp=_mm_load_pd(mp8);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp8+2);
  SIMDYmp=_mm_shuffle_pd(SIMDYmp,SSE2temp,_MM_SHUFFLE2(0,0));

  SIMDYmp2=_mm_load_pd(mp8+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp8+6);
  SIMDYmp2=_mm_shuffle_pd(SIMDYmp2,SSE2temp,_MM_SHUFFLE2(0,0));


 // We will process Result[x] and Result[y] in parallele  
 Mp0s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Xmp[] and Ymp[]
 Mp1s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Xmp[] and Ymp[]
 Mp2s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Xmp[] and Ymp[]
 Mp3s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Xmp[] and Ymp[]
 

 Xs=_mm_shuffle_pd(xy_ixy,xy_jxy,_MM_SHUFFLE2(0,0));
 Ys=_mm_shuffle_pd(xy_jxy,xy_jxy,_MM_SHUFFLE2(1,1)); // Grouping the Xs Ys and Zs **********> Change the PARAMETER here //PF
 Zs=_mm_shuffle_pd(zw_jzw,zw_jzw,_MM_SHUFFLE2(0,0));

 
 // Processing of INCRSUMP(mp, s, x_jx, y_iy, z_iz)


 // Processing XY part
   Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)




 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)

 SIMDs=_mm_shuffle_pd(s1s6,s0s7,_MM_SHUFFLE2(1,1)); // **********> Change the PARAMETER here //PF

 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultXY=_mm_add_pd(ResultXY,SSE2temp2);
 SSE2temp2=_mm_shuffle_pd(ResultXY,ResultXY,_MM_SHUFFLE2(1,1));
 ResultXY=_mm_add_pd(ResultXY,SSE2temp2);

 _mm_storel_pd (&sum, ResultXY);  // Go back to non SSE2 code.
 
  _mm_empty();  // Floating point unit safety instruction after MMX code.

 
 

 // Floating point unit safety instruction after MMX code.
  if ((noise_generator==2) && opts.Language_Version >= 350)
  {
    // details of range here:
   // Min, max: -1.05242, 0.988997
   // Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828

   // We want to chage it to as close to [0,1] as possible.
    //
    sum += 1.05242;
    sum *= 0.48985582;
    //sum *= 0.5;
    //  sum += 0.5;

    if (sum < 0.0)
      sum = 0.0;
    if (sum > 1.0)
      sum = 1.0;
  }
  else
  {
    sum = sum + 0.5;                     // * range at this point -0.5 - 0.5... 
  
    if (sum < 0.0)
      sum = 0.0;
    if (sum > 1.0)
      sum = 1.0;
  }
 
  return (sum);



}

void SSE2DNoise(VECTOR result, VECTOR EPoint)
{
   __declspec(align(8)) long t;
  //__declspec(align(8)) DBL *mp;
  __declspec(align(8)) long ix, iy, iz, jx, jy, jz;
  __declspec(align(8)) int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;

  __declspec(align(8)) __m64 MIN,MASK; 
  __declspec(align(8)) __m64 One32; 
   __declspec(align(8)) __m64 inttmp; 
   __declspec(align(8)) __m64 inttmp2; 
  __declspec(align(8)) __m64 jxy; __m64 ixy; 
  __declspec(align(8)) __m64 jzw; __m64 izw; 

    __declspec(align(8)) __m128d zero;
  __declspec(align(8)) __m128d One;
  __declspec(align(8)) __m128d Two;
  __declspec(align(8)) __m128d Tree;
  __declspec(align(8)) __m128d epsi;
  __declspec(align(8)) __m128d xy;
  __declspec(align(8)) __m128d onlyz;

  __declspec(align(16)) __m128d xy_ixy;
  __declspec(align(8)) __m128d zw_izw;
  __declspec(align(8)) __m128d xy_jxy;
  __declspec(align(8)) __m128d zw_jzw;
    __declspec(align(8)) __m128d sxy;
  __declspec(align(8)) __m128d szw;
  __declspec(align(8)) __m128d txy;
  __declspec(align(8)) __m128d tzw;
  __declspec(align(8)) __m128d txtysxsy;
  __declspec(align(8)) __m128d  sxtytxsy;
  __declspec(align(8)) __m128d SIMDmp;
  __declspec(align(8)) __m128d SIMDmp2;
   __declspec(align(8)) __m128d Resultxy;
   __declspec(align(8)) __m128d Resultzw;
    __declspec(align(8)) __m128d SIMDs;
  __declspec(align(8)) __m128d Half;

    __declspec(align(8)) __m128d ResultXY;
    __declspec(align(8)) __m128d ResultZW;

  
 __declspec(align(8)) __m128d s1s4;
 __declspec(align(8)) __m128d s7s2;
 __declspec(align(8)) __m128d s6s3;
 __declspec(align(8)) __m128d s0s5;
  __declspec(align(8)) __m128d Mp0s,Mp1s,Mp2s,Mp3s;

  __declspec(align(8)) __m128d SIMDXmp,SIMDXmp2,SIMDYmp,SIMDYmp2,SIMDZmp,SIMDZmp2;
 __declspec(align(8)) __m128d Xs,Ys,Zs;



  __declspec(align(8)) DBL x_ix, x_jx, y_iy, y_jy, z_iz, z_jz;
  __declspec(align(8)) DBL s;
  __declspec(align(8)) DBL sx, sy, sz, tx, ty, tz;
  __declspec(align(8)) DBL txty, sxty, txsy, sxsy;

register   __m128d SSE2temp;
register   __m128d SSE2temp2;

  int t1,t2,t3,t4,t5,t6,t7,t8;
  __declspec(align(8)) DBL *mp1,*mp2,*mp3,*mp4,*mp5,*mp6,*mp7,*mp8;
  Increase_Counter(stats[Calls_To_DNoise]);
  
  xy= _mm_loadu_pd(EPoint); // Load x and y in xy
  onlyz=_mm_load_sd(EPoint+2); // Load only z;
// Here
  zero=_mm_setzero_pd();
  SSE2temp=_mm_cmplt_pd(xy,zero);
  epsi=_mm_set1_pd(1-EPSILON);
  SSE2temp=_mm_and_pd(SSE2temp, epsi);
  MIN=_mm_setr_pi32(MINX,MINY);
  SSE2temp=_mm_sub_pd(xy, SSE2temp);
  inttmp=_mm_cvttpd_pi32(SSE2temp);

  ixy=_mm_sub_pi32(inttmp,MIN);
  MASK=_mm_setr_pi32(0xFFF,0xFFF);
  ixy=_mm_and_si64(ixy,MASK);
  SSE2temp=_mm_cvtpi32_pd(inttmp); 
  xy_ixy=_mm_sub_pd(xy,SSE2temp);
  One32=_mm_setr_pi32(1,1);
  jxy=_mm_add_pi32(ixy,One32);
//  jxy=_mm_and_si64(jxy,MASK);
  One=_mm_set1_pd(1.0);
  xy_jxy=_mm_sub_pd(xy_ixy,One);

 zero=_mm_setzero_pd(); 
 SSE2temp=_mm_cmplt_pd(onlyz,zero);
 SSE2temp=_mm_and_pd(SSE2temp, epsi);
 SSE2temp=_mm_sub_pd(onlyz, SSE2temp);
 inttmp2=_mm_cvttpd_pi32(SSE2temp);
 MIN=_mm_setr_pi32(MINX,MINY);
 izw=_mm_sub_pi32(inttmp2,MIN);
 MASK=_mm_setr_pi32(0xFFF,0xFFF);
 izw=_mm_and_si64(izw,MASK);
 SSE2temp=_mm_cvtpi32_pd(inttmp2); 
 zw_izw=_mm_sub_pd(onlyz,SSE2temp);


 jzw=_mm_add_pi32(izw,One32);
// jzw=_mm_and_si64(jzw,MASK);
 zw_jzw=_mm_sub_pd(zw_izw,One); 
// scurve begining
 sxy=_mm_mul_pd(xy_ixy,xy_ixy);  // a*a
 Two=_mm_set_pd(2.0,2.0);
 SSE2temp2=_mm_mul_pd(Two,xy_ixy);  // 2.0 * a
 Tree=_mm_set_pd(3.0,3.0);
 SSE2temp=_mm_sub_pd(Tree,SSE2temp2); // 3.0 - 2.0*a
 
 sxy=_mm_mul_pd(sxy,SSE2temp);
 txy=_mm_sub_pd(One,sxy);

 szw=_mm_mul_pd(zw_izw,zw_izw);  // a*a
 Two=_mm_set_pd(2.0,2.0);
 SSE2temp2=_mm_mul_pd(Two,zw_izw);  // 2.0 * a
 Tree=_mm_set_pd(3.0,3.0);
 SSE2temp=_mm_sub_pd(Tree,SSE2temp2); // 3.0 - 2.0*a
 szw=_mm_mul_pd(szw,SSE2temp);

 tzw=_mm_sub_pd(One,szw);

 ix= _mm_cvtsi64_si32 ( ixy );
 ixy=_mm_srli_si64(ixy,32);
 iz= _mm_cvtsi64_si32 ( izw ); 
 iy= _mm_cvtsi64_si32 ( ixy );
 

 jx= _mm_cvtsi64_si32 ( jxy );
 jxy=_mm_srli_si64(jxy,32);
 jz= _mm_cvtsi64_si32 ( jzw ); 
 jy= _mm_cvtsi64_si32 ( jxy );
 
 

SSE2temp=_mm_shuffle_pd(txy,txy,_MM_SHUFFLE2(0,1));
 sxtytxsy=_mm_mul_pd(sxy,SSE2temp);
 SSE2temp=_mm_shuffle_pd(tzw,szw,_MM_SHUFFLE2(0,0));
 s1s4=_mm_mul_pd(sxtytxsy,SSE2temp);   // sxty*tz   txsy*sz    1
 SSE2temp2=_mm_shuffle_pd(szw,tzw,_MM_SHUFFLE2(0,0));
 s6s3=_mm_mul_pd(sxtytxsy,SSE2temp2);   // sxty*tz   txsy*sz    2

 SSE2temp2=_mm_shuffle_pd(txy,sxy,_MM_SHUFFLE2(1,1));
 SSE2temp=_mm_shuffle_pd(txy,sxy,_MM_SHUFFLE2(0,0));
 txtysxsy=_mm_mul_pd(SSE2temp2,SSE2temp);

 SSE2temp=_mm_shuffle_pd(szw,tzw,_MM_SHUFFLE2(0,0));
 s7s2=_mm_mul_pd(txtysxsy,SSE2temp);   // txty*sz  sxsy*tz     3

 SSE2temp2=_mm_shuffle_pd(tzw,szw,_MM_SHUFFLE2(0,0));
 s0s5=_mm_mul_pd(txtysxsy,SSE2temp2);   // XnothingX  sxsy*tz   4

// The S steps of the Lock up process
//s0 = txty*tz;   //     4  
//s1 = sxty*tz;    //   1
//s2 = sxsy*tz;   //    3
//s3 = txsy*tz;   //    2 
//s4 = txsy*sz;   //    1 
//s5 = sxsy*sz;   //    4
//s6 = sxty*sz;  //   2
//s7 = txty*sz;  //       3


/* 
 _mm_storel_pd (&x_ix, xy_ixy);  //debugging FP
 _mm_storeh_pd (&y_iy, xy_ixy);  //debugging FP
 _mm_storel_pd (&z_iz, zw_izw);  //debugging FP
 _mm_storel_pd (&x_jx, xy_jxy);  //debugging FP
 _mm_storeh_pd (&y_jy, xy_jxy);  //debugging FP
 _mm_storel_pd (&z_jz, zw_jzw);  //debugging FP
 _mm_storel_pd (&txty, txtysxsy);  //debugging FP
 _mm_storel_pd (&sxty, sxtytxsy);  //debugging FP
 _mm_storeh_pd (&txsy, sxtytxsy);  //debugging FP
 _mm_storeh_pd (&sxsy, txtysxsy);  //debugging FP
*/



  ixiy_hash = Hash2d(ix, iy);
  t1=(long) 2* (Hash1d(ixiy_hash, iz) & 0xFF);   //it1
  t8=(long) 2* (Hash1d(ixiy_hash, jz) & 0xFF); //it8
  jxiy_hash = Hash2d(jx, iy);
  t2=(long) 2* (Hash1d(jxiy_hash, iz) & 0xFF); // it2
  t7=(long) 2* (Hash1d(jxiy_hash, jz) & 0xFF); //it7
  ixjy_hash = Hash2d(ix, jy);
  t4=(long) 2* (Hash1d(ixjy_hash, iz) & 0xFF); //it4
  t5=(long) 2* (Hash1d(ixjy_hash, jz) & 0xFF); //it5
  jxjy_hash = Hash2d(jx, jy);                       
  t3=(long) 2* (Hash1d(jxjy_hash, iz) & 0xFF); //it3
  t6=(long) 2* (Hash1d(jxjy_hash, jz) & 0xFF); //it6
    

  mp1 = &SSE2RTable[t1]; // FP need to check the alignement
  mp2 = &SSE2RTable[t2]; // FP need to check the alignement
  mp3 = &SSE2RTable[t3]; // FP need to check the alignement
  mp4 = &SSE2RTable[t4]; // FP need to check the alignement
  mp5 = &SSE2RTable[t5]; // FP need to check the alignement
  mp6 = &SSE2RTable[t6]; // FP need to check the alignement
  mp7 = &SSE2RTable[t7]; // FP need to check the alignement
  mp8 = &SSE2RTable[t8]; // FP need to check the alignement







// ******************************************************************* Iteration 1 **********************************************************************

 // t=(long) 2* (Hash1d(ixiy_hash, iz) & 0xFF);
  //mp = &SSE2RTable[t1]; // FP need to check the alignement

 SIMDXmp=_mm_load_pd(mp1);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp1+2);
  SIMDXmp=_mm_shuffle_pd(SIMDXmp,SSE2temp,_MM_SHUFFLE2(0,0));

 
  SIMDYmp=_mm_load_pd(mp1+8);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp1+10);
  SIMDYmp=_mm_shuffle_pd(SIMDYmp,SSE2temp,_MM_SHUFFLE2(0,0)); 


 Mp0s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Xmp[] and Ymp[]
 Mp1s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Xmp[] and Ymp[]
  
  SIMDXmp2=_mm_load_pd(mp1+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp1+6);
  SIMDXmp2=_mm_shuffle_pd(SIMDXmp2,SSE2temp,_MM_SHUFFLE2(0,0));



  SIMDYmp2=_mm_load_pd(mp1+12); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp1+14);
  SIMDYmp2=_mm_shuffle_pd(SIMDYmp2,SSE2temp,_MM_SHUFFLE2(0,0));
 
 Mp2s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Xmp[] and Ymp[]
 Mp3s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Xmp[] and Ymp[]
 

 SIMDZmp=_mm_load_pd(mp1+16);  // mp[0] and mp[1] 
 SSE2temp=_mm_load_pd(mp1+18);
 SIMDZmp=_mm_shuffle_pd(SIMDZmp,SSE2temp,_MM_SHUFFLE2(0,0));

 SIMDZmp2=_mm_load_pd(mp1+20); // mp[2] and mp[3]
 SSE2temp=_mm_load_pd(mp1+22);
 SIMDZmp2=_mm_shuffle_pd(SIMDZmp2,SSE2temp,_MM_SHUFFLE2(0,0));
 

 Xs=_mm_shuffle_pd(xy_ixy,xy_ixy,_MM_SHUFFLE2(0,0));
 Ys=_mm_shuffle_pd(xy_ixy,xy_ixy,_MM_SHUFFLE2(1,1)); // Grouping the Xs Ys and Zs **********> Change the PARAMETER here //PF
 Zs=_mm_shuffle_pd(zw_izw,zw_izw,_MM_SHUFFLE2(0,0));

 
 // Processing of INCRSUMP(mp, s, x_jx, y_iy, z_iz)


 // Processing XY part
  Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)




 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)

 SIMDs=_mm_shuffle_pd(s0s5,s0s5,_MM_SHUFFLE2(0,0)); // **********> Change the PARAMETER here //PF
 ResultXY=_mm_mul_pd(SSE2temp2,SIMDs);


// Processing Z part
//t=(long) 2* (Hash1d(jxiy_hash, iz) & 0xFF); // it2
 Mp0s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Zmp[] 
 Mp1s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Zmp[]
 Mp2s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Zmp[]
 Mp3s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Zmp[] 
Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)


 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)

 ResultZW=_mm_mul_pd(SSE2temp2,SIMDs);





 // ******************************************************************* Iteration 2 **********************************************************************

  //t=(long) 2* (Hash1d(jxiy_hash, iz) & 0xFF);
 // mp = &SSE2RTable[t2]; // FP need to check the alignement

  SIMDXmp=_mm_load_pd(mp2);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp2+2);
  SIMDXmp=_mm_shuffle_pd(SIMDXmp,SSE2temp,_MM_SHUFFLE2(0,0));

 
  SIMDYmp=_mm_load_pd(mp2+8);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp2+10);
  SIMDYmp=_mm_shuffle_pd(SIMDYmp,SSE2temp,_MM_SHUFFLE2(0,0)); 


 Mp0s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Xmp[] and Ymp[]
 Mp1s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Xmp[] and Ymp[]
  
  SIMDXmp2=_mm_load_pd(mp2+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp2+6);
  SIMDXmp2=_mm_shuffle_pd(SIMDXmp2,SSE2temp,_MM_SHUFFLE2(0,0));



  SIMDYmp2=_mm_load_pd(mp2+12); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp2+14);
  SIMDYmp2=_mm_shuffle_pd(SIMDYmp2,SSE2temp,_MM_SHUFFLE2(0,0));
 
 Mp2s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Xmp[] and Ymp[]
 Mp3s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Xmp[] and Ymp[]
 

 SIMDZmp=_mm_load_pd(mp2+16);  // mp[0] and mp[1] 
 SSE2temp=_mm_load_pd(mp2+18);
 SIMDZmp=_mm_shuffle_pd(SIMDZmp,SSE2temp,_MM_SHUFFLE2(0,0));

 SIMDZmp2=_mm_load_pd(mp2+20); // mp[2] and mp[3]
 SSE2temp=_mm_load_pd(mp2+22);
 SIMDZmp2=_mm_shuffle_pd(SIMDZmp2,SSE2temp,_MM_SHUFFLE2(0,0));

 // We will process Result[x] and Result[y] in parallele  

 

 Xs=_mm_shuffle_pd(xy_jxy,xy_jxy,_MM_SHUFFLE2(0,0));
 Ys=_mm_shuffle_pd(xy_ixy,xy_ixy,_MM_SHUFFLE2(1,1)); // Grouping the Xs Ys and Zs **********> Change the PARAMETER here //PF
 Zs=_mm_shuffle_pd(zw_izw,zw_izw,_MM_SHUFFLE2(0,0));

 
 // Processing of INCRSUMP(mp, s, x_jx, y_iy, z_iz)


 // Processing XY part
 Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)




 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)

 SIMDs=_mm_shuffle_pd(s1s4,s1s4,_MM_SHUFFLE2(0,0)); // **********> Change the PARAMETER here //PF

 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultXY=_mm_add_pd(ResultXY,SSE2temp2);
 

 
// Processing Z part
//t=(long) 2* (Hash1d(jxjy_hash, iz) & 0xFF); //it3
 Mp0s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Zmp[]
 Mp1s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Zmp[]
 Mp2s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Zmp[]
 Mp3s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Zmp[] 
Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)


 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)
 
 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultZW=_mm_add_pd(ResultZW,SSE2temp2);
  
 
 // ******************************************************************* Iteration 3 **********************************************************************

  //t=(long) 2* (Hash1d(jxjy_hash, iz) & 0xFF);
 // mp = &SSE2RTable[t3]; // FP need to check the alignement

 SIMDXmp=_mm_load_pd(mp3);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp3+2);
  SIMDXmp=_mm_shuffle_pd(SIMDXmp,SSE2temp,_MM_SHUFFLE2(0,0));

 
  SIMDYmp=_mm_load_pd(mp3+8);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp3+10);
  SIMDYmp=_mm_shuffle_pd(SIMDYmp,SSE2temp,_MM_SHUFFLE2(0,0)); 


 Mp0s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Xmp[] and Ymp[]
 Mp1s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Xmp[] and Ymp[]
  
  SIMDXmp2=_mm_load_pd(mp3+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp3+6);
  SIMDXmp2=_mm_shuffle_pd(SIMDXmp2,SSE2temp,_MM_SHUFFLE2(0,0));



  SIMDYmp2=_mm_load_pd(mp3+12); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp3+14);
  SIMDYmp2=_mm_shuffle_pd(SIMDYmp2,SSE2temp,_MM_SHUFFLE2(0,0));
 
 Mp2s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Xmp[] and Ymp[]
 Mp3s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Xmp[] and Ymp[]
 

 SIMDZmp=_mm_load_pd(mp3+16);  // mp[0] and mp[1] 
 SSE2temp=_mm_load_pd(mp3+18);
 SIMDZmp=_mm_shuffle_pd(SIMDZmp,SSE2temp,_MM_SHUFFLE2(0,0));

 SIMDZmp2=_mm_load_pd(mp3+20); // mp[2] and mp[3]
 SSE2temp=_mm_load_pd(mp3+22);
 SIMDZmp2=_mm_shuffle_pd(SIMDZmp2,SSE2temp,_MM_SHUFFLE2(0,0));
 

 Xs=_mm_shuffle_pd(xy_jxy,xy_jxy,_MM_SHUFFLE2(0,0));
 Ys=_mm_shuffle_pd(xy_jxy,xy_jxy,_MM_SHUFFLE2(1,1)); // Grouping the Xs Ys and Zs **********> Change the PARAMETER here //PF
 Zs=_mm_shuffle_pd(zw_izw,zw_izw,_MM_SHUFFLE2(0,0));

 
 // Processing of INCRSUMP(mp, s, x_jx, y_iy, z_iz)


 // Processing XY part
  Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)




 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)

 SIMDs=_mm_shuffle_pd(s7s2,s7s2,_MM_SHUFFLE2(1,1)); // **********> Change the PARAMETER here //PF

 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultXY=_mm_add_pd(ResultXY,SSE2temp2);
 

 
// Processing Z part
// t=(long) 2* (Hash1d(ixjy_hash, iz) & 0xFF); //it4
 Mp0s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Zmp[]
 Mp1s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Zmp[]
 Mp2s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Zmp[]
 Mp3s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Zmp[] 
Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)


 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)
 
 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultZW=_mm_add_pd(ResultZW,SSE2temp2);
  

  // ******************************************************************* Iteration 4 **********************************************************************

 // t=(long) 2* (Hash1d(ixjy_hash, iz) & 0xFF);
//  mp = &SSE2RTable[t4]; // FP need to check the alignement

  SIMDXmp=_mm_load_pd(mp4);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp4+2);
  SIMDXmp=_mm_shuffle_pd(SIMDXmp,SSE2temp,_MM_SHUFFLE2(0,0));

 
  SIMDYmp=_mm_load_pd(mp4+8);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp4+10);
  SIMDYmp=_mm_shuffle_pd(SIMDYmp,SSE2temp,_MM_SHUFFLE2(0,0)); 


 Mp0s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Xmp[] and Ymp[]
 Mp1s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Xmp[] and Ymp[]
  
  SIMDXmp2=_mm_load_pd(mp4+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp4+6);
  SIMDXmp2=_mm_shuffle_pd(SIMDXmp2,SSE2temp,_MM_SHUFFLE2(0,0));



  SIMDYmp2=_mm_load_pd(mp4+12); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp4+14);
  SIMDYmp2=_mm_shuffle_pd(SIMDYmp2,SSE2temp,_MM_SHUFFLE2(0,0));
 
 Mp2s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Xmp[] and Ymp[]
 Mp3s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Xmp[] and Ymp[]
 

 SIMDZmp=_mm_load_pd(mp4+16);  // mp[0] and mp[1] 
 SSE2temp=_mm_load_pd(mp4+18);
 SIMDZmp=_mm_shuffle_pd(SIMDZmp,SSE2temp,_MM_SHUFFLE2(0,0));

 SIMDZmp2=_mm_load_pd(mp4+20); // mp[2] and mp[3]
 SSE2temp=_mm_load_pd(mp4+22);
 SIMDZmp2=_mm_shuffle_pd(SIMDZmp2,SSE2temp,_MM_SHUFFLE2(0,0));

 Xs=_mm_shuffle_pd(xy_ixy,xy_ixy,_MM_SHUFFLE2(0,0));
 Ys=_mm_shuffle_pd(xy_jxy,xy_jxy,_MM_SHUFFLE2(1,1)); // Grouping the Xs Ys and Zs **********> Change the PARAMETER here //PF
 Zs=_mm_shuffle_pd(zw_izw,zw_izw,_MM_SHUFFLE2(0,0));

 
 // Processing of INCRSUMP(mp, s, x_jx, y_iy, z_iz)


 // Processing XY part
   Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)




 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)

 SIMDs=_mm_shuffle_pd(s6s3,s6s3,_MM_SHUFFLE2(1,1)); // **********> Change the PARAMETER here //PF

 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultXY=_mm_add_pd(ResultXY,SSE2temp2);
 

 
// Processing Z part
// t=(long) 2* (Hash1d(ixjy_hash, jz) & 0xFF); //it5
 Mp0s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Zmp[]
 Mp1s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Zmp[]
 Mp2s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Zmp[]
 Mp3s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Zmp[] 
Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)


 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)
 
 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultZW=_mm_add_pd(ResultZW,SSE2temp2);
  

  // ******************************************************************* Iteration 5 **********************************************************************

 // t=(long) 2* (Hash1d(ixjy_hash, jz) & 0xFF);
 // mp = &SSE2RTable[t5]; // FP need to check the alignement

   SIMDXmp=_mm_load_pd(mp5);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp5+2);
  SIMDXmp=_mm_shuffle_pd(SIMDXmp,SSE2temp,_MM_SHUFFLE2(0,0));

 
  SIMDYmp=_mm_load_pd(mp5+8);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp5+10);
  SIMDYmp=_mm_shuffle_pd(SIMDYmp,SSE2temp,_MM_SHUFFLE2(0,0)); 


 Mp0s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Xmp[] and Ymp[]
 Mp1s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Xmp[] and Ymp[]
  
  SIMDXmp2=_mm_load_pd(mp5+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp5+6);
  SIMDXmp2=_mm_shuffle_pd(SIMDXmp2,SSE2temp,_MM_SHUFFLE2(0,0));



  SIMDYmp2=_mm_load_pd(mp5+12); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp5+14);
  SIMDYmp2=_mm_shuffle_pd(SIMDYmp2,SSE2temp,_MM_SHUFFLE2(0,0));
 
 Mp2s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Xmp[] and Ymp[]
 Mp3s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Xmp[] and Ymp[]
 

 SIMDZmp=_mm_load_pd(mp5+16);  // mp[0] and mp[1] 
 SSE2temp=_mm_load_pd(mp5+18);
 SIMDZmp=_mm_shuffle_pd(SIMDZmp,SSE2temp,_MM_SHUFFLE2(0,0));

 SIMDZmp2=_mm_load_pd(mp5+20); // mp[2] and mp[3]
 SSE2temp=_mm_load_pd(mp5+22);
 SIMDZmp2=_mm_shuffle_pd(SIMDZmp2,SSE2temp,_MM_SHUFFLE2(0,0));
 

 Xs=_mm_shuffle_pd(xy_ixy,xy_ixy,_MM_SHUFFLE2(0,0));
 Ys=_mm_shuffle_pd(xy_jxy,xy_jxy,_MM_SHUFFLE2(1,1)); // Grouping the Xs Ys and Zs **********> Change the PARAMETER here //PF
 Zs=_mm_shuffle_pd(zw_jzw,zw_jzw,_MM_SHUFFLE2(0,0));

 
 // Processing of INCRSUMP(mp, s, x_jx, y_iy, z_iz)


 // Processing XY part
   Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)




 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)

 SIMDs=_mm_shuffle_pd(s1s4,s1s4,_MM_SHUFFLE2(1,1)); // **********> Change the PARAMETER here //PF

 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultXY=_mm_add_pd(ResultXY,SSE2temp2);
 


// Processing Z part
//t=(long) 2* (Hash1d(jxjy_hash, jz) & 0xFF); //it6
 Mp0s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Zmp[]
 Mp1s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Zmp[]
 Mp2s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Zmp[]
 Mp3s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Zmp[] 
Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)


 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)
 
 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultZW=_mm_add_pd(ResultZW,SSE2temp2);
  

  // ******************************************************************* Iteration 6 **********************************************************************

  // t=(long) 2* (Hash1d(jxjy_hash, jz) & 0xFF);
 // mp = &SSE2RTable[t6]; // FP need to check the alignement

  SIMDXmp=_mm_load_pd(mp6);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp6+2);
  SIMDXmp=_mm_shuffle_pd(SIMDXmp,SSE2temp,_MM_SHUFFLE2(0,0));

 
  SIMDYmp=_mm_load_pd(mp6+8);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp6+10);
  SIMDYmp=_mm_shuffle_pd(SIMDYmp,SSE2temp,_MM_SHUFFLE2(0,0)); 


 Mp0s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Xmp[] and Ymp[]
 Mp1s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Xmp[] and Ymp[]
  
  SIMDXmp2=_mm_load_pd(mp6+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp6+6);
  SIMDXmp2=_mm_shuffle_pd(SIMDXmp2,SSE2temp,_MM_SHUFFLE2(0,0));



  SIMDYmp2=_mm_load_pd(mp6+12); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp6+14);
  SIMDYmp2=_mm_shuffle_pd(SIMDYmp2,SSE2temp,_MM_SHUFFLE2(0,0));
 
 Mp2s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Xmp[] and Ymp[]
 Mp3s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Xmp[] and Ymp[]
 

 SIMDZmp=_mm_load_pd(mp6+16);  // mp[0] and mp[1] 
 SSE2temp=_mm_load_pd(mp6+18);
 SIMDZmp=_mm_shuffle_pd(SIMDZmp,SSE2temp,_MM_SHUFFLE2(0,0));

 SIMDZmp2=_mm_load_pd(mp6+20); // mp[2] and mp[3]
 SSE2temp=_mm_load_pd(mp6+22);
 SIMDZmp2=_mm_shuffle_pd(SIMDZmp2,SSE2temp,_MM_SHUFFLE2(0,0));
 

 Xs=_mm_shuffle_pd(xy_jxy,xy_jxy,_MM_SHUFFLE2(0,0));
 Ys=_mm_shuffle_pd(xy_jxy,xy_jxy,_MM_SHUFFLE2(1,1)); // Grouping the Xs Ys and Zs **********> Change the PARAMETER here //PF
 Zs=_mm_shuffle_pd(zw_jzw,zw_jzw,_MM_SHUFFLE2(0,0));

 
 // Processing of INCRSUMP(mp, s, x_jx, y_iy, z_iz)


 // Processing XY part
   Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)




 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)

 SIMDs=_mm_shuffle_pd(s0s5,s0s5,_MM_SHUFFLE2(1,1)); // **********> Change the PARAMETER here //PF

 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultXY=_mm_add_pd(ResultXY,SSE2temp2);
 

 
// Processing Z part

// t=(long) 2* (Hash1d(jxiy_hash, jz) & 0xFF); //it7
 Mp0s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Zmp[]
 Mp1s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Zmp[]
 Mp2s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Zmp[]
 Mp3s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Zmp[] 
Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)


 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)
 
 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultZW=_mm_add_pd(ResultZW,SSE2temp2);
  

  // ******************************************************************* Iteration 7 **********************************************************************

  //t=(long) 2* (Hash1d(jxiy_hash, jz) & 0xFF);
//  mp = &SSE2RTable[t7]; // FP need to check the alignement

  SIMDXmp=_mm_load_pd(mp7);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp7+2);
  SIMDXmp=_mm_shuffle_pd(SIMDXmp,SSE2temp,_MM_SHUFFLE2(0,0));

 
  SIMDYmp=_mm_load_pd(mp7+8);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp7+10);
  SIMDYmp=_mm_shuffle_pd(SIMDYmp,SSE2temp,_MM_SHUFFLE2(0,0)); 


 Mp0s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Xmp[] and Ymp[]
 Mp1s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Xmp[] and Ymp[]
  
  SIMDXmp2=_mm_load_pd(mp7+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp7+6);
  SIMDXmp2=_mm_shuffle_pd(SIMDXmp2,SSE2temp,_MM_SHUFFLE2(0,0));



  SIMDYmp2=_mm_load_pd(mp7+12); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp7+14);
  SIMDYmp2=_mm_shuffle_pd(SIMDYmp2,SSE2temp,_MM_SHUFFLE2(0,0));
 
 Mp2s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Xmp[] and Ymp[]
 Mp3s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Xmp[] and Ymp[]
 

 SIMDZmp=_mm_load_pd(mp7+16);  // mp[0] and mp[1] 
 SSE2temp=_mm_load_pd(mp7+18);
 SIMDZmp=_mm_shuffle_pd(SIMDZmp,SSE2temp,_MM_SHUFFLE2(0,0));

 SIMDZmp2=_mm_load_pd(mp7+20); // mp[2] and mp[3]
 SSE2temp=_mm_load_pd(mp7+22);
 SIMDZmp2=_mm_shuffle_pd(SIMDZmp2,SSE2temp,_MM_SHUFFLE2(0,0));

 Xs=_mm_shuffle_pd(xy_jxy,xy_jxy,_MM_SHUFFLE2(0,0));
 Ys=_mm_shuffle_pd(xy_ixy,xy_ixy,_MM_SHUFFLE2(1,1)); // Grouping the Xs Ys and Zs **********> Change the PARAMETER here //PF
 Zs=_mm_shuffle_pd(zw_jzw,zw_jzw,_MM_SHUFFLE2(0,0));

 
 // Processing of INCRSUMP(mp, s, x_jx, y_iy, z_iz)


 // Processing XY part
   Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)




 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)

 SIMDs=_mm_shuffle_pd(s6s3,s6s3,_MM_SHUFFLE2(0,0)); // **********> Change the PARAMETER here //PF

 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultXY=_mm_add_pd(ResultXY,SSE2temp2);
 
 
 
// Processing Z part
 // t=(long) 2* (Hash1d(ixiy_hash, jz) & 0xFF); //it8
 Mp0s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Zmp[]
 Mp1s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Zmp[]
 Mp2s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Zmp[]
 Mp3s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Zmp[] 
Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)


 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)
 
 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultZW=_mm_add_pd(ResultZW,SSE2temp2);
 


  // ******************************************************************* Iteration 8 **********************************************************************

//  t=(long) 2* (Hash1d(ixiy_hash, jz) & 0xFF);
 // mp = &SSE2RTable[t8]; // FP need to check the alignement

   SIMDXmp=_mm_load_pd(mp8);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp8+2);
  SIMDXmp=_mm_shuffle_pd(SIMDXmp,SSE2temp,_MM_SHUFFLE2(0,0));

 
  SIMDYmp=_mm_load_pd(mp8+8);  // mp[0] and mp[1]
  SSE2temp=_mm_load_pd(mp8+10);
  SIMDYmp=_mm_shuffle_pd(SIMDYmp,SSE2temp,_MM_SHUFFLE2(0,0)); 


 Mp0s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Xmp[] and Ymp[]
 Mp1s=_mm_shuffle_pd(SIMDXmp,SIMDYmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Xmp[] and Ymp[]
  
  SIMDXmp2=_mm_load_pd(mp8+4); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp8+6);
  SIMDXmp2=_mm_shuffle_pd(SIMDXmp2,SSE2temp,_MM_SHUFFLE2(0,0));



  SIMDYmp2=_mm_load_pd(mp8+12); // mp[2] and mp[3]
  SSE2temp=_mm_load_pd(mp8+14);
  SIMDYmp2=_mm_shuffle_pd(SIMDYmp2,SSE2temp,_MM_SHUFFLE2(0,0));
 
 Mp2s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Xmp[] and Ymp[]
 Mp3s=_mm_shuffle_pd(SIMDXmp2,SIMDYmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Xmp[] and Ymp[]
 

 SIMDZmp=_mm_load_pd(mp8+16);  // mp[0] and mp[1] 
 SSE2temp=_mm_load_pd(mp8+18);
 SIMDZmp=_mm_shuffle_pd(SIMDZmp,SSE2temp,_MM_SHUFFLE2(0,0));

 SIMDZmp2=_mm_load_pd(mp8+20); // mp[2] and mp[3]
 SSE2temp=_mm_load_pd(mp8+22);
 SIMDZmp2=_mm_shuffle_pd(SIMDZmp2,SSE2temp,_MM_SHUFFLE2(0,0));
 

 Xs=_mm_shuffle_pd(xy_ixy,xy_ixy,_MM_SHUFFLE2(0,0));
 Ys=_mm_shuffle_pd(xy_ixy,xy_ixy,_MM_SHUFFLE2(1,1)); // Grouping the Xs Ys and Zs **********> Change the PARAMETER here //PF
 Zs=_mm_shuffle_pd(zw_jzw,zw_jzw,_MM_SHUFFLE2(0,0));

 
 // Processing of INCRSUMP(mp, s, x_jx, y_iy, z_iz)


 // Processing XY part
   Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)




 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)

 SIMDs=_mm_shuffle_pd(s7s2,s7s2,_MM_SHUFFLE2(0,0)); // **********> Change the PARAMETER here //PF

 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultXY=_mm_add_pd(ResultXY,SSE2temp2);
 

 
// Processing Z part

 Mp0s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(0,0));  // Regroup the mp[0] part of Zmp[]
 Mp1s=_mm_shuffle_pd(SIMDZmp,SIMDZmp,_MM_SHUFFLE2(1,1)); // Regroup the mp[1] part of Zmp[]
 Mp2s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(0,0));  // Regroup the mp[2] part of Zmp[]
 Mp3s=_mm_shuffle_pd(SIMDZmp2,SIMDZmp2,_MM_SHUFFLE2(1,1)); // Regroup the mp[3] part of Zmp[] 
Half=_mm_set_pd(0.5,0.5);
 SSE2temp=_mm_mul_pd(Mp0s,Half); //                             Xmp[0] *0.5                             Ymp[0] *0.5
 SSE2temp2=_mm_mul_pd(Mp1s,Xs); //                Xmp[1] * (x)                            Ymp[1] * (x)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x)                      Ymp[0] *0.5 + Ymp[1] * (x)


 SSE2temp=_mm_mul_pd(Mp2s,Ys); //               Xmp[2] * (y)  Ymp[2] * (y)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y)             Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)
 SSE2temp=_mm_mul_pd(Mp3s,Zs); //               Xmp[3] * (z)  Ymp[3] * (z)
 SSE2temp2=_mm_add_pd(SSE2temp,SSE2temp2); //           Xmp[0] *0.5 +   Xmp[1] * (x) + Xmp[2] * (y) + Xmp[3] * (z)      Ymp[0] *0.5 + Ymp[1] * (x) + Ymp[2] * (y)+ Ymp[3] * (z)
 
 SSE2temp2=_mm_mul_pd(SSE2temp2,SIMDs);
 ResultZW=_mm_add_pd(ResultZW,SSE2temp2);
  

 
//  Store the result in No SIMD format.


 _mm_storel_pd (&result[X], ResultXY);  // Go back to non SSE2 code.
 _mm_storeh_pd (&result[Y], ResultXY); 
 _mm_storel_pd (&result[Z], ResultZW);  

  _mm_empty();  // Floating point unit safety instruction after MMX code.

}

void DNoise(VECTOR result, VECTOR EPoint)
{
  if(!SSE2ALREADYDETECTED)
  {
    CPUDetect(); // 0 ==> no CPU known ***** 1==> No SSE2 detected **** 2==> SSE2 detected
    if (SSE2ALREADYDETECTED == HAVESSE2)
      Render_Info ("\nSSE2 detected - using SSE2 optimizations\n") ;
  }

  if (SSE2ALREADYDETECTED == HAVESSE2)
  {
    SSE2DNoise (result,  EPoint) ; 
    return ;
  }

  OriDNoise (result,  EPoint) ;
};

DBL Noise(VECTOR EPoint, TPATTERN *TPat)
{
  if(!SSE2ALREADYDETECTED)
  {
    CPUDetect(); // 0 ==> no CPU known ***** 1==> No SSE2 detected **** 2==> SSE2 detected
    if (SSE2ALREADYDETECTED == HAVESSE2)
      Render_Info ("\nSSE2 detected - using SSE2 optimizations\n") ;
  }

  if(SSE2ALREADYDETECTED == HAVESSE2)
  {
    DBL a = SSE2Noise( EPoint,  TPat);
//    DBL a = OriNoise( EPoint,  TPat); 
    return a;
  }

  return (OriNoise( EPoint,  TPat)) ;
};

