|
MythTV
0.26-pre
|
00001 // SSE2 versions of the expensive routines for float samples 00002 00003 #include "STTypes.h" 00004 #include "TDStretch.h" 00005 #include "FIRFilter.h" 00006 #include "inttypes.h" 00007 00008 using namespace soundtouch; 00009 00010 double TDStretchSSE3::calcCrossCorrMulti(const float *mPos, const float *cPos) const 00011 { 00012 double corr = 0; 00013 int count = overlapLength * channels; 00014 int loops = count >> 4; 00015 int i = loops << 4; 00016 const float *mp = mPos; 00017 const float *cp = cPos; 00018 00019 __asm__ volatile ( 00020 "xorpd %%xmm7, %%xmm7 \n\t" 00021 "1: \n\t" 00022 "movups (%1), %%xmm0 \n\t" 00023 "movups 16(%1), %%xmm1 \n\t" 00024 "mulps (%2), %%xmm0 \n\t" 00025 "movups 32(%1), %%xmm2 \n\t" 00026 "addps %%xmm0, %%xmm7 \n\t" 00027 "mulps 16(%2), %%xmm1 \n\t" 00028 "movups 48(%1), %%xmm3 \n\t" 00029 "mulps 32(%2), %%xmm2 \n\t" 00030 "addps %%xmm1, %%xmm7 \n\t" 00031 "mulps 48(%2), %%xmm3 \n\t" 00032 "addps %%xmm2, %%xmm7 \n\t" 00033 "add $64, %1 \n\t" 00034 "add $64, %2 \n\t" 00035 "addps %%xmm3, %%xmm7 \n\t" 00036 "sub $1, %%ecx \n\t" 00037 "jnz 1b \n\t" 00038 "haddps %%xmm7, %%xmm7 \n\t" 00039 "cvtps2pd %%xmm7, %%xmm7 \n\t" 00040 "haddpd %%xmm7, %%xmm7 \n\t" 00041 "movsd %%xmm7, %0 \n\t" 00042 :"=m"(corr),"+r"(mp), "+r"(cp) 00043 :"c"(loops) 00044 ); 00045 00046 for (; i < count; i++) 00047 corr += *mp++ * *cp++; 00048 00049 return corr; 00050 } 00051 00052 double TDStretchSSE2::calcCrossCorrMulti(const float *mPos, const float *cPos) const 00053 { 00054 double corr = 0; 00055 int count = overlapLength * channels; 00056 int loops = count >> 4; 00057 int i = loops << 4; 00058 const float *mp = mPos; 00059 const float *cp = cPos; 00060 00061 __asm__ volatile ( 00062 "xorpd %%xmm7, %%xmm7 \n\t" 00063 "1: \n\t" 00064 "movups (%1), %%xmm0 \n\t" 00065 "movups 16(%1), %%xmm1 \n\t" 00066 "mulps (%2), %%xmm0 \n\t" 00067 "movups 32(%1), %%xmm2 \n\t" 00068 "addps %%xmm0, %%xmm7 \n\t" 00069 "mulps 16(%2), %%xmm1 \n\t" 00070 "movups 48(%1), %%xmm3 \n\t" 00071 "mulps 32(%2), %%xmm2 \n\t" 00072 "addps %%xmm1, %%xmm7 \n\t" 00073 "mulps 48(%2), %%xmm3 \n\t" 00074 "addps %%xmm2, %%xmm7 \n\t" 00075 "add $64, %1 \n\t" 00076 "add $64, %2 \n\t" 00077 "addps %%xmm3, %%xmm7 \n\t" 00078 "sub $1, %%ecx \n\t" 00079 "jnz 1b \n\t" 00080 "movaps %%xmm7, %%xmm6 \n\t" 00081 "shufps $0x4e, %%xmm7, %%xmm6 \n\t" 00082 "addps %%xmm6, %%xmm7 \n\t" 00083 "cvtps2pd %%xmm7, %%xmm7 \n\t" 00084 "movapd %%xmm7, %%xmm6 \n\t" 00085 "shufpd $0x01, %%xmm7, %%xmm6 \n\t" 00086 "addpd %%xmm6, %%xmm7 \n\t" 00087 "movsd %%xmm7, %0 \n\t" 00088 :"=m"(corr),"+r"(mp), "+r"(cp) 00089 :"c"(loops) 00090 ); 00091 00092 for (; i < count; i++) 00093 corr += *mp++ * *cp++; 00094 00095 return corr; 00096 } 00097 00098 double TDStretchSSE3::calcCrossCorrStereo(const float *mPos, const float *cPos) const 00099 { 00100 double corr = 0; 00101 int count = overlapLength <<1; 00102 int loops = count >> 4; 00103 int i = loops << 4; 00104 const float *mp = mPos; 00105 const float *cp = cPos; 00106 00107 __asm__ volatile ( 00108 "xorpd %%xmm7, %%xmm7 \n\t" 00109 "1: \n\t" 00110 "movups (%1), %%xmm0 \n\t" 00111 "movups 16(%1), %%xmm1 \n\t" 00112 "mulps (%2), %%xmm0 \n\t" 00113 "movups 32(%1), %%xmm2 \n\t" 00114 "addps %%xmm0, %%xmm7 \n\t" 00115 "mulps 16(%2), %%xmm1 \n\t" 00116 "movups 48(%1), %%xmm3 \n\t" 00117 "mulps 32(%2), %%xmm2 \n\t" 00118 "addps %%xmm1, %%xmm7 \n\t" 00119 "mulps 48(%2), %%xmm3 \n\t" 00120 "addps %%xmm2, %%xmm7 \n\t" 00121 "add $64, %1 \n\t" 00122 "add $64, %2 \n\t" 00123 "addps %%xmm3, %%xmm7 \n\t" 00124 "sub $1, %%ecx \n\t" 00125 "jnz 1b \n\t" 00126 "haddps %%xmm7, %%xmm7 \n\t" 00127 "cvtps2pd %%xmm7, %%xmm7 \n\t" 00128 "haddpd %%xmm7, %%xmm7 \n\t" 00129 "movsd %%xmm7, %0 \n\t" 00130 :"=m"(corr),"+r"(mp), "+r"(cp) 00131 :"c"(loops) 00132 ); 00133 00134 for (; i < count; i += 2) 00135 corr += (mp[i] * cp[i] + mp[i + 1] * cp[i + 1]); 00136 00137 return corr; 00138 } 00139 00140 double TDStretchSSE2::calcCrossCorrStereo(const float *mPos, const float *cPos) const 00141 { 00142 double corr = 0; 00143 int count = overlapLength <<1; 00144 int loops = count >> 4; 00145 int i = loops << 4; 00146 const float *mp = mPos; 00147 const float *cp = cPos; 00148 00149 __asm__ volatile ( 00150 "xorpd %%xmm7, %%xmm7 \n\t" 00151 "1: \n\t" 00152 "movups (%1), %%xmm0 \n\t" 00153 "movups 16(%1), %%xmm1 \n\t" 00154 "mulps (%2), %%xmm0 \n\t" 00155 "movups 32(%1), %%xmm2 \n\t" 00156 "addps %%xmm0, %%xmm7 \n\t" 00157 "mulps 16(%2), %%xmm1 \n\t" 00158 "movups 48(%1), %%xmm3 \n\t" 00159 "mulps 32(%2), %%xmm2 \n\t" 00160 "addps %%xmm1, %%xmm7 \n\t" 00161 "mulps 48(%2), %%xmm3 \n\t" 00162 "addps %%xmm2, %%xmm7 \n\t" 00163 "add $64, %1 \n\t" 00164 "add $64, %2 \n\t" 00165 "addps %%xmm3, %%xmm7 \n\t" 00166 "sub $1, %%ecx \n\t" 00167 "jnz 1b \n\t" 00168 "movaps %%xmm7, %%xmm6 \n\t" 00169 "shufps $0x4e, %%xmm7, %%xmm6 \n\t" 00170 "addps %%xmm6, %%xmm7 \n\t" 00171 "cvtps2pd %%xmm7, %%xmm7 \n\t" 00172 "movapd %%xmm7, %%xmm6 \n\t" 00173 "shufpd $0x01, %%xmm7, %%xmm6 \n\t" 00174 "addpd %%xmm6, %%xmm7 \n\t" 00175 "movsd %%xmm7, %0 \n\t" 00176 :"=m"(corr),"+r"(mp), "+r"(cp) 00177 :"c"(loops) 00178 ); 00179 00180 for (; i < count; i += 2) 00181 corr += (mp[i] * cp[i] + mp[i + 1] * cp[i + 1]); 00182 00183 return corr; 00184 } 00185 00186 void TDStretchSSE2::overlapMulti(float *output, const float *input) const 00187 { 00188 00189 float *o = output; 00190 const float *i = input; 00191 const float *m = pMidBuffer; 00192 00193 if (channels > 4) 00194 __asm__ volatile ( 00195 "cvtsi2ss %%ecx, %%xmm7 \n\t" 00196 "shl $2, %4 \n\t" 00197 "punpckldq %%xmm7, %%xmm7 \n\t" 00198 "xorpd %%xmm6, %%xmm6 \n\t" 00199 "punpckldq %%xmm7, %%xmm7 \n\t" 00200 "rcpps %%xmm7, %%xmm1 \n\t" 00201 "mulps %%xmm1, %%xmm7 \n\t" 00202 "1: \n\t" 00203 "movups (%1), %%xmm2 \n\t" 00204 "movups 16(%1), %%xmm4 \n\t" 00205 "mulps %%xmm6, %%xmm2 \n\t" 00206 "movups (%2), %%xmm3 \n\t" 00207 "movups 16(%2), %%xmm5 \n\t" 00208 "mulps %%xmm7, %%xmm3 \n\t" 00209 "add %4, %1 \n\t" 00210 "mulps %%xmm6, %%xmm4 \n\t" 00211 "addps %%xmm2, %%xmm3 \n\t" 00212 "mulps %%xmm7, %%xmm5 \n\t" 00213 "movups %%xmm3, (%3) \n\t" 00214 "addps %%xmm4, %%xmm5 \n\t" 00215 "add %4, %2 \n\t" 00216 "movups %%xmm5, 16(%3) \n\t" 00217 "addps %%xmm1, %%xmm6 \n\t" 00218 "add %4, %3 \n\t" 00219 "subps %%xmm1, %%xmm7 \n\t" 00220 "sub $1, %%ecx \n\t" 00221 "jnz 1b \n\t" 00222 : 00223 :"c"(overlapLength),"r"(i),"r"(m),"r"(o),"r"((long)channels) 00224 ); 00225 else 00226 __asm__ volatile ( 00227 "cvtsi2ss %%ecx, %%xmm7 \n\t" 00228 "shl $2, %4 \n\t" 00229 "shr %%ecx \n\t" 00230 "punpckldq %%xmm7, %%xmm7 \n\t" 00231 "xorpd %%xmm6, %%xmm6 \n\t" 00232 "punpckldq %%xmm7, %%xmm7 \n\t" 00233 "rcpps %%xmm7, %%xmm1 \n\t" 00234 "mulps %%xmm1, %%xmm7 \n\t" 00235 "1: \n\t" 00236 "movups (%1), %%xmm2 \n\t" 00237 "movups 16(%1), %%xmm4 \n\t" 00238 "mulps %%xmm6, %%xmm2 \n\t" 00239 "movups (%2), %%xmm3 \n\t" 00240 "movups 16(%2), %%xmm5 \n\t" 00241 "mulps %%xmm7, %%xmm3 \n\t" 00242 "addps %%xmm1, %%xmm6 \n\t" 00243 "add %4, %1 \n\t" 00244 "addps %%xmm2, %%xmm3 \n\t" 00245 "add %4, %2 \n\t" 00246 "subps %%xmm1, %%xmm7 \n\t" 00247 "movups %%xmm3, (%3) \n\t" 00248 "add %4, %3 \n\t" 00249 "mulps %%xmm6, %%xmm4 \n\t" 00250 "add %4, %1 \n\t" 00251 "mulps %%xmm7, %%xmm5 \n\t" 00252 "addps %%xmm1, %%xmm6 \n\t" 00253 "add %4, %2 \n\t" 00254 "addps %%xmm4, %%xmm5 \n\t" 00255 "subps %%xmm1, %%xmm7 \n\t" 00256 "movups %%xmm5, (%3) \n\t" 00257 "add %4, %3 \n\t" 00258 "sub $1, %%ecx \n\t" 00259 "jnz 1b \n\t" 00260 : 00261 :"c"(overlapLength),"r"(i),"r"(m),"r"(o),"r"((long)channels) 00262 ); 00263 } 00264 00265 void TDStretchSSE2::overlapStereo(float *output, const float *input) const 00266 { 00267 float *o = output; 00268 const float *i = input; 00269 const float *m = pMidBuffer; 00270 00271 __asm__ volatile ( 00272 "cvtsi2ss %%ecx, %%xmm7 \n\t" 00273 "shr %%ecx \n\t" 00274 "xorpd %%xmm6, %%xmm6 \n\t" 00275 "punpckldq %%xmm7, %%xmm7 \n\t" 00276 "rcpps %%xmm7, %%xmm1 \n\t" 00277 "mulps %%xmm1, %%xmm7 \n\t" 00278 "1: \n\t" 00279 "movups (%1), %%xmm2 \n\t" 00280 "movups 8(%1), %%xmm4 \n\t" 00281 "mulps %%xmm6, %%xmm2 \n\t" 00282 "movups (%2), %%xmm3 \n\t" 00283 "movups 8(%2), %%xmm5 \n\t" 00284 "mulps %%xmm7, %%xmm3 \n\t" 00285 "addps %%xmm1, %%xmm6 \n\t" 00286 "addps %%xmm2, %%xmm3 \n\t" 00287 "subps %%xmm1, %%xmm7 \n\t" 00288 "movlps %%xmm3, (%3) \n\t" 00289 "add $8, %3 \n\t" 00290 "mulps %%xmm6, %%xmm4 \n\t" 00291 "add $16, %1 \n\t" 00292 "mulps %%xmm7, %%xmm5 \n\t" 00293 "addps %%xmm1, %%xmm6 \n\t" 00294 "add $16, %2 \n\t" 00295 "addps %%xmm4, %%xmm5 \n\t" 00296 "subps %%xmm1, %%xmm7 \n\t" 00297 "movlps %%xmm5, (%3) \n\t" 00298 "add $8, %3 \n\t" 00299 "sub $1, %%ecx \n\t" 00300 "jnz 1b \n\t" 00301 : 00302 :"c"(overlapLength),"r"(i),"r"(m),"r"(o) 00303 ); 00304 } 00305 00306 FIRFilterSSE2::FIRFilterSSE2() : FIRFilter() 00307 { 00308 filterCoeffsAlign = NULL; 00309 filterCoeffsUnalign = NULL; 00310 } 00311 00312 FIRFilterSSE2::~FIRFilterSSE2() 00313 { 00314 delete[] filterCoeffsUnalign; 00315 filterCoeffsAlign = NULL; 00316 filterCoeffsUnalign = NULL; 00317 } 00318 00319 00320 void FIRFilterSSE2::setCoefficients(const float *coeffs, uint newLen, uint uRDF) 00321 { 00322 uint i; 00323 FIRFilter::setCoefficients(coeffs, newLen, uRDF); 00324 00325 // Ensure that filter coeffs array is aligned to 16-byte boundary 00326 delete[] filterCoeffsUnalign; 00327 filterCoeffsUnalign = new float[2 * newLen + 16]; 00328 filterCoeffsAlign = (float *)(((ulong)filterCoeffsUnalign + 15) & -16); 00329 00330 float fdiv = (float)resultDivider; 00331 00332 for (i = 0; i < newLen; i++) 00333 { 00334 filterCoeffsAlign[2 * i + 0] = 00335 filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fdiv; 00336 } 00337 } 00338 00339 uint FIRFilterSSE2::evaluateFilterStereo(float *dest, const float *src, const uint numSamples) const 00340 { 00341 uint count = (numSamples - length) & -2; 00342 00343 for (int i = 0; i < count; i += 2) 00344 { 00345 __asm__ volatile( 00346 "xorpd %%xmm6, %%xmm6 \n\t" 00347 "xorpd %%xmm7, %%xmm7 \n\t" 00348 "1: \n\t" 00349 "movups (%1), %%xmm1 \n\t" 00350 "movups 8(%1), %%xmm2 \n\t" 00351 "mulps (%2), %%xmm1 \n\t" 00352 "movups 16(%1), %%xmm3 \n\t" 00353 "mulps (%2), %%xmm2 \n\t" 00354 "addps %%xmm1, %%xmm6 \n\t" 00355 "movups 24(%1), %%xmm4 \n\t" 00356 "addps %%xmm2, %%xmm7 \n\t" 00357 "mulps 16(%2), %%xmm3 \n\t" 00358 "movups 32(%1), %%xmm1 \n\t" 00359 "mulps 16(%2), %%xmm4 \n\t" 00360 "addps %%xmm3, %%xmm6 \n\t" 00361 "movups 40(%1), %%xmm2 \n\t" 00362 "addps %%xmm4, %%xmm7 \n\t" 00363 "mulps 32(%2), %%xmm1 \n\t" 00364 "movups 48(%1), %%xmm3 \n\t" 00365 "mulps 32(%2), %%xmm2 \n\t" 00366 "addps %%xmm1, %%xmm6 \n\t" 00367 "movups 56(%1), %%xmm4 \n\t" 00368 "addps %%xmm2, %%xmm7 \n\t" 00369 "mulps 48(%2), %%xmm3 \n\t" 00370 "add $64, %1 \n\t" 00371 "mulps 48(%2), %%xmm4 \n\t" 00372 "addps %%xmm3, %%xmm6 \n\t" 00373 "add $64, %2 \n\t" 00374 "addps %%xmm4, %%xmm7 \n\t" 00375 "sub $1, %%ecx \n\t" 00376 "jnz 1b \n\t" 00377 "movhlps %%xmm6, %%xmm0 \n\t" 00378 "movlhps %%xmm7, %%xmm0 \n\t" 00379 "shufps $0xe4, %%xmm7, %%xmm6 \n\t" 00380 "addps %%xmm0, %%xmm6 \n\t" 00381 "movups %%xmm6, (%0) \n\t" 00382 : 00383 :"r"(dest),"r"(src),"r"(filterCoeffsAlign),"c"(length>>3) 00384 ); 00385 src += 4; 00386 dest += 4; 00387 } 00388 00389 return count; 00390 }
1.7.6.1