MythTV  0.26-pre
sse_gcc.cpp
Go to the documentation of this file.
00001 // SSE2 versions of the expensive routines for float samples
00002 
00003 #include "STTypes.h"
00004 #include "TDStretch.h"
00005 #include "FIRFilter.h"
00006 #include "inttypes.h"
00007 
00008 using namespace soundtouch;
00009 
00010 double TDStretchSSE3::calcCrossCorrMulti(const float *mPos, const float *cPos) const
00011 {
00012     double corr = 0;
00013     int count = overlapLength * channels;
00014     int loops = count >> 4;
00015     int i = loops << 4;
00016     const float *mp = mPos;
00017     const float *cp = cPos;
00018 
00019     __asm__ volatile (
00020         "xorpd      %%xmm7, %%xmm7      \n\t"
00021         "1:                             \n\t"
00022         "movups       (%1), %%xmm0      \n\t"
00023         "movups     16(%1), %%xmm1      \n\t"
00024         "mulps      (%2),   %%xmm0      \n\t"
00025         "movups     32(%1), %%xmm2      \n\t"
00026         "addps      %%xmm0, %%xmm7      \n\t"
00027         "mulps      16(%2), %%xmm1      \n\t"
00028         "movups     48(%1), %%xmm3      \n\t"
00029         "mulps      32(%2), %%xmm2      \n\t"
00030         "addps      %%xmm1, %%xmm7      \n\t"
00031         "mulps      48(%2), %%xmm3      \n\t"
00032         "addps      %%xmm2, %%xmm7      \n\t"
00033         "add        $64,    %1          \n\t"
00034         "add        $64,    %2          \n\t"
00035         "addps      %%xmm3, %%xmm7      \n\t"
00036         "sub        $1,     %%ecx       \n\t"
00037         "jnz        1b                  \n\t"
00038         "haddps     %%xmm7, %%xmm7      \n\t"
00039         "cvtps2pd   %%xmm7, %%xmm7      \n\t"
00040         "haddpd     %%xmm7, %%xmm7      \n\t"
00041         "movsd      %%xmm7, %0          \n\t"
00042         :"=m"(corr),"+r"(mp), "+r"(cp)
00043         :"c"(loops)
00044     );
00045 
00046     for (; i < count; i++)
00047         corr += *mp++ * *cp++;
00048 
00049     return corr;
00050 }
00051 
00052 double TDStretchSSE2::calcCrossCorrMulti(const float *mPos, const float *cPos) const
00053 {
00054     double corr = 0;
00055     int count = overlapLength * channels;
00056     int loops = count >> 4;
00057     int i = loops << 4;
00058     const float *mp = mPos;
00059     const float *cp = cPos;
00060 
00061     __asm__ volatile (
00062         "xorpd      %%xmm7, %%xmm7      \n\t"
00063         "1:                             \n\t"
00064         "movups       (%1), %%xmm0      \n\t"
00065         "movups     16(%1), %%xmm1      \n\t"
00066         "mulps      (%2),   %%xmm0      \n\t"
00067         "movups     32(%1), %%xmm2      \n\t"
00068         "addps      %%xmm0, %%xmm7      \n\t"
00069         "mulps      16(%2), %%xmm1      \n\t"
00070         "movups     48(%1), %%xmm3      \n\t"
00071         "mulps      32(%2), %%xmm2      \n\t"
00072         "addps      %%xmm1, %%xmm7      \n\t"
00073         "mulps      48(%2), %%xmm3      \n\t"
00074         "addps      %%xmm2, %%xmm7      \n\t"
00075         "add        $64,    %1          \n\t"
00076         "add        $64,    %2          \n\t"
00077         "addps      %%xmm3, %%xmm7      \n\t"
00078         "sub        $1,     %%ecx       \n\t"
00079         "jnz        1b                  \n\t"
00080         "movaps     %%xmm7, %%xmm6      \n\t"
00081         "shufps     $0x4e,  %%xmm7, %%xmm6  \n\t"
00082         "addps      %%xmm6, %%xmm7      \n\t"
00083         "cvtps2pd   %%xmm7, %%xmm7      \n\t"
00084         "movapd     %%xmm7, %%xmm6      \n\t"
00085         "shufpd     $0x01,  %%xmm7, %%xmm6  \n\t"
00086         "addpd      %%xmm6, %%xmm7      \n\t"
00087         "movsd      %%xmm7, %0          \n\t"
00088         :"=m"(corr),"+r"(mp), "+r"(cp)
00089         :"c"(loops)
00090     );
00091 
00092     for (; i < count; i++)
00093         corr += *mp++ * *cp++;
00094 
00095     return corr;
00096 }
00097 
00098 double TDStretchSSE3::calcCrossCorrStereo(const float *mPos, const float *cPos) const
00099 {
00100     double corr = 0;
00101     int count = overlapLength <<1;
00102     int loops = count >> 4;
00103     int i = loops << 4;
00104     const float *mp = mPos;
00105     const float *cp = cPos;
00106 
00107     __asm__ volatile (
00108         "xorpd      %%xmm7, %%xmm7      \n\t"
00109         "1:                             \n\t"
00110         "movups       (%1), %%xmm0      \n\t"
00111         "movups     16(%1), %%xmm1      \n\t"
00112         "mulps      (%2),   %%xmm0      \n\t"
00113         "movups     32(%1), %%xmm2      \n\t"
00114         "addps      %%xmm0, %%xmm7      \n\t"
00115         "mulps      16(%2), %%xmm1      \n\t"
00116         "movups     48(%1), %%xmm3      \n\t"
00117         "mulps      32(%2), %%xmm2      \n\t"
00118         "addps      %%xmm1, %%xmm7      \n\t"
00119         "mulps      48(%2), %%xmm3      \n\t"
00120         "addps      %%xmm2, %%xmm7      \n\t"
00121         "add        $64,    %1          \n\t"
00122         "add        $64,    %2          \n\t"
00123         "addps      %%xmm3, %%xmm7      \n\t"
00124         "sub        $1,     %%ecx       \n\t"
00125         "jnz        1b                  \n\t"
00126         "haddps     %%xmm7, %%xmm7      \n\t"
00127         "cvtps2pd   %%xmm7, %%xmm7      \n\t"
00128         "haddpd     %%xmm7, %%xmm7      \n\t"
00129         "movsd      %%xmm7, %0          \n\t"
00130         :"=m"(corr),"+r"(mp), "+r"(cp)
00131         :"c"(loops)
00132     );
00133 
00134     for (; i < count; i += 2)
00135         corr += (mp[i] * cp[i] + mp[i + 1] * cp[i + 1]);
00136 
00137     return corr;
00138 }
00139 
00140 double TDStretchSSE2::calcCrossCorrStereo(const float *mPos, const float *cPos) const
00141 {
00142     double corr = 0;
00143     int count = overlapLength <<1;
00144     int loops = count >> 4;
00145     int i = loops << 4;
00146     const float *mp = mPos;
00147     const float *cp = cPos;
00148 
00149     __asm__ volatile (
00150         "xorpd      %%xmm7, %%xmm7      \n\t"
00151         "1:                             \n\t"
00152         "movups       (%1), %%xmm0      \n\t"
00153         "movups     16(%1), %%xmm1      \n\t"
00154         "mulps      (%2),   %%xmm0      \n\t"
00155         "movups     32(%1), %%xmm2      \n\t"
00156         "addps      %%xmm0, %%xmm7      \n\t"
00157         "mulps      16(%2), %%xmm1      \n\t"
00158         "movups     48(%1), %%xmm3      \n\t"
00159         "mulps      32(%2), %%xmm2      \n\t"
00160         "addps      %%xmm1, %%xmm7      \n\t"
00161         "mulps      48(%2), %%xmm3      \n\t"
00162         "addps      %%xmm2, %%xmm7      \n\t"
00163         "add        $64,    %1          \n\t"
00164         "add        $64,    %2          \n\t"
00165         "addps      %%xmm3, %%xmm7      \n\t"
00166         "sub        $1,     %%ecx       \n\t"
00167         "jnz        1b                  \n\t"
00168         "movaps     %%xmm7, %%xmm6      \n\t"
00169         "shufps     $0x4e,  %%xmm7, %%xmm6  \n\t"
00170         "addps      %%xmm6, %%xmm7      \n\t"
00171         "cvtps2pd   %%xmm7, %%xmm7      \n\t"
00172         "movapd     %%xmm7, %%xmm6      \n\t"
00173         "shufpd     $0x01,  %%xmm7, %%xmm6  \n\t"
00174         "addpd      %%xmm6, %%xmm7      \n\t"
00175         "movsd      %%xmm7, %0          \n\t"
00176         :"=m"(corr),"+r"(mp), "+r"(cp)
00177         :"c"(loops)
00178     );
00179 
00180     for (; i < count; i += 2)
00181         corr += (mp[i] * cp[i] + mp[i + 1] * cp[i + 1]);
00182 
00183     return corr;
00184 }
00185 
00186 void TDStretchSSE2::overlapMulti(float *output, const float *input) const
00187 {
00188 
00189     float *o = output;
00190     const float *i = input;
00191     const float *m = pMidBuffer;
00192 
00193     if (channels > 4)
00194         __asm__ volatile (
00195             "cvtsi2ss   %%ecx,  %%xmm7      \n\t"
00196             "shl        $2,     %4          \n\t"
00197             "punpckldq  %%xmm7, %%xmm7      \n\t"
00198             "xorpd      %%xmm6, %%xmm6      \n\t"
00199             "punpckldq  %%xmm7, %%xmm7      \n\t"
00200             "rcpps      %%xmm7, %%xmm1      \n\t"
00201             "mulps      %%xmm1, %%xmm7      \n\t"
00202             "1:                             \n\t"
00203             "movups     (%1),   %%xmm2      \n\t"
00204             "movups     16(%1), %%xmm4      \n\t"
00205             "mulps      %%xmm6, %%xmm2      \n\t"
00206             "movups     (%2),   %%xmm3      \n\t"
00207             "movups     16(%2), %%xmm5      \n\t"
00208             "mulps      %%xmm7, %%xmm3      \n\t"
00209             "add        %4,     %1          \n\t"
00210             "mulps      %%xmm6, %%xmm4      \n\t"
00211             "addps      %%xmm2, %%xmm3      \n\t"
00212             "mulps      %%xmm7, %%xmm5      \n\t"
00213             "movups     %%xmm3, (%3)        \n\t"
00214             "addps      %%xmm4, %%xmm5      \n\t"
00215             "add        %4,     %2          \n\t"
00216             "movups     %%xmm5, 16(%3)      \n\t"
00217             "addps      %%xmm1, %%xmm6      \n\t"
00218             "add        %4,     %3          \n\t"
00219             "subps      %%xmm1, %%xmm7      \n\t"
00220             "sub        $1,     %%ecx       \n\t"
00221             "jnz        1b                  \n\t"
00222             :
00223             :"c"(overlapLength),"r"(i),"r"(m),"r"(o),"r"((long)channels)
00224         );
00225     else
00226         __asm__ volatile (
00227             "cvtsi2ss   %%ecx, %%xmm7      \n\t"
00228             "shl        $2, %4              \n\t"
00229             "shr        %%ecx               \n\t"
00230             "punpckldq  %%xmm7, %%xmm7      \n\t"
00231             "xorpd      %%xmm6, %%xmm6      \n\t"
00232             "punpckldq  %%xmm7, %%xmm7      \n\t"
00233             "rcpps      %%xmm7, %%xmm1      \n\t"
00234             "mulps      %%xmm1, %%xmm7      \n\t"
00235             "1:                             \n\t"
00236             "movups     (%1),   %%xmm2      \n\t"
00237             "movups     16(%1), %%xmm4      \n\t"
00238             "mulps      %%xmm6, %%xmm2      \n\t"
00239             "movups     (%2),   %%xmm3      \n\t"
00240             "movups     16(%2), %%xmm5      \n\t"
00241             "mulps      %%xmm7, %%xmm3      \n\t"
00242             "addps      %%xmm1, %%xmm6      \n\t"
00243             "add        %4,     %1          \n\t"
00244             "addps      %%xmm2, %%xmm3      \n\t"
00245             "add        %4,     %2          \n\t"
00246             "subps      %%xmm1, %%xmm7      \n\t"
00247             "movups     %%xmm3, (%3)        \n\t"
00248             "add        %4,     %3          \n\t"
00249             "mulps      %%xmm6, %%xmm4      \n\t"
00250             "add        %4,     %1          \n\t"
00251             "mulps      %%xmm7, %%xmm5      \n\t"
00252             "addps      %%xmm1, %%xmm6      \n\t"
00253             "add        %4,     %2          \n\t"
00254             "addps      %%xmm4, %%xmm5      \n\t"
00255             "subps      %%xmm1, %%xmm7      \n\t"
00256             "movups     %%xmm5, (%3)        \n\t"
00257             "add        %4,     %3          \n\t"
00258             "sub        $1,     %%ecx       \n\t"
00259             "jnz        1b                  \n\t"
00260             :
00261             :"c"(overlapLength),"r"(i),"r"(m),"r"(o),"r"((long)channels)
00262         );
00263 }
00264 
00265 void TDStretchSSE2::overlapStereo(float *output, const float *input) const
00266 {
00267     float *o = output;
00268     const float *i = input;
00269     const float *m = pMidBuffer;
00270 
00271     __asm__ volatile (
00272         "cvtsi2ss   %%ecx, %%xmm7       \n\t"
00273         "shr        %%ecx               \n\t"
00274         "xorpd      %%xmm6, %%xmm6      \n\t"
00275         "punpckldq  %%xmm7, %%xmm7      \n\t"
00276         "rcpps      %%xmm7, %%xmm1      \n\t"
00277         "mulps      %%xmm1, %%xmm7      \n\t"
00278         "1:                             \n\t"
00279         "movups     (%1),  %%xmm2       \n\t"
00280         "movups     8(%1), %%xmm4       \n\t"
00281         "mulps      %%xmm6, %%xmm2      \n\t"
00282         "movups     (%2),  %%xmm3       \n\t"
00283         "movups     8(%2), %%xmm5       \n\t"
00284         "mulps      %%xmm7, %%xmm3      \n\t"
00285         "addps      %%xmm1, %%xmm6      \n\t"
00286         "addps      %%xmm2, %%xmm3      \n\t"
00287         "subps      %%xmm1, %%xmm7      \n\t"
00288         "movlps     %%xmm3, (%3)        \n\t"
00289         "add        $8,    %3           \n\t"
00290         "mulps      %%xmm6, %%xmm4      \n\t"
00291         "add        $16,   %1           \n\t"
00292         "mulps      %%xmm7, %%xmm5      \n\t"
00293         "addps      %%xmm1, %%xmm6      \n\t"
00294         "add        $16,   %2           \n\t"
00295         "addps      %%xmm4, %%xmm5      \n\t"
00296         "subps      %%xmm1, %%xmm7      \n\t"
00297         "movlps     %%xmm5, (%3)        \n\t"
00298         "add        $8,    %3           \n\t"
00299         "sub        $1,    %%ecx        \n\t"
00300         "jnz        1b                  \n\t"
00301         :
00302         :"c"(overlapLength),"r"(i),"r"(m),"r"(o)
00303     );
00304 }
00305 
00306 FIRFilterSSE2::FIRFilterSSE2() : FIRFilter()
00307 {
00308     filterCoeffsAlign = NULL;
00309     filterCoeffsUnalign = NULL;
00310 }
00311 
00312 FIRFilterSSE2::~FIRFilterSSE2()
00313 {
00314     delete[] filterCoeffsUnalign;
00315     filterCoeffsAlign = NULL;
00316     filterCoeffsUnalign = NULL;
00317 }
00318 
00319 
00320 void FIRFilterSSE2::setCoefficients(const float *coeffs, uint newLen, uint uRDF)
00321 {
00322     uint i;
00323     FIRFilter::setCoefficients(coeffs, newLen, uRDF);
00324 
00325     // Ensure that filter coeffs array is aligned to 16-byte boundary
00326     delete[] filterCoeffsUnalign;
00327     filterCoeffsUnalign = new float[2 * newLen + 16];
00328     filterCoeffsAlign = (float *)(((ulong)filterCoeffsUnalign + 15) & -16);
00329 
00330     float fdiv = (float)resultDivider;
00331 
00332     for (i = 0; i < newLen; i++)
00333     {
00334         filterCoeffsAlign[2 * i + 0] =
00335         filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fdiv;
00336     }
00337 }
00338 
00339 uint FIRFilterSSE2::evaluateFilterStereo(float *dest, const float *src, const uint numSamples) const
00340 {
00341     uint count = (numSamples - length) & -2;
00342 
00343     for (int i = 0; i < count; i += 2)
00344     {
00345         __asm__ volatile(
00346             "xorpd      %%xmm6, %%xmm6          \n\t"
00347             "xorpd      %%xmm7, %%xmm7          \n\t"
00348             "1:                                 \n\t"
00349             "movups     (%1),   %%xmm1          \n\t"
00350             "movups     8(%1),  %%xmm2          \n\t"
00351             "mulps      (%2),   %%xmm1          \n\t"
00352             "movups     16(%1), %%xmm3          \n\t"
00353             "mulps      (%2),   %%xmm2          \n\t"
00354             "addps      %%xmm1, %%xmm6          \n\t"
00355             "movups     24(%1), %%xmm4          \n\t"
00356             "addps      %%xmm2, %%xmm7          \n\t"
00357             "mulps      16(%2), %%xmm3          \n\t"
00358             "movups     32(%1), %%xmm1          \n\t"
00359             "mulps      16(%2), %%xmm4          \n\t"
00360             "addps      %%xmm3, %%xmm6          \n\t"
00361             "movups     40(%1), %%xmm2          \n\t"
00362             "addps      %%xmm4, %%xmm7          \n\t"
00363             "mulps      32(%2), %%xmm1          \n\t"
00364             "movups     48(%1), %%xmm3          \n\t"
00365             "mulps      32(%2), %%xmm2          \n\t"
00366             "addps      %%xmm1, %%xmm6          \n\t"
00367             "movups     56(%1), %%xmm4          \n\t"
00368             "addps      %%xmm2, %%xmm7          \n\t"
00369             "mulps      48(%2), %%xmm3          \n\t"
00370             "add        $64,    %1              \n\t"
00371             "mulps      48(%2), %%xmm4          \n\t"
00372             "addps      %%xmm3, %%xmm6          \n\t"
00373             "add        $64,    %2              \n\t"
00374             "addps      %%xmm4, %%xmm7          \n\t"
00375             "sub        $1,     %%ecx           \n\t"
00376             "jnz        1b                      \n\t"
00377             "movhlps    %%xmm6, %%xmm0          \n\t"
00378             "movlhps    %%xmm7, %%xmm0          \n\t"
00379             "shufps     $0xe4,  %%xmm7, %%xmm6  \n\t"
00380             "addps      %%xmm0, %%xmm6          \n\t"
00381             "movups     %%xmm6, (%0)            \n\t"
00382             :
00383             :"r"(dest),"r"(src),"r"(filterCoeffsAlign),"c"(length>>3)
00384         );
00385         src  += 4;
00386         dest += 4;
00387     }
00388 
00389     return count;
00390 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends