libavcodec/x86/mpegaudiodec_mmx.c
Go to the documentation of this file.
00001 /*
00002  * MMX optimized MP3 decoding functions
00003  * Copyright (c) 2010 Vitor Sessak
00004  *
00005  * This file is part of Libav.
00006  *
00007  * Libav is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU Lesser General Public
00009  * License as published by the Free Software Foundation; either
00010  * version 2.1 of the License, or (at your option) any later version.
00011  *
00012  * Libav is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with Libav; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00020  */
00021 
00022 #include "libavutil/cpu.h"
00023 #include "libavutil/x86_cpu.h"
00024 #include "libavcodec/dsputil.h"
00025 #include "libavcodec/mpegaudiodsp.h"
00026 
00027 void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
00028 void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
00029 void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
00030 void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
00031 void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
00032 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
00033                                float *tmpbuf);
00034 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
00035                                float *tmpbuf);
00036 
00037 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
00038 
00039 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
00040 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
00041 
00042 #define SUM8(op, sum, w, p)               \
00043 {                                         \
00044     op(sum, (w)[0 * 64], (p)[0 * 64]);    \
00045     op(sum, (w)[1 * 64], (p)[1 * 64]);    \
00046     op(sum, (w)[2 * 64], (p)[2 * 64]);    \
00047     op(sum, (w)[3 * 64], (p)[3 * 64]);    \
00048     op(sum, (w)[4 * 64], (p)[4 * 64]);    \
00049     op(sum, (w)[5 * 64], (p)[5 * 64]);    \
00050     op(sum, (w)[6 * 64], (p)[6 * 64]);    \
00051     op(sum, (w)[7 * 64], (p)[7 * 64]);    \
00052 }
00053 
00054 static void apply_window(const float *buf, const float *win1,
00055                          const float *win2, float *sum1, float *sum2, int len)
00056 {
00057     x86_reg count = - 4*len;
00058     const float *win1a = win1+len;
00059     const float *win2a = win2+len;
00060     const float *bufa  = buf+len;
00061     float *sum1a = sum1+len;
00062     float *sum2a = sum2+len;
00063 
00064 
00065 #define MULT(a, b)                                 \
00066     "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
00067     "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
00068     "mulps         %%xmm2, %%xmm1           \n\t"  \
00069     "subps         %%xmm1, %%xmm0           \n\t"  \
00070     "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
00071     "subps         %%xmm2, %%xmm4           \n\t"  \
00072 
00073     __asm__ volatile(
00074             "1:                                   \n\t"
00075             "xorps       %%xmm0, %%xmm0           \n\t"
00076             "xorps       %%xmm4, %%xmm4           \n\t"
00077 
00078             MULT(   0,   0)
00079             MULT( 256,  64)
00080             MULT( 512, 128)
00081             MULT( 768, 192)
00082             MULT(1024, 256)
00083             MULT(1280, 320)
00084             MULT(1536, 384)
00085             MULT(1792, 448)
00086 
00087             "movaps      %%xmm0, (%4,%0)          \n\t"
00088             "movaps      %%xmm4, (%5,%0)          \n\t"
00089             "add            $16,  %0              \n\t"
00090             "jl              1b                   \n\t"
00091             :"+&r"(count)
00092             :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
00093             );
00094 
00095 #undef MULT
00096 }
00097 
00098 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
00099                              int incr)
00100 {
00101     LOCAL_ALIGNED_16(float, suma, [17]);
00102     LOCAL_ALIGNED_16(float, sumb, [17]);
00103     LOCAL_ALIGNED_16(float, sumc, [17]);
00104     LOCAL_ALIGNED_16(float, sumd, [17]);
00105 
00106     float sum;
00107 
00108     /* copy to avoid wrap */
00109     memcpy(in + 512, in, 32 * sizeof(*in));
00110 
00111     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
00112     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
00113 
00114     SUM8(MACS, suma[0], win + 32, in + 48);
00115 
00116     sumc[ 0] = 0;
00117     sumb[16] = 0;
00118     sumd[16] = 0;
00119 
00120 #define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
00121             "movups " #sumd "(%4),       %%xmm0          \n\t" \
00122             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
00123             "subps  " #suma "(%1),       %%xmm0          \n\t" \
00124             "movaps        %%xmm0," #out1 "(%0)          \n\t" \
00125 \
00126             "movups " #sumc "(%3),       %%xmm0          \n\t" \
00127             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
00128             "addps  " #sumb "(%2),       %%xmm0          \n\t" \
00129             "movaps        %%xmm0," #out2 "(%0)          \n\t"
00130 
00131     if (incr == 1) {
00132         __asm__ volatile(
00133             SUMS( 0, 48,  4, 52,  0, 112)
00134             SUMS(16, 32, 20, 36, 16,  96)
00135             SUMS(32, 16, 36, 20, 32,  80)
00136             SUMS(48,  0, 52,  4, 48,  64)
00137 
00138             :"+&r"(out)
00139             :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
00140             :"memory"
00141             );
00142         out += 16*incr;
00143     } else {
00144         int j;
00145         float *out2 = out + 32 * incr;
00146         out[0  ]  = -suma[   0];
00147         out += incr;
00148         out2 -= incr;
00149         for(j=1;j<16;j++) {
00150             *out  = -suma[   j] + sumd[16-j];
00151             *out2 =  sumb[16-j] + sumc[   j];
00152             out  += incr;
00153             out2 -= incr;
00154         }
00155     }
00156 
00157     sum = 0;
00158     SUM8(MLSS, sum, win + 16 + 32, in + 32);
00159     *out = sum;
00160 }
00161 
00162 
00163 #if HAVE_YASM
00164 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
00165 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
00166                                int count, int switch_point, int block_type) \
00167 {                                                                           \
00168     int align_end = count - (count & 3);                                \
00169     int j;                                                              \
00170     for (j = 0; j < align_end; j+= 4) {                                 \
00171         LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
00172         float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
00173         /* apply window & overlap with previous buffer */               \
00174                                                                         \
00175         /* select window */                                             \
00176         ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
00177         in      += 4*18;                                                \
00178         buf     += 4*18;                                                \
00179         out     += 4;                                                   \
00180     }                                                                   \
00181     for (; j < count; j++) {                                            \
00182         /* apply window & overlap with previous buffer */               \
00183                                                                         \
00184         /* select window */                                             \
00185         int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
00186         float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
00187                                                                         \
00188         ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
00189                                                                         \
00190         in  += 18;                                                      \
00191         buf++;                                                          \
00192         out++;                                                          \
00193     }                                                                   \
00194 }
00195 
00196 DECL_IMDCT_BLOCKS(sse,sse)
00197 DECL_IMDCT_BLOCKS(sse2,sse)
00198 DECL_IMDCT_BLOCKS(sse3,sse)
00199 DECL_IMDCT_BLOCKS(ssse3,sse)
00200 DECL_IMDCT_BLOCKS(avx,avx)
00201 #endif /* HAVE_YASM */
00202 
00203 void ff_mpadsp_init_mmx(MPADSPContext *s)
00204 {
00205     int mm_flags = av_get_cpu_flags();
00206 
00207     int i, j;
00208     for (j = 0; j < 4; j++) {
00209         for (i = 0; i < 40; i ++) {
00210             mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
00211             mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
00212             mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
00213             mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00214             mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
00215             mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
00216             mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
00217             mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00218         }
00219     }
00220 
00221     if (mm_flags & AV_CPU_FLAG_SSE2) {
00222         s->apply_window_float = apply_window_mp3;
00223     }
00224 #if HAVE_YASM
00225     if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
00226         s->imdct36_blocks_float = imdct36_blocks_avx;
00227 #if HAVE_SSE
00228     } else if (mm_flags & AV_CPU_FLAG_SSSE3) {
00229         s->imdct36_blocks_float = imdct36_blocks_ssse3;
00230     } else if (mm_flags & AV_CPU_FLAG_SSE3) {
00231         s->imdct36_blocks_float = imdct36_blocks_sse3;
00232     } else if (mm_flags & AV_CPU_FLAG_SSE2) {
00233         s->imdct36_blocks_float = imdct36_blocks_sse2;
00234     } else if (mm_flags & AV_CPU_FLAG_SSE) {
00235         s->imdct36_blocks_float = imdct36_blocks_sse;
00236 #endif /* HAVE_SSE */
00237     }
00238 #endif /* HAVE_YASM */
00239 }