libavcodec/x86/h264dsp_mmx.c
Go to the documentation of this file.
00001 /*
00002  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
00003  *
00004  * This file is part of Libav.
00005  *
00006  * Libav is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * Libav is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with Libav; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00019  */
00020 
00021 #include "libavutil/cpu.h"
00022 #include "libavutil/x86_cpu.h"
00023 #include "libavcodec/h264dsp.h"
00024 #include "dsputil_mmx.h"
00025 
00026 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1  ) = 0x0103010301030103ULL;
00027 
00028 /***********************************/
00029 /* IDCT */
00030 #define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \
00031 void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT (uint8_t *dst, int16_t *block, int stride);
00032 
00033 IDCT_ADD_FUNC(, 8, mmx)
00034 IDCT_ADD_FUNC(, 10, sse2)
00035 IDCT_ADD_FUNC(_dc, 8, mmx2)
00036 IDCT_ADD_FUNC(_dc, 10, mmx2)
00037 IDCT_ADD_FUNC(8_dc, 8, mmx2)
00038 IDCT_ADD_FUNC(8_dc, 10, sse2)
00039 IDCT_ADD_FUNC(8, 8, mmx)
00040 IDCT_ADD_FUNC(8, 8, sse2)
00041 IDCT_ADD_FUNC(8, 10, sse2)
00042 #if HAVE_AVX
00043 IDCT_ADD_FUNC(, 10, avx)
00044 IDCT_ADD_FUNC(8_dc, 10, avx)
00045 IDCT_ADD_FUNC(8, 10, avx)
00046 #endif
00047 
00048 
00049 #define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \
00050 void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
00051                               (uint8_t *dst, const int *block_offset, \
00052                               DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
00053 
00054 IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
00055 IDCT_ADD_REP_FUNC(8, 4, 8, mmx2)
00056 IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
00057 IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
00058 IDCT_ADD_REP_FUNC(8, 4, 10, avx)
00059 IDCT_ADD_REP_FUNC(, 16, 8, mmx)
00060 IDCT_ADD_REP_FUNC(, 16, 8, mmx2)
00061 IDCT_ADD_REP_FUNC(, 16, 8, sse2)
00062 IDCT_ADD_REP_FUNC(, 16, 10, sse2)
00063 IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
00064 IDCT_ADD_REP_FUNC(, 16intra, 8, mmx2)
00065 IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
00066 IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
00067 #if HAVE_AVX
00068 IDCT_ADD_REP_FUNC(, 16, 10, avx)
00069 IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
00070 #endif
00071 
00072 
00073 #define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \
00074 void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
00075                               (uint8_t **dst, const int *block_offset, \
00076                               DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
00077 IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
00078 IDCT_ADD_REP_FUNC2(, 8, 8, mmx2)
00079 IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
00080 IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
00081 #if HAVE_AVX
00082 IDCT_ADD_REP_FUNC2(, 8, 10, avx)
00083 #endif
00084 
00085 void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul);
00086 void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);
00087 
00088 /***********************************/
00089 /* deblocking */
00090 
00091 #define h264_loop_filter_strength_iteration_mmx2(bS, nz, ref, mv, bidir, edges, step, mask_mv, dir, d_idx, mask_dir) \
00092     do { \
00093         x86_reg b_idx; \
00094         mask_mv <<= 3; \
00095         for( b_idx=0; b_idx<edges; b_idx+=step ) { \
00096             if (!mask_dir) \
00097             __asm__ volatile( \
00098                     "pxor %%mm0, %%mm0 \n\t" \
00099                     :: \
00100             ); \
00101             if(!(mask_mv & b_idx)) { \
00102                 if(bidir) { \
00103                     __asm__ volatile( \
00104                         "movd         %a3(%0,%2), %%mm2 \n" \
00105                         "punpckldq    %a4(%0,%2), %%mm2 \n" /* { ref0[bn], ref1[bn] } */ \
00106                         "pshufw $0x44, 12(%0,%2), %%mm0 \n" /* { ref0[b], ref0[b] } */ \
00107                         "pshufw $0x44, 52(%0,%2), %%mm1 \n" /* { ref1[b], ref1[b] } */ \
00108                         "pshufw $0x4E, %%mm2, %%mm3 \n" \
00109                         "psubb         %%mm2, %%mm0 \n" /* { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] } */ \
00110                         "psubb         %%mm3, %%mm1 \n" /* { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] } */ \
00111  \
00112                         "por           %%mm1, %%mm0 \n" \
00113                         "movq   %a5(%1,%2,4), %%mm1 \n" \
00114                         "movq   %a6(%1,%2,4), %%mm2 \n" \
00115                         "movq          %%mm1, %%mm3 \n" \
00116                         "movq          %%mm2, %%mm4 \n" \
00117                         "psubw   48(%1,%2,4), %%mm1 \n" \
00118                         "psubw   56(%1,%2,4), %%mm2 \n" \
00119                         "psubw  208(%1,%2,4), %%mm3 \n" \
00120                         "psubw  216(%1,%2,4), %%mm4 \n" \
00121                         "packsswb      %%mm2, %%mm1 \n" \
00122                         "packsswb      %%mm4, %%mm3 \n" \
00123                         "paddb         %%mm6, %%mm1 \n" \
00124                         "paddb         %%mm6, %%mm3 \n" \
00125                         "psubusb       %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \
00126                         "psubusb       %%mm5, %%mm3 \n" \
00127                         "packsswb      %%mm3, %%mm1 \n" \
00128  \
00129                         "por           %%mm1, %%mm0 \n" \
00130                         "movq   %a7(%1,%2,4), %%mm1 \n" \
00131                         "movq   %a8(%1,%2,4), %%mm2 \n" \
00132                         "movq          %%mm1, %%mm3 \n" \
00133                         "movq          %%mm2, %%mm4 \n" \
00134                         "psubw   48(%1,%2,4), %%mm1 \n" \
00135                         "psubw   56(%1,%2,4), %%mm2 \n" \
00136                         "psubw  208(%1,%2,4), %%mm3 \n" \
00137                         "psubw  216(%1,%2,4), %%mm4 \n" \
00138                         "packsswb      %%mm2, %%mm1 \n" \
00139                         "packsswb      %%mm4, %%mm3 \n" \
00140                         "paddb         %%mm6, %%mm1 \n" \
00141                         "paddb         %%mm6, %%mm3 \n" \
00142                         "psubusb       %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \
00143                         "psubusb       %%mm5, %%mm3 \n" \
00144                         "packsswb      %%mm3, %%mm1 \n" \
00145  \
00146                         "pshufw $0x4E, %%mm1, %%mm1 \n" \
00147                         "por           %%mm1, %%mm0 \n" \
00148                         "pshufw $0x4E, %%mm0, %%mm1 \n" \
00149                         "pminub        %%mm1, %%mm0 \n" \
00150                         ::"r"(ref), \
00151                           "r"(mv), \
00152                           "r"(b_idx), \
00153                           "i"(d_idx+12), \
00154                           "i"(d_idx+52), \
00155                           "i"(d_idx*4+48), \
00156                           "i"(d_idx*4+56), \
00157                           "i"(d_idx*4+208), \
00158                           "i"(d_idx*4+216) \
00159                     ); \
00160                 } else { \
00161                     __asm__ volatile( \
00162                         "movd   12(%0,%2), %%mm0 \n" \
00163                         "psubb %a3(%0,%2), %%mm0 \n" /* ref[b] != ref[bn] */ \
00164                         "movq   48(%1,%2,4), %%mm1 \n" \
00165                         "movq   56(%1,%2,4), %%mm2 \n" \
00166                         "psubw %a4(%1,%2,4), %%mm1 \n" \
00167                         "psubw %a5(%1,%2,4), %%mm2 \n" \
00168                         "packsswb   %%mm2, %%mm1 \n" \
00169                         "paddb      %%mm6, %%mm1 \n" \
00170                         "psubusb    %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \
00171                         "packsswb   %%mm1, %%mm1 \n" \
00172                         "por        %%mm1, %%mm0 \n" \
00173                         ::"r"(ref), \
00174                           "r"(mv), \
00175                           "r"(b_idx), \
00176                           "i"(d_idx+12), \
00177                           "i"(d_idx*4+48), \
00178                           "i"(d_idx*4+56) \
00179                     ); \
00180                 } \
00181             } \
00182             __asm__ volatile( \
00183                 "movd 12(%0,%1), %%mm1 \n" \
00184                 "por %a2(%0,%1), %%mm1 \n" /* nnz[b] || nnz[bn] */ \
00185                 ::"r"(nnz), \
00186                   "r"(b_idx), \
00187                   "i"(d_idx+12) \
00188             ); \
00189             __asm__ volatile( \
00190                 "pminub    %%mm7, %%mm1 \n" \
00191                 "pminub    %%mm7, %%mm0 \n" \
00192                 "psllw        $1, %%mm1 \n" \
00193                 "pxor      %%mm2, %%mm2 \n" \
00194                 "pmaxub    %%mm0, %%mm1 \n" \
00195                 "punpcklbw %%mm2, %%mm1 \n" \
00196                 "movq      %%mm1, %a1(%0,%2) \n" \
00197                 ::"r"(bS), \
00198                   "i"(32*dir), \
00199                   "r"(b_idx) \
00200                 :"memory" \
00201             ); \
00202         } \
00203     } while (0)
00204 
00205 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
00206                                             int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
00207     __asm__ volatile(
00208         "movq %0, %%mm7 \n"
00209         "movq %1, %%mm6 \n"
00210         ::"m"(ff_pb_1), "m"(ff_pb_3)
00211     );
00212     if(field)
00213         __asm__ volatile(
00214             "movq %0, %%mm6 \n"
00215             ::"m"(ff_pb_3_1)
00216         );
00217     __asm__ volatile(
00218         "movq  %%mm6, %%mm5 \n"
00219         "paddb %%mm5, %%mm5 \n"
00220     :);
00221 
00222     // could do a special case for dir==0 && edges==1, but it only reduces the
00223     // average filter time by 1.2%
00224     step  <<= 3;
00225     edges <<= 3;
00226     h264_loop_filter_strength_iteration_mmx2(bS, nnz, ref, mv, bidir, edges, step, mask_mv1, 1, -8,  0);
00227     h264_loop_filter_strength_iteration_mmx2(bS, nnz, ref, mv, bidir,    32,    8, mask_mv0, 0, -1, -1);
00228 
00229     __asm__ volatile(
00230         "movq   (%0), %%mm0 \n\t"
00231         "movq  8(%0), %%mm1 \n\t"
00232         "movq 16(%0), %%mm2 \n\t"
00233         "movq 24(%0), %%mm3 \n\t"
00234         TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
00235         "movq %%mm0,   (%0) \n\t"
00236         "movq %%mm3,  8(%0) \n\t"
00237         "movq %%mm4, 16(%0) \n\t"
00238         "movq %%mm2, 24(%0) \n\t"
00239         ::"r"(bS[0])
00240         :"memory"
00241     );
00242 }
00243 
00244 #define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
00245 void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
00246                                                                 int alpha, int beta, int8_t *tc0);
00247 #define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
00248 void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
00249                                                                 int alpha, int beta);
00250 
00251 #define LF_FUNCS(type, depth)\
00252 LF_FUNC (h,  chroma,       depth, mmxext)\
00253 LF_IFUNC(h,  chroma_intra, depth, mmxext)\
00254 LF_FUNC (v,  chroma,       depth, mmxext)\
00255 LF_IFUNC(v,  chroma_intra, depth, mmxext)\
00256 LF_FUNC (h,  luma,         depth, mmxext)\
00257 LF_IFUNC(h,  luma_intra,   depth, mmxext)\
00258 LF_FUNC (h,  luma,         depth, sse2)\
00259 LF_IFUNC(h,  luma_intra,   depth, sse2)\
00260 LF_FUNC (v,  luma,         depth, sse2)\
00261 LF_IFUNC(v,  luma_intra,   depth, sse2)\
00262 LF_FUNC (h,  chroma,       depth, sse2)\
00263 LF_IFUNC(h,  chroma_intra, depth, sse2)\
00264 LF_FUNC (v,  chroma,       depth, sse2)\
00265 LF_IFUNC(v,  chroma_intra, depth, sse2)\
00266 LF_FUNC (h,  luma,         depth,  avx)\
00267 LF_IFUNC(h,  luma_intra,   depth,  avx)\
00268 LF_FUNC (v,  luma,         depth,  avx)\
00269 LF_IFUNC(v,  luma_intra,   depth,  avx)\
00270 LF_FUNC (h,  chroma,       depth,  avx)\
00271 LF_IFUNC(h,  chroma_intra, depth,  avx)\
00272 LF_FUNC (v,  chroma,       depth,  avx)\
00273 LF_IFUNC(v,  chroma_intra, depth,  avx)
00274 
00275 LF_FUNCS( uint8_t,  8)
00276 LF_FUNCS(uint16_t, 10)
00277 
00278 #if ARCH_X86_32
00279 LF_FUNC (v8, luma,             8, mmxext)
00280 static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00281 {
00282     if((tc0[0] & tc0[1]) >= 0)
00283         ff_deblock_v8_luma_8_mmxext(pix+0, stride, alpha, beta, tc0);
00284     if((tc0[2] & tc0[3]) >= 0)
00285         ff_deblock_v8_luma_8_mmxext(pix+8, stride, alpha, beta, tc0+2);
00286 }
00287 LF_IFUNC(v8, luma_intra,        8, mmxext)
00288 static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, int alpha, int beta)
00289 {
00290     ff_deblock_v8_luma_intra_8_mmxext(pix+0, stride, alpha, beta);
00291     ff_deblock_v8_luma_intra_8_mmxext(pix+8, stride, alpha, beta);
00292 }
00293 #endif /* ARCH_X86_32 */
00294 
00295 LF_FUNC (v,  luma,            10, mmxext)
00296 LF_IFUNC(v,  luma_intra,      10, mmxext)
00297 
00298 /***********************************/
00299 /* weighted prediction */
00300 
00301 #define H264_WEIGHT(W, OPT) \
00302 void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \
00303     int stride, int height, int log2_denom, int weight, int offset);
00304 
00305 #define H264_BIWEIGHT(W, OPT) \
00306 void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \
00307     uint8_t *src, int stride, int height, int log2_denom, int weightd, \
00308     int weights, int offset);
00309 
00310 #define H264_BIWEIGHT_MMX(W) \
00311 H264_WEIGHT  (W, mmx2) \
00312 H264_BIWEIGHT(W, mmx2)
00313 
00314 #define H264_BIWEIGHT_MMX_SSE(W) \
00315 H264_BIWEIGHT_MMX(W) \
00316 H264_WEIGHT      (W, sse2) \
00317 H264_BIWEIGHT    (W, sse2) \
00318 H264_BIWEIGHT    (W, ssse3)
00319 
00320 H264_BIWEIGHT_MMX_SSE(16)
00321 H264_BIWEIGHT_MMX_SSE( 8)
00322 H264_BIWEIGHT_MMX    ( 4)
00323 
00324 #define H264_WEIGHT_10(W, DEPTH, OPT) \
00325 void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
00326     int stride, int height, int log2_denom, int weight, int offset);
00327 
00328 #define H264_BIWEIGHT_10(W, DEPTH, OPT) \
00329 void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \
00330     (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \
00331      int weightd, int weights, int offset);
00332 
00333 #define H264_BIWEIGHT_10_SSE(W, DEPTH) \
00334 H264_WEIGHT_10  (W, DEPTH, sse2) \
00335 H264_WEIGHT_10  (W, DEPTH, sse4) \
00336 H264_BIWEIGHT_10(W, DEPTH, sse2) \
00337 H264_BIWEIGHT_10(W, DEPTH, sse4)
00338 
00339 H264_BIWEIGHT_10_SSE(16, 10)
00340 H264_BIWEIGHT_10_SSE( 8, 10)
00341 H264_BIWEIGHT_10_SSE( 4, 10)
00342 
00343 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
00344 {
00345     int mm_flags = av_get_cpu_flags();
00346 
00347     if (chroma_format_idc <= 1 && mm_flags & AV_CPU_FLAG_MMX2) {
00348         c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
00349     }
00350 
00351     if (bit_depth == 8) {
00352 #if HAVE_YASM
00353     if (mm_flags & AV_CPU_FLAG_MMX) {
00354         c->h264_idct_dc_add         =
00355         c->h264_idct_add            = ff_h264_idct_add_8_mmx;
00356         c->h264_idct8_dc_add        =
00357         c->h264_idct8_add           = ff_h264_idct8_add_8_mmx;
00358 
00359         c->h264_idct_add16          = ff_h264_idct_add16_8_mmx;
00360         c->h264_idct8_add4          = ff_h264_idct8_add4_8_mmx;
00361         if (chroma_format_idc <= 1)
00362             c->h264_idct_add8       = ff_h264_idct_add8_8_mmx;
00363         c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_mmx;
00364         if (mm_flags & AV_CPU_FLAG_CMOV)
00365             c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
00366 
00367         if (mm_flags & AV_CPU_FLAG_MMX2) {
00368             c->h264_idct_dc_add    = ff_h264_idct_dc_add_8_mmx2;
00369             c->h264_idct8_dc_add   = ff_h264_idct8_dc_add_8_mmx2;
00370             c->h264_idct_add16     = ff_h264_idct_add16_8_mmx2;
00371             c->h264_idct8_add4     = ff_h264_idct8_add4_8_mmx2;
00372             if (chroma_format_idc <= 1)
00373                 c->h264_idct_add8  = ff_h264_idct_add8_8_mmx2;
00374             c->h264_idct_add16intra= ff_h264_idct_add16intra_8_mmx2;
00375 
00376             c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmxext;
00377             c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmxext;
00378             if (chroma_format_idc <= 1) {
00379                 c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext;
00380                 c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmxext;
00381             }
00382 #if ARCH_X86_32
00383             c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmxext;
00384             c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmxext;
00385             c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
00386             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
00387 #endif
00388             c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2;
00389             c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2;
00390             c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2;
00391 
00392             c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2;
00393             c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2;
00394             c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2;
00395 
00396             if (mm_flags&AV_CPU_FLAG_SSE2) {
00397                 c->h264_idct8_add           = ff_h264_idct8_add_8_sse2;
00398 
00399                 c->h264_idct_add16          = ff_h264_idct_add16_8_sse2;
00400                 c->h264_idct8_add4          = ff_h264_idct8_add4_8_sse2;
00401                 if (chroma_format_idc <= 1)
00402                     c->h264_idct_add8       = ff_h264_idct_add8_8_sse2;
00403                 c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_sse2;
00404                 c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
00405 
00406                 c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2;
00407                 c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2;
00408 
00409                 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2;
00410                 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2;
00411 
00412 #if HAVE_ALIGNED_STACK
00413                 c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
00414                 c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
00415                 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
00416                 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
00417 #endif
00418             }
00419             if (mm_flags&AV_CPU_FLAG_SSSE3) {
00420                 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3;
00421                 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3;
00422             }
00423             if (mm_flags&AV_CPU_FLAG_AVX) {
00424 #if HAVE_ALIGNED_STACK
00425                 c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
00426                 c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
00427                 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
00428                 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
00429 #endif
00430             }
00431         }
00432     }
00433 #endif
00434     } else if (bit_depth == 10) {
00435 #if HAVE_YASM
00436     if (mm_flags & AV_CPU_FLAG_MMX) {
00437         if (mm_flags & AV_CPU_FLAG_MMX2) {
00438 #if ARCH_X86_32
00439             c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_mmxext;
00440             c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_mmxext;
00441             c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmxext;
00442             c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmxext;
00443             c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
00444             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
00445 #endif
00446             c->h264_idct_dc_add= ff_h264_idct_dc_add_10_mmx2;
00447             if (mm_flags&AV_CPU_FLAG_SSE2) {
00448                 c->h264_idct_add       = ff_h264_idct_add_10_sse2;
00449                 c->h264_idct8_dc_add   = ff_h264_idct8_dc_add_10_sse2;
00450 
00451                 c->h264_idct_add16     = ff_h264_idct_add16_10_sse2;
00452                 if (chroma_format_idc <= 1)
00453                     c->h264_idct_add8  = ff_h264_idct_add8_10_sse2;
00454                 c->h264_idct_add16intra= ff_h264_idct_add16intra_10_sse2;
00455 #if HAVE_ALIGNED_STACK
00456                 c->h264_idct8_add      = ff_h264_idct8_add_10_sse2;
00457                 c->h264_idct8_add4     = ff_h264_idct8_add4_10_sse2;
00458 #endif
00459 
00460                 c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
00461                 c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
00462                 c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
00463 
00464                 c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
00465                 c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
00466                 c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
00467 
00468                 c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
00469                 c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
00470 #if HAVE_ALIGNED_STACK
00471                 c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
00472                 c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
00473                 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
00474                 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
00475 #endif
00476             }
00477             if (mm_flags&AV_CPU_FLAG_SSE4) {
00478                 c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
00479                 c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
00480                 c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
00481 
00482                 c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
00483                 c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
00484                 c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
00485             }
00486 #if HAVE_AVX
00487             if (mm_flags&AV_CPU_FLAG_AVX) {
00488                 c->h264_idct_dc_add    =
00489                 c->h264_idct_add       = ff_h264_idct_add_10_avx;
00490                 c->h264_idct8_dc_add   = ff_h264_idct8_dc_add_10_avx;
00491 
00492                 c->h264_idct_add16     = ff_h264_idct_add16_10_avx;
00493                 if (chroma_format_idc <= 1)
00494                     c->h264_idct_add8  = ff_h264_idct_add8_10_avx;
00495                 c->h264_idct_add16intra= ff_h264_idct_add16intra_10_avx;
00496 #if HAVE_ALIGNED_STACK
00497                 c->h264_idct8_add      = ff_h264_idct8_add_10_avx;
00498                 c->h264_idct8_add4     = ff_h264_idct8_add4_10_avx;
00499 #endif
00500 
00501                 c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_avx;
00502                 c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_avx;
00503 #if HAVE_ALIGNED_STACK
00504                 c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
00505                 c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
00506                 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
00507                 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
00508 #endif
00509             }
00510 #endif /* HAVE_AVX */
00511         }
00512     }
00513 #endif
00514     }
00515 }