MythTV  0.26-pre
yuv2rgb.cpp
Go to the documentation of this file.
00001 /*
00002  * yuv2rgb_mmx.c
00003  * Copyright (C) 2000-2001 Silicon Integrated System Corp.
00004  * All Rights Reserved.
00005  *
00006  * Author: Olie Lho <ollie@sis.com.tw>
00007  *
00008  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
00009  * See http://libmpeg2.sourceforge.net/ for updates.
00010  *
00011  * mpeg2dec is free software; you can redistribute it and/or modify
00012  * it under the terms of the GNU General Public License as published by
00013  * the Free Software Foundation; either version 2 of the License, or
00014  * (at your option) any later version.
00015  *
00016  * mpeg2dec is distributed in the hope that it will be useful,
00017  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00018  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00019  * GNU General Public License for more details.
00020  *
00021  * You should have received a copy of the GNU General Public License
00022  * along with this program; if not, write to the Free Software
00023  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00024  */
00025 
00026 #include <cstdio>
00027 #include <cstdlib>
00028 #include <algorithm>
00029 #include <inttypes.h>
00030 #include <limits.h>
00031 #include "mythconfig.h"
00032 #include "mythtvexp.h"      // for MUNUSED
00033 
00034 #if HAVE_MMX
00035 extern "C" {
00036 #include "ffmpeg-mmx.h"
00037 }
00038 #define CPU_MMXEXT 0
00039 #define CPU_MMX 1
00040 #endif
00041 
00042 #if HAVE_ALTIVEC
00043 extern "C" {
00044 #include "libavutil/cpu.h"
00045 }
00046 int has_altivec(void); 
00047 #if HAVE_ALTIVEC_H
00048 #include <altivec.h>
00049 #else
00050 #include <Accelerate/Accelerate.h>
00051 #endif
00052 #endif
00053 #include "yuv2rgb.h"
00054 
00055 #if HAVE_ALTIVEC
00056 int has_altivec(void)
00057 {
00058     int cpu_flags = av_get_cpu_flags();
00059     if (cpu_flags & AV_CPU_FLAG_ALTIVEC)
00060         return(1);
00061 
00062     return(0);
00063 }
00064 #endif
00065 
00071 static void yuv420_argb32_non_mmx(unsigned char *image, unsigned char *py,
00072                            unsigned char *pu, unsigned char *pv,
00073                            int h_size, int v_size, int rgb_stride,
00074                            int y_stride, int uv_stride, int alphaones)
00075    MUNUSED; /* <- suppress compiler warning */
00076 
00077 /* CPU_MMXEXT/CPU_MMX adaptation layer */
00078 
00079 #define movntq(src,dest)        \
00080 do {                            \
00081     if (cpu == CPU_MMXEXT)      \
00082         movntq_r2m (src, dest); \
00083     else                        \
00084         movq_r2m (src, dest);   \
00085 } while (0)
00086 
00087 #if HAVE_MMX
00088 static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv)
00089 {
00090     static mmx_t mmx_80w = {0x0080008000800080LL};
00091     static mmx_t mmx_U_green = {0xf37df37df37df37dLL};
00092     static mmx_t mmx_U_blue = {0x4093409340934093LL};
00093     static mmx_t mmx_V_red = {0x3312331233123312LL};
00094     static mmx_t mmx_V_green = {0xe5fce5fce5fce5fcLL};
00095     static mmx_t mmx_10w = {0x1010101010101010LL};
00096     static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffLL};
00097     static mmx_t mmx_Y_coeff = {0x253f253f253f253fLL};
00098 
00099     movd_m2r (*pu, mm0);                // mm0 = 00 00 00 00 u3 u2 u1 u0
00100     movd_m2r (*pv, mm1);                // mm1 = 00 00 00 00 v3 v2 v1 v0
00101     movq_m2r (*py, mm6);                // mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
00102     pxor_r2r (mm4, mm4);                // mm4 = 0
00103     /* XXX might do cache preload for image here */
00104 
00105     /*
00106      * Do the multiply part of the conversion for even and odd pixels
00107      * register usage:
00108      * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
00109      * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels
00110      * mm6 -> Y even, mm7 -> Y odd
00111      */
00112 
00113     punpcklbw_r2r (mm4, mm0);           // mm0 = u3 u2 u1 u0
00114     punpcklbw_r2r (mm4, mm1);           // mm1 = v3 v2 v1 v0
00115     psubsw_m2r (mmx_80w, mm0);          // u -= 128
00116     psubsw_m2r (mmx_80w, mm1);          // v -= 128
00117     psllw_i2r (3, mm0);                 // promote precision
00118     psllw_i2r (3, mm1);                 // promote precision
00119     movq_r2r (mm0, mm2);                // mm2 = u3 u2 u1 u0
00120     movq_r2r (mm1, mm3);                // mm3 = v3 v2 v1 v0
00121     pmulhw_m2r (mmx_U_green, mm2);      // mm2 = u * u_green
00122     pmulhw_m2r (mmx_V_green, mm3);      // mm3 = v * v_green
00123     pmulhw_m2r (mmx_U_blue, mm0);       // mm0 = chroma_b
00124     pmulhw_m2r (mmx_V_red, mm1);        // mm1 = chroma_r
00125     paddsw_r2r (mm3, mm2);              // mm2 = chroma_g
00126 
00127     psubusb_m2r (mmx_10w, mm6);         // Y -= 16
00128     movq_r2r (mm6, mm7);                // mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
00129     pand_m2r (mmx_00ffw, mm6);          // mm6 =    Y6    Y4    Y2    Y0
00130     psrlw_i2r (8, mm7);                 // mm7 =    Y7    Y5    Y3    Y1
00131     psllw_i2r (3, mm6);                 // promote precision
00132     psllw_i2r (3, mm7);                 // promote precision
00133     pmulhw_m2r (mmx_Y_coeff, mm6);      // mm6 = luma_rgb even
00134     pmulhw_m2r (mmx_Y_coeff, mm7);      // mm7 = luma_rgb odd
00135 
00136     /*
00137      * Do the addition part of the conversion for even and odd pixels
00138      * register usage:
00139      * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
00140      * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels
00141      * mm6 -> Y even, mm7 -> Y odd
00142      */
00143 
00144     movq_r2r (mm0, mm3);                // mm3 = chroma_b
00145     movq_r2r (mm1, mm4);                // mm4 = chroma_r
00146     movq_r2r (mm2, mm5);                // mm5 = chroma_g
00147     paddsw_r2r (mm6, mm0);              // mm0 = B6 B4 B2 B0
00148     paddsw_r2r (mm7, mm3);              // mm3 = B7 B5 B3 B1
00149     paddsw_r2r (mm6, mm1);              // mm1 = R6 R4 R2 R0
00150     paddsw_r2r (mm7, mm4);              // mm4 = R7 R5 R3 R1
00151     paddsw_r2r (mm6, mm2);              // mm2 = G6 G4 G2 G0
00152     paddsw_r2r (mm7, mm5);              // mm5 = G7 G5 G3 G1
00153     packuswb_r2r (mm0, mm0);            // saturate to 0-255
00154     packuswb_r2r (mm1, mm1);            // saturate to 0-255
00155     packuswb_r2r (mm2, mm2);            // saturate to 0-255
00156     packuswb_r2r (mm3, mm3);            // saturate to 0-255
00157     packuswb_r2r (mm4, mm4);            // saturate to 0-255
00158     packuswb_r2r (mm5, mm5);            // saturate to 0-255
00159     punpcklbw_r2r (mm3, mm0);           // mm0 = B7 B6 B5 B4 B3 B2 B1 B0
00160     punpcklbw_r2r (mm4, mm1);           // mm1 = R7 R6 R5 R4 R3 R2 R1 R0
00161     punpcklbw_r2r (mm5, mm2);           // mm2 = G7 G6 G5 G4 G3 G2 G1 G0
00162 }
00163 
00164 static inline void mmx_unpack_16rgb (uint8_t * image, int cpu)
00165 {
00166     static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8LL};
00167     static mmx_t mmx_greenmask = {0xfcfcfcfcfcfcfcfcLL};
00168     static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8LL};
00169 
00170     /*
00171      * convert RGB plane to RGB 16 bits
00172      * mm0 -> B, mm1 -> R, mm2 -> G
00173      * mm4 -> GB, mm5 -> AR pixel 4-7
00174      * mm6 -> GB, mm7 -> AR pixel 0-3
00175      */
00176 
00177     pand_m2r (mmx_bluemask, mm0);       // mm0 = b7b6b5b4b3______
00178     pand_m2r (mmx_greenmask, mm2);      // mm2 = g7g6g5g4g3g2____
00179     pand_m2r (mmx_redmask, mm1);        // mm1 = r7r6r5r4r3______
00180     psrlq_i2r (3, mm0);                 // mm0 = ______b7b6b5b4b3
00181     pxor_r2r (mm4, mm4);                // mm4 = 0
00182     movq_r2r (mm0, mm5);                // mm5 = ______b7b6b5b4b3
00183     movq_r2r (mm2, mm7);                // mm7 = g7g6g5g4g3g2____
00184 
00185     punpcklbw_r2r (mm4, mm2);
00186     punpcklbw_r2r (mm1, mm0);
00187     psllq_i2r (3, mm2);
00188     por_r2r (mm2, mm0);
00189     movntq (mm0, *image);
00190 
00191     punpckhbw_r2r (mm4, mm7);
00192     punpckhbw_r2r (mm1, mm5);
00193     psllq_i2r (3, mm7);
00194     por_r2r (mm7, mm5);
00195     movntq (mm5, *(image+8));
00196 }
00197 
00198 static inline void mmx_unpack_32rgb (uint8_t * image, int cpu, int alphaones)
00199 {
00200     /*
00201      * convert RGB plane to RGB packed format,
00202      * mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
00203      * mm4 -> GB, mm5 -> AR pixel 4-7,
00204      * mm6 -> GB, mm7 -> AR pixel 0-3
00205      */
00206 
00207     if (alphaones)
00208     {
00209         static mmx_t mmx_1s = {0xffffffffffffffffLL};
00210         movq_m2r (mmx_1s, mm3);
00211     }
00212     else
00213         pxor_r2r (mm3, mm3);
00214 
00215     movq_r2r (mm0, mm6);
00216     movq_r2r (mm1, mm7);
00217     movq_r2r (mm0, mm4);
00218     movq_r2r (mm1, mm5);
00219     punpcklbw_r2r (mm2, mm6);
00220     punpcklbw_r2r (mm3, mm7);
00221     punpcklwd_r2r (mm7, mm6);
00222     movntq (mm6, *image);
00223     movq_r2r (mm0, mm6);
00224     punpcklbw_r2r (mm2, mm6);
00225     punpckhwd_r2r (mm7, mm6);
00226     movntq (mm6, *(image+8));
00227     punpckhbw_r2r (mm2, mm4);
00228     punpckhbw_r2r (mm3, mm5);
00229     punpcklwd_r2r (mm5, mm4);
00230     movntq (mm4, *(image+16));
00231     movq_r2r (mm0, mm4);
00232     punpckhbw_r2r (mm2, mm4);
00233     punpckhwd_r2r (mm5, mm4);
00234     movntq (mm4, *(image+24));
00235 }
00236 
00237 static inline void yuv420_rgb16 (uint8_t * image,
00238                                  uint8_t * py, uint8_t * pu, uint8_t * pv,
00239                                  int width, int height,
00240                                  int rgb_stride, int y_stride, int uv_stride,
00241                                  int cpu, int alphaones)
00242 {
00243     (void)alphaones;
00244     int i;
00245 
00246     rgb_stride -= 2 * width;
00247     y_stride -= width;
00248     uv_stride -= width >> 1;
00249     width >>= 3;
00250 
00251     do {
00252         i = width;
00253         do {
00254             mmx_yuv2rgb (py, pu, pv);
00255             mmx_unpack_16rgb (image, cpu);
00256             py += 8;
00257             pu += 4;
00258             pv += 4;
00259             image += 16;
00260         } while (--i);
00261 
00262         py += y_stride;
00263         image += rgb_stride;
00264         if (height & 1) {
00265             pu += uv_stride;
00266             pv += uv_stride;
00267         } else {
00268             pu -= 4 * width;
00269             pv -= 4 * width;
00270         }
00271     } while (--height);
00272 
00273         emms();
00274 }
00275 
00276 static inline void yuv420_argb32 (uint8_t * image, uint8_t * py,
00277                                   uint8_t * pu, uint8_t * pv,
00278                                   int width, int height,
00279                                   int rgb_stride, int y_stride, int uv_stride,
00280                                   int cpu, int alphaones)
00281 {
00282     int i;
00283 
00284     rgb_stride -= 4 * width;
00285     y_stride -= width;
00286     uv_stride -= width >> 1;
00287     width >>= 3;
00288 
00289     do {
00290         i = width;
00291         do {
00292             mmx_yuv2rgb (py, pu, pv);
00293             mmx_unpack_32rgb (image, cpu, alphaones);
00294             py += 8;
00295             pu += 4;
00296             pv += 4;
00297             image += 32;
00298         } while (--i);
00299 
00300         py += y_stride;
00301         image += rgb_stride;
00302         if (height & 1) {
00303             pu += uv_stride;
00304             pv += uv_stride;
00305         } else {
00306             pu -= 4 * width;
00307             pv -= 4 * width;
00308         }
00309     } while (--height);
00310 
00311         emms();
00312 }
00313 
00314 static void mmxext_rgb16 (uint8_t * image,
00315                           uint8_t * py, uint8_t * pu, uint8_t * pv,
00316                           int width, int height,
00317                           int rgb_stride, int y_stride, int uv_stride,
00318                           int alphaones)
00319 {
00320     yuv420_rgb16 (image, py, pu, pv, width, height,
00321                   rgb_stride, y_stride, uv_stride, CPU_MMXEXT, alphaones);
00322 }
00323 
00324 static void mmxext_argb32 (uint8_t * image,
00325                            uint8_t * py, uint8_t * pu, uint8_t * pv,
00326                            int width, int height,
00327                            int rgb_stride, int y_stride, int uv_stride,
00328                            int alphaones)
00329 {
00330     yuv420_argb32 (image, py, pu, pv, width, height,
00331                    rgb_stride, y_stride, uv_stride, CPU_MMXEXT, alphaones);
00332 }
00333 
00334 static void mmx_rgb16 (uint8_t * image,
00335                        uint8_t * py, uint8_t * pu, uint8_t * pv,
00336                        int width, int height,
00337                        int rgb_stride, int y_stride, int uv_stride,
00338                        int alphaones)
00339 {
00340     yuv420_rgb16 (image, py, pu, pv, width, height,
00341                   rgb_stride, y_stride, uv_stride, CPU_MMX, alphaones);
00342 }
00343 
00344 static void mmx_argb32 (uint8_t * image,
00345                         uint8_t * py, uint8_t * pu, uint8_t * pv,
00346                         int width, int height,
00347                         int rgb_stride, int y_stride, int uv_stride,
00348                         int alphaones)
00349 {
00350     yuv420_argb32 (image, py, pu, pv, width, height,
00351                    rgb_stride, y_stride, uv_stride, CPU_MMX, alphaones);
00352 }
00353 #endif
00354 
00364 yuv2rgb_fun yuv2rgb_init_mmxext (int bpp, int mode)
00365 {
00366 #if HAVE_MMX
00367     if ((bpp == 16) && (mode == MODE_RGB))
00368         return mmxext_rgb16;
00369     else if ((bpp == 32) && (mode == MODE_RGB))
00370         return mmxext_argb32;
00371 #endif
00372 
00373     (void)bpp;
00374     (void)mode;
00375 
00376     return NULL; /* Fallback to C */
00377 }
00378 
00388 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode)
00389 {
00390 #if HAVE_MMX
00391     if ((bpp == 16) && (mode == MODE_RGB))
00392         return mmx_rgb16;
00393     else if ((bpp == 32) && (mode == MODE_RGB))
00394         return mmx_argb32;
00395 #endif
00396     if ((bpp == 32) && (mode == MODE_RGB))
00397         return yuv420_argb32_non_mmx;
00398 
00399     return NULL;
00400 }
00401 
00402 #define SCALE_BITS 10
00403 
00404 #define C_Y  (76309 >> (16 - SCALE_BITS))
00405 #define C_RV (117504 >> (16 - SCALE_BITS))
00406 #define C_BU (138453 >> (16 - SCALE_BITS))
00407 #define C_GU (13954 >> (16 - SCALE_BITS))
00408 #define C_GV (34903 >> (16 - SCALE_BITS))
00409 
00410 #if defined(__FreeBSD__)
00411 // HACK: this is actually only needed on AMD64 at the moment,
00412 //       but is doesn't hurt the other architectures.
00413 #undef  UCHAR_MAX
00414 #define UCHAR_MAX  (int)__UCHAR_MAX
00415 #endif
00416 
00417 #define RGBOUT(r, g, b, y1)\
00418 {\
00419     y = (y1 - 16) * C_Y;\
00420     r = std::min(UCHAR_MAX, std::max(0, (y + r_add) >> SCALE_BITS));\
00421     g = std::min(UCHAR_MAX, std::max(0, (y + g_add) >> SCALE_BITS));\
00422     b = std::min(UCHAR_MAX, std::max(0, (y + b_add) >> SCALE_BITS));\
00423 }
00424 
00425 static void yuv420_argb32_non_mmx(unsigned char *image, unsigned char *py,
00426                            unsigned char *pu, unsigned char *pv,
00427                            int h_size, int v_size, int rgb_stride,
00428                            int y_stride, int uv_stride, int alphaones)
00429 {
00430     unsigned char *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr, *d, *d1, *d2;
00431     int w, y, cb, cr, r_add, g_add, b_add, width2;
00432     int dstwidth;
00433 
00434 // byte indices
00435 #if HAVE_BIGENDIAN
00436 #define R_OI  1
00437 #define G_OI  2
00438 #define B_OI  3
00439 #define A_OI  0
00440 #else
00441 #define R_OI  2
00442 #define G_OI  1
00443 #define B_OI  0
00444 #define A_OI  3
00445 #endif
00446 
00447     // squelch a warning
00448     (void) rgb_stride; (void) y_stride; (void) uv_stride;
00449 
00450     d = image;
00451     y1_ptr = py;
00452     cb_ptr = pu;
00453     cr_ptr = pv;
00454     dstwidth = h_size * 4;
00455     width2 = h_size / 2;
00456 
00457     for(;v_size > 0; v_size -= 2) {
00458         d1 = d;
00459         d2 = d + h_size * 4;
00460         y2_ptr = y1_ptr + h_size;
00461         for(w = width2; w > 0; w--) {
00462             cb = cb_ptr[0] - 128;
00463             cr = cr_ptr[0] - 128;
00464             r_add = C_RV * cr + (1 << (SCALE_BITS - 1));
00465             g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1));
00466             b_add = C_BU * cb + (1 << (SCALE_BITS - 1));
00467 
00468             /* output 4 pixels */
00469             RGBOUT(d1[R_OI],   d1[G_OI],   d1[B_OI],   y1_ptr[0]);
00470             RGBOUT(d1[R_OI+4], d1[G_OI+4], d1[B_OI+4], y1_ptr[1]);
00471             RGBOUT(d2[R_OI],   d2[G_OI],   d2[B_OI],   y2_ptr[0]);
00472             RGBOUT(d2[R_OI+4], d2[G_OI+4], d2[B_OI+4], y2_ptr[1]);
00473 
00474             if (alphaones)
00475                 d1[A_OI] = d1[A_OI+4] = d2[A_OI] = d2[A_OI+4] = 0xff;
00476             else
00477                 d1[A_OI] = d1[A_OI+4] = d2[A_OI] = d2[A_OI+4] = 0;
00478 
00479             d1 += 8;
00480             d2 += 8;
00481             y1_ptr += 2;
00482             y2_ptr += 2;
00483             cb_ptr++;
00484             cr_ptr++;
00485         }
00486         d += 2 * dstwidth;
00487         y1_ptr += h_size;
00488     }
00489 }
00490 
00491 #define SCALEBITS 8
00492 #define ONE_HALF  (1 << (SCALEBITS - 1))
00493 #define FIX(x)          ((int) ((x) * (1L<<SCALEBITS) + 0.5))
00494 
00499 void rgb32_to_yuv420p(unsigned char *lum, unsigned char *cb, unsigned char *cr,
00500                       unsigned char *alpha, unsigned char *src,
00501                       int width, int height, int srcwidth)
00502 {
00503     int wrap, wrap4, x, y;
00504     int r, g, b, r1, g1, b1;
00505     unsigned char *p;
00506 
00507 // byte indices
00508 #if HAVE_BIGENDIAN
00509 #define R_II  3
00510 #define G_II  2
00511 #define B_II  1
00512 #define A_II  0
00513 #else
00514 #define R_II  0
00515 #define G_II  1
00516 #define B_II  2
00517 #define A_II  3
00518 #endif
00519 
00520     wrap = (width + 1) & ~1;
00521     wrap4 = srcwidth * 4;
00522     p = src;
00523     for(y=0;y+1<height;y+=2) {
00524         for(x=0;x+1<width;x+=2) {
00525             r = p[R_II];
00526             g = p[G_II];
00527             b = p[B_II];
00528             r1 = r;
00529             g1 = g;
00530             b1 = b;
00531             lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00532                       FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00533             alpha[0] = p[A_II];
00534 
00535             r = p[R_II+4];
00536             g = p[G_II+4];
00537             b = p[B_II+4];
00538             r1 += r;
00539             g1 += g;
00540             b1 += b;
00541             lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
00542                       FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00543             alpha[1] = p[A_II+4];
00544 
00545             p += wrap4;
00546             lum += wrap;
00547             alpha += wrap;
00548 
00549             r = p[R_II];
00550             g = p[G_II];
00551             b = p[B_II];
00552             r1 += r;
00553             g1 += g;
00554             b1 += b;
00555             lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00556                       FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00557             alpha[0] = p[A_II];
00558 
00559             r = p[R_II+4];
00560             g = p[G_II+4];
00561             b = p[B_II+4];
00562             r1 += r;
00563             g1 += g;
00564             b1 += b;
00565             lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
00566                       FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00567             alpha[1] = p[A_II+4];
00568 
00569             cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
00570                     FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) +
00571                     128;
00572             cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
00573                     FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) +
00574                     128;
00575 
00576             cb++;
00577             cr++;
00578             p += -wrap4 + 2 * 4;
00579             lum += -wrap + 2;
00580             alpha += -wrap + 2;
00581         }
00582         if (width & 1) {
00583             r = p[R_II];
00584             g = p[G_II];
00585             b = p[B_II];
00586             r1 = r;
00587             g1 = g;
00588             b1 = b;
00589             lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00590                       FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00591             alpha[0] = p[A_II];
00592 
00593             lum[1] = 16;
00594             alpha[1] = 0;
00595 
00596             p += wrap4;
00597             lum += wrap;
00598             alpha += wrap;
00599 
00600             r = p[R_II];
00601             g = p[G_II];
00602             b = p[B_II];
00603             r1 += r;
00604             g1 += g;
00605             b1 += b;
00606             lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00607                       FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00608             alpha[0] = p[A_II];
00609 
00610             lum[1] = 16;
00611             alpha[1] = 0;
00612 
00613             cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
00614                     FIX(0.50000) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
00615                     128;
00616             cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
00617                     FIX(0.08131) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
00618                     128;
00619 
00620             cb++;
00621             cr++;
00622             p += -wrap4 + 4;
00623             lum += -wrap + 2;
00624             alpha += -wrap + 2;
00625         }
00626         p += wrap4 * 2 - width * 4;
00627         lum += wrap;
00628         alpha += wrap;
00629     }
00630     if (height & 1) {
00631         for(x=0;x+1<width;x+=2) {
00632             r = p[R_II];
00633             g = p[G_II];
00634             b = p[B_II];
00635             r1 = r;
00636             g1 = g;
00637             b1 = b;
00638             lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00639                       FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00640             alpha[0] = p[A_II];
00641 
00642             r = p[R_II+4];
00643             g = p[G_II+4];
00644             b = p[B_II+4];
00645             r1 += r;
00646             g1 += g;
00647             b1 += b;
00648             lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
00649                       FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00650             alpha[1] = p[A_II+4];
00651 
00652             lum += wrap;
00653             alpha += wrap;
00654 
00655             lum[0] = 16;
00656             alpha[0] = 0;
00657 
00658             lum[1] = 16;
00659             alpha[1] = 0;
00660 
00661             cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
00662                     FIX(0.50000) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
00663                     128;
00664             cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
00665                     FIX(0.08131) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
00666                     128;
00667 
00668             cb++;
00669             cr++;
00670             p += 2 * 4;
00671             lum += -wrap + 2;
00672             alpha += -wrap + 2;
00673         }
00674         if (width & 1) {
00675             r = p[R_II];
00676             g = p[G_II];
00677             b = p[B_II];
00678             r1 = r;
00679             g1 = g;
00680             b1 = b;
00681             lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
00682                       FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
00683             alpha[0] = p[A_II];
00684 
00685             lum[1] = 16;
00686             alpha[1] = 0;
00687 
00688             lum += wrap;
00689             alpha += wrap;
00690 
00691             lum[0] = 16;
00692             alpha[0] = 0;
00693 
00694             lum[1] = 16;
00695             alpha[1] = 0;
00696 
00697             cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
00698                     FIX(0.50000) * b1 + ONE_HALF - 1) >> SCALEBITS) +
00699                     128;
00700             cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
00701                     FIX(0.08131) * b1 + ONE_HALF - 1) >> SCALEBITS) +
00702                     128;
00703 
00704             cb++;
00705             cr++;
00706             p += 4;
00707             lum += -wrap + 2;
00708             alpha += -wrap + 2;
00709        }
00710     }
00711 }
00712 
00713 /* I420 to 2VUY colorspace conversion routines.
00714  *
00715  * In the early days of the OS X port of MythTV, Paul Jara noticed that
00716  * QuickTime spent a lot of time converting from YUV420 to YUV422.
00717  * He found some sample code on the Ars Technica forum by a
00718  * Frenchman called Titer which used Altivec to speed this up.
00719  * Jeremiah Morris took that code and added it into MythTV.
00720  *
00721  * All was well until the Intel Macs came along,
00722  * which seem to crash when fed YUV420 from MythTV.
00723  *
00724  * Fortunately, Mino Taoyama has provided an MMX optimised version too.
00725  */
00726 
00739 static void non_vec_i420_2vuy(
00740     uint8_t *image, int vuy_stride,
00741     const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
00742     int y_stride, int u_stride, int v_stride,
00743     int h_size, int v_size)
00744 {
00745     uint8_t *pi1, *pi2;
00746     const uint8_t *py1;
00747     const uint8_t *py2;
00748     const uint8_t *pu1;
00749     const uint8_t *pv1;
00750     int x, y;
00751 
00752     for (y = 0; y < (v_size>>1); y++)
00753     {
00754         pi1 = image + 2*y * vuy_stride;
00755         pi2 = image + 2*y * vuy_stride + vuy_stride;
00756         py1 = py + 2*y * y_stride;
00757         py2 = py + 2*y * y_stride + y_stride;
00758         pu1 = pu + y * u_stride;
00759         pv1 = pv + y * v_stride;
00760 
00761         for (x = 0; x < (h_size>>1); x++)
00762         {
00763             pi1[4*x+0] = pu1[1*x+0];
00764             pi2[4*x+0] = pu1[1*x+0];
00765             pi1[4*x+1] = py1[2*x+0];
00766             pi2[4*x+1] = py2[2*x+0];
00767             pi1[4*x+2] = pv1[1*x+0];
00768             pi2[4*x+2] = pv1[1*x+0];
00769             pi1[4*x+3] = py1[2*x+1];
00770             pi2[4*x+3] = py2[2*x+1];
00771         }
00772     }
00773 }
00774 
00775 #if HAVE_MMX
00776 
00788 static void mmx_i420_2vuy(
00789     uint8_t *image, int vuy_stride,
00790     const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
00791     int y_stride, int u_stride, int v_stride,
00792     int h_size, int v_size)
00793 {
00794     uint8_t *pi1, *pi2;
00795     const uint8_t *py1 = py;
00796     const uint8_t *py2 = py;
00797     const uint8_t *pu1 = pu;
00798     const uint8_t *pv1 = pv;
00799 
00800     int x,y;
00801 
00802     if ((h_size % 16) || (v_size % 2))
00803     {
00804         non_vec_i420_2vuy(image, vuy_stride,
00805                           py, pu, pv, y_stride, u_stride, v_stride,
00806                           h_size, v_size);
00807         return;
00808     }
00809 
00810     emms();
00811 
00812     for (y = 0; y < (v_size>>1); y++)
00813     {
00814         pi1 = image + 2*y * vuy_stride;
00815         pi2 = image + 2*y * vuy_stride + vuy_stride;
00816         py1 = py + 2*y * y_stride;
00817         py2 = py + 2*y * y_stride + y_stride;
00818         pu1 = pu + y * u_stride;
00819         pv1 = pv + y * v_stride;
00820 
00821         for (x = 0; x < h_size / 16; x++)
00822         {
00823             movq_m2r (*py1, mm0);     // y data
00824             movq_m2r (*py2, mm1);     // y data
00825             movq_m2r (*pu1, mm2);     // u data
00826             movq_m2r (*pv1, mm3);     // v data
00827 
00828             movq_r2r (mm2, mm4);      // Copy U
00829 
00830             punpcklbw_r2r (mm3, mm2); // Combine low U & V  mm2 = uv low
00831             punpckhbw_r2r (mm3, mm4); // Combine high U & V mm4 = uv high
00832 
00833             movq_r2r (mm2, mm5);      // Copy low UV  mm5 = uv low
00834             movq_r2r (mm2, mm6);      // Copy low UV  mm6 = uv low
00835             punpcklbw_r2r (mm0, mm5); // mm5 = y1 low uv low
00836             punpckhbw_r2r (mm0, mm6); // mm6 = y1 high uv high
00837 
00838             movntq_r2m (mm5, *(pi1));
00839             movntq_r2m (mm6, *(pi1+8));
00840 
00841             movq_r2r (mm2, mm5);      // Copy low UV mm5 = uv low
00842             movq_r2r (mm2, mm6);      // Copy low UV mm6 = uv low
00843             punpcklbw_r2r (mm1, mm5); // mm5 = y2 low uv low
00844             punpckhbw_r2r (mm1, mm6); // mm6 = y2 high uv high
00845 
00846             movntq_r2m (mm5, *(pi2));
00847             movntq_r2m (mm6, *(pi2+8));
00848 
00849 
00850             movq_m2r (*(py1+8), mm0); // y data
00851             movq_m2r (*(py2+8), mm1); // y data
00852 
00853             movq_r2r (mm4, mm5);      // Copy high UV mm5 = uv high
00854             movq_r2r (mm4, mm6);      // Copy high UV mm6 = uv high
00855             punpcklbw_r2r (mm0, mm5); // mm5 = y1 low uv high
00856             punpckhbw_r2r (mm0, mm6); // mm6 = y1 high uv high
00857 
00858             movntq_r2m (mm5, *(pi1+16));
00859             movntq_r2m (mm6, *(pi1+24));
00860 
00861             movq_r2r (mm4, mm5);      // Copy high UV mm5 = uv high
00862             movq_r2r (mm4, mm6);      // Copy high UV mm6 = uv high
00863             punpcklbw_r2r (mm1, mm5); // mm5 = y2 low uv low
00864             punpckhbw_r2r (mm1, mm6); // mm6 = y2 high uv high
00865 
00866             movntq_r2m (mm5, *(pi2+16));
00867             movntq_r2m (mm6, *(pi2+24));
00868 
00869             pi1 += 32;
00870             pi2 += 32;
00871             py1 += 16;
00872             py2 += 16;
00873             pu1 += 8;
00874             pv1 += 8;
00875         }
00876     }
00877 
00878     emms();
00879 }
00880 
00881 #endif // HAVE_MMX
00882 
00883 #if HAVE_ALTIVEC
00884 
00885 // Altivec code adapted from VLC's i420_yuv2.c (thanks to Titer and Paul Jara)
00886 
00887 #define VEC_NEXT_LINES()                                                    \
00888     pi1  = pi2;                                                             \
00889     pi2 += h_size * 2;                                                      \
00890     py1  = py2;                                                             \
00891     py2 += h_size;
00892 
00893 #define VEC_LOAD_UV()                                                       \
00894     u_vec = vec_ld(0, pu); pu += 16;                                        \
00895     v_vec = vec_ld(0, pv); pv += 16;
00896 
00897 #define VEC_MERGE(a)                                                        \
00898     uv_vec = a(u_vec, v_vec);                                               \
00899     y_vec = vec_ld(0, py1); py1 += 16;                                      \
00900     vec_st(vec_mergeh(uv_vec, y_vec), 0, pi1); pi1 += 16;                   \
00901     vec_st(vec_mergel(uv_vec, y_vec), 0, pi1); pi1 += 16;                   \
00902     y_vec = vec_ld(0, py2); py2 += 16;                                      \
00903     vec_st(vec_mergeh(uv_vec, y_vec), 0, pi2); pi2 += 16;                   \
00904     vec_st(vec_mergel(uv_vec, y_vec), 0, pi2); pi2 += 16;
00905 
00918 static void altivec_i420_2vuy(
00919     uint8_t *image, int vuy_stride,
00920     const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
00921     int y_stride, int u_stride, int v_stride,
00922     int h_size, int v_size)
00923 {
00924     uint8_t *pi1, *pi2 = image;
00925     const uint8_t *py1;
00926     const uint8_t *py2 = py;
00927 
00928     int x, y;
00929 
00930     vector unsigned char u_vec;
00931     vector unsigned char v_vec;
00932     vector unsigned char uv_vec;
00933     vector unsigned char y_vec;
00934 
00935     int vuy_extra = vuy_stride - (h_size<<1);
00936     int y_extra   = y_stride   - (h_size);
00937     int u_extra   = u_stride   - (h_size>>1);
00938     int v_extra   = v_stride   - (h_size>>1);
00939 
00940     if (vuy_extra || y_extra || u_extra || v_extra)
00941     {
00942         // Fall back to C version
00943         non_vec_i420_2vuy(image, vuy_stride,
00944                           py, pu, pv,
00945                           y_stride, u_stride, v_stride,
00946                           h_size, v_size);
00947         return;
00948     }
00949 
00950     if (!((h_size % 32) || (v_size % 2)))
00951     {
00952         // Width is a multiple of 32, process 2 lines at a time
00953         for (y = v_size / 2; y--; )
00954         {
00955             VEC_NEXT_LINES();
00956             for (x = h_size / 32; x--; )
00957             {
00958                 VEC_LOAD_UV();
00959                 VEC_MERGE(vec_mergeh);
00960                 VEC_MERGE(vec_mergel);
00961             }
00962         }
00963 
00964     }
00965     else if (!((h_size % 16) || (v_size % 4)))
00966     {
00967         // Width is a multiple of 16, process 4 lines at a time
00968         for (y = v_size / 4; y--; )
00969         {
00970             // Lines 1-2, pixels 0 to (width - 16)
00971             VEC_NEXT_LINES();
00972             for (x = h_size / 32; x--; )
00973             {
00974                 VEC_LOAD_UV();
00975                 VEC_MERGE(vec_mergeh);
00976                 VEC_MERGE(vec_mergel);
00977             }
00978 
00979             // Lines 1-2, pixels (width - 16) to width
00980             VEC_LOAD_UV();
00981             VEC_MERGE(vec_mergeh);
00982 
00983             // Lines 3-4, pixels 0-16
00984             VEC_NEXT_LINES();
00985             VEC_MERGE(vec_mergel);
00986 
00987             // Lines 3-4, pixels 16 to width
00988             for (x = h_size / 32; x--; )
00989             {
00990                 VEC_LOAD_UV();
00991                 VEC_MERGE(vec_mergeh);
00992                 VEC_MERGE(vec_mergel);
00993             }
00994         }
00995     }
00996     else
00997     {
00998         // Fall back to C version
00999         non_vec_i420_2vuy(image, vuy_stride,
01000                           py, pu, pv,
01001                           y_stride, u_stride, v_stride,
01002                           h_size, v_size);
01003     }
01004 }
01005 
01006 #endif // HAVE_ALTIVEC
01007 
01008 
01022 conv_i420_2vuy_fun get_i420_2vuy_conv(void)
01023 {
01024 #if HAVE_ALTIVEC
01025     if (has_altivec())
01026         return altivec_i420_2vuy;
01027 #endif
01028 #if HAVE_MMX
01029         return mmx_i420_2vuy;
01030 #else
01031         return non_vec_i420_2vuy; /* Fallback to C */
01032 #endif
01033 }
01034 
01044 static void non_vec_2vuy_i420(
01045     uint8_t *py, uint8_t *pu, uint8_t *pv,
01046     int y_stride, int u_stride, int v_stride,
01047     const uint8_t *image, int vuy_stride,
01048     int h_size, int v_size)
01049 {
01050     const uint8_t *pi1;
01051     const uint8_t *pi2;
01052     uint8_t *py1, *py2, *pu1, *pv1;
01053     int x, y;
01054 
01055     for (y = 0; y < (v_size>>1); y++)
01056     {
01057         pi1 = image + 2*y * vuy_stride;
01058         pi2 = image + 2*y * vuy_stride + vuy_stride;
01059         py1 = py + 2*y * y_stride;
01060         py2 = py + 2*y * y_stride + y_stride;
01061         pu1 = pu + y * u_stride;
01062         pv1 = pv + y * v_stride;
01063 
01064         for (x = 0; x < (h_size>>1); x++)
01065         {
01066             pu1[1*x+0] = (pi1[4*x+0] + pi2[4*x+0]) >> 1;
01067             py1[2*x+0] =  pi1[4*x+1];
01068             py2[2*x+0] =  pi2[4*x+1];
01069             pv1[1*x+0] = (pi1[4*x+2] + pi2[4*x+2]) >> 1;
01070             py1[2*x+1] =  pi1[4*x+3];
01071             py2[2*x+1] =  pi2[4*x+3];
01072         }
01073     }
01074 }
01075 
01076 #if HAVE_ALTIVEC
01077 
01078 // Altivec code adapted from VLC's i420_yuv2.c (thanks to Titer and Paul Jara)
01079 
01080 #define VEC_READ_LINE(ptr, y, uv)                                           \
01081     pa_vec = vec_ld(0, ptr); ptr += 16;                                     \
01082     pb_vec = vec_ld(0, ptr); ptr += 16;                                     \
01083     vec_st(vec_pack((vector unsigned short)pa_vec,                          \
01084                     (vector unsigned short)pb_vec),                         \
01085            0, y); y += 16;                                                  \
01086     uv = vec_pack(vec_sr((vector unsigned short)pa_vec, eight_vec),         \
01087                   vec_sr((vector unsigned short)pb_vec, eight_vec));
01088 
01089 #define VEC_SPLIT(a)                                                        \
01090     VEC_READ_LINE(pi1, py1, uv1_vec);                                       \
01091     VEC_READ_LINE(pi2, py2, uv2_vec);                                       \
01092     a = vec_avg(uv1_vec, uv2_vec);
01093 
01094 #define VEC_STORE_UV()                                                      \
01095     vec_st(vec_pack((vector unsigned short)uva_vec,                         \
01096                     (vector unsigned short)uvb_vec),                        \
01097            0, pv); pv += 16;                                                \
01098     vec_st(vec_pack(vec_sr((vector unsigned short)uva_vec, eight_vec),      \
01099                     vec_sr((vector unsigned short)uvb_vec, eight_vec)),     \
01100            0, pu); pu += 16;
01101 
01102 
01112 static void altivec_2vuy_i420(
01113     uint8_t *py, uint8_t *pu, uint8_t *pv,
01114     int y_stride, int u_stride, int v_stride,
01115     const uint8_t *image, int vuy_stride,
01116     int h_size, int v_size)
01117 {
01118     const uint8_t *pi1;
01119     const uint8_t *pi2 = image;
01120     uint8_t *py1, *py2 = py;
01121 
01122     int x, y;
01123 
01124     vector unsigned short eight_vec = vec_splat_u16(8);
01125     vector unsigned char pa_vec, pb_vec,
01126                          uv1_vec, uv2_vec,
01127                          uva_vec, uvb_vec;
01128 
01129     int vuy_extra = vuy_stride - (h_size<<1);
01130     int y_extra   = y_stride   - (h_size);
01131     int u_extra   = u_stride   - (h_size>>1);
01132     int v_extra   = v_stride   - (h_size>>1);
01133 
01134     if (vuy_extra || y_extra || u_extra || v_extra)
01135     {
01136         // Fall back to C version
01137         non_vec_2vuy_i420(py, pu, pv,
01138                           y_stride, u_stride, v_stride,
01139                           image, vuy_stride,
01140                           h_size, v_size);
01141         return;
01142     }
01143 
01144     if (!((h_size % 32) || (v_size % 2)))
01145     {
01146         // Width is a multiple of 32, process 2 lines at a time
01147         for (y = v_size / 2; y--; )
01148         {
01149             VEC_NEXT_LINES();
01150             for (x = h_size / 32; x--; )
01151             {
01152                 VEC_SPLIT(uva_vec);
01153                 VEC_SPLIT(uvb_vec);
01154                 VEC_STORE_UV();
01155             }
01156         }
01157     }
01158     else if (!((h_size % 16) || (v_size % 4)))
01159     {
01160         // Width is a multiple of 16, process 4 lines at a time
01161         for (y = v_size / 4; y--; )
01162         {
01163             // Lines 1-2, pixels 0 to (width - 16)
01164             VEC_NEXT_LINES();
01165             for (x = h_size / 32; x--; )
01166             {
01167                 VEC_SPLIT(uva_vec);
01168                 VEC_SPLIT(uvb_vec);
01169                 VEC_STORE_UV();
01170             }
01171 
01172             // Lines 1-2, pixels (width - 16) to width
01173             VEC_SPLIT(uva_vec);
01174 
01175             // Lines 3-4, pixels 0-16
01176             VEC_NEXT_LINES();
01177             VEC_SPLIT(uvb_vec);
01178             VEC_STORE_UV();
01179 
01180             // Lines 3-4, pixels 16 to width
01181             for (x = h_size / 32; x--; )
01182             {
01183                 VEC_SPLIT(uva_vec);
01184                 VEC_SPLIT(uvb_vec);
01185                 VEC_STORE_UV();
01186             }
01187         }
01188     }
01189     else
01190     {
01191         // Fall back to C version
01192         non_vec_2vuy_i420(py, pu, pv,
01193                           y_stride, u_stride, v_stride,
01194                           image, vuy_stride,
01195                           h_size, v_size);
01196     }
01197 }
01198 
01199 #endif // HAVE_ALTIVEC
01200 
01201 
01215 conv_2vuy_i420_fun get_2vuy_i420_conv(void)
01216 {
01217 #if HAVE_ALTIVEC
01218     if (has_altivec())
01219         return altivec_2vuy_i420;
01220 #endif
01221     return non_vec_2vuy_i420; /* Fallback to C */
01222 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends