|
MythTV
0.26-pre
|
00001 /* 00002 * yuv2rgb_mmx.c 00003 * Copyright (C) 2000-2001 Silicon Integrated System Corp. 00004 * All Rights Reserved. 00005 * 00006 * Author: Olie Lho <ollie@sis.com.tw> 00007 * 00008 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. 00009 * See http://libmpeg2.sourceforge.net/ for updates. 00010 * 00011 * mpeg2dec is free software; you can redistribute it and/or modify 00012 * it under the terms of the GNU General Public License as published by 00013 * the Free Software Foundation; either version 2 of the License, or 00014 * (at your option) any later version. 00015 * 00016 * mpeg2dec is distributed in the hope that it will be useful, 00017 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00018 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00019 * GNU General Public License for more details. 00020 * 00021 * You should have received a copy of the GNU General Public License 00022 * along with this program; if not, write to the Free Software 00023 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 00024 */ 00025 00026 #include <cstdio> 00027 #include <cstdlib> 00028 #include <algorithm> 00029 #include <inttypes.h> 00030 #include <limits.h> 00031 #include "mythconfig.h" 00032 #include "mythtvexp.h" // for MUNUSED 00033 00034 #if HAVE_MMX 00035 extern "C" { 00036 #include "ffmpeg-mmx.h" 00037 } 00038 #define CPU_MMXEXT 0 00039 #define CPU_MMX 1 00040 #endif 00041 00042 #if HAVE_ALTIVEC 00043 extern "C" { 00044 #include "libavutil/cpu.h" 00045 } 00046 int has_altivec(void); 00047 #if HAVE_ALTIVEC_H 00048 #include <altivec.h> 00049 #else 00050 #include <Accelerate/Accelerate.h> 00051 #endif 00052 #endif 00053 #include "yuv2rgb.h" 00054 00055 #if HAVE_ALTIVEC 00056 int has_altivec(void) 00057 { 00058 int cpu_flags = av_get_cpu_flags(); 00059 if (cpu_flags & AV_CPU_FLAG_ALTIVEC) 00060 return(1); 00061 00062 return(0); 00063 } 00064 #endif 00065 00071 static void yuv420_argb32_non_mmx(unsigned char *image, unsigned char *py, 00072 unsigned char *pu, unsigned char *pv, 00073 int h_size, int v_size, int rgb_stride, 00074 int y_stride, int uv_stride, int alphaones) 00075 MUNUSED; /* <- suppress compiler warning */ 00076 00077 /* CPU_MMXEXT/CPU_MMX adaptation layer */ 00078 00079 #define movntq(src,dest) \ 00080 do { \ 00081 if (cpu == CPU_MMXEXT) \ 00082 movntq_r2m (src, dest); \ 00083 else \ 00084 movq_r2m (src, dest); \ 00085 } while (0) 00086 00087 #if HAVE_MMX 00088 static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv) 00089 { 00090 static mmx_t mmx_80w = {0x0080008000800080LL}; 00091 static mmx_t mmx_U_green = {0xf37df37df37df37dLL}; 00092 static mmx_t mmx_U_blue = {0x4093409340934093LL}; 00093 static mmx_t mmx_V_red = {0x3312331233123312LL}; 00094 static mmx_t mmx_V_green = {0xe5fce5fce5fce5fcLL}; 00095 static mmx_t mmx_10w = {0x1010101010101010LL}; 00096 static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffLL}; 00097 static mmx_t mmx_Y_coeff = {0x253f253f253f253fLL}; 00098 00099 movd_m2r (*pu, mm0); // mm0 = 00 00 00 00 u3 u2 u1 u0 00100 movd_m2r (*pv, mm1); // mm1 = 00 00 00 00 v3 v2 v1 v0 00101 movq_m2r (*py, mm6); // mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 00102 pxor_r2r (mm4, mm4); // mm4 = 0 00103 /* XXX might do cache preload for image here */ 00104 00105 /* 00106 * Do the multiply part of the conversion for even and odd pixels 00107 * register usage: 00108 * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels 00109 * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels 00110 * mm6 -> Y even, mm7 -> Y odd 00111 */ 00112 00113 punpcklbw_r2r (mm4, mm0); // mm0 = u3 u2 u1 u0 00114 punpcklbw_r2r (mm4, mm1); // mm1 = v3 v2 v1 v0 00115 psubsw_m2r (mmx_80w, mm0); // u -= 128 00116 psubsw_m2r (mmx_80w, mm1); // v -= 128 00117 psllw_i2r (3, mm0); // promote precision 00118 psllw_i2r (3, mm1); // promote precision 00119 movq_r2r (mm0, mm2); // mm2 = u3 u2 u1 u0 00120 movq_r2r (mm1, mm3); // mm3 = v3 v2 v1 v0 00121 pmulhw_m2r (mmx_U_green, mm2); // mm2 = u * u_green 00122 pmulhw_m2r (mmx_V_green, mm3); // mm3 = v * v_green 00123 pmulhw_m2r (mmx_U_blue, mm0); // mm0 = chroma_b 00124 pmulhw_m2r (mmx_V_red, mm1); // mm1 = chroma_r 00125 paddsw_r2r (mm3, mm2); // mm2 = chroma_g 00126 00127 psubusb_m2r (mmx_10w, mm6); // Y -= 16 00128 movq_r2r (mm6, mm7); // mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 00129 pand_m2r (mmx_00ffw, mm6); // mm6 = Y6 Y4 Y2 Y0 00130 psrlw_i2r (8, mm7); // mm7 = Y7 Y5 Y3 Y1 00131 psllw_i2r (3, mm6); // promote precision 00132 psllw_i2r (3, mm7); // promote precision 00133 pmulhw_m2r (mmx_Y_coeff, mm6); // mm6 = luma_rgb even 00134 pmulhw_m2r (mmx_Y_coeff, mm7); // mm7 = luma_rgb odd 00135 00136 /* 00137 * Do the addition part of the conversion for even and odd pixels 00138 * register usage: 00139 * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels 00140 * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels 00141 * mm6 -> Y even, mm7 -> Y odd 00142 */ 00143 00144 movq_r2r (mm0, mm3); // mm3 = chroma_b 00145 movq_r2r (mm1, mm4); // mm4 = chroma_r 00146 movq_r2r (mm2, mm5); // mm5 = chroma_g 00147 paddsw_r2r (mm6, mm0); // mm0 = B6 B4 B2 B0 00148 paddsw_r2r (mm7, mm3); // mm3 = B7 B5 B3 B1 00149 paddsw_r2r (mm6, mm1); // mm1 = R6 R4 R2 R0 00150 paddsw_r2r (mm7, mm4); // mm4 = R7 R5 R3 R1 00151 paddsw_r2r (mm6, mm2); // mm2 = G6 G4 G2 G0 00152 paddsw_r2r (mm7, mm5); // mm5 = G7 G5 G3 G1 00153 packuswb_r2r (mm0, mm0); // saturate to 0-255 00154 packuswb_r2r (mm1, mm1); // saturate to 0-255 00155 packuswb_r2r (mm2, mm2); // saturate to 0-255 00156 packuswb_r2r (mm3, mm3); // saturate to 0-255 00157 packuswb_r2r (mm4, mm4); // saturate to 0-255 00158 packuswb_r2r (mm5, mm5); // saturate to 0-255 00159 punpcklbw_r2r (mm3, mm0); // mm0 = B7 B6 B5 B4 B3 B2 B1 B0 00160 punpcklbw_r2r (mm4, mm1); // mm1 = R7 R6 R5 R4 R3 R2 R1 R0 00161 punpcklbw_r2r (mm5, mm2); // mm2 = G7 G6 G5 G4 G3 G2 G1 G0 00162 } 00163 00164 static inline void mmx_unpack_16rgb (uint8_t * image, int cpu) 00165 { 00166 static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8LL}; 00167 static mmx_t mmx_greenmask = {0xfcfcfcfcfcfcfcfcLL}; 00168 static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8LL}; 00169 00170 /* 00171 * convert RGB plane to RGB 16 bits 00172 * mm0 -> B, mm1 -> R, mm2 -> G 00173 * mm4 -> GB, mm5 -> AR pixel 4-7 00174 * mm6 -> GB, mm7 -> AR pixel 0-3 00175 */ 00176 00177 pand_m2r (mmx_bluemask, mm0); // mm0 = b7b6b5b4b3______ 00178 pand_m2r (mmx_greenmask, mm2); // mm2 = g7g6g5g4g3g2____ 00179 pand_m2r (mmx_redmask, mm1); // mm1 = r7r6r5r4r3______ 00180 psrlq_i2r (3, mm0); // mm0 = ______b7b6b5b4b3 00181 pxor_r2r (mm4, mm4); // mm4 = 0 00182 movq_r2r (mm0, mm5); // mm5 = ______b7b6b5b4b3 00183 movq_r2r (mm2, mm7); // mm7 = g7g6g5g4g3g2____ 00184 00185 punpcklbw_r2r (mm4, mm2); 00186 punpcklbw_r2r (mm1, mm0); 00187 psllq_i2r (3, mm2); 00188 por_r2r (mm2, mm0); 00189 movntq (mm0, *image); 00190 00191 punpckhbw_r2r (mm4, mm7); 00192 punpckhbw_r2r (mm1, mm5); 00193 psllq_i2r (3, mm7); 00194 por_r2r (mm7, mm5); 00195 movntq (mm5, *(image+8)); 00196 } 00197 00198 static inline void mmx_unpack_32rgb (uint8_t * image, int cpu, int alphaones) 00199 { 00200 /* 00201 * convert RGB plane to RGB packed format, 00202 * mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0, 00203 * mm4 -> GB, mm5 -> AR pixel 4-7, 00204 * mm6 -> GB, mm7 -> AR pixel 0-3 00205 */ 00206 00207 if (alphaones) 00208 { 00209 static mmx_t mmx_1s = {0xffffffffffffffffLL}; 00210 movq_m2r (mmx_1s, mm3); 00211 } 00212 else 00213 pxor_r2r (mm3, mm3); 00214 00215 movq_r2r (mm0, mm6); 00216 movq_r2r (mm1, mm7); 00217 movq_r2r (mm0, mm4); 00218 movq_r2r (mm1, mm5); 00219 punpcklbw_r2r (mm2, mm6); 00220 punpcklbw_r2r (mm3, mm7); 00221 punpcklwd_r2r (mm7, mm6); 00222 movntq (mm6, *image); 00223 movq_r2r (mm0, mm6); 00224 punpcklbw_r2r (mm2, mm6); 00225 punpckhwd_r2r (mm7, mm6); 00226 movntq (mm6, *(image+8)); 00227 punpckhbw_r2r (mm2, mm4); 00228 punpckhbw_r2r (mm3, mm5); 00229 punpcklwd_r2r (mm5, mm4); 00230 movntq (mm4, *(image+16)); 00231 movq_r2r (mm0, mm4); 00232 punpckhbw_r2r (mm2, mm4); 00233 punpckhwd_r2r (mm5, mm4); 00234 movntq (mm4, *(image+24)); 00235 } 00236 00237 static inline void yuv420_rgb16 (uint8_t * image, 00238 uint8_t * py, uint8_t * pu, uint8_t * pv, 00239 int width, int height, 00240 int rgb_stride, int y_stride, int uv_stride, 00241 int cpu, int alphaones) 00242 { 00243 (void)alphaones; 00244 int i; 00245 00246 rgb_stride -= 2 * width; 00247 y_stride -= width; 00248 uv_stride -= width >> 1; 00249 width >>= 3; 00250 00251 do { 00252 i = width; 00253 do { 00254 mmx_yuv2rgb (py, pu, pv); 00255 mmx_unpack_16rgb (image, cpu); 00256 py += 8; 00257 pu += 4; 00258 pv += 4; 00259 image += 16; 00260 } while (--i); 00261 00262 py += y_stride; 00263 image += rgb_stride; 00264 if (height & 1) { 00265 pu += uv_stride; 00266 pv += uv_stride; 00267 } else { 00268 pu -= 4 * width; 00269 pv -= 4 * width; 00270 } 00271 } while (--height); 00272 00273 emms(); 00274 } 00275 00276 static inline void yuv420_argb32 (uint8_t * image, uint8_t * py, 00277 uint8_t * pu, uint8_t * pv, 00278 int width, int height, 00279 int rgb_stride, int y_stride, int uv_stride, 00280 int cpu, int alphaones) 00281 { 00282 int i; 00283 00284 rgb_stride -= 4 * width; 00285 y_stride -= width; 00286 uv_stride -= width >> 1; 00287 width >>= 3; 00288 00289 do { 00290 i = width; 00291 do { 00292 mmx_yuv2rgb (py, pu, pv); 00293 mmx_unpack_32rgb (image, cpu, alphaones); 00294 py += 8; 00295 pu += 4; 00296 pv += 4; 00297 image += 32; 00298 } while (--i); 00299 00300 py += y_stride; 00301 image += rgb_stride; 00302 if (height & 1) { 00303 pu += uv_stride; 00304 pv += uv_stride; 00305 } else { 00306 pu -= 4 * width; 00307 pv -= 4 * width; 00308 } 00309 } while (--height); 00310 00311 emms(); 00312 } 00313 00314 static void mmxext_rgb16 (uint8_t * image, 00315 uint8_t * py, uint8_t * pu, uint8_t * pv, 00316 int width, int height, 00317 int rgb_stride, int y_stride, int uv_stride, 00318 int alphaones) 00319 { 00320 yuv420_rgb16 (image, py, pu, pv, width, height, 00321 rgb_stride, y_stride, uv_stride, CPU_MMXEXT, alphaones); 00322 } 00323 00324 static void mmxext_argb32 (uint8_t * image, 00325 uint8_t * py, uint8_t * pu, uint8_t * pv, 00326 int width, int height, 00327 int rgb_stride, int y_stride, int uv_stride, 00328 int alphaones) 00329 { 00330 yuv420_argb32 (image, py, pu, pv, width, height, 00331 rgb_stride, y_stride, uv_stride, CPU_MMXEXT, alphaones); 00332 } 00333 00334 static void mmx_rgb16 (uint8_t * image, 00335 uint8_t * py, uint8_t * pu, uint8_t * pv, 00336 int width, int height, 00337 int rgb_stride, int y_stride, int uv_stride, 00338 int alphaones) 00339 { 00340 yuv420_rgb16 (image, py, pu, pv, width, height, 00341 rgb_stride, y_stride, uv_stride, CPU_MMX, alphaones); 00342 } 00343 00344 static void mmx_argb32 (uint8_t * image, 00345 uint8_t * py, uint8_t * pu, uint8_t * pv, 00346 int width, int height, 00347 int rgb_stride, int y_stride, int uv_stride, 00348 int alphaones) 00349 { 00350 yuv420_argb32 (image, py, pu, pv, width, height, 00351 rgb_stride, y_stride, uv_stride, CPU_MMX, alphaones); 00352 } 00353 #endif 00354 00364 yuv2rgb_fun yuv2rgb_init_mmxext (int bpp, int mode) 00365 { 00366 #if HAVE_MMX 00367 if ((bpp == 16) && (mode == MODE_RGB)) 00368 return mmxext_rgb16; 00369 else if ((bpp == 32) && (mode == MODE_RGB)) 00370 return mmxext_argb32; 00371 #endif 00372 00373 (void)bpp; 00374 (void)mode; 00375 00376 return NULL; /* Fallback to C */ 00377 } 00378 00388 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode) 00389 { 00390 #if HAVE_MMX 00391 if ((bpp == 16) && (mode == MODE_RGB)) 00392 return mmx_rgb16; 00393 else if ((bpp == 32) && (mode == MODE_RGB)) 00394 return mmx_argb32; 00395 #endif 00396 if ((bpp == 32) && (mode == MODE_RGB)) 00397 return yuv420_argb32_non_mmx; 00398 00399 return NULL; 00400 } 00401 00402 #define SCALE_BITS 10 00403 00404 #define C_Y (76309 >> (16 - SCALE_BITS)) 00405 #define C_RV (117504 >> (16 - SCALE_BITS)) 00406 #define C_BU (138453 >> (16 - SCALE_BITS)) 00407 #define C_GU (13954 >> (16 - SCALE_BITS)) 00408 #define C_GV (34903 >> (16 - SCALE_BITS)) 00409 00410 #if defined(__FreeBSD__) 00411 // HACK: this is actually only needed on AMD64 at the moment, 00412 // but is doesn't hurt the other architectures. 00413 #undef UCHAR_MAX 00414 #define UCHAR_MAX (int)__UCHAR_MAX 00415 #endif 00416 00417 #define RGBOUT(r, g, b, y1)\ 00418 {\ 00419 y = (y1 - 16) * C_Y;\ 00420 r = std::min(UCHAR_MAX, std::max(0, (y + r_add) >> SCALE_BITS));\ 00421 g = std::min(UCHAR_MAX, std::max(0, (y + g_add) >> SCALE_BITS));\ 00422 b = std::min(UCHAR_MAX, std::max(0, (y + b_add) >> SCALE_BITS));\ 00423 } 00424 00425 static void yuv420_argb32_non_mmx(unsigned char *image, unsigned char *py, 00426 unsigned char *pu, unsigned char *pv, 00427 int h_size, int v_size, int rgb_stride, 00428 int y_stride, int uv_stride, int alphaones) 00429 { 00430 unsigned char *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr, *d, *d1, *d2; 00431 int w, y, cb, cr, r_add, g_add, b_add, width2; 00432 int dstwidth; 00433 00434 // byte indices 00435 #if HAVE_BIGENDIAN 00436 #define R_OI 1 00437 #define G_OI 2 00438 #define B_OI 3 00439 #define A_OI 0 00440 #else 00441 #define R_OI 2 00442 #define G_OI 1 00443 #define B_OI 0 00444 #define A_OI 3 00445 #endif 00446 00447 // squelch a warning 00448 (void) rgb_stride; (void) y_stride; (void) uv_stride; 00449 00450 d = image; 00451 y1_ptr = py; 00452 cb_ptr = pu; 00453 cr_ptr = pv; 00454 dstwidth = h_size * 4; 00455 width2 = h_size / 2; 00456 00457 for(;v_size > 0; v_size -= 2) { 00458 d1 = d; 00459 d2 = d + h_size * 4; 00460 y2_ptr = y1_ptr + h_size; 00461 for(w = width2; w > 0; w--) { 00462 cb = cb_ptr[0] - 128; 00463 cr = cr_ptr[0] - 128; 00464 r_add = C_RV * cr + (1 << (SCALE_BITS - 1)); 00465 g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1)); 00466 b_add = C_BU * cb + (1 << (SCALE_BITS - 1)); 00467 00468 /* output 4 pixels */ 00469 RGBOUT(d1[R_OI], d1[G_OI], d1[B_OI], y1_ptr[0]); 00470 RGBOUT(d1[R_OI+4], d1[G_OI+4], d1[B_OI+4], y1_ptr[1]); 00471 RGBOUT(d2[R_OI], d2[G_OI], d2[B_OI], y2_ptr[0]); 00472 RGBOUT(d2[R_OI+4], d2[G_OI+4], d2[B_OI+4], y2_ptr[1]); 00473 00474 if (alphaones) 00475 d1[A_OI] = d1[A_OI+4] = d2[A_OI] = d2[A_OI+4] = 0xff; 00476 else 00477 d1[A_OI] = d1[A_OI+4] = d2[A_OI] = d2[A_OI+4] = 0; 00478 00479 d1 += 8; 00480 d2 += 8; 00481 y1_ptr += 2; 00482 y2_ptr += 2; 00483 cb_ptr++; 00484 cr_ptr++; 00485 } 00486 d += 2 * dstwidth; 00487 y1_ptr += h_size; 00488 } 00489 } 00490 00491 #define SCALEBITS 8 00492 #define ONE_HALF (1 << (SCALEBITS - 1)) 00493 #define FIX(x) ((int) ((x) * (1L<<SCALEBITS) + 0.5)) 00494 00499 void rgb32_to_yuv420p(unsigned char *lum, unsigned char *cb, unsigned char *cr, 00500 unsigned char *alpha, unsigned char *src, 00501 int width, int height, int srcwidth) 00502 { 00503 int wrap, wrap4, x, y; 00504 int r, g, b, r1, g1, b1; 00505 unsigned char *p; 00506 00507 // byte indices 00508 #if HAVE_BIGENDIAN 00509 #define R_II 3 00510 #define G_II 2 00511 #define B_II 1 00512 #define A_II 0 00513 #else 00514 #define R_II 0 00515 #define G_II 1 00516 #define B_II 2 00517 #define A_II 3 00518 #endif 00519 00520 wrap = (width + 1) & ~1; 00521 wrap4 = srcwidth * 4; 00522 p = src; 00523 for(y=0;y+1<height;y+=2) { 00524 for(x=0;x+1<width;x+=2) { 00525 r = p[R_II]; 00526 g = p[G_II]; 00527 b = p[B_II]; 00528 r1 = r; 00529 g1 = g; 00530 b1 = b; 00531 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 00532 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; 00533 alpha[0] = p[A_II]; 00534 00535 r = p[R_II+4]; 00536 g = p[G_II+4]; 00537 b = p[B_II+4]; 00538 r1 += r; 00539 g1 += g; 00540 b1 += b; 00541 lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 00542 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; 00543 alpha[1] = p[A_II+4]; 00544 00545 p += wrap4; 00546 lum += wrap; 00547 alpha += wrap; 00548 00549 r = p[R_II]; 00550 g = p[G_II]; 00551 b = p[B_II]; 00552 r1 += r; 00553 g1 += g; 00554 b1 += b; 00555 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 00556 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; 00557 alpha[0] = p[A_II]; 00558 00559 r = p[R_II+4]; 00560 g = p[G_II+4]; 00561 b = p[B_II+4]; 00562 r1 += r; 00563 g1 += g; 00564 b1 += b; 00565 lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 00566 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; 00567 alpha[1] = p[A_II+4]; 00568 00569 cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 + 00570 FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 00571 128; 00572 cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 - 00573 FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) + 00574 128; 00575 00576 cb++; 00577 cr++; 00578 p += -wrap4 + 2 * 4; 00579 lum += -wrap + 2; 00580 alpha += -wrap + 2; 00581 } 00582 if (width & 1) { 00583 r = p[R_II]; 00584 g = p[G_II]; 00585 b = p[B_II]; 00586 r1 = r; 00587 g1 = g; 00588 b1 = b; 00589 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 00590 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; 00591 alpha[0] = p[A_II]; 00592 00593 lum[1] = 16; 00594 alpha[1] = 0; 00595 00596 p += wrap4; 00597 lum += wrap; 00598 alpha += wrap; 00599 00600 r = p[R_II]; 00601 g = p[G_II]; 00602 b = p[B_II]; 00603 r1 += r; 00604 g1 += g; 00605 b1 += b; 00606 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 00607 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; 00608 alpha[0] = p[A_II]; 00609 00610 lum[1] = 16; 00611 alpha[1] = 0; 00612 00613 cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 + 00614 FIX(0.50000) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) + 00615 128; 00616 cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 - 00617 FIX(0.08131) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) + 00618 128; 00619 00620 cb++; 00621 cr++; 00622 p += -wrap4 + 4; 00623 lum += -wrap + 2; 00624 alpha += -wrap + 2; 00625 } 00626 p += wrap4 * 2 - width * 4; 00627 lum += wrap; 00628 alpha += wrap; 00629 } 00630 if (height & 1) { 00631 for(x=0;x+1<width;x+=2) { 00632 r = p[R_II]; 00633 g = p[G_II]; 00634 b = p[B_II]; 00635 r1 = r; 00636 g1 = g; 00637 b1 = b; 00638 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 00639 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; 00640 alpha[0] = p[A_II]; 00641 00642 r = p[R_II+4]; 00643 g = p[G_II+4]; 00644 b = p[B_II+4]; 00645 r1 += r; 00646 g1 += g; 00647 b1 += b; 00648 lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g + 00649 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; 00650 alpha[1] = p[A_II+4]; 00651 00652 lum += wrap; 00653 alpha += wrap; 00654 00655 lum[0] = 16; 00656 alpha[0] = 0; 00657 00658 lum[1] = 16; 00659 alpha[1] = 0; 00660 00661 cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 + 00662 FIX(0.50000) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) + 00663 128; 00664 cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 - 00665 FIX(0.08131) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) + 00666 128; 00667 00668 cb++; 00669 cr++; 00670 p += 2 * 4; 00671 lum += -wrap + 2; 00672 alpha += -wrap + 2; 00673 } 00674 if (width & 1) { 00675 r = p[R_II]; 00676 g = p[G_II]; 00677 b = p[B_II]; 00678 r1 = r; 00679 g1 = g; 00680 b1 = b; 00681 lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g + 00682 FIX(0.11400) * b + ONE_HALF) >> SCALEBITS; 00683 alpha[0] = p[A_II]; 00684 00685 lum[1] = 16; 00686 alpha[1] = 0; 00687 00688 lum += wrap; 00689 alpha += wrap; 00690 00691 lum[0] = 16; 00692 alpha[0] = 0; 00693 00694 lum[1] = 16; 00695 alpha[1] = 0; 00696 00697 cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 + 00698 FIX(0.50000) * b1 + ONE_HALF - 1) >> SCALEBITS) + 00699 128; 00700 cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 - 00701 FIX(0.08131) * b1 + ONE_HALF - 1) >> SCALEBITS) + 00702 128; 00703 00704 cb++; 00705 cr++; 00706 p += 4; 00707 lum += -wrap + 2; 00708 alpha += -wrap + 2; 00709 } 00710 } 00711 } 00712 00713 /* I420 to 2VUY colorspace conversion routines. 00714 * 00715 * In the early days of the OS X port of MythTV, Paul Jara noticed that 00716 * QuickTime spent a lot of time converting from YUV420 to YUV422. 00717 * He found some sample code on the Ars Technica forum by a 00718 * Frenchman called Titer which used Altivec to speed this up. 00719 * Jeremiah Morris took that code and added it into MythTV. 00720 * 00721 * All was well until the Intel Macs came along, 00722 * which seem to crash when fed YUV420 from MythTV. 00723 * 00724 * Fortunately, Mino Taoyama has provided an MMX optimised version too. 00725 */ 00726 00739 static void non_vec_i420_2vuy( 00740 uint8_t *image, int vuy_stride, 00741 const uint8_t *py, const uint8_t *pu, const uint8_t *pv, 00742 int y_stride, int u_stride, int v_stride, 00743 int h_size, int v_size) 00744 { 00745 uint8_t *pi1, *pi2; 00746 const uint8_t *py1; 00747 const uint8_t *py2; 00748 const uint8_t *pu1; 00749 const uint8_t *pv1; 00750 int x, y; 00751 00752 for (y = 0; y < (v_size>>1); y++) 00753 { 00754 pi1 = image + 2*y * vuy_stride; 00755 pi2 = image + 2*y * vuy_stride + vuy_stride; 00756 py1 = py + 2*y * y_stride; 00757 py2 = py + 2*y * y_stride + y_stride; 00758 pu1 = pu + y * u_stride; 00759 pv1 = pv + y * v_stride; 00760 00761 for (x = 0; x < (h_size>>1); x++) 00762 { 00763 pi1[4*x+0] = pu1[1*x+0]; 00764 pi2[4*x+0] = pu1[1*x+0]; 00765 pi1[4*x+1] = py1[2*x+0]; 00766 pi2[4*x+1] = py2[2*x+0]; 00767 pi1[4*x+2] = pv1[1*x+0]; 00768 pi2[4*x+2] = pv1[1*x+0]; 00769 pi1[4*x+3] = py1[2*x+1]; 00770 pi2[4*x+3] = py2[2*x+1]; 00771 } 00772 } 00773 } 00774 00775 #if HAVE_MMX 00776 00788 static void mmx_i420_2vuy( 00789 uint8_t *image, int vuy_stride, 00790 const uint8_t *py, const uint8_t *pu, const uint8_t *pv, 00791 int y_stride, int u_stride, int v_stride, 00792 int h_size, int v_size) 00793 { 00794 uint8_t *pi1, *pi2; 00795 const uint8_t *py1 = py; 00796 const uint8_t *py2 = py; 00797 const uint8_t *pu1 = pu; 00798 const uint8_t *pv1 = pv; 00799 00800 int x,y; 00801 00802 if ((h_size % 16) || (v_size % 2)) 00803 { 00804 non_vec_i420_2vuy(image, vuy_stride, 00805 py, pu, pv, y_stride, u_stride, v_stride, 00806 h_size, v_size); 00807 return; 00808 } 00809 00810 emms(); 00811 00812 for (y = 0; y < (v_size>>1); y++) 00813 { 00814 pi1 = image + 2*y * vuy_stride; 00815 pi2 = image + 2*y * vuy_stride + vuy_stride; 00816 py1 = py + 2*y * y_stride; 00817 py2 = py + 2*y * y_stride + y_stride; 00818 pu1 = pu + y * u_stride; 00819 pv1 = pv + y * v_stride; 00820 00821 for (x = 0; x < h_size / 16; x++) 00822 { 00823 movq_m2r (*py1, mm0); // y data 00824 movq_m2r (*py2, mm1); // y data 00825 movq_m2r (*pu1, mm2); // u data 00826 movq_m2r (*pv1, mm3); // v data 00827 00828 movq_r2r (mm2, mm4); // Copy U 00829 00830 punpcklbw_r2r (mm3, mm2); // Combine low U & V mm2 = uv low 00831 punpckhbw_r2r (mm3, mm4); // Combine high U & V mm4 = uv high 00832 00833 movq_r2r (mm2, mm5); // Copy low UV mm5 = uv low 00834 movq_r2r (mm2, mm6); // Copy low UV mm6 = uv low 00835 punpcklbw_r2r (mm0, mm5); // mm5 = y1 low uv low 00836 punpckhbw_r2r (mm0, mm6); // mm6 = y1 high uv high 00837 00838 movntq_r2m (mm5, *(pi1)); 00839 movntq_r2m (mm6, *(pi1+8)); 00840 00841 movq_r2r (mm2, mm5); // Copy low UV mm5 = uv low 00842 movq_r2r (mm2, mm6); // Copy low UV mm6 = uv low 00843 punpcklbw_r2r (mm1, mm5); // mm5 = y2 low uv low 00844 punpckhbw_r2r (mm1, mm6); // mm6 = y2 high uv high 00845 00846 movntq_r2m (mm5, *(pi2)); 00847 movntq_r2m (mm6, *(pi2+8)); 00848 00849 00850 movq_m2r (*(py1+8), mm0); // y data 00851 movq_m2r (*(py2+8), mm1); // y data 00852 00853 movq_r2r (mm4, mm5); // Copy high UV mm5 = uv high 00854 movq_r2r (mm4, mm6); // Copy high UV mm6 = uv high 00855 punpcklbw_r2r (mm0, mm5); // mm5 = y1 low uv high 00856 punpckhbw_r2r (mm0, mm6); // mm6 = y1 high uv high 00857 00858 movntq_r2m (mm5, *(pi1+16)); 00859 movntq_r2m (mm6, *(pi1+24)); 00860 00861 movq_r2r (mm4, mm5); // Copy high UV mm5 = uv high 00862 movq_r2r (mm4, mm6); // Copy high UV mm6 = uv high 00863 punpcklbw_r2r (mm1, mm5); // mm5 = y2 low uv low 00864 punpckhbw_r2r (mm1, mm6); // mm6 = y2 high uv high 00865 00866 movntq_r2m (mm5, *(pi2+16)); 00867 movntq_r2m (mm6, *(pi2+24)); 00868 00869 pi1 += 32; 00870 pi2 += 32; 00871 py1 += 16; 00872 py2 += 16; 00873 pu1 += 8; 00874 pv1 += 8; 00875 } 00876 } 00877 00878 emms(); 00879 } 00880 00881 #endif // HAVE_MMX 00882 00883 #if HAVE_ALTIVEC 00884 00885 // Altivec code adapted from VLC's i420_yuv2.c (thanks to Titer and Paul Jara) 00886 00887 #define VEC_NEXT_LINES() \ 00888 pi1 = pi2; \ 00889 pi2 += h_size * 2; \ 00890 py1 = py2; \ 00891 py2 += h_size; 00892 00893 #define VEC_LOAD_UV() \ 00894 u_vec = vec_ld(0, pu); pu += 16; \ 00895 v_vec = vec_ld(0, pv); pv += 16; 00896 00897 #define VEC_MERGE(a) \ 00898 uv_vec = a(u_vec, v_vec); \ 00899 y_vec = vec_ld(0, py1); py1 += 16; \ 00900 vec_st(vec_mergeh(uv_vec, y_vec), 0, pi1); pi1 += 16; \ 00901 vec_st(vec_mergel(uv_vec, y_vec), 0, pi1); pi1 += 16; \ 00902 y_vec = vec_ld(0, py2); py2 += 16; \ 00903 vec_st(vec_mergeh(uv_vec, y_vec), 0, pi2); pi2 += 16; \ 00904 vec_st(vec_mergel(uv_vec, y_vec), 0, pi2); pi2 += 16; 00905 00918 static void altivec_i420_2vuy( 00919 uint8_t *image, int vuy_stride, 00920 const uint8_t *py, const uint8_t *pu, const uint8_t *pv, 00921 int y_stride, int u_stride, int v_stride, 00922 int h_size, int v_size) 00923 { 00924 uint8_t *pi1, *pi2 = image; 00925 const uint8_t *py1; 00926 const uint8_t *py2 = py; 00927 00928 int x, y; 00929 00930 vector unsigned char u_vec; 00931 vector unsigned char v_vec; 00932 vector unsigned char uv_vec; 00933 vector unsigned char y_vec; 00934 00935 int vuy_extra = vuy_stride - (h_size<<1); 00936 int y_extra = y_stride - (h_size); 00937 int u_extra = u_stride - (h_size>>1); 00938 int v_extra = v_stride - (h_size>>1); 00939 00940 if (vuy_extra || y_extra || u_extra || v_extra) 00941 { 00942 // Fall back to C version 00943 non_vec_i420_2vuy(image, vuy_stride, 00944 py, pu, pv, 00945 y_stride, u_stride, v_stride, 00946 h_size, v_size); 00947 return; 00948 } 00949 00950 if (!((h_size % 32) || (v_size % 2))) 00951 { 00952 // Width is a multiple of 32, process 2 lines at a time 00953 for (y = v_size / 2; y--; ) 00954 { 00955 VEC_NEXT_LINES(); 00956 for (x = h_size / 32; x--; ) 00957 { 00958 VEC_LOAD_UV(); 00959 VEC_MERGE(vec_mergeh); 00960 VEC_MERGE(vec_mergel); 00961 } 00962 } 00963 00964 } 00965 else if (!((h_size % 16) || (v_size % 4))) 00966 { 00967 // Width is a multiple of 16, process 4 lines at a time 00968 for (y = v_size / 4; y--; ) 00969 { 00970 // Lines 1-2, pixels 0 to (width - 16) 00971 VEC_NEXT_LINES(); 00972 for (x = h_size / 32; x--; ) 00973 { 00974 VEC_LOAD_UV(); 00975 VEC_MERGE(vec_mergeh); 00976 VEC_MERGE(vec_mergel); 00977 } 00978 00979 // Lines 1-2, pixels (width - 16) to width 00980 VEC_LOAD_UV(); 00981 VEC_MERGE(vec_mergeh); 00982 00983 // Lines 3-4, pixels 0-16 00984 VEC_NEXT_LINES(); 00985 VEC_MERGE(vec_mergel); 00986 00987 // Lines 3-4, pixels 16 to width 00988 for (x = h_size / 32; x--; ) 00989 { 00990 VEC_LOAD_UV(); 00991 VEC_MERGE(vec_mergeh); 00992 VEC_MERGE(vec_mergel); 00993 } 00994 } 00995 } 00996 else 00997 { 00998 // Fall back to C version 00999 non_vec_i420_2vuy(image, vuy_stride, 01000 py, pu, pv, 01001 y_stride, u_stride, v_stride, 01002 h_size, v_size); 01003 } 01004 } 01005 01006 #endif // HAVE_ALTIVEC 01007 01008 01022 conv_i420_2vuy_fun get_i420_2vuy_conv(void) 01023 { 01024 #if HAVE_ALTIVEC 01025 if (has_altivec()) 01026 return altivec_i420_2vuy; 01027 #endif 01028 #if HAVE_MMX 01029 return mmx_i420_2vuy; 01030 #else 01031 return non_vec_i420_2vuy; /* Fallback to C */ 01032 #endif 01033 } 01034 01044 static void non_vec_2vuy_i420( 01045 uint8_t *py, uint8_t *pu, uint8_t *pv, 01046 int y_stride, int u_stride, int v_stride, 01047 const uint8_t *image, int vuy_stride, 01048 int h_size, int v_size) 01049 { 01050 const uint8_t *pi1; 01051 const uint8_t *pi2; 01052 uint8_t *py1, *py2, *pu1, *pv1; 01053 int x, y; 01054 01055 for (y = 0; y < (v_size>>1); y++) 01056 { 01057 pi1 = image + 2*y * vuy_stride; 01058 pi2 = image + 2*y * vuy_stride + vuy_stride; 01059 py1 = py + 2*y * y_stride; 01060 py2 = py + 2*y * y_stride + y_stride; 01061 pu1 = pu + y * u_stride; 01062 pv1 = pv + y * v_stride; 01063 01064 for (x = 0; x < (h_size>>1); x++) 01065 { 01066 pu1[1*x+0] = (pi1[4*x+0] + pi2[4*x+0]) >> 1; 01067 py1[2*x+0] = pi1[4*x+1]; 01068 py2[2*x+0] = pi2[4*x+1]; 01069 pv1[1*x+0] = (pi1[4*x+2] + pi2[4*x+2]) >> 1; 01070 py1[2*x+1] = pi1[4*x+3]; 01071 py2[2*x+1] = pi2[4*x+3]; 01072 } 01073 } 01074 } 01075 01076 #if HAVE_ALTIVEC 01077 01078 // Altivec code adapted from VLC's i420_yuv2.c (thanks to Titer and Paul Jara) 01079 01080 #define VEC_READ_LINE(ptr, y, uv) \ 01081 pa_vec = vec_ld(0, ptr); ptr += 16; \ 01082 pb_vec = vec_ld(0, ptr); ptr += 16; \ 01083 vec_st(vec_pack((vector unsigned short)pa_vec, \ 01084 (vector unsigned short)pb_vec), \ 01085 0, y); y += 16; \ 01086 uv = vec_pack(vec_sr((vector unsigned short)pa_vec, eight_vec), \ 01087 vec_sr((vector unsigned short)pb_vec, eight_vec)); 01088 01089 #define VEC_SPLIT(a) \ 01090 VEC_READ_LINE(pi1, py1, uv1_vec); \ 01091 VEC_READ_LINE(pi2, py2, uv2_vec); \ 01092 a = vec_avg(uv1_vec, uv2_vec); 01093 01094 #define VEC_STORE_UV() \ 01095 vec_st(vec_pack((vector unsigned short)uva_vec, \ 01096 (vector unsigned short)uvb_vec), \ 01097 0, pv); pv += 16; \ 01098 vec_st(vec_pack(vec_sr((vector unsigned short)uva_vec, eight_vec), \ 01099 vec_sr((vector unsigned short)uvb_vec, eight_vec)), \ 01100 0, pu); pu += 16; 01101 01102 01112 static void altivec_2vuy_i420( 01113 uint8_t *py, uint8_t *pu, uint8_t *pv, 01114 int y_stride, int u_stride, int v_stride, 01115 const uint8_t *image, int vuy_stride, 01116 int h_size, int v_size) 01117 { 01118 const uint8_t *pi1; 01119 const uint8_t *pi2 = image; 01120 uint8_t *py1, *py2 = py; 01121 01122 int x, y; 01123 01124 vector unsigned short eight_vec = vec_splat_u16(8); 01125 vector unsigned char pa_vec, pb_vec, 01126 uv1_vec, uv2_vec, 01127 uva_vec, uvb_vec; 01128 01129 int vuy_extra = vuy_stride - (h_size<<1); 01130 int y_extra = y_stride - (h_size); 01131 int u_extra = u_stride - (h_size>>1); 01132 int v_extra = v_stride - (h_size>>1); 01133 01134 if (vuy_extra || y_extra || u_extra || v_extra) 01135 { 01136 // Fall back to C version 01137 non_vec_2vuy_i420(py, pu, pv, 01138 y_stride, u_stride, v_stride, 01139 image, vuy_stride, 01140 h_size, v_size); 01141 return; 01142 } 01143 01144 if (!((h_size % 32) || (v_size % 2))) 01145 { 01146 // Width is a multiple of 32, process 2 lines at a time 01147 for (y = v_size / 2; y--; ) 01148 { 01149 VEC_NEXT_LINES(); 01150 for (x = h_size / 32; x--; ) 01151 { 01152 VEC_SPLIT(uva_vec); 01153 VEC_SPLIT(uvb_vec); 01154 VEC_STORE_UV(); 01155 } 01156 } 01157 } 01158 else if (!((h_size % 16) || (v_size % 4))) 01159 { 01160 // Width is a multiple of 16, process 4 lines at a time 01161 for (y = v_size / 4; y--; ) 01162 { 01163 // Lines 1-2, pixels 0 to (width - 16) 01164 VEC_NEXT_LINES(); 01165 for (x = h_size / 32; x--; ) 01166 { 01167 VEC_SPLIT(uva_vec); 01168 VEC_SPLIT(uvb_vec); 01169 VEC_STORE_UV(); 01170 } 01171 01172 // Lines 1-2, pixels (width - 16) to width 01173 VEC_SPLIT(uva_vec); 01174 01175 // Lines 3-4, pixels 0-16 01176 VEC_NEXT_LINES(); 01177 VEC_SPLIT(uvb_vec); 01178 VEC_STORE_UV(); 01179 01180 // Lines 3-4, pixels 16 to width 01181 for (x = h_size / 32; x--; ) 01182 { 01183 VEC_SPLIT(uva_vec); 01184 VEC_SPLIT(uvb_vec); 01185 VEC_STORE_UV(); 01186 } 01187 } 01188 } 01189 else 01190 { 01191 // Fall back to C version 01192 non_vec_2vuy_i420(py, pu, pv, 01193 y_stride, u_stride, v_stride, 01194 image, vuy_stride, 01195 h_size, v_size); 01196 } 01197 } 01198 01199 #endif // HAVE_ALTIVEC 01200 01201 01215 conv_2vuy_i420_fun get_2vuy_i420_conv(void) 01216 { 01217 #if HAVE_ALTIVEC 01218 if (has_altivec()) 01219 return altivec_2vuy_i420; 01220 #endif 01221 return non_vec_2vuy_i420; /* Fallback to C */ 01222 }
1.7.6.1