How to convert RGB565 to YUV420SP faster on Android?

I need to display a jpeg image and convert it to YUV420SP. First I use SkBitmap to parse the jpeg and display it, then I use the code below to convert RGB565 to YUV420SP on Android, but it takes 75ms to convert the RGB565 image to 640 * 480 in size, so does anyone know a faster way to convert RGB565 in YUV420SP on android? or a faster way to convert jpeg file to YUV420SP on android?

// Convert from RGB to YUV420 int RGB2YUV_YR[256], RGB2YUV_YG[256], RGB2YUV_YB[256]; int RGB2YUV_UR[256], RGB2YUV_UG[256], RGB2YUV_UBVR[256]; int RGB2YUV_VG[256], RGB2YUV_VB[256]; // // Table used for RGB to YUV420 conversion // void InitLookupTable() { static bool hasInited = false; if(hasInited) return ; hasInited = true; int i; for (i = 0; i < 256; i++) RGB2YUV_YR[i] = (float) 65.481 * (i << 8); for (i = 0; i < 256; i++) RGB2YUV_YG[i] = (float) 128.553 * (i << 8); for (i = 0; i < 256; i++) RGB2YUV_YB[i] = (float) 24.966 * (i << 8); for (i = 0; i < 256; i++) RGB2YUV_UR[i] = (float) 37.797 * (i << 8); for (i = 0; i < 256; i++) RGB2YUV_UG[i] = (float) 74.203 * (i << 8); for (i = 0; i < 256; i++) RGB2YUV_VG[i] = (float) 93.786 * (i << 8); for (i = 0; i < 256; i++) RGB2YUV_VB[i] = (float) 18.214 * (i << 8); for (i = 0; i < 256; i++) RGB2YUV_UBVR[i] = (float) 112 * (i << 8); } int ConvertRGB5652YUV420SP(int w, int h, unsigned char *bmp, unsigned char *yuv) { unsigned char *u, *v, *y, *uu, *vv; unsigned char *pu1, *pu2, *pu3, *pu4; unsigned char *pv1, *pv2, *pv3, *pv4; unsigned char rValue = 0, gValue = 0, bValue = 0; uint16_t* bmpPtr; int i, j; printf("ConvertRGB5652YUV420SP begin,w=%d,h=%d,bmp=%p,yuv=%p\n", w, h, bmp, yuv); struct timeval tpstart,tpend; gettimeofday(&tpstart,NULL); InitLookupTable(); gettimeofday(&tpend,NULL); float timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec; timeuse/=1000; printf("InitLookupTable used time=%f\n", timeuse); gettimeofday(&tpstart,NULL); uu = new unsigned char[w * h]; vv = new unsigned char[w * h]; if (uu == NULL || vv == NULL || yuv == NULL) return 0; y = yuv; u = uu; v = vv; // Get r,g,b pointers from bmp image data.... bmpPtr = (uint16_t*)bmp; //Get YUV values for rgb values... for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { uint16_t color = *bmpPtr; unsigned int r = (color>>11) & 0x1f; unsigned int g = (color>> 5) & 0x3f; unsigned int b = (color ) & 0x1f; rValue = (r<<3) | (r>>2); gValue = (g<<2) | (g>>4); bValue = (b<<3) | (b>>2); *y++ = (RGB2YUV_YR[rValue] + RGB2YUV_YG[gValue] + RGB2YUV_YB[bValue] + 1048576) >> 16; *u++ = (-RGB2YUV_UR[rValue] - RGB2YUV_UG[gValue] + RGB2YUV_UBVR[bValue] + 8388608) >> 16; *v++ = (RGB2YUV_UBVR[rValue] - RGB2YUV_VG[gValue] - RGB2YUV_VB[bValue] + 8388608) >> 16; bmpPtr++; } } gettimeofday(&tpend,NULL); timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec; timeuse/=1000; printf("Get YUV values used time=%f\n", timeuse); gettimeofday(&tpstart,NULL); // Now sample the U & V to obtain YUV 4:2:0 format // Get the right pointers... u = yuv + w * h; v = u + 1; // For U pu1 = uu; pu2 = pu1 + 1; pu3 = pu1 + w; pu4 = pu3 + 1; // For V pv1 = vv; pv2 = pv1 + 1; pv3 = pv1 + w; pv4 = pv3 + 1; // Do sampling.... for (i = 0; i < h; i += 2) { for (j = 0; j < w; j += 2) { *u = (*pu1 + *pu2 + *pu3 + *pu4) >> 2; u += 2; *v = (*pv1 + *pv2 + *pv3 + *pv4) >> 2; v += 2; pu1 += 2; pu2 += 2; pu3 += 2; pu4 += 2; pv1 += 2; pv2 += 2; pv3 += 2; pv4 += 2; } pu1 += w; pu2 += w; pu3 += w; pu4 += w; pv1 += w; pv2 += w; pv3 += w; pv4 += w; } gettimeofday(&tpend,NULL); timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec; timeuse/=1000; printf("Do sampling used time=%f\n", timeuse); gettimeofday(&tpstart,NULL); delete uu; delete vv; return 1; } int main(int argc, char **argv) { unsigned char bmp[640*480*2] = {0}; unsigned char yuv[(640*480*3)/2] = {0}; struct timeval tpstart,tpend; gettimeofday(&tpstart,NULL); ConvertRGB5652YUV420SP(640, 480, bmp, yuv); gettimeofday(&tpend,NULL); float timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec; timeuse/=1000; printf("ConvertARGB2YUV420SP used time=%f\n", timeuse); return 0; } 

output to android (armv6):

 ConvertRGB5652YUV420SP begin,w=640,h=480,bmp=0xbe7314fc,yuv=0xbe7c74fc InitLookupTable used time=0.383000 Get YUV values used time=61.394001 Do sampling used time=11.918000 ConvertARGB2YUV420SP used time=74.596001 

cpu info:

 $ cat /proc/cpuinfo cat /proc/cpuinfo Processor : ARMv6-compatible processor rev 5 (v6l) BogoMIPS : 791.34 Features : swp half thumb fastmult vfp edsp java CPU implementer : 0x41 CPU architecture: 6TEJ CPU variant : 0x1 CPU part : 0xb36 CPU revision : 5 Hardware : IMAPX200 Revision : 0000 Serial : 0000000000000000 
+4
source share
2 answers

On ARMv7, use NEON. It will do the job in less than 1 ms. (VGA)

If you are stuck with ARMv6, optimize it in the ARM assembly. (about 8 ms on VGA)

Use fixed point arithmetic instead of lookup tables. Get rid of them.

make two masks:

  • 0x001f001f: mask1
  • 0x003f003f: mask2

then load two pixels at the same time into a 32-bit register (which is much faster than 16-bit)

 and red, mask1, pixel, lsr #11 and grn, mask2, pixel, lsr #5 and blu, mask1, pixel 

you now have three registers, each of which contains two values: one in the lower and one in the upper 16 bits.

smulxy instructions will do some miracles here. (Multiplied by 16 bits)

Good luck.

PS: your lookup table is not that good either. Why do they all have a length of 256? You can reduce them to 32 (associated with r and b) and 64 (related to g), which will increase the speed of getting into the cache. It will probably be just for the target 40 ms without resorting to assembly. Yes, cache passes are BIG.

+7
source

I found a faster way in skia, it works for about 40 ms.

 #include "SkColorPriv.h" #include "SkBitmap.h" #include "SkCanvas.h" #include "SkStream.h" using namespace android; // taken from jcolor.c in libjpeg #if 0 // 16bit - precise but slow #define CYR 19595 // 0.299 #define CYG 38470 // 0.587 #define CYB 7471 // 0.114 #define CUR -11059 // -0.16874 #define CUG -21709 // -0.33126 #define CUB 32768 // 0.5 #define CVR 32768 // 0.5 #define CVG -27439 // -0.41869 #define CVB -5329 // -0.08131 #define CSHIFT 16 #else // 8bit - fast, slightly less precise #define CYR 77 // 0.299 #define CYG 150 // 0.587 #define CYB 29 // 0.114 #define CUR -43 // -0.16874 #define CUG -85 // -0.33126 #define CUB 128 // 0.5 #define CVR 128 // 0.5 #define CVG -107 // -0.41869 #define CVB -21 // -0.08131 #define CSHIFT 8 #endif static void rgb2yuv_32(uint8_t dst[], SkPMColor c) { int r = SkGetPackedR32(c); int g = SkGetPackedG32(c); int b = SkGetPackedB32(c); int y = ( CYR*r + CYG*g + CYB*b ) >> CSHIFT; int u = ( CUR*r + CUG*g + CUB*b ) >> CSHIFT; int v = ( CVR*r + CVG*g + CVB*b ) >> CSHIFT; dst[0] = SkToU8(y); dst[1] = SkToU8(u + 128); dst[2] = SkToU8(v + 128); } static void rgb2yuv_32_x(uint8_t *py, uint8_t *pu, uint8_t *pv, SkPMColor c) { int r = SkGetPackedR32(c); int g = SkGetPackedG32(c); int b = SkGetPackedB32(c); if(py != NULL){ int y = ( CYR*r + CYG*g + CYB*b ) >> CSHIFT; *py = SkToU8(y); } if(pu != NULL){ int u = ( CUR*r + CUG*g + CUB*b ) >> CSHIFT; *pu = SkToU8(u + 128); } if(pv != NULL){ int v = ( CVR*r + CVG*g + CVB*b ) >> CSHIFT; *pv = SkToU8(v + 128); } } static void rgb2yuv_4444(uint8_t dst[], U16CPU c) { int r = SkGetPackedR4444(c); int g = SkGetPackedG4444(c); int b = SkGetPackedB4444(c); int y = ( CYR*r + CYG*g + CYB*b ) >> (CSHIFT - 4); int u = ( CUR*r + CUG*g + CUB*b ) >> (CSHIFT - 4); int v = ( CVR*r + CVG*g + CVB*b ) >> (CSHIFT - 4); dst[0] = SkToU8(y); dst[1] = SkToU8(u + 128); dst[2] = SkToU8(v + 128); } static void rgb2yuv_4444_x(uint8_t *py, uint8_t *pu, uint8_t *pv, U16CPU c) { int r = SkGetPackedR4444(c); int g = SkGetPackedG4444(c); int b = SkGetPackedB4444(c); if(py != NULL){ int y = ( CYR*r + CYG*g + CYB*b ) >> (CSHIFT - 4); *py = SkToU8(y); } if(pu != NULL){ int u = ( CUR*r + CUG*g + CUB*b ) >> (CSHIFT - 4); *pu = SkToU8(u + 128); } if(pv != NULL){ int v = ( CVR*r + CVG*g + CVB*b ) >> (CSHIFT - 4); *pv = SkToU8(v + 128); } } static void rgb2yuv_16(uint8_t dst[], U16CPU c) { int r = SkGetPackedR16(c); int g = SkGetPackedG16(c); int b = SkGetPackedB16(c); int y = ( 2*CYR*r + CYG*g + 2*CYB*b ) >> (CSHIFT - 2); int u = ( 2*CUR*r + CUG*g + 2*CUB*b ) >> (CSHIFT - 2); int v = ( 2*CVR*r + CVG*g + 2*CVB*b ) >> (CSHIFT - 2); dst[0] = SkToU8(y); dst[1] = SkToU8(u + 128); dst[2] = SkToU8(v + 128); } static void rgb2yuv_16_x(uint8_t *py, uint8_t *pu, uint8_t *pv, U16CPU c) { int r = SkGetPackedR16(c); int g = SkGetPackedG16(c); int b = SkGetPackedB16(c); if(py != NULL){ int y = ( 2*CYR*r + CYG*g + 2*CYB*b ) >> (CSHIFT - 2); *py = SkToU8(y); } if(pu != NULL){ int u = ( 2*CUR*r + CUG*g + 2*CUB*b ) >> (CSHIFT - 2); *pu = SkToU8(u + 128); } if(pv != NULL){ int v = ( 2*CVR*r + CVG*g + 2*CVB*b ) >> (CSHIFT - 2); *pv = SkToU8(v + 128); } } int ConvertRGB5652YUV420SPBySkia(SkBitmap* bmp, unsigned char* dst) { if(!bmp || !dst || bmp->getConfig() != SkBitmap::kRGB_565_Config) return -1; int width = bmp->width(); int height = bmp->height(); void *src = bmp->getPixels(); int src_rowbytes = bmp->rowBytes(); int stride = width; int dstheight = height; int i, j; uint8_t *y_base = (uint8_t *)dst; uint8_t *cb_base = (uint8_t *)((unsigned int)y_base + stride * dstheight); uint8_t *cr_base = cb_base + 1; uint8_t yuv[3]; uint8_t *y = NULL, *cb = NULL, *cr = NULL; uint16_t *rgb = (uint16_t *)src; for(i=0; i<height; i++){ rgb = (uint16_t *)((unsigned int)src + i * src_rowbytes); y = (uint8_t *)((unsigned int)y_base + i * stride); if((i & 0x1) == 0){ cb = (uint8_t *)((unsigned int)cb_base + ((i>>1) * stride)); cr = cb + 1; } for(j=0; j<width; j++){ if(i & 0x1){// valid y and cr if(j & 0x01){ // only y rgb2yuv_16_x(y++, NULL, NULL, *rgb++); }else{ // both y and cr rgb2yuv_16_x(y++, NULL, cr++, *rgb++); cr++; } }else{// valid y and cb if(j & 0x01){ // only y rgb2yuv_16_x(y++, NULL, NULL, *rgb++); }else{ // both y and cb rgb2yuv_16_x(y++, cb++, NULL, *rgb++); cb++; } } } } return 0; } 
+2
source

Source: https://habr.com/ru/post/1379843/


All Articles