本文共 5232 字,大约阅读时间需要 17 分钟。
转自:http://blog.csdn.net/xiaoyi247/article/details/7913360
1.SAD即绝对误差和(sum of absolute differences),应用非常广泛,是一种测量两个图像块的差异的最简单的办法。顾名思义,其公式为:
2.为什么要优化SAD
SAD在视频编解码中用处非常广泛,
1. intra预测中选择最佳划分,最佳预测方向
2. inter预测中选择最佳划分,运动搜索中对搜索结果的评判。
3. 作为rdo公式的失真项
那为什么要优化sad呢?试想一下,每个宏块在进行intra预测的时候,会由16x16 8x8 4x4四类划分,4x4和8x8分别都有9中预测模式,每一种预测模式都要计算其sad。如果把每一种分块的模式都走完,sad的调用次数将是非常可观的。实际上sad或其类似算法satd,ssd是整个编码算法中调用次数最多算法。
3.SIMD中的SAD
还好在intel的多媒体指令中,提供了sad的专用指令,可见sad的重要性。mmx和xmm中都提供了sad指令。一条指令可以做8个像素,或者更多像素的sad,大大提高了sad的运算效率。
4.SAD在x264中的实现代码可以在sad-a.asm中找到,下面对一些代码进行简单注释
;============================================================================= ; SAD MMX ;============================================================================= ;两行16像素的sad,结果累加存入mm0中,参考8x8和4x4的代码分析 %macro SAD_INC_2x16P 0 movq mm1, [r0] movq mm2, [r0+8] movq mm3, [r0+r1] movq mm4, [r0+r1+8] psadbw mm1, [r2] psadbw mm2, [r2+8] psadbw mm3, [r2+r3] psadbw mm4, [r2+r3+8] lea r0, [r0+2*r1] paddw mm1, mm2 paddw mm3, mm4 lea r2, [r2+2*r3] paddw mm0, mm1 paddw mm0, mm3 %endmacro ;两行8像素的sad,结果累加入mm0中 %macro SAD_INC_2x8P 0 ;在r0中取8个像素放入mm132中 movq mm1, [r0] ;在r0中取8个像素放入mm2中 movq mm2, [r0+r1] ;mm1和r2进行SAD,结果存入mm1的低16bit中 psadbw mm1, [r2] ;mm2和r2的下一行进行SAD,结果存入mm2中 psadbw mm2, [r2+r3] ;r0指向下面两行 lea r0, [r0+2*r1] ;将结果累加到mm0 paddw mm0, mm1 paddw mm0, mm2 ;r2指向下面两行 lea r2, [r2+2*r3] %endmacro ;两行四个像素的sad,结果累加入mm0中 %macro SAD_INC_2x4P 0 ;将r0中的4个像素放入mm1的低32位,r2中的4个像素放入mm2的低32位 movd mm1, [r0] movd mm2, [r2] ;从r0中继续取4个像素放入mm1的高32位,r2中继续取4个像素放入mm2的高32位 punpckldq mm1, [r0+r1] punpckldq mm2, [r2+r3] ;进行byte的SAD,并且将结果放入mm1的低16位中 psadbw mm1, mm2 ;累加SAD的值到mm0 paddw mm0, mm1 ;装入新的地址,开始下一次操作 lea r0, [r0+2*r1] lea r2, [r2+2*r3] %endmacro ;定义x264_pixel_sad_16x16_mmxext汇编函数 ;----------------------------------------------------------------------------- ; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SAD 2 cglobal x264_pixel_sad_%1x%2_mmxext, 4,4 pxor mm0, mm0 %rep %2/2 ;一次做两行,所以这里必须/2 SAD_INC_2x%1P %endrep movd eax, mm0 RET %endmacro ;定义16x16 16x8 8x16 8x8 8x4 4x8 4x4的使用mmx指令集的sad函数 SAD 16, 16 SAD 16, 8 SAD 8, 16 SAD 8, 8 SAD 8, 4 SAD 4, 8 SAD 4, 4 ;============================================================================= ; SAD XMM ;============================================================================= %macro SAD_END_SSE2 0 movhlps m1, m0 paddw m0, m1 movd eax, m0 RET %endmacro ; SAD_W16指定使用SSE SSE2等指令集 %macro SAD_W16 1 ;----------------------------------------------------------------------------- ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- ;使用SSE2指令集做16x16的SAD cglobal x264_pixel_sad_16x16_%1, 4,4,8 ;使用4个参数,4个寄存器,8个XMM寄存器 movdqu m0, [r2] movdqu m1, [r2+r3] ;分别存入两行数据到m0和m1中 lea r2, [r2+2*r3] movdqu m2, [r2] movdqu m3, [r2+r3] lea r2, [r2+2*r3] ;分别存入两行数据到m2和m3中 psadbw m0, [r0] ;m0和dst的第1行做sad psadbw m1, [r0+r1] ;m1和dst的第二行做sad lea r0, [r0+2*r1] ;r0指向第三行 movdqu m4, [r2] ;载入第5行 paddw m0, m1 psadbw m2, [r0] ;m2和dst的第三行做SAD psadbw m3, [r0+r1] ;m3和dst的第四行做SAD lea r0, [r0+2*r1] ;r0指向第5行 movdqu m5, [r2+r3] ;载入第6行 lea r2, [r2+2*r3] ;r2指向第7行..... paddw m2, m3 movdqu m6, [r2] movdqu m7, [r2+r3] lea r2, [r2+2*r3] paddw m0, m2 psadbw m4, [r0] psadbw m5, [r0+r1] lea r0, [r0+2*r1] movdqu m1, [r2] paddw m4, m5 psadbw m6, [r0] psadbw m7, [r0+r1] lea r0, [r0+2*r1] movdqu m2, [r2+r3] lea r2, [r2+2*r3] paddw m6, m7 movdqu m3, [r2] paddw m0, m4 movdqu m4, [r2+r3] lea r2, [r2+2*r3] paddw m0, m6 psadbw m1, [r0] psadbw m2, [r0+r1] lea r0, [r0+2*r1] movdqu m5, [r2] paddw m1, m2 psadbw m3, [r0] psadbw m4, [r0+r1] lea r0, [r0+2*r1] movdqu m6, [r2+r3] lea r2, [r2+2*r3] paddw m3, m4 movdqu m7, [r2] paddw m0, m1 movdqu m1, [r2+r3] paddw m0, m3 psadbw m5, [r0] psadbw m6, [r0+r1] lea r0, [r0+2*r1] paddw m5, m6 psadbw m7, [r0] psadbw m1, [r0+r1] paddw m7, m1 paddw m0, m5 paddw m0, m7 SAD_END_SSE2 ;----------------------------------------------------------------------------- ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- ;使用SSE2做16x8的指令集 cglobal x264_pixel_sad_16x8_%1, 4,4 movdqu m0, [r2] movdqu m2, [r2+r3] lea r2, [r2+2*r3] movdqu m3, [r2] movdqu m4, [r2+r3] psadbw m0, [r0] psadbw m2, [r0+r1] lea r0, [r0+2*r1] psadbw m3, [r0] psadbw m4, [r0+r1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] paddw m0, m2 paddw m3, m4 paddw m0, m3 movdqu m1, [r2] movdqu m2, [r2+r3] lea r2, [r2+2*r3] movdqu m3, [r2] movdqu m4, [r2+r3] psadbw m1, [r0] psadbw m2, [r0+r1] lea r0, [r0+2*r1] psadbw m3, [r0] psadbw m4, [r0+r1] lea r0, [r0+2*r1] lea r2, [r2+2*r3] paddw m1, m2 paddw m3, m4 paddw m0, m1 paddw m0, m3 SAD_END_SSE2 %endmacro INIT_XMM SAD_W16 sse2 %define movdqu lddqu SAD_W16 sse3 %define movdqu movdqa SAD_W16 sse2_aligned %undef movdqu ;使用SSE做4x8的SAD %macro SAD_INC_4x8P_SSE 1 movq m1, [r0] movq m2, [r0+r1] lea r0, [r0+2*r1] movq m3, [r2] movq m4, [r2+r3] lea r2, [r2+2*r3] movhps m1, [r0] movhps m2, [r0+r1] movhps m3, [r2] movhps m4, [r2+r3] lea r0, [r0+2*r1] psadbw m1, m3 psadbw m2, m4 lea r2, [r2+2*r3] %if %1 paddw m0, m1 %else SWAP m0, m1 %endif paddw m0, m2 %endmacro