1、可能是最快的算法alpha blend汇编源代码可能是最快的算法alpha blend汇编源代码,Intel官方提供 Intel官方网站有一个ablend_565的快速汇编算法,理论上是是把一块32bit RGBA渲染到16bit的buffer上,我的机器是PIII800,函数在system menory中进行,640*480的256级alpha blending,达到100fps,我想可以满足绝大部分的要求了,在这里,我提供了这个算法的应用,希望可以对大家有所帮助。ablend_565函数,源代码可以直接编译使用,无需其他库函数,感谢intel提供这么好的东西。 首先,我提供一些本人编写的把
2、32bit tga文件读入pRGBABuffer的函数文件尺寸保存在 width,height/-/ Name: LoadTgaFile( TCHAR* strPathname, DWORD* pRGBABuffer, long* width, long* height )/ Desc: 读取32bit tga文件到DWORD缓冲里,返回其尺寸/ Time: 2002.06.22 00:36/ Author: RealRender/ Para:/ Return:/ Note: 这段代码来自directx 7.0 sample中的d3dtextr.cpp,我把他提取了出来/ 方便使用/-BOOL
3、 LoadTgaFile( TCHAR* strPathname, DWORD* pRGBABuffer, long* width, long* height ) FILE* file = fopen( strPathname, rb ); if( NULL = file ) return false; struct TargaHeader BYTE IDLength; BYTE ColormapType; BYTE ImageType; BYTE ColormapSpecification5; WORD XOrigin; WORD YOrigin; WORD ImageWidth; WORD
4、 ImageHeight; BYTE PixelDepth; BYTE ImageDescriptor; tga; fread( &tga, sizeof(TargaHeader), 1, file ); / Only true color, non-mapped images are supported if( ( 0 != tga.ColormapType ) | ( tga.ImageType != 10 & tga.ImageType != 2 ) ) fclose( file ); return false; / Skip the ID field. The first byte o
5、f the header is the length of this field if( tga.IDLength ) fseek( file, tga.IDLength, SEEK_CUR ); DWORD m_dwWidth = tga.ImageWidth; DWORD m_dwHeight = tga.ImageHeight; DWORD m_dwBPP = tga.PixelDepth; DWORD *m_pRGBAData = new DWORDm_dwWidth*m_dwHeight; if( m_pRGBAData = NULL ) fclose(file); return f
6、alse; for( DWORD y=0; ym_dwHeight; y+ ) DWORD dwOffset = y*m_dwWidth; if( 0 = ( tga.ImageDescriptor & 0x0010 ) ) dwOffset = (m_dwHeight-y-1)*m_dwWidth; for( DWORD x=0; xm_dwWidth; x ) if( tga.ImageType = 10 ) BYTE PacketInfo = getc( file ); WORD PacketType = 0x80 & PacketInfo; WORD PixelCount = ( 0x
7、007f & PacketInfo ) + 1; if( PacketType ) DWORD b = getc( file ); DWORD g = getc( file ); DWORD r = getc( file ); DWORD a = 0xff; if( m_dwBPP = 32 ) a = getc( file ); while( PixelCount- ) m_pRGBADatadwOffset+x = (r24L)+(g16L)+(b8L)+(a); x+; else while( PixelCount- ) BYTE b = getc( file ); BYTE g = g
8、etc( file ); BYTE r = getc( file ); BYTE a = 0xff; if( m_dwBPP = 32 ) a = getc( file ); m_pRGBADatadwOffset+x = (r24L)+(g16L)+(b8L)+(a); x+; else BYTE b = getc( file ); BYTE g = getc( file ); BYTE r = getc( file ); BYTE a = 0xff; if( m_dwBPP = 32 ) a = getc( file ); m_pRGBADatadwOffset+x = (r24L)+(g
9、16L)+(b8L)+(a); x+; fclose( file ); / Check for alpha content for( DWORD i=0; i(m_dwWidth*m_dwHeight); i+ ) if( m_pRGBADatai & 0x000000ff != 0xff ) /m_bHasAlpha = TRUE; break; *pRGBABuffer = m_pRGBAData; *width = m_dwWidth; *height = m_dwHeight; return true;把32bit buffer分割为rgb和alpha的代码。注意,分割后的pBitma
10、p一定要是8字节对齐,这是优化的一个重要条件,所以,我的算法中:BYTE* p = new BYTElSize*2+8;BYTE* pOrig = p;p += (DWORD)p%8;WORD* color = (WORD*)p;这是不规范的写法,把指针强行改变为8位对齐,实际使用的时候,要记住释放的原始指针不是p,而是pOrig,在这里,我没有释放分配的内存,请谅解。/-/ Name: SplitRGBA( DWORD* pRGBABuffer, LPBYTE* pAlpha, LPWORD* pBitmap, long lWidth, long lHeight )/ Desc:/ Time
11、: 2002.06.22 00:36/ Author: RealRender/ Para:/ Return:/ Note: 把从32bit的缓冲建立16bit的565缓冲和8bit的alpha通道/-void SplitRGBA( DWORD* pRGBABuffer, LPBYTE* pAlpha, LPWORD* pBitmap, long lWidth, long lHeight ) long lSize = lWidth*lHeight; BYTE* alpha = new BYTElSize; BYTE* p = new BYTElSize*2+8; / 强行转换为8字节对齐 p +
12、= (DWORD)p%8; WORD* color = (WORD*)p; DWORD dwPixel; DWORD r, g, b, a; for( int i = 0; i 24)&0x000000ff); g = (dwPixel16)&0x000000ff); b = (dwPixel 8)&0x000000ff); a = (dwPixel 0)&0x000000ff); alphai = a; / 888i转化为565 colori = RGBTo16( r, g, b ); *pAlpha = alpha; *pBitmap = color;/这个视intel官方提供的函数,函数
13、的描述,用我的话来说就是把一个带有256级alpha通道的565颜色数据绘制到16位目标页面。函数说明:unsigned char *lpAlpha, / 256 级alpha通道unsigned int iAlpPitch, / alpha通道的pitchunsigned char *lpSrc, / 原色彩缓冲unsigned int iSrcX, /unsigned int iSrcY, / 原色彩位置unsigned int iSrcPitch, / 原色彩pitchunsigned char *lpDst, / 目标缓冲unsigned int iDstX,unsigned int
14、iDstY, / 目标位置unsigned int iDstW,unsigned int iDstH, / 目标缓冲的尺寸unsigned int iDstPitch / 目标缓冲的pitchvoid ablend_565(unsigned char *lpAlpha,unsigned int iAlpPitch,unsigned char *lpSrc,unsigned int iSrcX, unsigned int iSrcY,unsigned int iSrcPitch, unsigned char *lpDst,unsigned int iDstX, unsigned int iDst
15、Y,unsigned int iDstW, unsigned int iDstH,unsigned int iDstPitch)/Mask for isolating the red,green, and blue componentsstatic _int64 MASKB=0x001F001F001F001F;static _int64 MASKG=0x07E007E007E007E0;static _int64 MASKSHIFTG=0x03F003F003F003F0;static _int64 MASKR=0xF800F800F800F800;/constants used by th
16、e integer alpha blending equationstatic _int64 SIXTEEN=0x0010001000100010;static _int64 FIVETWELVE=0x0200020002000200;static _int64 SIXONES=0x003F003F003F003F;unsigned char *lpLinearDstBp=(iDstX1)+(iDstY*iDstPitch)+lpDst; /base pointer for linear destinationunsigned char *lpLinearSrcBp=(iSrcX6)6;/re
17、d/i=a*sr+(31-a)*dr;/i=(i+16)+(i+16)5)5;movq mm5,edi; /g2: mm5=dst3 dst2 dst1 dst0psrlw mm1,2; /mm1=a?2 nuke out lower 2 bitsmovq mm7,MASKSHIFTG; /g3: mm7=1 bit shifted green maskpsrlw mm4,1; /g3a: move src green down by 1 so that we wont overflowmovq mm0,mm1; /mm0=00a3 00a2 00a1 00a0psrlw mm5,1; /g3
18、b: move dst green down by 1 so that we wont overflowpsrlw mm1,1; /mm1=a?1 nuke out lower 1 bitspand mm4,mm7; /g5: mm4=sg3 sg2 sg1 sg0movq mm2,SIXONES;/g4: mm2=63pand mm5,mm7; /g7: mm5=dg3 dg2 dg1 dg0movq mm3,esi; /b1: mm3=src3 src2 src1 src0psubsb mm2,mm0; /g6: mm2=63-a3 63-a2 63-a1 63-a0movq mm7,MA
19、SKB; /b2: mm7=BLUE MASKpmullw mm4,mm0; /g8: mm4=sg?*a?movq mm0,edi; /b3: mm0=dst3 dst2 dst1 dst0pmullw mm5,mm2; /g9: mm5=dg?*(1-a?)movq mm2,mm7; /b4: mm2=fiveonespand mm3,mm7; /b4: mm3=sb3 sb2 sb1 sb0pmullw mm3,mm1; /b6: mm3=sb?*a?pand mm0,mm7; /b5: mm0=db3 db2 db1 db0movq mm7,esi; /r1: mm7=src3 src
20、2 src1 src0paddw mm4,mm5; /g10: mm4=sg?*a?+dg?*(1-a?)pand mm7,MASKR; /r2: mm7=sr3 sr2 sr1 sr0psubsb mm2,mm1; /b5a: mm2=31-a3 31-a2 31-a1 31-a0paddw mm4,FIVETWELVE; /g11: mm4=(mm4+512) greenpmullw mm0,mm2; /b7: mm0=db?*(1-a?)movq mm5,mm4; /g12: mm5=mm4 greenpsrlw mm7,11; /r4: shift src red down to po
21、sition 0psrlw mm4,6; /g13: mm4=mm46paddw mm4,mm5; /g14: mm4=mm4+mm5 greenpaddw mm0,mm3; /b8: mm0=sb?*a?+db?*(1-a?)movq mm5,edi; /r3: mm5=dst3 dst2 dst1 dst0paddw mm0,SIXTEEN; /b9: mm0=(mm0+16) bluepand mm5,MASKR; /r5: mm5=dr3 dr2 dr1 dr0psrlw mm4,5; /g15: mm4=0?g0 0?g0 0?g0 0?g0 greenmovq mm3,mm0; /
22、b10: mm3=mm0 bluepsrlw mm0,5; /b11: mm0=mm05 bluepsrlw mm5,11; /r6: shift dst red down to position 0paddw mm0,mm3; /b12: mm0=mm3+mm0 bluepsrlw mm0,5; /b13: mm0=000b 000b 000b 000b bluepmullw mm7,mm1; /mm7=sr?*a?pand mm4,MASKG; /g16: mm4=00g0 00g0 00g0 00g0 greenpmullw mm5,mm2; /r7: mm5=dr?*(31-a?)po
23、r mm0,mm4; /mm0=00gb 00gb 00gb 00gbadd eax,4; /move to next 4 alphasadd esi,8; /move to next 4 pixels in srcadd edi,8; /move to next 4 pixels in dstmovd mm1,eax; /mm1=00 00 00 00 a3 a2 a1 a0paddw mm5,mm7; /r8: mm5=sr?*a?+dr?*(31-a?)paddw mm5,SIXTEEN; /r9: mm5=(mm5+16) redpxor mm2,mm2; /mm2=0;movq mm7,mm5; /r10: mm7=mm5 redpsrlw mm5,5; /r11: mm5=mm55 redmovq mm4,esi; /g1: mm4=src3 src2 src1 src0paddw mm5,mm7; /r12: mm5=mm7+mm5 redpunpcklbw mm1,mm2; /mm1=00a3 00a2 00a1 00a0psrlw mm5,5; /r13: mm5=mm55 redpsllw mm5,11; /r14: mm5=mm52 nuke out
copyright@ 2008-2022 冰豆网网站版权所有
经营许可证编号:鄂ICP备2022015515号-1