可能是最快的算法alpha blend汇编源代码.docx

资源描述

可能是最快的算法alpha blend汇编源代码.docx

《可能是最快的算法alpha blend汇编源代码.docx》由会员分享，可在线阅读，更多相关《可能是最快的算法alpha blend汇编源代码.docx（13页珍藏版）》请在冰豆网上搜索。

可能是最快的算法alpha blend汇编源代码.docx

可能是最快的算法alphablend汇编源代码

可能是最快的算法alphablend汇编源代码，Intel官方提供

Intel官方网站有一个ablend_565的快速汇编算法，理论上是是把一块32bitRGBA渲染到16bit的buffer上，我的机器是PIII800,函数在systemmenory中进行，640*480的256级alphablending，达到100fps，我想可以满足绝大部分的要求了，在这里，我提供了这个算法的应用，希望可以对大家有所帮助。

ablend_565函数，源代码可以直接编译使用，无需其他库函数，感谢intel提供这么好的东西。

首先，我提供一些本人编写的把32bittga文件读入pRGBABuffer的函数

文件尺寸保存在width,height

//-----------------------------------------------------------------------

//Name:

LoadTgaFile（TCHAR*strPathname,DWORD**pRGBABuffer,long*width,long*height）

//Desc:

读取32bittga文件到DWORD缓冲里，返回其尺寸

//Time:

2002.06.2200:

//Author:

RealRender

//Para:

//Return:

//Note:

这段代码来自directx7.0sample中的d3dtextr.cpp，我把他提取了出来

//方便使用

//-----------------------------------------------------------------------

BOOLLoadTgaFile（TCHAR*strPathname,DWORD**pRGBABuffer,long*width,long*height）

{

FILE*file=fopen（strPathname,"rb"）;

if（NULL==file）

returnfalse;

structTargaHeader

{

BYTEIDLength;

BYTEColormapType;

BYTEImageType;

BYTEColormapSpecification[5];

WORDXOrigin;

WORDYOrigin;

WORDImageWidth;

WORDImageHeight;

BYTEPixelDepth;

BYTEImageDescriptor;

}tga;

fread（&tga,sizeof（TargaHeader）,1,file）;

//Onlytruecolor,non-mappedimagesaresupported

if（（0!

=tga.ColormapType）||

（tga.ImageType!

=10&&tga.ImageType!

=2））

{

fclose（file）;

returnfalse;

}

//SkiptheIDfield.Thefirstbyteoftheheaderisthelengthofthisfield

if（tga.IDLength）

fseek（file,tga.IDLength,SEEK_CUR）;

DWORDm_dwWidth=tga.ImageWidth;

DWORDm_dwHeight=tga.ImageHeight;

DWORDm_dwBPP=tga.PixelDepth;

DWORD*m_pRGBAData=newDWORD[m_dwWidth*m_dwHeight];

if（m_pRGBAData==NULL）

{

fclose（file）;

returnfalse;

}

for（DWORDy=0;y

{

DWORDdwOffset=y*m_dwWidth;

if（0==（tga.ImageDescriptor&0x0010））

dwOffset=（m_dwHeight-y-1）*m_dwWidth;

for（DWORDx=0;x

{

if（tga.ImageType==10）

{

BYTEPacketInfo=getc（file）;

WORDPacketType=0x80&PacketInfo;

WORDPixelCount=（0x007f&PacketInfo）+1;

if（PacketType）

{

DWORDb=getc（file）;

DWORDg=getc（file）;

DWORDr=getc（file）;

DWORDa=0xff;

if（m_dwBPP==32）

a=getc（file）;

while（PixelCount--）

{

m_pRGBAData[dwOffset+x]=（r<<24L）+（g<<16L）+（b<<8L）+（a）;

x++;

}

else

{

while（PixelCount--）

{

BYTEb=getc（file）;

BYTEg=getc（file）;

BYTEr=getc（file）;

BYTEa=0xff;

if（m_dwBPP==32）

a=getc（file）;

m_pRGBAData[dwOffset+x]=（r<<24L）+（g<<16L）+（b<<8L）+（a）;

x++;

}

else

{

BYTEb=getc（file）;

BYTEg=getc（file）;

BYTEr=getc（file）;

BYTEa=0xff;

if（m_dwBPP==32）

a=getc（file）;

m_pRGBAData[dwOffset+x]=（r<<24L）+（g<<16L）+（b<<8L）+（a）;

x++;

}

fclose（file）;

//Checkforalphacontent

for（DWORDi=0;i<（m_dwWidth*m_dwHeight）;i++）

{

if（m_pRGBAData[i]&0x000000ff!

=0xff）

{

//m_bHasAlpha=TRUE;

break;

}

*pRGBABuffer=m_pRGBAData;

*width=m_dwWidth;

*height=m_dwHeight;

returntrue;

}

把32bitbuffer分割为rgb和alpha的代码。

注意，分割后的pBitmap一定要是8字节对齐，这是优化的一个重要条件，所以，我的算法中：

BYTE*p=newBYTE[lSize*2+8];

BYTE*pOrig=p;

p+=（DWORD）p%8;

WORD*color=（WORD*）p;

这是不规范的写法，把指针强行改变为8位对齐，实际使用的时候，要记住释放的原始指针不是p，而是pOrig，在这里，我没有释放分配的内存，请谅解。

//-----------------------------------------------------------------------

//Name:

SplitRGBA（DWORD*pRGBABuffer,LPBYTE*pAlpha,LPWORD*pBitmap,longlWidth,longlHeight）

//Desc:

//Time:

2002.06.2200:

//Author:

RealRender

//Para:

//Return:

//Note:

把从32bit的缓冲建立16bit的565缓冲和8bit的alpha通道

//-----------------------------------------------------------------------

voidSplitRGBA（DWORD*pRGBABuffer,LPBYTE*pAlpha,LPWORD*pBitmap,longlWidth,longlHeight）

{

longlSize=lWidth*lHeight;

BYTE*alpha=newBYTE[lSize];

BYTE*p=newBYTE[lSize*2+8];

//强行转换为8字节对齐

p+=（DWORD）p%8;

WORD*color=（WORD*）p;

DWORDdwPixel;

DWORDr,g,b,a;

for（inti=0;i

{

dwPixel=pRGBABuffer[i];

r=（（dwPixel>>24）&0x000000ff）;

g=（（dwPixel>>16）&0x000000ff）;

b=（（dwPixel>>8）&0x000000ff）;

a=（（dwPixel>>0）&0x000000ff）;

alpha[i]=a;

//888i转化为565

color[i]=RGBTo16（r,g,b）;

}

*pAlpha=alpha;

*pBitmap=color;

}

这个视intel官方提供的函数，函数的描述，用我的话来说就是把一个带有256级alpha通道的565颜色数据绘制到16位目标页面。

函数说明：

unsignedchar*lpAlpha,//256级alpha通道

unsignedintiAlpPitch,//alpha通道的pitch

unsignedchar*lpSrc,//原色彩缓冲

unsignedintiSrcX,//

unsignedintiSrcY,//原色彩位置

unsignedintiSrcPitch,//原色彩pitch

unsignedchar*lpDst,//目标缓冲

unsignedintiDstX,

unsignedintiDstY,//目标位置

unsignedintiDstW,

unsignedintiDstH,//目标缓冲的尺寸

unsignedintiDstPitch//目标缓冲的pitch

voidablend_565（unsignedchar*lpAlpha,unsignedintiAlpPitch,

unsignedchar*lpSrc,unsignedintiSrcX,unsignedintiSrcY,

unsignedintiSrcPitch,unsignedchar*lpDst,

unsignedintiDstX,unsignedintiDstY,

unsignedintiDstW,unsignedintiDstH,

unsignedintiDstPitch）

{

//Maskforisolatingthered,green,andbluecomponents

static__int64MASKB=0x001F001F001F001F;

static__int64MASKG=0x07E007E007E007E0;

static__int64MASKSHIFTG=0x03F003F003F003F0;

static__int64MASKR=0xF800F800F800F800;

//constantsusedbytheintegeralphablendingequation

static__int64SIXTEEN=0x0010001000100010;

static__int64FIVETWELVE=0x0200020002000200;

static__int64SIXONES=0x003F003F003F003F;

unsignedchar*lpLinearDstBp=（iDstX<<1）+（iDstY*iDstPitch）+lpDst;//basepointerforlineardestination

unsignedchar*lpLinearSrcBp=（iSrcX<<1）+（iSrcY*iSrcPitch）+lpSrc;//basepointerforlinearsource

unsignedchar*lpLinearAlpBp=iSrcX+（iSrcY*iAlpPitch）+lpAlpha;//basepointerforlinearalpha

_asm{

movesi,lpLinearSrcBp;//src

movedi,lpLinearDstBp;//dst

moveax,lpLinearAlpBp;//alpha

movecx,iDstH;//ecx=numberoflinestocopy

movebx,iDstW;//ebx=spanwidthtocopy

testesi,6;//checkifsourceaddressisqwordaligned

//sinceaddrcominginisalwayswordaligned（16bit）

jnzdone;//ifnotqwordalignedwedon'tdoanything

primeloop:

movdmm1,[eax];//mm1=00000000a3a2a1a0

pxormm2,mm2;//mm2=0;

movqmm4,[esi];//g1:

mm4=src3src2src1src0

punpcklbwmm1,mm2;//mm1=00a300a200a100a0

loopqword:

movedx,[eax];

testebx,0xFFFFFFFC;//checkifonly3pixelsleft

jzcheckback;//3orlesspixelsleft

//earlyouttests

cmpedx,0xffffffff;//testforalphavalueof1

jecopyback;//if1'scopythesourcepixelstothedestination

testedx,0xffffffff;//testforalphavalueof0

jzleavefront;//ifsogotothenext4pixels

//thealphablendstarts

//green

//i=a*sg+（63-a）*dg;

//i=（i+32）+（（i+32）>>6）>>6;

//red

//i=a*sr+（31-a）*dr;

//i=（i+16）+（（i+16）>>5）>>5;

movqmm5,[edi];//g2:

mm5=dst3dst2dst1dst0

psrlwmm1,2;//mm1=a?

>>2nukeoutlower2bits

movqmm7,MASKSHIFTG;//g3:

mm7=1bitshiftedgreenmask

psrlwmm4,1;//g3a:

movesrcgreendownby1sothatwewon'toverflow

movqmm0,mm1;//mm0=00a300a200a100a0

psrlwmm5,1;//g3b:

movedstgreendownby1sothatwewon'toverflow

psrlwmm1,1;//mm1=a?

>>1nukeoutlower1bits

pandmm4,mm7;//g5:

mm4=sg3sg2sg1sg0

movqmm2,SIXONES;//g4:

mm2=63

pandmm5,mm7;//g7:

mm5=dg3dg2dg1dg0

movqmm3,[esi];//b1:

mm3=src3src2src1src0

psubsbmm2,mm0;//g6:

mm2=63-a363-a263-a163-a0

movqmm7,MASKB;//b2:

mm7=BLUEMASK

pmullwmm4,mm0;//g8:

mm4=sg?

*a?

movqmm0,[edi];//b3:

mm0=dst3dst2dst1dst0

pmullwmm5,mm2;//g9:

mm5=dg?

*（1-a?

）

movqmm2,mm7;//b4:

mm2=fiveones

pandmm3,mm7;//b4:

mm3=sb3sb2sb1sb0

pmullwmm3,mm1;//b6:

mm3=sb?

*a?

pandmm0,mm7;//b5:

mm0=db3db2db1db0

movqmm7,[esi];//r1:

mm7=src3src2src1src0

paddwmm4,mm5;//g10:

mm4=sg?

*a?

+dg?

*（1-a?

）

pandmm7,MASKR;//r2:

mm7=sr3sr2sr1sr0　

psubsbmm2,mm1;//b5a:

mm2=31-a331-a231-a131-a0

paddwmm4,FIVETWELVE;//g11:

mm4=（mm4+512）green

pmullwmm0,mm2;//b7:

mm0=db?

*（1-a?

）

movqmm5,mm4;//g12:

mm5=mm4green

psrlwmm7,11;//r4:

shiftsrcreddowntoposition0

psrlwmm4,6;//g13:

mm4=mm4>>6

paddwmm4,mm5;//g14:

mm4=mm4+mm5green

paddwmm0,mm3;//b8:

mm0=sb?

*a?

+db?

*（1-a?

）

movqmm5,[edi];//r3:

mm5=dst3dst2dst1dst0

paddwmm0,SIXTEEN;//b9:

mm0=（mm0+16）blue

pandmm5,MASKR;//r5:

mm5=dr3dr2dr1dr0

psrlwmm4,5;//g15:

mm4=0?

g00?

g0green

movqmm3,mm0;//b10:

mm3=mm0blue

psrlwmm0,5;//b11:

mm0=mm0>>5blue

psrlwmm5,11;//r6:

shiftdstreddowntoposition0

paddwmm0,mm3;//b12:

mm0=mm3+mm0blue

psrlwmm0,5;//b13:

mm0=000b000b000b000bblue

pmullwmm7,mm1;//mm7=sr?

*a?

pandmm4,MASKG;//g16:

mm4=00g000g000g000g0green

pmullwmm5,mm2;//r7:

mm5=dr?

*（31-a?

）

pormm0,mm4;//mm0=00gb00gb00gb00gb

addeax,4;//movetonext4alphas

addesi,8;//movetonext4pixelsinsrc

addedi,8;//movetonext4pixelsindst

movdmm1,[eax];//mm1=00000000a3a2a1a0

paddwmm5,mm7;//r8:

mm5=sr?

*a?

+dr?

*（31-a?

）

paddwmm5,SIXTEEN;//r9:

mm5=（mm5+16）red

pxormm2,mm2;//mm2=0;

movqmm7,mm5;//r10:

mm7=mm5red

psrlwmm5,5;//r11:

mm5=mm5>>5red

movqmm4,[esi];//g1:

mm4=src3src2src1src0

paddwmm5,mm7;//r12:

mm5=mm7+mm5red

punpcklbwmm1,mm2;//mm1=00a300a200a100a0

psrlwmm5,5;//r13:

mm5=mm5>>5red

psllwmm5,11;//r14:

mm5=mm5<<10red

pormm0,mm5;//mm0=0rgb0rgb0rgb0rgb

subebx,4;//polishedoff4pixels

movq[edi-8],mm0;//dst=0rgb0rgb0rgb0rgb

jmploopqword;//gobacktostart

copyback:

movq[edi],mm4;//copysourcetodestination

leavefront:

addedi,8;//advancedestinationby4pixels

addeax,4;//advancealphaby4

addesi,8;//advancesourceby4pixels

subebx,4;//decreasepixelcountby4

jmpprimeloop;

checkback:

testebx,0xFF;//checkif0pixelsleft

jznextline;//donewiththisspan

//backalign:

//workoutbackendpixels

movqmm5,[edi];//g2:

mm5=dst3dst2dst1dst0

psrlwmm1,2;//mm1=a?

>>2nukeout

展开阅读全文