可能是最快的算法alpha blend汇编源代码.docx
《可能是最快的算法alpha blend汇编源代码.docx》由会员分享,可在线阅读,更多相关《可能是最快的算法alpha blend汇编源代码.docx(13页珍藏版)》请在冰豆网上搜索。
可能是最快的算法alphablend汇编源代码
可能是最快的算法alphablend汇编源代码,Intel官方提供
Intel官方网站有一个ablend_565的快速汇编算法,理论上是是把一块32bitRGBA渲染到16bit的buffer上,我的机器是PIII800,函数在systemmenory中进行,640*480的256级alphablending,达到100fps,我想可以满足绝大部分的要求了,在这里,我提供了这个算法的应用,希望可以对大家有所帮助。
ablend_565函数,源代码可以直接编译使用,无需其他库函数,感谢intel提供这么好的东西。
首先,我提供一些本人编写的把32bittga文件读入pRGBABuffer的函数
文件尺寸保存在width,height
//-----------------------------------------------------------------------
//Name:
LoadTgaFile(TCHAR*strPathname,DWORD**pRGBABuffer,long*width,long*height)
//Desc:
读取32bittga文件到DWORD缓冲里,返回其尺寸
//Time:
2002.06.2200:
36
//Author:
RealRender
//Para:
//Return:
//Note:
这段代码来自directx7.0sample中的d3dtextr.cpp,我把他提取了出来
//方便使用
//-----------------------------------------------------------------------
BOOLLoadTgaFile(TCHAR*strPathname,DWORD**pRGBABuffer,long*width,long*height)
{
FILE*file=fopen(strPathname,"rb");
if(NULL==file)
returnfalse;
structTargaHeader
{
BYTEIDLength;
BYTEColormapType;
BYTEImageType;
BYTEColormapSpecification[5];
WORDXOrigin;
WORDYOrigin;
WORDImageWidth;
WORDImageHeight;
BYTEPixelDepth;
BYTEImageDescriptor;
}tga;
fread(&tga,sizeof(TargaHeader),1,file);
//Onlytruecolor,non-mappedimagesaresupported
if((0!
=tga.ColormapType)||
(tga.ImageType!
=10&&tga.ImageType!
=2))
{
fclose(file);
returnfalse;
}
//SkiptheIDfield.Thefirstbyteoftheheaderisthelengthofthisfield
if(tga.IDLength)
fseek(file,tga.IDLength,SEEK_CUR);
DWORDm_dwWidth=tga.ImageWidth;
DWORDm_dwHeight=tga.ImageHeight;
DWORDm_dwBPP=tga.PixelDepth;
DWORD*m_pRGBAData=newDWORD[m_dwWidth*m_dwHeight];
if(m_pRGBAData==NULL)
{
fclose(file);
returnfalse;
}
for(DWORDy=0;y{
DWORDdwOffset=y*m_dwWidth;
if(0==(tga.ImageDescriptor&0x0010))
dwOffset=(m_dwHeight-y-1)*m_dwWidth;
for(DWORDx=0;x{
if(tga.ImageType==10)
{
BYTEPacketInfo=getc(file);
WORDPacketType=0x80&PacketInfo;
WORDPixelCount=(0x007f&PacketInfo)+1;
if(PacketType)
{
DWORDb=getc(file);
DWORDg=getc(file);
DWORDr=getc(file);
DWORDa=0xff;
if(m_dwBPP==32)
a=getc(file);
while(PixelCount--)
{
m_pRGBAData[dwOffset+x]=(r<<24L)+(g<<16L)+(b<<8L)+(a);
x++;
}
}
else
{
while(PixelCount--)
{
BYTEb=getc(file);
BYTEg=getc(file);
BYTEr=getc(file);
BYTEa=0xff;
if(m_dwBPP==32)
a=getc(file);
m_pRGBAData[dwOffset+x]=(r<<24L)+(g<<16L)+(b<<8L)+(a);
x++;
}
}
}
else
{
BYTEb=getc(file);
BYTEg=getc(file);
BYTEr=getc(file);
BYTEa=0xff;
if(m_dwBPP==32)
a=getc(file);
m_pRGBAData[dwOffset+x]=(r<<24L)+(g<<16L)+(b<<8L)+(a);
x++;
}
}
}
fclose(file);
//Checkforalphacontent
for(DWORDi=0;i<(m_dwWidth*m_dwHeight);i++)
{
if(m_pRGBAData[i]&0x000000ff!
=0xff)
{
//m_bHasAlpha=TRUE;
break;
}
}
*pRGBABuffer=m_pRGBAData;
*width=m_dwWidth;
*height=m_dwHeight;
returntrue;
}
把32bitbuffer分割为rgb和alpha的代码。
注意,分割后的pBitmap一定要是8字节对齐,这是优化的一个重要条件,所以,我的算法中:
BYTE*p=newBYTE[lSize*2+8];
BYTE*pOrig=p;
p+=(DWORD)p%8;
WORD*color=(WORD*)p;
这是不规范的写法,把指针强行改变为8位对齐,实际使用的时候,要记住释放的原始指针不是p,而是pOrig,在这里,我没有释放分配的内存,请谅解。
//-----------------------------------------------------------------------
//Name:
SplitRGBA(DWORD*pRGBABuffer,LPBYTE*pAlpha,LPWORD*pBitmap,longlWidth,longlHeight)
//Desc:
//Time:
2002.06.2200:
36
//Author:
RealRender
//Para:
//Return:
//Note:
把从32bit的缓冲建立16bit的565缓冲和8bit的alpha通道
//-----------------------------------------------------------------------
voidSplitRGBA(DWORD*pRGBABuffer,LPBYTE*pAlpha,LPWORD*pBitmap,longlWidth,longlHeight)
{
longlSize=lWidth*lHeight;
BYTE*alpha=newBYTE[lSize];
BYTE*p=newBYTE[lSize*2+8];
//强行转换为8字节对齐
p+=(DWORD)p%8;
WORD*color=(WORD*)p;
DWORDdwPixel;
DWORDr,g,b,a;
for(inti=0;i{
dwPixel=pRGBABuffer[i];
r=((dwPixel>>24)&0x000000ff);
g=((dwPixel>>16)&0x000000ff);
b=((dwPixel>>8)&0x000000ff);
a=((dwPixel>>0)&0x000000ff);
alpha[i]=a;
//888i转化为565
color[i]=RGBTo16(r,g,b);
}
*pAlpha=alpha;
*pBitmap=color;
}
//
这个视intel官方提供的函数,函数的描述,用我的话来说就是把一个带有256级alpha通道的565颜色数据绘制到16位目标页面。
函数说明:
unsignedchar*lpAlpha,//256级alpha通道
unsignedintiAlpPitch,//alpha通道的pitch
unsignedchar*lpSrc,//原色彩缓冲
unsignedintiSrcX,//
unsignedintiSrcY,//原色彩位置
unsignedintiSrcPitch,//原色彩pitch
unsignedchar*lpDst,//目标缓冲
unsignedintiDstX,
unsignedintiDstY,//目标位置
unsignedintiDstW,
unsignedintiDstH,//目标缓冲的尺寸
unsignedintiDstPitch//目标缓冲的pitch
voidablend_565(unsignedchar*lpAlpha,unsignedintiAlpPitch,
unsignedchar*lpSrc,unsignedintiSrcX,unsignedintiSrcY,
unsignedintiSrcPitch,unsignedchar*lpDst,
unsignedintiDstX,unsignedintiDstY,
unsignedintiDstW,unsignedintiDstH,
unsignedintiDstPitch)
{
//Maskforisolatingthered,green,andbluecomponents
static__int64MASKB=0x001F001F001F001F;
static__int64MASKG=0x07E007E007E007E0;
static__int64MASKSHIFTG=0x03F003F003F003F0;
static__int64MASKR=0xF800F800F800F800;
//constantsusedbytheintegeralphablendingequation
static__int64SIXTEEN=0x0010001000100010;
static__int64FIVETWELVE=0x0200020002000200;
static__int64SIXONES=0x003F003F003F003F;
unsignedchar*lpLinearDstBp=(iDstX<<1)+(iDstY*iDstPitch)+lpDst;//basepointerforlineardestination
unsignedchar*lpLinearSrcBp=(iSrcX<<1)+(iSrcY*iSrcPitch)+lpSrc;//basepointerforlinearsource
unsignedchar*lpLinearAlpBp=iSrcX+(iSrcY*iAlpPitch)+lpAlpha;//basepointerforlinearalpha
_asm{
movesi,lpLinearSrcBp;//src
movedi,lpLinearDstBp;//dst
moveax,lpLinearAlpBp;//alpha
movecx,iDstH;//ecx=numberoflinestocopy
movebx,iDstW;//ebx=spanwidthtocopy
testesi,6;//checkifsourceaddressisqwordaligned
//sinceaddrcominginisalwayswordaligned(16bit)
jnzdone;//ifnotqwordalignedwedon'tdoanything
primeloop:
movdmm1,[eax];//mm1=00000000a3a2a1a0
pxormm2,mm2;//mm2=0;
movqmm4,[esi];//g1:
mm4=src3src2src1src0
punpcklbwmm1,mm2;//mm1=00a300a200a100a0
loopqword:
movedx,[eax];
testebx,0xFFFFFFFC;//checkifonly3pixelsleft
jzcheckback;//3orlesspixelsleft
//earlyouttests
cmpedx,0xffffffff;//testforalphavalueof1
jecopyback;//if1'scopythesourcepixelstothedestination
testedx,0xffffffff;//testforalphavalueof0
jzleavefront;//ifsogotothenext4pixels
//thealphablendstarts
//green
//i=a*sg+(63-a)*dg;
//i=(i+32)+((i+32)>>6)>>6;
//red
//i=a*sr+(31-a)*dr;
//i=(i+16)+((i+16)>>5)>>5;
movqmm5,[edi];//g2:
mm5=dst3dst2dst1dst0
psrlwmm1,2;//mm1=a?
>>2nukeoutlower2bits
movqmm7,MASKSHIFTG;//g3:
mm7=1bitshiftedgreenmask
psrlwmm4,1;//g3a:
movesrcgreendownby1sothatwewon'toverflow
movqmm0,mm1;//mm0=00a300a200a100a0
psrlwmm5,1;//g3b:
movedstgreendownby1sothatwewon'toverflow
psrlwmm1,1;//mm1=a?
>>1nukeoutlower1bits
pandmm4,mm7;//g5:
mm4=sg3sg2sg1sg0
movqmm2,SIXONES;//g4:
mm2=63
pandmm5,mm7;//g7:
mm5=dg3dg2dg1dg0
movqmm3,[esi];//b1:
mm3=src3src2src1src0
psubsbmm2,mm0;//g6:
mm2=63-a363-a263-a163-a0
movqmm7,MASKB;//b2:
mm7=BLUEMASK
pmullwmm4,mm0;//g8:
mm4=sg?
*a?
movqmm0,[edi];//b3:
mm0=dst3dst2dst1dst0
pmullwmm5,mm2;//g9:
mm5=dg?
*(1-a?
)
movqmm2,mm7;//b4:
mm2=fiveones
pandmm3,mm7;//b4:
mm3=sb3sb2sb1sb0
pmullwmm3,mm1;//b6:
mm3=sb?
*a?
pandmm0,mm7;//b5:
mm0=db3db2db1db0
movqmm7,[esi];//r1:
mm7=src3src2src1src0
paddwmm4,mm5;//g10:
mm4=sg?
*a?
+dg?
*(1-a?
)
pandmm7,MASKR;//r2:
mm7=sr3sr2sr1sr0
psubsbmm2,mm1;//b5a:
mm2=31-a331-a231-a131-a0
paddwmm4,FIVETWELVE;//g11:
mm4=(mm4+512)green
pmullwmm0,mm2;//b7:
mm0=db?
*(1-a?
)
movqmm5,mm4;//g12:
mm5=mm4green
psrlwmm7,11;//r4:
shiftsrcreddowntoposition0
psrlwmm4,6;//g13:
mm4=mm4>>6
paddwmm4,mm5;//g14:
mm4=mm4+mm5green
paddwmm0,mm3;//b8:
mm0=sb?
*a?
+db?
*(1-a?
)
movqmm5,[edi];//r3:
mm5=dst3dst2dst1dst0
paddwmm0,SIXTEEN;//b9:
mm0=(mm0+16)blue
pandmm5,MASKR;//r5:
mm5=dr3dr2dr1dr0
psrlwmm4,5;//g15:
mm4=0?
g00?
g00?
g00?
g0green
movqmm3,mm0;//b10:
mm3=mm0blue
psrlwmm0,5;//b11:
mm0=mm0>>5blue
psrlwmm5,11;//r6:
shiftdstreddowntoposition0
paddwmm0,mm3;//b12:
mm0=mm3+mm0blue
psrlwmm0,5;//b13:
mm0=000b000b000b000bblue
pmullwmm7,mm1;//mm7=sr?
*a?
pandmm4,MASKG;//g16:
mm4=00g000g000g000g0green
pmullwmm5,mm2;//r7:
mm5=dr?
*(31-a?
)
pormm0,mm4;//mm0=00gb00gb00gb00gb
addeax,4;//movetonext4alphas
addesi,8;//movetonext4pixelsinsrc
addedi,8;//movetonext4pixelsindst
movdmm1,[eax];//mm1=00000000a3a2a1a0
paddwmm5,mm7;//r8:
mm5=sr?
*a?
+dr?
*(31-a?
)
paddwmm5,SIXTEEN;//r9:
mm5=(mm5+16)red
pxormm2,mm2;//mm2=0;
movqmm7,mm5;//r10:
mm7=mm5red
psrlwmm5,5;//r11:
mm5=mm5>>5red
movqmm4,[esi];//g1:
mm4=src3src2src1src0
paddwmm5,mm7;//r12:
mm5=mm7+mm5red
punpcklbwmm1,mm2;//mm1=00a300a200a100a0
psrlwmm5,5;//r13:
mm5=mm5>>5red
psllwmm5,11;//r14:
mm5=mm5<<10red
pormm0,mm5;//mm0=0rgb0rgb0rgb0rgb
subebx,4;//polishedoff4pixels
movq[edi-8],mm0;//dst=0rgb0rgb0rgb0rgb
jmploopqword;//gobacktostart
copyback:
movq[edi],mm4;//copysourcetodestination
leavefront:
addedi,8;//advancedestinationby4pixels
addeax,4;//advancealphaby4
addesi,8;//advancesourceby4pixels
subebx,4;//decreasepixelcountby4
jmpprimeloop;
checkback:
testebx,0xFF;//checkif0pixelsleft
jznextline;//donewiththisspan
//backalign:
//workoutbackendpixels
movqmm5,[edi];//g2:
mm5=dst3dst2dst1dst0
psrlwmm1,2;//mm1=a?
>>2nukeout