its erasing 307200 bytes ( 76800 dwords ), is this the problem with how your using memset?
anyways... this would probably beat out the library function anyways.
__asm{
mov eax, pvBmpBits
pcmpeqw xmm0,xmm0
psllw xmm0,1
packsswb xmm0,xmm0
mov ecx, 640*480/128
lbl_loop:
movntdq 0[eax], xmm0;
movntdq 16[eax], xmm0;
movntdq 32[eax], xmm0;
movntdq 48[eax], xmm0;
movntdq 64[eax], xmm0;
movntdq 80[eax], xmm0;
movntdq 96[eax], xmm0;
movntdq 112[eax], xmm0;
add eax,128
dec ecx
jnz lbl_loop
}
It could be done on a separate thread because we don't need the surface again until the next dd->Lock call.