comment ~
    Copyright (C) 2008 Rouslan Dimitrov

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
~

.686
.model flat
.xmm
.code

; 16-byte aligned pointers required
; NOTE: According to MSDN, __fastcall callee should preserve ecx
; but it doesn't seem to be the case in VS2005

alias <@MemCopy@12> = <memcpy>

memcpy proc uses esi edi
		mov		esi, edx 
		mov		edi, ecx 
		prefetchnta	[esi]		; Request source directly into L1 cache 

		mov		eax, [esp+12]	; byteCount 
		mov		ecx, eax
		and		ecx, 63			; Keep remainder in ecx for later
		shr		eax, 6
		jz		$finish_rest 
 
$loop:							; 64 byte granularity
		prefetchnta	[esi + 64]
 
		movaps	xmm0, [esi+0]	; Transfer one cache line 
		movaps	xmm1, [esi+16] 
		movaps	xmm2, [esi+32] 
		movaps	xmm3, [esi+48] 
 
 
		movntps	[edi+0], xmm0	; Bypass caches 
		movntps	[edi+16], xmm1 
		movntps	[edi+32], xmm2 
		movntps	[edi+48], xmm3 

		add		esi, 64 
		add		edi, 64 
		dec		eax 
		jnz		$loop 
 
$finish_rest: 
		test	ecx, ecx 
		jz		$done
$loop2:							; 16 byte granularity
		test	ecx, -16
		jz		$loop3

		movaps	xmm0, [esi] 
		movntps	[edi],xmm0 
 
		add		esi, 16 
		add		edi, 16 
		sub		ecx, 16
		jg		$loop2 
$done:
		ret 4
$loop3:							; 1 byte granularity
		rep	movsb
		jmp		$done
memcpy endp

end