comment ~
    Copyright (C) 2008 Rouslan Dimitrov

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
~

PI				equ	3.141592653589793f
TWO_PI			equ 6.283185307179586f
ONE_OVER_TWO_PI equ 0.159154943091895f
HALF_PI			equ	1.570796326794897f
SIGN_BIT		equ	2147483648

vec4	struct
x		real4	?
y		real4	?
z		real4	?
w		real4	?
vec4	ends

.686
.model flat
.xmm
.data
constants	segment	readonly para

c5	vec4	{-2.5052108e-8f, -2.5052108e-8f, -2.7557319e-7f, -2.7557319e-7f }
c4	vec4	{ 2.7557319e-6f,  2.7557319e-6f,  2.4801587e-5f,  2.4801587e-5f }
c3	vec4	{-1.9841270e-4f, -1.9841270e-4f, -1.3888889e-3f, -1.3888889e-3f }
c2	vec4	{ 8.3333333e-3f,  8.3333333e-3f,  4.1666667e-2f,  4.1666667e-2f }
c1	vec4	{-1.6666667e-1f, -1.6666667e-1f, -5.0000000e-1f, -5.0000000e-1f }
c0	vec4	{ 1.0000000e+0f,  1.0000000e+0f,  1.0000000e+0f,  1.0000000e+0f }


c_one_over_two_pi	real4	ONE_OVER_TWO_PI
c_pi				real4	PI
c_two_pi			real4	TWO_PI
c_half_pi			real4	HALF_PI

c_pi4				vec4	{ PI, PI, PI, PI }
c_two_pi4			vec4	{ TWO_PI, TWO_PI, TWO_PI, TWO_PI }
c_one_over_two_pi4	vec4	{ ONE_OVER_TWO_PI, ONE_OVER_TWO_PI, ONE_OVER_TWO_PI, ONE_OVER_TWO_PI }
c_half_pi4			vec4	{ HALF_PI, HALF_PI, HALF_PI, HALF_PI }
c_sign_bit4			dword	4 dup (SIGN_BIT)
c_cos_sign_bit4		dword	0, 0, SIGN_BIT, SIGN_BIT

constants ends

.code
sincos		equ ?SinCos@@YI?AVvec4@@AAM@Z
sincos_360	equ ?SinCos_360@@YI?AVvec4@@AAM@Z
sincos4		equ	?SinCos4@@YI?AVvec4@@AAV1@@Z
sincos4_360	equ	?SinCos4_360@@YI?AVvec4@@AAV1@@Z

sincos_360 proc near
	movss	xmm0, dword ptr [edx]
	movss	xmm1, xmm0
	mulss	xmm1, xmm0
	movss	xmm2, [c5.x]
	movss	xmm3, [c5.z]
	mulss	xmm2, xmm1
	mulss	xmm3, xmm1
	addss	xmm2, [c4.x]
	addss	xmm3, [c4.z]
	mulss	xmm2, xmm1
	mulss	xmm3, xmm1
	addss	xmm2, [c3.x]
	addss	xmm3, [c3.z]
	mulss	xmm2, xmm1
	mulss	xmm3, xmm1
	addss	xmm2, [c2.x]
	addss	xmm3, [c2.z]
	mulss	xmm2, xmm1
	mulss	xmm3, xmm1
	addss	xmm2, [c1.x]
	addss	xmm3, [c1.z]
	mulss	xmm2, xmm1
	mulss	xmm3, xmm1
	addss	xmm2, [c0.x]
	addss	xmm3, [c0.x]
	mulss	xmm2, xmm0
	unpcklps	xmm2, xmm3	
	movaps	[ecx], xmm2
	mov eax, ecx
	ret
sincos_360 endp

sincos proc
	;fld		dword ptr [edx]
	;fsincos
	;fstp	dword ptr [ecx]
	;fstp	dword ptr [ecx+4]
	;mov eax, ecx
	;ret
	
	movss		xmm0, dword ptr [edx]			; rotate angle to [-pi/2; pi/2]
	movss		xmm1, xmm0
	mulss		xmm1, [c_one_over_two_pi]
	cvttss2si	eax, xmm1
	cvtsi2ss	xmm1, eax
	mulss		xmm1, [c_two_pi]
	subss		xmm0, xmm1
	
	movss		xmm2, xmm0			; map to [-pi/2; pi/2]
	andps		xmm0, [c_sign_bit4]	; sign(a)
	xorps		xmm2, xmm0			; fabs(a)
	movss		xmm1, [c_pi]
	subss		xmm1, xmm2			; pi - fabs(a)
	movss		xmm3, xmm2
	cmpnltss	xmm3, [c_half_pi]
	movss		xmm7, [c_sign_bit4]
	andps		xmm7, xmm3
	movss		xmm4, xmm2
	cmpltss		xmm4, [c_half_pi]
	andps		xmm3, xmm1
	andps		xmm4, xmm2
	addss		xmm3, xmm4			; (fabs(a) > pi/2) ? pi - fabs(a) : fabs(a)
	orps		xmm0, xmm3
	
	movss	xmm1, xmm0				; Taylor
	mulss	xmm1, xmm0
	movss	xmm2, [c5.x]
	movss	xmm3, [c5.z]
	mulss	xmm2, xmm1
	mulss	xmm3, xmm1
	addss	xmm2, [c4.x]
	addss	xmm3, [c4.z]
	mulss	xmm2, xmm1
	mulss	xmm3, xmm1
	addss	xmm2, [c3.x]
	addss	xmm3, [c3.z]
	mulss	xmm2, xmm1
	mulss	xmm3, xmm1
	addss	xmm2, [c2.x]
	addss	xmm3, [c2.z]
	mulss	xmm2, xmm1
	mulss	xmm3, xmm1
	addss	xmm2, [c1.x]
	addss	xmm3, [c1.z]
	mulss	xmm2, xmm1
	mulss	xmm3, xmm1
	addss	xmm2, [c0.x]
	addss	xmm3, [c0.x]
	mulss	xmm2, xmm0
	xorps	xmm3, xmm7				; flip cosines if needed
	unpcklps	xmm2, xmm3	
	movaps	[ecx], xmm2
	mov eax, ecx
	ret
sincos endp

sincos4_360 proc near
	movaps	xmm0, [edx]
	movaps	xmm2, xmm0
	mulps	xmm2, xmm0
	movaps	xmm1, xmm2
	mulps	xmm2, [c5]
	addps	xmm2, [c4]
	mulps	xmm2, xmm1
	addps	xmm2, [c3]
	mulps	xmm2, xmm1
	addps	xmm2, [c2]
	mulps	xmm2, xmm1
	addps	xmm2, [c1]
	mulps	xmm2, xmm1
	movaps	xmm3, [c0]
	addps	xmm2, xmm3
	movlhps	xmm0, xmm3
	mulps	xmm2, xmm0
	movaps	[ecx], xmm2
	mov eax, ecx
	ret
sincos4_360 endp

sincos4	proc near
	movaps		xmm0, [edx]			; rotate angles to [-pi/2; pi/2]
	movaps		xmm1, xmm0
	mulps		xmm1, [c_one_over_two_pi4]
	cvtps2pi	mm1, xmm1
	cvtpi2ps	xmm1, mm1
	movhlps		xmm2, xmm1
	cvtps2pi	mm2, xmm2
	cvtpi2ps	xmm2, mm2
	movlhps		xmm1, xmm2
	mulps		xmm1, [c_two_pi4]
	subps		xmm0, xmm1
	
	movaps		xmm2, xmm0			; map to [-pi/2; pi/2]
	andps		xmm0, [c_sign_bit4]	; sign(a)
	xorps		xmm2, xmm0			; fabs(a)
	movaps		xmm1, [c_pi4]
	subps		xmm1, xmm2			; pi - fabs(a)
	movaps		xmm3, xmm2
	cmpnltps	xmm3, [c_half_pi4]
	movaps		xmm7, [c_cos_sign_bit4]
	andps		xmm7, xmm3
	movaps		xmm4, xmm2
	cmpltps		xmm4, [c_half_pi4]
	andps		xmm3, xmm1
	andps		xmm4, xmm2
	addps		xmm3, xmm4			; (fabs(a) > pi/2) ? pi - fabs(a) : fabs(a)
	orps		xmm0, xmm3
	
	movaps	xmm2, xmm0				; compute sin and cos
	mulps	xmm2, xmm0
	movaps	xmm1, xmm2
	mulps	xmm2, [c5]
	addps	xmm2, [c4]
	mulps	xmm2, xmm1
	addps	xmm2, [c3]
	mulps	xmm2, xmm1
	addps	xmm2, [c2]
	mulps	xmm2, xmm1
	addps	xmm2, [c1]
	mulps	xmm2, xmm1
	movaps	xmm3, [c0]
	addps	xmm2, xmm3
	movlhps	xmm0, xmm3
	mulps	xmm2, xmm0
	
	xorps	xmm2, xmm7	; flip cosines if needed
	
	movaps	[ecx], xmm2
	mov eax, ecx		; VC expects eax to point to the result,
	ret					; which is ecx due to the calling convention.
sincos4 endp

end