/*
    Copyright (C) 2008 Rouslan Dimitrov

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

// math.cpp and math.h should be compiled with SSE optimizations enabled
// Matrices are column-major

#pragma once

#include <xmmintrin.h>
#include "memalloc.h"

const unsigned int	FABS_MASK = 0x7fffffff;
const unsigned int	INV_MASK = 0x80000000;
const float			EPSILON = 0.001f;
const float			PI = 3.141592653589793f;
const float			HALF_PI = 0.5f * PI;

#define BIT(i) (1<<i)

#define CPUCAP_NOSSE	0
#define CPUCAP_SSE1		BIT(0)
#define CPUCAP_SSE2		BIT(1)
#define CPUCAP_SSE3		BIT(2)

enum {
	P_ERR = -1,
	P_FRONT,
	P_BACK,
	P_ON,
	P_CROSS
};

extern int g_cpuCaps;

inline float fabs(float f) {
	int i = *reinterpret_cast<int*>(&f);
	i &= FABS_MASK;
	return *reinterpret_cast<float*>(&i);
}

int __fastcall MathInit();
void __fastcall	MatrixVectorBatchMultiply(float *dest, float *m, float *v, int byte_size); // defined in matrix.asm
// More function declarations are at the end of this file

#define sPlane vec4

class vec4 : public CMemObj
{ 
	public: 
			vec4(){}; 
			vec4(float x);
			vec4(float x1, float y1, float z1, float w1=1.0f):x(x1),y(y1),z(z1),w(w1){}; 
	//float*	operator&();
	vec4	operator+(vec4& v);
	vec4&	operator+=(vec4& v);
	vec4	operator-(vec4& v);
	vec4&	operator-=(vec4& v);
	int		operator==(vec4& v);
	vec4	operator*(float f);
	vec4	operator*(vec4& v); // elementwise multiplication
	friend
	vec4	operator*(float f, vec4& v);
	vec4&	operator*=(float f);
	vec4	operator/(vec4& v);
	vec4&	operator=(vec4& v);
	float&	operator[](int i);
	vec4	cross3D(vec4& v);	// Clears w
	float	dot(vec4& v);
	float	magnitude();
	void	normalize();
	void	clear();
	int		equal(vec4& v, float epsilon = EPSILON);
	// Component-wise min/max, useful for bounds
	vec4	min(vec4 &v);
	void	minSelf(vec4 &v);
	vec4	max(vec4 &v);
	void	maxSelf(vec4 &v);
	// Plane related
	vec4&	normalFast();	// note only x,y,z are valid
	vec4	normal();
	float	dist();
	int		side(vec4 &p);
	int		side2(vec4 &p); // returns only FRONT and BACK (if point is ON then FRONT is returned)
	vec4	lineIntersect(vec4 &start, vec4 &end);
	int		Hash();

	union {
		struct {
			 float x;
			 float y;
			 float z;
			 float w;
		};
		__m128 dataSSE;
		float data[4];

	};
};

inline vec4 operator*(float f, vec4& v) {
	return v*f;
}

inline vec4::vec4(float x) {
	dataSSE = _mm_set1_ps(x);
}

inline vec4 vec4::operator+(vec4& v) {
	vec4 r;
	r.dataSSE = _mm_add_ps(dataSSE, v.dataSSE);
	return r;
}

inline vec4& vec4::operator+=(vec4& v) {
	dataSSE = _mm_add_ps(dataSSE, v.dataSSE);
	return *this;
}

inline vec4 vec4::operator-(vec4& v) {
	vec4 r;
 	r.dataSSE = _mm_sub_ps(dataSSE, v.dataSSE);
	return r;
}

inline vec4& vec4::operator-=(vec4& v) {
	dataSSE = _mm_sub_ps(dataSSE, v.dataSSE);
	return *this;
}

inline int vec4::operator==(vec4& v) {
	return equal(v);
}

inline vec4 vec4::operator*(float f) {
	vec4 r;
	__m128 f4 = _mm_load_ss(&f);
	f4 = _mm_shuffle_ps(f4, f4, 0x00);
	r.dataSSE = _mm_mul_ps(dataSSE, f4);
	return r;
}

inline vec4 vec4::operator*(vec4& v) {
	vec4 r;
	r.dataSSE = _mm_mul_ps(dataSSE, v.dataSSE);
	return r;
}

inline vec4 vec4::operator/(vec4& v) {
	vec4 r;
	r.dataSSE = _mm_div_ps(dataSSE, v.dataSSE);
	return r;
}

inline vec4& vec4::operator*=(float f) {
	__m128 f4 = _mm_load_ss(&f);
	f4 = _mm_shuffle_ps(f4, f4, 0x00);
	dataSSE = _mm_mul_ps(dataSSE, f4);
	return *this;
}

inline vec4& vec4::operator=(vec4& v) {
	_mm_store_ps( data, _mm_load_ps((float*)&v) );
	return *this;
}

inline float& vec4::operator[](int i) {
	return data[i];
}

// Hash function for planes simply returns D
inline int vec4::Hash() {
	return (int)w;
}

inline vec4& vec4::normalFast() {
	return *this;
}

inline vec4 vec4::normal() {
	vec4 r = *this;
	r.w = 0;
	return r;
}

inline float vec4::dist() {
	return w;
}

inline int vec4::side(vec4 &p) {
	float d = this->dot(p);
	if(fabs(d) < EPSILON)
		return P_ON;
	//int r = _mm_movemask_ps(_mm_load_ss(&d));
	unsigned int r = (*reinterpret_cast<unsigned int*>((float*)&d))>>31;
	return r;
}

inline int vec4::side2(vec4 &p) {
	float d = this->dot(p) + EPSILON;
	unsigned int r = (*reinterpret_cast<unsigned int*>((float*)&d))>>31;
	return r;
}

inline vec4 vec4::lineIntersect(vec4 &start, vec4 &end) {
	float d_s = this->dot(start);
	float d_e = this->dot(end);
	return start + (d_s)/(d_s - d_e) * (end - start);
}

inline void vec4::clear() {
	dataSSE = _mm_setzero_ps();
	//dataSSE = _mm_xor_ps(dataSSE, dataSSE);
}

inline int vec4::equal(vec4& v, float epsilon) {
	__m128 dif = _mm_sub_ps(dataSSE, v.dataSSE);
	__m128 fabs_mask4 = _mm_set1_ps(*reinterpret_cast<float*>((unsigned int*)&FABS_MASK));
	__m128 epsilon4 = _mm_set1_ps(epsilon);
	dif = _mm_and_ps(dif, fabs_mask4);
	dif = _mm_sub_ps(epsilon4, dif);

	return !_mm_movemask_ps(dif);
}

inline float vec4::dot(vec4& v) {
#if 0
	__m128 r1 = _mm_mul_ps(dataSSE, v.dataSSE);
	__m128 r2 = _mm_shuffle_ps(r1, r1, 0x4E);
	r1 = _mm_add_ps(r1, r2);
	r2 = _mm_shuffle_ps(r1, r1, 0x11);
	r1 = _mm_add_ps(r1, r2);
	return r1.m128_f32[0];
#else
	__m128 sq = _mm_mul_ps(this->dataSSE, v.dataSSE);
	return sq.m128_f32[0] + sq.m128_f32[1]
		 + sq.m128_f32[2] + sq.m128_f32[3];
#endif
}

// elementwise min & max
inline vec4 min4(vec4 &a, vec4 &b) {
	vec4 r;
	__m128 mask = _mm_cmplt_ps(a.dataSSE, b.dataSSE);
	r.dataSSE	= _mm_and_ps(mask, a.dataSSE);
	__m128 t	= _mm_andnot_ps(mask, b.dataSSE);
	r.dataSSE	= _mm_or_ps(r.dataSSE, t);
	return r;
}

inline vec4 max4(vec4 &a, vec4 &b) {
	vec4 r;
	__m128 mask = _mm_cmpgt_ps(a.dataSSE, b.dataSSE);
	r.dataSSE	= _mm_and_ps(mask, a.dataSSE);
	__m128 t	= _mm_andnot_ps(mask, b.dataSSE);
	r.dataSSE	= _mm_or_ps(r.dataSSE, t);
	return r;
}

inline vec4 vec4::min(vec4 &v) {
	return min4(*this, v);
}

inline void vec4::minSelf(vec4 &v) {
	*this = this->min(v);
}

inline vec4 vec4::max(vec4 &v) {
	return max4(*this, v);
}

inline void vec4::maxSelf(vec4 &v) {
	*this = this->max(v);
}


class mat4 : public CMemObj
{
public:
	float*			operator&();				
	vec4			operator*(vec4& v);
	mat4&			operator=(float* m);
	mat4&			operator=(mat4& m);
	float&			operator[](int i);
	void			VectorBatchMultiply(vec4 *v, const int num);
	void			MultiplySelf(mat4 &m);
	void			TransposeSelf();
	void			InverseSelf();
	mat4			InverseModelView();

	union
	{
		float data[16];
		__m128 dataSSE[4];
	};
};

inline float* mat4::operator&() {
	return data;
}

inline float& mat4::operator[](int i) {
	return data[i];
}

inline void mat4::VectorBatchMultiply(vec4 *v, const int num) {
	MatrixVectorBatchMultiply(v->data, data, v->data, num<<4);
}

inline void mat4::MultiplySelf(mat4 &m) {
	MatrixVectorBatchMultiply(data, data, m.data, 64);
}

inline mat4& mat4::operator=(float* m) {
	_mm_store_ps(&data[0], _mm_load_ps(&m[0]));
	_mm_store_ps(&data[4], _mm_load_ps(&m[4]));
	_mm_store_ps(&data[8], _mm_load_ps(&m[8]));
	_mm_store_ps(&data[12], _mm_load_ps(&m[12]));
	
	return *this;
}

inline mat4& mat4::operator=(mat4& m) {
	*this = m.data;	
	return *this;
}

// SinCos performs ( sin(angle), cos(angle), ..., ... )
vec4 __fastcall SinCos(float& angle);  // defined in sincos.asm
// much faster version for small angles
vec4 __fastcall SinCos_360(float& angle);

// SinCos4 performs ( sin(angles.x), sin(angles.y), cos(angles.z), cos(angles.w) )
vec4 __fastcall SinCos4(vec4& angles);
vec4 __fastcall SinCos4_360(vec4& angles);

// Slightly faster than SinCos (legacy, but only 4 lines!)
#define SinCosFPU(s, c, a)	\
	__asm fld	a			\
	__asm fsincos			\
	__asm fstp	[s]			\
	__asm fstp	[c]

inline vec4 fabs4(vec4 *v) {
	__m128 mask = _mm_set1_ps(*reinterpret_cast<float*>((unsigned int*)&FABS_MASK));
	vec4 r;
	r.dataSSE = _mm_and_ps(v->dataSSE, mask);
	return r;
}

inline vec4 reverse4(vec4 *v) {
	__m128 mask = _mm_set1_ps(*reinterpret_cast<float*>((unsigned int*)&INV_MASK));
	vec4 r;
	r.dataSSE = _mm_xor_ps(v->dataSSE, mask);
	return r;
}

#if 0
inline int abs(int x) {
	int s = x>>31;
	return x^s - s;
}
#endif