initial

2025-06-04 03:22:50 +02:00
parent f234f23848
commit f12416cffd
14243 changed files with 6446499 additions and 26 deletions
--- a/public/mathlib/IceKey.H
+++ b/public/mathlib/IceKey.H
@@ -0,0 +1,66 @@
+// Purpose: Header file for the C++ ICE encryption class.
+//			Taken from public domain code, as written by Matthew Kwan - July 1996
+//			http://www.darkside.com.au/ice/
+
+#ifndef _IceKey_H
+#define _IceKey_H
+
+/*
+The IceKey class is used for encrypting and decrypting 64-bit blocks of data 
+with the ICE (Information Concealment Engine) encryption algorithm. 
+
+The constructor creates a new IceKey object that can be used to encrypt and decrypt data. 
+The level of encryption determines the size of the key, and hence its speed. 
+Level 0 uses the Thin-ICE variant, which is an 8-round cipher taking an 8-byte key. 
+This is the fastest option, and is generally considered to be at least as secure as DES, 
+although it is not yet certain whether it is as secure as its key size. 
+
+For levels n greater than zero, a 16n-round cipher is used, taking 8n-byte keys. 
+Although not as fast as level 0, these are very very secure. 
+
+Before an IceKey can be used to encrypt data, its key schedule must be set with the set() member function. 
+The length of the key required is determined by the level, as described above. 
+
+The member functions encrypt() and decrypt() encrypt and decrypt respectively data 
+in blocks of eight characters, using the specified key. 
+
+Two functions keySize() and blockSize() are provided 
+which return the key and block size respectively, measured in bytes. 
+The key size is determined by the level, while the block size is always 8. 
+
+The destructor zeroes out and frees up all memory associated with the key. 
+*/
+
+class IceSubkey;
+
+class IceKey {
+    public:
+	IceKey (int n);
+	~IceKey ();
+
+	void		set (const unsigned char *key);
+
+	void		encrypt (const unsigned char *plaintext,
+					unsigned char *ciphertext) const;
+
+	void		decrypt (const unsigned char *ciphertext,
+					unsigned char *plaintext) const;
+
+	int		keySize () const;
+
+	int		blockSize () const;
+
+    private:
+	void		scheduleBuild (unsigned short *k, int n,
+							const int *keyrot);
+
+	int		_size;
+	int		_rounds;
+	IceSubkey	*_keysched;
+};
+
+// Valve-written routine  to decode a buffer
+void DecodeICE( unsigned char *pBuffer, int nSize, const unsigned char *pKey );
+
+
+#endif // IceKey_H
--- a/public/mathlib/aabb.h
+++ b/public/mathlib/aabb.h
@@ -0,0 +1,218 @@
+//========= Copyright © Valve Corporation, All rights reserved. ============//
+#ifndef MATHLIB_AABB_HDR
+#define MATHLIB_AABB_HDR
+
+#include "mathlib/vector4d.h"
+#include "mathlib/vector.h"
+#include "mathlib/mathlib.h"
+
+
+/// Axis-aligned 3d bounding box.
+schema struct AABB_t
+{
+public:
+	Vector				m_vMinBounds; 
+	Vector				m_vMaxBounds;
+
+	FORCEINLINE AABB_t() {}
+
+	FORCEINLINE AABB_t( const Vector &vMins, const Vector &vMaxs )
+	{
+		m_vMinBounds = vMins;
+		m_vMaxBounds = vMaxs;
+	}
+
+	FORCEINLINE Vector GetCenter() const { return ( m_vMaxBounds + m_vMinBounds ) / 2.0f; }
+
+	/// radius of bounding sphere centered at GetCenter()
+	FORCEINLINE float GetBoundingRadius( void ) const
+	{
+		return ( m_vMinBounds - GetCenter() ).Length();
+	}
+
+
+	FORCEINLINE float GetSurfaceArea( void ) const { return BoxSurfaceArea( m_vMinBounds, m_vMaxBounds ); }
+
+	/// Calculate the volume. Does not contain special handling for inside-out volumes - if an odd
+	/// number of axes are inside-out, this will return a negative volume, but if an even number,
+	/// it will return a positive one.
+	FORCEINLINE float GetVolume( void ) const { return ComputeVolume( m_vMinBounds, m_vMaxBounds ); }
+
+
+	FORCEINLINE float GetMinDistToPoint( const Vector &vPoint ) const
+	{
+		return CalcDistanceToAABB( m_vMinBounds, m_vMaxBounds, vPoint );
+	}
+
+	FORCEINLINE float GetMinAxialDistanceToPoint( const Vector &vPoint ) const
+	{
+		float flXInterval = MAX( 0, MAX( vPoint.x - m_vMaxBounds.x, m_vMinBounds.x - vPoint.x ) );
+		float flYInterval = MAX( 0, MAX( vPoint.y - m_vMaxBounds.y, m_vMinBounds.y - vPoint.y ) );
+		float flZInterval = MAX( 0, MAX( vPoint.z - m_vMaxBounds.z, m_vMinBounds.z - vPoint.z ) );
+		return MAX( flXInterval, MAX( flYInterval, flZInterval ) );
+	}
+
+	/// expand the aabt_t to contain a point
+	FORCEINLINE void operator |=( const Vector &vPoint )
+	{
+		AddPointToBounds( vPoint, m_vMinBounds, m_vMaxBounds );
+	}
+
+	/// expand the bounds to enclose another aabb_t
+	FORCEINLINE void operator |=( const AABB_t &other )
+	{
+		VectorMin( other.m_vMinBounds, m_vMinBounds, m_vMinBounds );
+		VectorMax( other.m_vMaxBounds, m_vMaxBounds, m_vMaxBounds );
+	}
+
+	/// set the bounds to the bounds of the union of this and another aabb_t
+	FORCEINLINE void operator &=( const AABB_t &other )
+	{
+		VectorMax( other.m_vMinBounds, m_vMinBounds, m_vMinBounds );
+		VectorMin( other.m_vMaxBounds, m_vMaxBounds, m_vMaxBounds );
+	}
+
+	void CreatePlanesFrom( Vector4D *pPlanes ) const
+	{
+		// X
+		pPlanes[0] = Vector4D( 1, 0, 0, -m_vMaxBounds.x  );
+		pPlanes[1] = Vector4D( -1, 0, 0, m_vMinBounds.x );
+
+		// Y
+		pPlanes[2] = Vector4D( 0, 1, 0, -m_vMaxBounds.y );
+		pPlanes[3] = Vector4D( 0, -1, 0, m_vMinBounds.y );
+
+		// Z
+		pPlanes[4] = Vector4D( 0, 0, 1, -m_vMaxBounds.z );
+		pPlanes[5] = Vector4D( 0, 0, -1, m_vMinBounds.z );
+	}
+
+	/// Set the aabb to be invalid (max < min )
+	void MakeInvalid( void )
+	{
+		m_vMinBounds.Init( FLT_MAX, FLT_MAX, FLT_MAX );
+		m_vMaxBounds.Init( -FLT_MAX, -FLT_MAX, -FLT_MAX );
+	}
+
+	// Returns if the bounds are invalid (negative volume), this is different
+	// than empty, the bounds are still considered valid if min == max.
+	FORCEINLINE bool IsInvalid() const
+	{
+		return ( ( m_vMinBounds.x > m_vMaxBounds.x ) ||
+				 ( m_vMinBounds.y > m_vMaxBounds.y ) ||
+				 ( m_vMinBounds.z > m_vMaxBounds.z ) );
+	}
+
+	/// Return if the bounding box has either 0 or negative volume (i.e. if min >= max for any
+	/// coord ). Note that this treats bounds set to a single point as empty.
+	FORCEINLINE bool IsEmpty( void ) const
+	{
+		return (
+			( m_vMinBounds.x >= m_vMaxBounds.x ) ||
+			( m_vMinBounds.y >= m_vMaxBounds.y ) ||
+			( m_vMinBounds.z >= m_vMaxBounds.z )
+			);
+	}
+
+
+	FORCEINLINE bool Overlaps( AABB_t bBox ) const
+	{
+		bBox &= *this;
+		return (! bBox.IsEmpty() );
+	}
+
+	FORCEINLINE bool ContainsPoint( Vector const &vPnt ) const
+	{
+		return (
+			( vPnt.x >= m_vMinBounds.x ) &&
+			( vPnt.y >= m_vMinBounds.y ) &&
+			( vPnt.z >= m_vMinBounds.z ) &&
+			( vPnt.x <= m_vMaxBounds.x ) &&
+			( vPnt.y <= m_vMaxBounds.y ) &&
+			( vPnt.z <= m_vMaxBounds.z )
+			);
+	}
+
+
+	FORCEINLINE bool Contains( const AABB_t &box ) const
+	{
+		return (
+			( box.m_vMinBounds.x >= m_vMinBounds.x ) &&
+			( box.m_vMinBounds.y >= m_vMinBounds.y ) &&
+			( box.m_vMinBounds.z >= m_vMinBounds.z ) &&
+			( box.m_vMaxBounds.x <= m_vMaxBounds.x ) &&
+			( box.m_vMaxBounds.y <= m_vMaxBounds.y ) &&
+			( box.m_vMaxBounds.z <= m_vMaxBounds.z )
+		);
+	}
+
+	/// set the aabb_t to a zero volume point in space.
+	FORCEINLINE void SetToPoint( Vector const &vPnt )
+	{
+		m_vMinBounds = vPnt;
+		m_vMaxBounds = vPnt;
+	}
+
+	FORCEINLINE float LengthOfSmallestDimension() const
+	{
+		Vector vDelta = m_vMaxBounds - m_vMinBounds;
+		return vDelta.SmallestComponentValue();
+	}
+
+	FORCEINLINE const Vector GetSize() const
+	{
+		return m_vMaxBounds - m_vMinBounds;
+	}
+
+	FORCEINLINE void EnsureMinSize( const Vector &vMinSize )
+	{
+		Vector vHalfExpand = VectorMax( Vector( 0,0,0 ), vMinSize - GetSize() ) * 0.5f;
+		m_vMaxBounds += vHalfExpand;
+		m_vMinBounds -= vHalfExpand;
+	}
+
+	FORCEINLINE void Move( const Vector &vDelta )
+	{
+		m_vMinBounds += vDelta;
+		m_vMaxBounds += vDelta;
+	}
+
+	FORCEINLINE void Expand( float flRadius )
+	{
+		m_vMinBounds -= Vector( flRadius, flRadius, flRadius );
+		m_vMaxBounds += Vector( flRadius, flRadius, flRadius );
+	}
+};
+
+
+inline const AABB_t Snap( const AABB_t& aabb, float flSnap )
+{
+	return AABB_t( Snap( aabb.m_vMinBounds, flSnap ), Snap( aabb.m_vMaxBounds, flSnap ) );
+}
+
+inline AABB_t operator+( const AABB_t& aabb1, const AABB_t& aabb2 )
+{
+	Vector vMin = VectorMin( aabb1.m_vMinBounds, aabb2.m_vMinBounds );
+	Vector vMax = VectorMax( aabb1.m_vMaxBounds, aabb2.m_vMaxBounds );
+	return AABB_t( vMin, vMax );
+}
+
+FORCEINLINE void TransformAABB( const matrix3x4_t &matTransform, AABB_t const &boundsIn, AABB_t *pBoundsOut )
+{
+	TransformAABB( matTransform, boundsIn.m_vMinBounds, boundsIn.m_vMaxBounds, pBoundsOut->m_vMinBounds, pBoundsOut->m_vMaxBounds );
+	
+}
+
+
+inline AABB_t GetAabb( const VectorAligned *pPos, int nCount )
+{
+	AABB_t aabb;
+	aabb.MakeInvalid();
+	for ( int i = 0; i < nCount; ++i )
+	{
+		aabb |= pPos[ i ];
+	}
+	return aabb;
+};
+
+#endif
--- a/public/mathlib/anorms.h
+++ b/public/mathlib/anorms.h
@@ -0,0 +1,25 @@
+//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=============================================================================//
+
+#ifndef ANORMS_H
+#define ANORMS_H
+#ifdef _WIN32
+#pragma once
+#endif
+
+
+#include "mathlib/vector.h"
+
+
+#define NUMVERTEXNORMALS	162
+
+// the angle between consecutive g_anorms[] vectors is ~14.55 degrees
+#define VERTEXNORMAL_CONE_INNER_ANGLE	DEG2RAD(7.275)
+
+extern Vector g_anorms[NUMVERTEXNORMALS];
+
+
+#endif // ANORMS_H
--- a/public/mathlib/beziercurve.h
+++ b/public/mathlib/beziercurve.h
@@ -0,0 +1,757 @@
+//====== Copyright © 1996-2009, Valve Corporation, All rights reserved. =======
+//
+// A template implementation of a bezier curve class and associated helper
+// functions.
+//
+//=============================================================================
+
+#ifndef BEZIERCURVE_H
+#define BEZIERCURVE_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+
+const float ONE_THIRD  = 1.0f / 3.0f;
+const float TWO_THIRDS = 2.0f / 3.0f;
+
+//-----------------------------------------------------------------------------
+// Generic order N Bezier curve evaluation. Evaluates the bezier curve at the
+// specified 0 to 1 parameter and returns the result.
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE, int ORDER > 
+struct BezierEvaluateImpl
+{		
+	// This generic implementation performs an iterative set of lerps in order 
+	// to compute the bezier evaluation for any order curve, it is not efficiently 
+	// and here primarily to maintain generality. All order curve that are used 
+	// with any frequency should have their own specialized implementations.
+	static void BezierEvaluate( const POINT_TYPE *pControlPoints, float t, POINT_TYPE &result )
+	{
+		// This array is initialized with the control points and is then 
+		// used to hold the intermediate results of each lerp in order 
+		// to preserve the original set of control points.
+		POINT_TYPE points[ ORDER + 1 ];
+
+		for ( int i = 0; i <= ORDER; ++i ) 
+		{
+			points[ i ] = pControlPoints[ i ];
+		}
+
+		for ( int i = 1; i <= ORDER; ++i ) 
+		{	
+			for ( int j = 0; j <= ( ORDER - i ); ++j ) 
+			{
+				points[ j ] = ( ( 1.0f - t ) * points[ j ] ) + ( t * points[ j + 1 ] );
+			}
+		}
+
+		result = points[ 0 ];
+	}
+};
+
+
+//-----------------------------------------------------------------------------
+// Partial specialization for linear evaluation.
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE >
+struct BezierEvaluateImpl< POINT_TYPE, 1 >
+{
+	static void BezierEvaluate( const POINT_TYPE *points, float t, POINT_TYPE &result )
+	{
+		float u = 1.0f - t;
+		result = ( u * points[ 0 ] ) + ( t * points[ 1 ] );
+	}
+};
+
+
+//-----------------------------------------------------------------------------
+// Partial specialization for quadratic bezier curve evaluation
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE >
+struct BezierEvaluateImpl< POINT_TYPE, 2 >
+{
+	static void BezierEvaluate( const POINT_TYPE *points, float t, POINT_TYPE &result )
+	{
+		float u = 1.0f - t;
+		float t2 = t * t;
+		float u2 = u * u;
+		result = ( u2 * points[ 0 ] ) + ( 2.0f * u * t * points[ 1 ] ) + ( t2 * points[ 2 ] );
+	}
+};
+
+//-----------------------------------------------------------------------------
+// Partial specialization for cubic Bezier curve evaluation.
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE >
+struct BezierEvaluateImpl< POINT_TYPE, 3 >
+{
+	static void BezierEvaluate( const POINT_TYPE *points, float t, POINT_TYPE &result )
+	{
+		float u = 1.0f - t;
+		float t2 = t * t;
+		float t3 = t * t * t;
+		float u2 = u * u;
+		float u3 = u * u * u;
+		result = ( u3 * points[ 0 ] ) + ( 3.0f * u2 * t * points[ 1 ] ) + ( 3.0f * u * t2 * points[ 2 ] ) + ( t3 * points[ 3 ] );
+	}
+};
+
+
+//-----------------------------------------------------------------------------
+// Evaluate the bezier curve of the specified order given a set of control 
+// points for the curve. Uses the BezierEvaluateImpl in order to allow 
+// template partial specialization for specific order curves.
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE, int ORDER > 
+void BezierEvaluate( const POINT_TYPE *pControlPoints, float t, POINT_TYPE &result )
+{
+	BezierEvaluateImpl< POINT_TYPE, ORDER >::BezierEvaluate( pControlPoints, t, result );
+}
+
+
+//-----------------------------------------------------------------------------
+// Generic order N Bezier curve tangent evaluation. Evaluates the derivative of 
+// the Bezier curve at the specified 0 to 1 parameter and returns the result.
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE, int ORDER > 
+struct BezierTangentImpl
+{		
+	static void BezierTangent( const POINT_TYPE *pControlPoints, float t, POINT_TYPE &result )
+	{
+		POINT_TYPE derPoints[ ORDER ];
+		for ( int i = 0; i < ORDER; ++i )
+		{
+			derPoints[ i ] = ( pControlPoints[ i + 1 ] - pControlPoints[ i ] ) * (float)ORDER;
+		}
+		BezierEvaluate< POINT_TYPE, ORDER - 1 >( derPoints, t, result );
+	}
+};
+
+
+//-----------------------------------------------------------------------------
+// Partial specialization for linear Bezier curve tangent evaluation
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE >
+struct BezierTangentImpl< POINT_TYPE, 1 >
+{
+	static void BezierTangent( const POINT_TYPE *pControlPoints, float t, POINT_TYPE &result )
+	{
+		POINT_TYPE derPoint;
+		result = ( pControlPoints[ 1 ] - pControlPoints[ 0 ] );		
+	}
+};
+
+
+//-----------------------------------------------------------------------------
+// Partial specialization for quadratic Bezier curve tangent evaluation
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE >
+struct BezierTangentImpl< POINT_TYPE, 2 >
+{
+	static void BezierTangent( const POINT_TYPE *pControlPoints, float t, POINT_TYPE &result )
+	{
+		POINT_TYPE a = pControlPoints[ 0 ] + ( -2.0f * pControlPoints[ 1 ] ) + pControlPoints[ 2 ];
+		POINT_TYPE b = ( -2.0f * pControlPoints[ 0 ] ) + ( 2.0f * pControlPoints[ 1 ] );
+		result = ( 2.0f * a * t ) + b;
+	}
+};
+
+
+//-----------------------------------------------------------------------------
+// Partial specialization for cubic Bezier curve tangent evaluation
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE >
+struct BezierTangentImpl< POINT_TYPE, 3 >
+{
+	static void BezierTangent( const POINT_TYPE *pControlPoints, float t, POINT_TYPE &result )
+	{
+		POINT_TYPE a = (-1.0f * pControlPoints[ 0 ] ) + ( 3.0f * pControlPoints[ 1 ] ) + (-3.0f * pControlPoints[ 2 ] ) + pControlPoints[ 3 ];
+		POINT_TYPE b = ( 3.0f * pControlPoints[ 0 ] ) + (-6.0f * pControlPoints[ 1 ] ) + ( 3.0f * pControlPoints[ 2 ] );
+		POINT_TYPE c = (-3.0f * pControlPoints[ 0 ] ) + ( 3.0f * pControlPoints[ 1 ] );
+		result = ( 3.0f * a * t * t ) + ( 2.0f * b * t ) + c;
+	}
+};
+
+
+//-----------------------------------------------------------------------------
+// Evaluate the derivative of the bezier curve in order to compute the tangent
+// of the curve the the specified parameter. Uses BezierTangentImpl in order
+// to allow template partial specialization for specific order curves.
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE, int ORDER > 
+void BezierTangent( const POINT_TYPE *pControlPoints, float t, POINT_TYPE &result )
+{
+	BezierTangentImpl< POINT_TYPE, ORDER >::BezierTangent( pControlPoints, t, result );
+}
+
+
+
+//-----------------------------------------------------------------------------
+// The CBezierCurve represents an a order N bezier curve defined by control 
+// points of an arbitrary dimension. The class has template parameters for both 
+// the order ( ORDER ) and the point type ( POINT_TYPE ). In general the point
+// type is expected to be vector, vector2d, or vector4d, but may work with other
+// types if the appropriate operators are provided. 
+//
+//-----------------------------------------------------------------------------
+template < class POINT_TYPE, int ORDER >
+class CBezierCurve
+{
+protected:
+
+	static const int NUM_POINTS = ORDER + 1;
+
+public:
+
+	// Default constructor, performs no initialization
+	CBezierCurve() {}
+
+	// Copy constructor
+	CBezierCurve( const CBezierCurve &source );
+
+	// Array constructor, initialize the bezier from an array of control points 
+	explicit CBezierCurve( const POINT_TYPE controlPoints[ NUM_POINTS ] );
+
+	// Set all of the control points of the curve
+	void SetControlPoints( const POINT_TYPE controlPoints[ NUM_POINTS ] );
+
+	// Evaluate the curve at specified 0 to 1 parameter, returning the point on the curve
+	void Evaluate( float flParam, POINT_TYPE &point ) const;
+
+	// Compute the tangent vector at the specified parameter of the curve
+	void ComputeTangent( float flParam, POINT_TYPE &tangent ) const;
+
+	// Get the specified control point
+	const POINT_TYPE &ControlPoint( int index ) const { return m_ControlPoints[ index ]; }
+
+protected:
+
+	POINT_TYPE m_ControlPoints[ NUM_POINTS ];
+};
+
+
+//-----------------------------------------------------------------------------
+// Copy constructor
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE, int ORDER > 
+CBezierCurve< POINT_TYPE, ORDER >::CBezierCurve( const CBezierCurve &source )
+{
+	m_ControlPoints = source.m_ControlPoints;
+}
+
+
+//-----------------------------------------------------------------------------
+// Array constructor, initialize the bezier from an array of control points 
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE, int ORDER > 
+CBezierCurve< POINT_TYPE, ORDER >::CBezierCurve( const POINT_TYPE controlPoints[ NUM_POINTS ] )
+{
+	SetControlPoints( controlPoints );
+}
+
+//-----------------------------------------------------------------------------
+// Set all of the control points of the curve
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE, int ORDER > 
+void CBezierCurve< POINT_TYPE, ORDER >::SetControlPoints( const POINT_TYPE controlPoints[ NUM_POINTS ] )
+{
+	for ( int i = 0; i < NUM_POINTS; ++i )
+	{
+		m_ControlPoints[ i ] = controlPoints[ i ];
+	}
+}
+
+
+//-----------------------------------------------------------------------------
+// Evaluate the bezier curve of the specified order given a set of control 
+// points for the curve
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE, int ORDER > 
+void CBezierCurve< POINT_TYPE, ORDER >::Evaluate( float t, POINT_TYPE &result ) const
+{
+	BezierEvaluate< POINT_TYPE, ORDER >( m_ControlPoints, t, result );
+}
+
+
+//-----------------------------------------------------------------------------
+// Compute the tangent vector at the specified parameter of the curve
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE, int ORDER > 
+void CBezierCurve< POINT_TYPE, ORDER>::ComputeTangent( float t, POINT_TYPE &tangent ) const
+{
+	BezierTangent< POINT_TYPE, ORDER >( m_ControlPoints, t, tangent );
+}
+
+
+//-----------------------------------------------------------------------------
+// The CCubicBezierCurve class represents a third order specialization of the
+// generic CBezierCurve class and provided additional functionality which is
+// implemented specifically for the cubic form of the bezier curve. 
+//-----------------------------------------------------------------------------
+template < class POINT_TYPE >
+class CCubicBezierCurve : public CBezierCurve< POINT_TYPE, 3 >
+{
+
+public:
+
+	// Default constructor, performs no initialization
+	CCubicBezierCurve() {}
+
+	// Array constructor, initialize the bezier from an array of control points 
+	explicit CCubicBezierCurve( const POINT_TYPE controlPoints[ 4 ] );
+
+	// Compute the specified number of points along the curve
+	void ComputePoints( POINT_TYPE *pPoints, int numPoints ) const;
+
+	// Fit the curve to a set of data points
+	bool FitToPoints( const POINT_TYPE *pPoints, float *pParams, int nPoints, int nMaxSteps, float flMaxError );
+
+private:
+
+	// Compute the bezier factor values used for evaluation that are independent of the parameter value
+	void ComputeFactors( POINT_TYPE &a, POINT_TYPE &b, POINT_TYPE &c, POINT_TYPE &d ) const;
+
+	// Perform a single step of the iterative point fitting process
+	bool FitToPointsStep( const POINT_TYPE *pPoints, float *pParams, int nPoints, bool bReparaterize );
+
+	// Calculate the control points of the curve that best fit the sample points with the specified parameters
+	void ComputeControlPointsForSamples( const POINT_TYPE *pPoints, float *pParams, int nNumPoints );
+
+	// Re-parameterize the provided set of points, finding parameter values which provide 
+	// points on the curve closer to the sample points than the current parameter values.
+	void ReparameterizePoints( const POINT_TYPE *pPoints, float *pParams, int nNumPoints ) const;
+
+	// Compute the maximum squared distance between the specified points and the curve
+	float ComputeMaxError( const POINT_TYPE *pPoints, const float *pParams, int nNumPoints ) const;
+
+	// Compute the sum of the squared distance between the specified points and the curve
+	float ComputeErrorSum( const POINT_TYPE *pPoints, const float *pParams, int nNumPoints ) const;
+
+};
+
+
+//-----------------------------------------------------------------------------
+// Array constructor, initialize the bezier from an array of control points
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE > 
+CCubicBezierCurve< POINT_TYPE >::CCubicBezierCurve( const POINT_TYPE controlPoints[ 4 ] )
+{
+	this->m_ControlPoints[ 0 ] = controlPoints[ 0 ];
+	this->m_ControlPoints[ 1 ] = controlPoints[ 1 ];
+	this->m_ControlPoints[ 2 ] = controlPoints[ 2 ];
+	this->m_ControlPoints[ 3 ] = controlPoints[ 3 ];
+}
+
+
+//-----------------------------------------------------------------------------
+// Compute the specified number of points along the curve
+//-----------------------------------------------------------------------------
+template< class POINT_TYPE > 
+void CCubicBezierCurve< POINT_TYPE >::ComputePoints( POINT_TYPE *pPoints, int numPoints ) const
+{
+	// Must evaluate at least two points.
+	if ( numPoints <= 1 )
+		return;
+	
+	// Calculate the parameter increment for each step
+	const float flStep = 1.0f / ( numPoints - 1 );
+
+	// Compute the basis values that can be re-used for all of the point calculations
+	POINT_TYPE a, b, c, d;
+	ComputeFactors( a, b, c, d );
+
+	float t = 0;
+	for ( int i = 0; i < numPoints; ++i )
+	{
+		pPoints[ i ] = ( a * t * t * t ) + ( b * t * t ) + ( c * t ) + d;
+		t = t + flStep;
+	}
+}
+
+
+//-----------------------------------------------------------------------------
+// Fit the curve to a set of data points
+//-----------------------------------------------------------------------------
+template < class POINT_TYPE >
+bool CCubicBezierCurve< POINT_TYPE >::FitToPoints( const POINT_TYPE *pPoints, float *pParams, int nPoints, int nMaxSteps, float flMaxError )
+{
+	if ( ( pPoints == NULL ) || ( pParams == NULL ) || ( nPoints < 2 ) )
+		return false;
+	
+	// Compute the max error distance, the provided max error is assumed 
+	// to be a percentage of based on the length of the curve.
+	float flLengthSQ = pPoints[ 0 ].DistToSqr( pPoints[ nPoints - 1 ] );
+
+	// Do one initial step using chord length parameterization.
+	FitToPointsStep( pPoints, pParams, nPoints, false );
+	float flError = ComputeMaxError( pPoints, pParams, nPoints );
+
+	// Check to see if the error is reasonable enough to be solved by iteration.
+	float flMaxIterationError = flLengthSQ * 0.1f;
+	if ( flError > flMaxIterationError )
+	{
+		POINT_TYPE vSegment = pPoints[ nPoints - 1 ] - pPoints[ 0 ];
+		this->m_ControlPoints[ 0 ] = pPoints[ 0 ];
+		this->m_ControlPoints[ 1 ] = pPoints[ 0 ] + ( vSegment * ONE_THIRD  );
+		this->m_ControlPoints[ 2 ] = pPoints[ 0 ] + ( vSegment * TWO_THIRDS );
+		this->m_ControlPoints[ 3 ] = pPoints[ nPoints - 1 ];
+		return false;
+	}
+
+	// Iteratively improve the solution by re-evaluating the parameter values to make them match
+	// the sample points more closely and then re-fitting the curve using the least squares method.
+	int iStep = 0;
+	float flMaxErrorDist = flLengthSQ * ( flMaxError * flMaxError );
+
+	while ( ( flError > flMaxErrorDist ) && ( iStep < nMaxSteps ) )
+	{
+		FitToPointsStep( pPoints, pParams, nPoints, true );
+		flError = ComputeMaxError( pPoints, pParams, nPoints );
+		++iStep;
+	}
+
+	return ( flError <= flMaxErrorDist );
+}
+
+
+//-----------------------------------------------------------------------------
+// Compute the bezier factor values used for evaluation that are independent of
+// the parameter value
+//-----------------------------------------------------------------------------
+template < class POINT_TYPE >
+void CCubicBezierCurve< POINT_TYPE >::ComputeFactors( POINT_TYPE &a, POINT_TYPE &b, POINT_TYPE &c, POINT_TYPE &d ) const
+{
+	const POINT_TYPE *pControlPoints = this->m_ControlPoints;
+
+	a = (-1.0f * pControlPoints[ 0 ] ) + ( 3.0f * pControlPoints[ 1 ] ) + (-3.0f * pControlPoints[ 2 ] ) + pControlPoints[ 3 ];
+	b = ( 3.0f * pControlPoints[ 0 ] ) + (-6.0f * pControlPoints[ 1 ] ) + ( 3.0f * pControlPoints[ 2 ] );
+	c = (-3.0f * pControlPoints[ 0 ] ) + ( 3.0f * pControlPoints[ 1 ] );
+	d = ( 1.0f * pControlPoints[ 0 ] );
+}
+
+
+//-----------------------------------------------------------------------------
+// Perform a single step of the iterative point fitting process
+//-----------------------------------------------------------------------------
+template < class POINT_TYPE >
+bool CCubicBezierCurve< POINT_TYPE >::FitToPointsStep( const POINT_TYPE *pPoints, float *pParams, int nNumPoints, bool bReparameterize )
+{
+	POINT_TYPE *pControlPoints = this->m_ControlPoints;
+
+	if ( ( pPoints == NULL ) || ( pParams == NULL ) )
+		return false;
+
+	if ( nNumPoints < 2 )
+		return false;
+
+	if ( nNumPoints == 2)
+	{
+		pControlPoints[ 0 ] = pPoints[ 0 ];
+		pControlPoints[ 1 ] = pPoints[ 0 ] + ( pPoints[ 1 ] - pPoints[ 0 ] ) * ONE_THIRD;
+		pControlPoints[ 2 ] = pPoints[ 0 ] + ( pPoints[ 1 ] - pPoints[ 0 ] ) * TWO_THIRDS;
+		pControlPoints[ 3 ] = pPoints[ 1 ];
+		pParams[ 0 ] = 0.0f;
+		pParams[ 1 ] = 1.0f;
+		return true;
+	}
+
+	if ( bReparameterize )
+	{
+		ReparameterizePoints( pPoints, pParams, nNumPoints );
+	}
+	else
+	{
+		// Chord length parameterization
+		float length = 0;
+		pParams[ 0 ] = 0;
+		for ( int i = 1; i < nNumPoints; ++i )
+		{
+			float distance = pPoints[ i - 1 ].DistTo( pPoints[ i ] );
+			length += distance;
+			pParams[ i ] = length;
+		}
+		for ( int i = 0; i < nNumPoints; ++i )
+		{
+			pParams[ i ] = pParams[ i ] / length;
+		}
+	}
+
+	ComputeControlPointsForSamples( pPoints, pParams, nNumPoints );
+
+	return true;
+}
+
+
+//-----------------------------------------------------------------------------
+// Calculate the control points of the curve that best fit the sample points 
+// with the specified parameters.
+//-----------------------------------------------------------------------------
+template < class POINT_TYPE >
+void CCubicBezierCurve< POINT_TYPE >::ComputeControlPointsForSamples( const POINT_TYPE *pPoints, float *pParams, int nPoints )
+{
+	POINT_TYPE *pControlPoints = this->m_ControlPoints;
+
+	// Set end control points to the first and last sample points
+	pControlPoints[ 0 ] = pPoints[ 0 ];
+	pControlPoints[ 3 ] = pPoints[ nPoints - 1 ];
+
+	// Use the least squares method to calculate new controls points
+	float a1  = 0; 
+	float a2  = 0;
+	float a12 = 0;
+	POINT_TYPE c1;
+	POINT_TYPE c2;
+
+	c1.Init();
+	c2.Init();
+
+	Assert( c1.IsZero() );
+	Assert( c2.IsZero() );
+
+	for ( int i = 0; i < nPoints; ++i )
+	{
+		const POINT_TYPE &p = pPoints[ i ];
+
+		float t = pParams[ i ];
+		float t2 = t * t;
+		float t3 = t * t2;
+		float t4 = t * t3;
+		float u = 1 - t;
+		float u2 = u * u;
+		float u3 = u * u2;
+		float u4 = u * u3;
+
+		a1  += t2 * u4;
+		a2  += t4 * u2;
+		a12 += t3 * u3;
+
+		POINT_TYPE vP = p - (u3 * pControlPoints[ 0 ]) - ( t3 * pControlPoints[ 3 ] );
+		c1 += ( ( 3 * t * u2 ) * vP );
+		c2 += ( ( 3 * t2 * u ) * vP );      
+	}
+
+	a1 = 9.0f * a1;
+	a2 = 9.0f * a2;
+	a12 = 9.0f * a12;
+
+	const float flFactorTolerance = 0.000001f;
+	float flFactor = ( a1 * a2 - a12 * a12 );
+	if ( fabs( flFactor ) < flFactorTolerance )
+	{
+		POINT_TYPE vSegment = pControlPoints[ 3 ] - pControlPoints[ 0 ];
+		pControlPoints[ 1 ] = pControlPoints[ 0 ] + vSegment * ONE_THIRD;
+		pControlPoints[ 2 ] = pControlPoints[ 0 ] + vSegment * TWO_THIRDS;
+	}
+	else
+	{
+		pControlPoints[ 1 ] = ( a2 * c1 - a12 * c2 ) / flFactor;
+		pControlPoints[ 2 ] = ( a1 * c2 - a12 * c1 ) / flFactor;
+	}
+
+	// The subsequent re-parameterization relies on NewtonRaphson root finding which will
+	// fail if the tangents have an x delta of 0 or less, so ensure this does not happen.
+	float flMinStep = ( pControlPoints[ 3 ].x - pControlPoints[ 0 ].x ) * 0.0001f;
+	float flMinX = pControlPoints[ 0 ].x + flMinStep;
+	float flMaxX = pControlPoints[ 3 ].x - flMinStep;
+	pControlPoints[ 1 ].x = MAX( flMinX, MIN( flMaxX, pControlPoints[ 1 ].x ) );
+	pControlPoints[ 2 ].x = MAX( flMinX, MIN( flMaxX, pControlPoints[ 2 ].x ) );
+}
+
+
+//-----------------------------------------------------------------------------
+// Find a better parameter for the specified point using the NewtonRaphson 
+// method, or with simple iteration if the NewtonRaphson method fails.
+//-----------------------------------------------------------------------------
+template < class POINT_TYPE >
+void CCubicBezierCurve< POINT_TYPE >::ReparameterizePoints( const POINT_TYPE *pPoints, float *pParams, int nNumPoints ) const
+{
+	const POINT_TYPE *pControlPoints = this->m_ControlPoints;
+
+	const float flTolerance = 0.0001f;
+
+	POINT_TYPE der1[ 3 ];
+	der1[ 0 ] = ( pControlPoints[ 1 ] - pControlPoints[ 0 ] ) * 3.0f;
+	der1[ 1 ] = ( pControlPoints[ 2 ] - pControlPoints[ 1 ] ) * 3.0f;
+	der1[ 2 ] = ( pControlPoints[ 3 ] - pControlPoints[ 2 ] ) * 3.0f;
+
+	POINT_TYPE der2[ 2 ];
+	der2[ 0 ] = ( der1[ 1 ] - der1[ 0 ] ) * 2.0f;
+	der2[ 1 ] = ( der1[ 2 ] - der1[ 1 ] ) * 2.0f;
+
+	// Compute the basis values that can be re-used for all of the point calculations
+	POINT_TYPE b3a, b3b, b3c, b3d;
+	ComputeFactors( b3a, b3b, b3c, b3d );
+
+	POINT_TYPE b2a = (  1.0f * der1[ 0 ] ) + ( -2.0f * der1[ 1 ] ) + ( 1.0f * der1[ 2 ] );
+	POINT_TYPE b2b = ( -2.0f * der1[ 0 ] ) + (  2.0f * der1[ 1 ] );
+	POINT_TYPE b2c = (  1.0f * der1[ 0 ] );
+
+	POINT_TYPE b1a = der2[ 1 ] - der2[ 0 ];
+	POINT_TYPE b1b = der2[ 0 ];
+
+	float flPrevParam = 0;
+	for ( int iPoint = 0; iPoint < nNumPoints; ++iPoint )
+	{
+		float t = pParams[ iPoint ];
+		const POINT_TYPE &point = pPoints[ iPoint ];
+
+		POINT_TYPE curvePoint = ( b3a * t * t * t ) + ( b3b * t * t ) + ( b3c * t ) + b3d;
+		POINT_TYPE der1Point  = ( b2a * t * t ) + ( b2b * t ) + b2c;
+		POINT_TYPE der2Point  = ( b1a * t ) + b1b;
+
+		// Attempt to find a better parameter for the point 
+		// using the NewtonRaphson root finding method.
+		POINT_TYPE vDelta = curvePoint - point;
+		float flNumerator = vDelta.Dot( der1Point );
+		float flDenominator = vDelta.Dot( der2Point ) + der1Point.Dot( der1Point );
+		float flRootParam = ( flDenominator == 0.0f ) ? t :  t - ( flNumerator / flDenominator );
+
+		// We are not interested in any solutions outside the 0 to 1 range, so 
+		// clamp the result. This may give a result that is farther than the 
+		// original parameter, in which case the original parameter will be used.
+		flRootParam = MAX( 0.0f, MIN( 1.0f, flRootParam ) );
+
+		// Evaluate the parameter returned by the root finding, to 
+		// determine if it is actually better parameter for the point.
+		float rp = flRootParam;
+		POINT_TYPE rootCurvePoint = ( b3a * rp * rp * rp ) + ( b3b * rp * rp ) + ( b3c * rp ) + b3d;
+		float flDist = point.DistToSqr( curvePoint );
+		float flDistRoot = point.DistToSqr( rootCurvePoint );
+		
+		// If the parameter returned by the root finding method gives a point on the 
+		// curve that is closer to the sample point than the current parameter make 
+		// the new parameter the value found by the root finding method.
+		float flNewParam = t;
+		if ( flDistRoot <= flDist )
+		{
+			flNewParam = flRootParam;
+		}
+		else if ( flDist > flTolerance )
+		{
+			// If the root finding method failed, try to find a better parameter iteratively. This is 
+			// basically a brute force method, but with a couple of observations which actually make it
+			// reasonable. First the direction to iterate from the current parameter can be deduced 
+			// from the dot product of the vector from the point and the tangent of the curve. Second
+			// the range of iteration can be restricted such that values before the last parameter 
+			// are not considered.
+			POINT_TYPE stepPoint;
+			float flStepParam = t;
+			float flStepDist = 0;
+			float flBestStepParam = t;
+			float flBestStepDist = flDist;
+
+			const int nMaxSteps = 10;
+			const float flBaseStepSize = MAX( t - flPrevParam, 0.001f ) / ( float )nMaxSteps;
+			float flStepSize = 0;
+			int nStep = 0;
+
+			// The numerator of the root finding method is the dot product between the vector from the 
+			// sample point to the point on the curve and the tangent of the curve. The tangent of the
+			// curve tells us which way the curve is going and we want to move the parameter along the
+			// curve in the way which is moving closer to the point, so if the dot product of the 
+			// tangent and the vector from the point on the curve to the sample point is positive then
+			// moving in a positive direction along the curve will bring us closer to the sample point. 
+			// However, the numerator value used the vector from the sample point to the curve point, 
+			// so negative value implies a positive movement.
+			if ( flNumerator < 0 )
+			{
+				flStepSize = flBaseStepSize;
+			}
+			else
+			{
+				flStepSize = -flBaseStepSize;
+			}
+
+			// Starting with the current parameter, move the parameter by the calculated step interval
+			// and evaluate the result. Continue as long as the result is closer than the previous 
+			// best result and the specified maximum number of steps has not been reached.
+			while ( nStep < nMaxSteps )
+			{
+				flStepParam = MAX( 0.0f, MIN( 1.0f, flStepParam + flStepSize ) );
+
+				float sp = flStepParam;
+				stepPoint = ( b3a * sp * sp * sp ) + ( b3b * sp * sp ) + ( b3c * sp ) + b3d;
+				flStepDist = point.DistToSqr( stepPoint );
+				
+				if ( flStepDist >= flBestStepDist )
+					break;
+
+				flBestStepParam = flStepParam;
+				flBestStepDist = flStepDist;
+				++nStep;
+			}
+
+			flNewParam = flBestStepParam;
+		}
+
+		// Update the parameter to the new value which provides 
+		// a closer point on the curve to the sample point.
+		Assert( flNewParam >= 0.0f );
+		Assert( flNewParam <= 1.0f );
+		pParams[ iPoint ] = flNewParam;
+
+		// Save the old parameter so it may be used by the next point
+		// to determine the iteration range if the root finding fails
+		flPrevParam = t;
+	}
+}
+
+
+//-----------------------------------------------------------------------------
+// Compute the maximum squared distance between the specified points and the 
+// curve
+//-----------------------------------------------------------------------------
+template < class POINT_TYPE >
+float CCubicBezierCurve< POINT_TYPE >::ComputeMaxError( const POINT_TYPE *pPoints, const float *pParams, int nNumPoints ) const
+{
+	float flMaxError = 0.0f;
+
+	POINT_TYPE a, b, c, d;
+	ComputeFactors( a, b, c, d );
+
+	for ( int iPoint = 0; iPoint < nNumPoints; ++iPoint )
+	{
+		const POINT_TYPE &samplePoint = pPoints[ iPoint ];
+		float t = pParams[ iPoint ];
+
+		POINT_TYPE curvePoint = ( a * t * t * t ) + ( b * t * t ) + ( c * t ) + d;
+
+		float flDistSQ = samplePoint.DistToSqr( curvePoint );
+		flMaxError = MAX( flDistSQ, flMaxError );
+	}
+
+	return flMaxError;
+}
+
+
+
+//-----------------------------------------------------------------------------
+// Compute the sum of the squared distance between the specified points and the
+// curve
+//-----------------------------------------------------------------------------
+template < class POINT_TYPE >
+float CCubicBezierCurve< POINT_TYPE >::ComputeErrorSum( const POINT_TYPE *pPoints, const float *pParams, int nNumPoints ) const
+{
+	float flErrorSum = 0.0f;
+
+	POINT_TYPE a, b, c, d;
+	ComputeFactors( a, b, c, d );
+
+	for ( int iPoint = 0; iPoint < nNumPoints; ++iPoint )
+	{
+		const POINT_TYPE &samplePoint = pPoints[ iPoint ];
+		float t = pParams[ iPoint ];
+
+		POINT_TYPE curvePoint = ( a * t * t * t ) + ( b * t * t ) + ( c * t ) + d;
+
+		float flDistSQ = samplePoint.DistToSqr( curvePoint );
+		flErrorSum += flDistSQ;
+	}
+
+	return flErrorSum;
+}
+
+
+#endif 
--- a/public/mathlib/box_buoyancy.h
+++ b/public/mathlib/box_buoyancy.h
@@ -0,0 +1,24 @@
+//========= Copyright (c) Valve Corporation, All rights reserved. ==========
+
+#ifndef MATHLIB_BOX_BUOYANCY_H
+#define MATHLIB_BOX_BUOYANCY_H
+
+#include "ssemath.h"
+#include "mathlib/vector4d.h"
+
+
+// returns the volume of the part of the box submerged in water
+// box is defined as mutually orthogonal half-sizes. THe halfsizes MUST be orthogonal!
+// the water plane is z=0, and the box center's z coordinate is taken from the f4Origin parameter
+// (only z is used in f4Origin)
+extern fltx4 GetBoxBuoyancy4x3( const fltx4& f4a, const fltx4& f4b, const fltx4&f4c, const fltx4&f4Origin );
+
+extern Vector4D GetBoxBuoyancy( const Vector& a, const Vector& b, const Vector& c, const Vector& vecOrigin );
+
+// this takes a,b,c half-sizes and the center position of the box in the columns of the 3x4 matrix
+extern fltx4 GetBoxBuoyancy3x4( const FourVectors &box );
+
+extern void TestBuoyancy();
+
+
+#endif
--- a/public/mathlib/bumpvects.h
+++ b/public/mathlib/bumpvects.h
@@ -0,0 +1,37 @@
+//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $Workfile:     $
+// $Date:         $
+// $NoKeywords: $
+//=============================================================================//
+
+#ifndef BUMPVECTS_H
+#define BUMPVECTS_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include "mathlib/mathlib.h"
+
+#define OO_SQRT_2 0.70710676908493042f
+#define OO_SQRT_3 0.57735025882720947f
+#define OO_SQRT_6 0.40824821591377258f
+// sqrt( 2 / 3 )
+#define OO_SQRT_2_OVER_3 0.81649661064147949f
+
+#define NUM_BUMP_VECTS 3
+
+const TableVector g_localBumpBasis[NUM_BUMP_VECTS] = 
+{
+	{	OO_SQRT_2_OVER_3, 0.0f, OO_SQRT_3 },
+	{  -OO_SQRT_6, OO_SQRT_2, OO_SQRT_3 },
+	{  -OO_SQRT_6, -OO_SQRT_2, OO_SQRT_3 }
+};
+
+void GetBumpNormals( const Vector& sVect, const Vector& tVect, const Vector& flatNormal, 
+					 const Vector& phongNormal, Vector bumpNormals[NUM_BUMP_VECTS] );
+
+#endif // BUMPVECTS_H
--- a/public/mathlib/camera.h
+++ b/public/mathlib/camera.h
@@ -0,0 +1,660 @@
+//====== Copyright © 1996-2005, Valve Corporation, All rights reserved. =======//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#ifndef CAMERA_H
+#define CAMERA_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include <math.h>
+#include <float.h>
+
+// For vec_t, put this somewhere else?
+#include "tier0/basetypes.h"
+#include "mathlib/vector.h"
+
+#include "tier0/dbg.h"
+#include "mathlib/vector2d.h"
+#include "mathlib/math_pfns.h"
+#include "mathlib/vmatrix.h"
+#include "mathlib/ssemath.h"
+#include "datamap.h"
+#include "mathlib/aabb.h"
+
+#include "tier0/memalloc.h"
+// declarations for camera and frustum
+
+extern VMatrix g_matViewToCameraMatrix, g_matCameraToViewMatrix;
+
+struct ALIGN16 Camera_t
+{
+	void Init( const Vector &origin, const QAngle &angles, float flNear, float flFar, float flFOV, float flAspect );
+	void InitOrtho( const Vector &origin, const QAngle &angles, float flNear, float flFar, float flWidth, float flHeight );
+
+	void InitViewParameters( const Vector &vOrigin, const QAngle &vAngles );
+
+	void InitOrthoProjection( float flZNear, float flZFar, float flWidth, float flHeight );
+	bool IsOrthographic() const;
+	void InitPerspectiveProjection( float flZNear, float flZFar, float flFOVX, float flAspect );
+	// generates 8 vertices of the frustum
+	// vertex order is near plane (UL, UR, LL, LR), far plane (UL, UR, LL, LR)
+	void ComputeGeometry( Vector *pVertsOut8 ) const;
+	void ComputeGeometry( Vector *pVertsOut8, const Vector &vForward, const Vector &vLeft, const Vector &vUp ) const;
+
+	inline bool operator ==( const Camera_t &other ) const;
+	inline bool operator !=( const Camera_t &other ) const;
+
+	Vector m_origin;
+	QAngle m_angles;
+
+	// FOV for X/width. 
+	// This should be set to -1 to get an ortho projection,
+	// in which case it'll use m_flWidth and m_flHeight.
+	float m_flFOVX;
+	float m_flAspect; // For Perspective
+
+	float m_flZNear;
+	float m_flZFar;
+
+	float m_flWidth;	// For ortho.
+	float m_flHeight;
+} ALIGN16_POST;
+
+inline void Camera_t::Init( const Vector &origin, const QAngle &angles, float flNear, float flFar, float flFOV, float flAspect )
+{
+	InitViewParameters( origin, angles );
+	InitPerspectiveProjection( flNear, flFar, flFOV, flAspect );
+	m_flWidth = -1;
+	m_flHeight = -1;
+}
+
+inline void Camera_t::InitOrtho( const Vector &origin, const QAngle &angles, float flNear, float flFar, float flWidth, float flHeight )
+{
+	InitViewParameters( origin, angles );
+	InitOrthoProjection( flNear, flFar, flWidth, flHeight );
+}
+
+
+inline void Camera_t::InitViewParameters( const Vector &vOrigin, const QAngle &vAngles )
+{
+	m_origin = vOrigin;
+	m_angles = vAngles;
+}
+
+inline void Camera_t::InitOrthoProjection( float flZNear, float flZFar, float flWidth, float flHeight )
+{
+	m_flFOVX = -1;
+	m_flZNear = flZNear;
+	m_flZFar = flZFar;
+	m_flWidth = flWidth;
+	m_flHeight = flHeight;
+}
+
+inline bool Camera_t::IsOrthographic() const
+{
+	return m_flFOVX == -1;
+}
+
+inline void Camera_t::InitPerspectiveProjection( float flZNear, float flZFar, float flFOVX, float flAspect )
+{
+	m_flFOVX = flFOVX;
+	m_flAspect = flAspect;
+	m_flZNear = flZNear;
+	m_flZFar = flZFar;
+}
+
+inline bool Camera_t::operator ==( const Camera_t &other ) const
+{
+	return ( m_origin == other.m_origin ) && 
+		( m_angles == other.m_angles ) &&
+		( m_flFOVX == other.m_flFOVX ) &&
+		( m_flAspect == other.m_flAspect ) &&
+		( m_flZNear == other.m_flZNear ) &&
+		( m_flZFar == other.m_flZFar ) &&
+		( m_flWidth == other.m_flWidth ) &&
+		( m_flHeight == other.m_flHeight );
+}
+
+
+inline bool Camera_t::operator !=( const Camera_t &other ) const
+{
+	return !( *this == other );
+}
+
+
+///\name Functions to set up a VMatrix from various input view specifications
+//@{
+/// This maps the X/Y bounds into [-1,1] and flNear/flFar into [0,1]
+inline VMatrix OrthoMatrixRH( float x1, float y1, float x2, float y2, float flNear, float flFar )
+{
+	float flDelta = flNear - flFar;
+
+	float ix = 2.0f / ( x2 - x1 );
+	float iy = 2.0f / ( y2 - y1 );
+
+	VMatrix mRet( 
+		ix,		 0,		0,				0,
+		0,		 iy,	0,				0,
+		0,		 0,		1.0f / flDelta,	flNear / flDelta,
+		0,		 0,		0,	1 );
+
+	return mRet;
+}
+
+inline VMatrix OrthoMatrixOffCenterRH( float x1, float y1, float x2, float y2, float flNear, float flFar )
+{
+	float flDelta = flNear - flFar;
+
+	float ix = 2.0f / ( x2 - x1 );
+	float iy = 2.0f / ( y2 - y1 );
+
+	VMatrix mRet( 
+		ix,		 0,		0,				-(ix * x1) - 1,
+		0,		 iy,	0,				-(iy * y1) - 1,
+		0,		 0,		1.0f / flDelta,	flNear / flDelta,
+		0,		 0,		0,	1 );
+
+	return mRet;
+}
+
+/// This maps the X/Y bounds into [-1,1] and flNear/flFar into [0,1]
+/// This is left-handed for concatenation onto the viewproj matrix for app-tiling in source2
+inline VMatrix OrthoMatrixOffCenterLH( float x1, float y1, float x2, float y2, float flNear, float flFar )
+{
+	float flDelta = flFar - flNear;
+
+	float ix = 2.0f / ( x2 - x1 );
+	float iy = 2.0f / ( y2 - y1 );
+
+	VMatrix mRet( 
+		ix,		 0,		0,				0,
+		0,		 iy,	0,				0,
+		0,		 0,		1.0f / flDelta,	0,
+		( x1 + x2 )/( x1 - x2 ), ( y1 + y2 ) / ( y1 - y2 ),	-flNear / flDelta,	1 );
+
+	return mRet;
+}
+
+/// This is the hammer wireframe widget version that inverts depth and shifts the xy coordinates
+inline VMatrix OrthoMatrixHammerRH( float x1, float y1, float x2, float y2, float flNear, float flFar )
+{
+	float flDelta = flNear - flFar;
+
+	float ix = 2.0f / ( x2 - x1 );
+	float iy = 2.0f / ( y2 - y1 );
+
+	VMatrix mRet( 
+		ix,		 0,		0,				-(ix * x1) - 1,
+		0,		 iy,	0,				-(iy * y1) - 1,
+		0,		 0,		1.0f / flDelta,	-flNear / flDelta,
+		0,		 0,		0,				1 );
+
+	return mRet;
+}
+
+
+
+/// helper to calculate an ortho matrix for a view region centered at 0 of specified width and height
+inline VMatrix OrthoMatrixRH( float flWidth, float flHeight, float flNear, float flFar )
+{
+	return OrthoMatrixRH( -flWidth/2, -flHeight/2, flWidth/2, flHeight/2, flNear, flFar );
+}
+
+/// calculate a view matrix given an origin, forward vector, and up vector
+VMatrix ViewMatrixRH( Vector &vEye, Vector &vAt, Vector &vUp );
+
+/// calculate a VMatrix from a camera_t
+void ComputeViewMatrix( VMatrix *pWorldToView, const Camera_t& camera );
+/// calculate a matrix3x4_t corresponding to a camera_t
+void ComputeViewMatrix( matrix3x4_t *pWorldToView, const Camera_t& camera );
+void ComputeViewMatrix( matrix3x4_t *pWorldToView, matrix3x4_t *pWorldToCamera, const Camera_t &camera );
+void ComputeViewMatrix( matrix3x4_t *pWorldToView, matrix3x4_t *pCameraToWorld, 
+	Vector const &vecOrigin,
+	Vector const &vecForward, Vector const &vecLeft, Vector const &vecUp );
+
+void ComputeViewMatrix( VMatrix *pViewMatrix, const Vector &origin, const QAngle &angles );
+void ComputeViewMatrix( VMatrix *pViewMatrix, const matrix3x4_t &matGameCustom );
+
+void ComputeProjectionMatrix( VMatrix *pCameraToProjection, const Camera_t& camera, int width, int height );
+void ComputeProjectionMatrix( VMatrix *pCameraToProjection, float flZNear, float flZFar, float flFOVX, float flAspectRatio );
+void ComputeProjectionMatrix( VMatrix *pCameraToProjection, float flZNear, float flZFar, float flFOVX, float flAspectRatio, 
+	float flClipSpaceBottomLeftX, float flClipSpaceBottomLeftY,
+	float flClipSpaceTopRightX, float flClipSpaceTopRightY );
+
+//@}
+
+
+void CalcFarPlaneCameraRelativePoints( Vector *p4PointsOut, Vector &vForward, Vector &vUp, Vector &vLeft, float flFarPlane, 
+	float flFovX, float flFovY,
+	float flClipSpaceBottomLeftX = -1.0f, float flClipSpaceBottomLeftY = -1.0f,
+	float flClipSpaceTopRightX = 1.0f, float flClipSpaceTopRightY = 1.0f );
+
+/// transform a point from 3d to 2d, given screen width + height
+void ComputeScreenSpacePosition( Vector2D *pScreenPosition, const Vector &vecWorldPosition, 
+	const Camera_t &camera, int width, int height );
+
+
+
+// Functions to build frustum information given params
+void MatricesFromCamera( VMatrix &mWorldToView, VMatrix &mProjection, const Camera_t &camera,
+	float flClipSpaceBottomLeftX = -1.0f, float flClipSpaceBottomLeftY = -1.0f,
+	float flClipSpaceTopRightX = 1.0f, float flClipSpaceTopRightY = 1.0f );
+void FrustumFromViewProj( Frustum_t *pFrustum, const VMatrix &mViewProj, const Vector &origin, bool bD3DClippingRange = true );
+void FrustumFromMatrices( Frustum_t *pFrustum, const VMatrix &mWorldToView, const VMatrix &mProjection, const Vector &origin, bool bD3DClippingRange = true );
+// TODO: desired api.
+//void MatrixFromFrustum( VMatrix *pViewProj, const Frustum_t &frustum );
+VMatrix ViewProjFromVectors( const Vector &origin, float flNear, float flFar, float flFOV, float flAspect,
+	Vector const &vecForward, Vector const &vecLeft, Vector const &vecUp );
+
+enum EBoxOverlapFlags
+{
+	BOXCHECK_FLAGS_OVERLAPS_NEAR = 1,
+	BOXCHECK_FLAGS_OVERLAPS_FAR = 2,
+};
+
+
+
+
+/// Class holding a camera, frustum planes, and transformation matrices, and methods to calculate
+/// them and keep them in sync.
+class CFrustum
+{
+public:
+	CFrustum();
+	~CFrustum(){}
+
+	//--------------------------------------------------------------------------------------------------
+	// Camera fxns
+	//--------------------------------------------------------------------------------------------------
+	void InitCamera( const Camera_t &Other )
+	{
+		m_camera = Other;
+		m_bDirty = true;
+	}
+
+	// For an off-center projection:
+	// flClipSpaceXXXX coordinates are in clip space, where ( -1,-1 ) is the bottom left corner of the screen
+	// and ( 1,1 ) is the top right corner of the screen.
+	void InitCamera( const Vector &origin, const QAngle &angles, float flNear, float flFar, float flFOV, float flAspect,
+		float flClipSpaceBottomLeftX = -1.0f,	float flClipSpaceBottomLeftY = -1.0f, float flClipSpaceTopRightX = 1.0f, float flClipSpaceTopRightY = 1.0f )
+	{
+		m_camera.Init( origin, angles, flNear, flFar, flFOV, flAspect );
+
+		m_flClipSpaceBottomLeftX = flClipSpaceBottomLeftX;
+		m_flClipSpaceBottomLeftY = flClipSpaceBottomLeftY;
+		m_flClipSpaceTopRightX = flClipSpaceTopRightX;
+		m_flClipSpaceTopRightY = flClipSpaceTopRightY;
+		m_bDirty = true;
+	}
+
+	void InitOrthoCamera( const Vector &origin, const QAngle &angles, float flNear, float flFar, float flWidth, float flHeight,
+		float flClipSpaceBottomLeftX = -1.0f, float flClipSpaceBottomLeftY = -1.0f, float flClipSpaceTopRightX = 1.0f, float flClipSpaceTopRightY = 1.0f )
+	{
+		m_camera.InitOrtho( origin, angles, flNear, flFar, flWidth, flHeight );
+		m_camera.m_flAspect = flWidth / flHeight;
+
+		m_flClipSpaceBottomLeftX = flClipSpaceBottomLeftX;
+		m_flClipSpaceBottomLeftY = flClipSpaceBottomLeftY;
+		m_flClipSpaceTopRightX = flClipSpaceTopRightX;
+		m_flClipSpaceTopRightY = flClipSpaceTopRightY;
+		m_bDirty = true;
+	}
+
+	bool IsOrthographic() const { return m_camera.IsOrthographic(); }
+
+	void SetCameraPosition( const Vector &origin );
+	const Vector &GetCameraPosition() const { return m_camera.m_origin; }
+
+	void SetCameraAngles( const QAngle &angles );
+	const QAngle &GetCameraAngles() const { return m_camera.m_angles; }
+
+	// Sets the distance from the camera to the near/far clipping plane in world units.
+	void SetCameraNearFarPlanes( float flNear, float flFar );
+	void GetCameraNearFarPlanes( float &flNear, float &flFar ) const { flNear = m_camera.m_flZNear; flFar = m_camera.m_flZFar; }
+
+	// Sets the distance from the camera to the near clipping plane in world units.
+	void SetCameraNearPlane( float flNear );
+	float GetCameraNearPlane() const { return m_camera.m_flZNear; }
+
+	// Sets the distance from the camera to the far clipping plane in world units.
+	void SetCameraFarPlane( float flFar );
+	float GetCameraFarPlane() const { return m_camera.m_flZFar; }
+
+	// Set the field of view (in degrees)
+	void SetCameraFOV( float flFOV );
+	float GetCameraFOV() const { return m_camera.m_flFOVX; }
+
+	void SetCameraWidthHeight( float flWidth, float flHeight );
+	void GetCameraWidthHeight( float &width, float &height ) const { width = m_camera.m_flWidth; height = m_camera.m_flHeight; }
+
+	void SetCameraWidth( float flWidth );
+	float GetCameraWidth() const { return m_camera.m_flWidth; }
+
+	void SetCameraHeight( float flHeight );
+	float GetCameraHeight() const { return m_camera.m_flHeight; }
+
+	void SetCameraAspect( float flAspect );
+	float GetCameraAspect() const { return m_camera.m_flAspect; }
+
+	/// Returns mask of BOXCHECK_FLAGS_xxx indicating the status of the box with respect to this
+	/// frustum's near and far clip planes.
+	int CheckBoxAgainstNearAndFarPlanes( const VectorAligned &minBounds, const VectorAligned &maxBounds ) const;
+
+	/// given an AABB, return the values of the near and far plane which will enclose the box
+	void GetNearAndFarPlanesAroundBox( float *pNear, float *pFar, AABB_t const &inBox, Vector &vOriginShift ) const;
+
+	/// Compute the approximate size of a sphere. Rough calculation suitable for lod selection,
+	/// etc.  Result is in terms of approximate % coverage of the viewport, not taking clipping
+	/// into account.
+	float ComputeScreenSize( Vector vecOrigin, float flRadius ) const;
+
+	/// Return the Sin of the FOV
+	FORCEINLINE float SinFOV( void ) const { return sin( DEG2RAD( GetCameraFOV() ) ); }
+
+	const Camera_t &GetCameraStruct() const { return m_camera; }
+
+	//--------------------------------------------------------------------------------------------------
+	// Frustum fxns
+	//--------------------------------------------------------------------------------------------------
+
+	void SetFrustumStruct( const Frustum_t &frustumStruct ) { m_frustumStruct = frustumStruct; }
+
+	// Camera oriented directions
+	const Vector &CameraForward() const { return m_forward; }
+	const Vector &CameraLeft() const { return m_left; }
+	const Vector &CameraUp() const { return m_up; }
+
+	// View oriented directions i.e. view align matrix has been applied
+	void ViewForward( Vector& vViewForward ) const;
+	void ViewLeft( Vector& vViewLeft ) const;
+	void ViewUp(  Vector& vViewUp ) const;
+
+	void SetView( VMatrix &mWorldToView ) { m_worldToView = mWorldToView.As3x4(); }
+	const matrix3x4_t &GetView() const { return m_worldToView; }
+
+	void SetProj( VMatrix &mProj ) { m_projection = mProj; }
+	const VMatrix &GetProj() const { return m_projection; }
+	const VMatrix &GetInvProj() const { return m_invProjection; }
+
+	// The viewProj and invViewProj matrices are NOT transposed.
+	void SetViewProj( VMatrix &viewProj ) { m_viewProj = viewProj; }
+	const VMatrix &GetViewProj() const { return m_viewProj; }
+	VMatrix GetViewProjTranspose() const { return m_viewProj.Transpose(); }
+
+	void SetInvViewProj( VMatrix &invViewProj ) { m_invViewProj = invViewProj; }
+	const VMatrix &GetInvViewProj() const { return m_invViewProj; }
+	VMatrix GetInvViewProjTranspose() const { return m_invViewProj.Transpose(); }
+
+	bool BoundingVolumeIntersectsFrustum( AABB_t const &box ) const;
+	bool BoundingVolumeIntersectsFrustum( Vector const &mins, Vector const &maxes ) const;
+	bool BoundingVolumeIntersectsFrustum( AABB_t const &box, Vector &vOriginShift ) const;
+
+	/// Update the matrix and clip planes for this frustum to reflect the state of the embedded Camera_t
+	void UpdateFrustumFromCamera();
+
+	/// build a full frustum from rotation vectors plus camera vars
+	void BuildFrustumFromVectors( const Vector &origin, float flNear, float flFar, float flFOV, float flAspect,
+		Vector const &vecForward, Vector const &vecLeft, Vector const &vecUp );
+
+	void BuildShadowFrustum( VMatrix &newWorldToView, VMatrix &newProj );
+
+	void BuildFrustumFromParameters(
+		const Vector &origin, const QAngle &angles, 
+		float flNear, float flFar, float flFOV, float flAspect,
+		const VMatrix &worldToView, const VMatrix &viewToProj );
+
+	/// calculate the projection of the 4 view-frustum corner rays onto a plane
+	void CalcFarPlaneCameraRelativePoints( Vector *p4PointsOut, float flFarPlane, 
+		float flClipSpaceBottomLeftX = -1.0f, float flClipSpaceBottomLeftY = -1.0f,
+		float flClipSpaceTopRightX = 1.0f, float flClipSpaceTopRightY = 1.0f ) const;
+
+
+	/// concatenate the projection and view matrices, and also update the inverse view and
+	/// projection matrices 
+	void CalcViewProj( );
+
+	/// Transform a point from world space into camera space.
+	Vector4D TransformPointToHomogenousViewCoordinates( Vector const &pnt ) const;
+
+	void SetClipSpaceBounds( float flClipSpaceBottomLeftX, float flClipSpaceBottomLeftY, float flClipSpaceTopRightX, float flClipSpaceTopRightY );
+
+	void GetClipSpaceBounds( float &flClipSpaceBottomLeftX, float &flClipSpaceBottomLeftY, float &flClipSpaceTopRightX, float &flClipSpaceTopRightY ) const
+	{
+		flClipSpaceBottomLeftX = m_flClipSpaceBottomLeftX;
+		flClipSpaceBottomLeftY = m_flClipSpaceBottomLeftY;
+		flClipSpaceTopRightX = m_flClipSpaceTopRightX;
+		flClipSpaceTopRightY = m_flClipSpaceTopRightY;
+	}
+
+	// NOTE: Not tested with ortho projections
+	void ComputeBounds( Vector *pMins, Vector *pMaxs ) const;
+	void ComputeGeometry( Vector *pVertsOut8 ) const { m_camera.ComputeGeometry( pVertsOut8, m_forward, m_left, m_up ); }
+
+	const Frustum_t &GetFrustumStruct() const { return m_frustumStruct; }
+
+	//--------------------------------------------------------------------------------------------------
+	void ViewToWorld( const Vector2D &vViewMinusOneToOne, Vector *pOutWorld );
+	void BuildRay( const Vector2D &vViewMinusOneToOne, Vector *pOutRayStart, Vector *pOutRayDirection );
+
+protected:
+
+	Camera_t m_camera; // NOTE: SIMD-aligned
+
+	Frustum_t m_frustumStruct; // NOTE: SIMD-aligned
+
+	// For off-center projection
+	float m_flClipSpaceBottomLeftX;
+	float m_flClipSpaceBottomLeftY;
+	float m_flClipSpaceTopRightX;
+	float m_flClipSpaceTopRightY;
+		
+	Vector m_forward;
+	Vector m_left;
+	Vector m_up;
+
+	// Camera/view matrices. (The space order is: world->camera->view->projection->screenspace.)
+	matrix3x4_t	m_cameraToWorld;	// camera->world (NOT view->world, and not the inverse of m_worldToView)
+	matrix3x4_t	m_worldToView;		// world->view
+	
+	// Projection matrices.
+	VMatrix m_projection;			// view->proj
+	VMatrix m_invProjection;		// proj->view
+
+	// Combined world->projection matrices.
+	VMatrix m_viewProj;				// world->proj
+	VMatrix m_invViewProj;			// proj->world
+
+	bool m_bDirty;	
+};
+
+inline CFrustum::CFrustum()
+{
+	InitCamera( Vector( 0, 0, 0 ), QAngle( 0, 0, 0 ), 10, 100, 90, 1.0f );
+	V_memset( &m_frustumStruct, 0, sizeof(Frustum_t) );
+
+	m_forward.Init( 0, 0, 0 );
+	m_left.Init( 0, 0, 0 );
+	m_up.Init( 0, 0, 0 );
+
+	V_memset( &m_cameraToWorld, 0, sizeof(matrix3x4_t) );
+	V_memset( &m_worldToView, 0, sizeof(matrix3x4_t) );
+	V_memset( &m_projection, 0, sizeof(VMatrix) );
+	V_memset( &m_invProjection, 0, sizeof(VMatrix) );
+	V_memset( &m_viewProj, 0, sizeof(VMatrix) );
+	V_memset( &m_invViewProj, 0, sizeof(VMatrix) );
+
+	m_bDirty = true;
+}
+
+inline void CFrustum::SetCameraPosition( const Vector &origin ) 
+{ 
+	if ( m_camera.m_origin == origin )
+		return;
+
+	m_camera.m_origin = origin;
+	Assert( origin.IsValid() && origin.IsReasonable() );
+
+	m_bDirty = true; 
+}
+
+inline void CFrustum::SetCameraAngles( const QAngle &angles ) 
+{ 
+	if ( m_camera.m_angles == angles )
+		return;
+
+	m_camera.m_angles = angles;
+	m_bDirty = true; 
+}
+
+
+inline void CFrustum::SetCameraNearFarPlanes( float flNear, float flFar ) 
+{ 
+	if ( ( m_camera.m_flZNear == flNear ) && ( m_camera.m_flZFar == flFar ) )
+		return;
+
+	m_camera.m_flZNear = flNear; 
+	m_camera.m_flZFar = flFar;
+	m_bDirty = true; 
+}
+
+inline void CFrustum::SetCameraNearPlane( float flNear ) 
+{ 
+	if ( m_camera.m_flZNear == flNear ) 
+		return;
+
+	m_camera.m_flZNear = flNear; 
+	m_bDirty = true; 
+}
+
+inline void CFrustum::SetCameraFarPlane( float flFar ) 
+{ 
+	if ( m_camera.m_flZFar == flFar ) 
+		return;
+
+	m_camera.m_flZFar = flFar;
+	m_bDirty = true; 
+}
+
+/// Set the field of view (in degrees)
+inline void CFrustum::SetCameraFOV( float flFOV ) 
+{ 
+	if ( m_camera.m_flFOVX == flFOV ) 
+		return;
+
+	m_camera.m_flFOVX = flFOV;
+	m_bDirty = true; 
+}
+
+inline void CFrustum::SetCameraWidthHeight( float flWidth, float flHeight ) 
+{ 
+	if ( ( m_camera.m_flWidth == flWidth ) && ( m_camera.m_flHeight == flHeight ) )
+		return;
+
+	m_camera.m_flWidth = flWidth; 
+	m_camera.m_flHeight = flHeight; 
+	m_camera.m_flAspect = m_camera.m_flWidth / m_camera.m_flHeight; 
+	m_bDirty = true; 
+}
+
+inline void CFrustum::SetCameraWidth( float flWidth ) 
+{ 
+	if ( m_camera.m_flWidth == flWidth ) 
+		return;
+
+	m_camera.m_flWidth = flWidth; 
+	m_camera.m_flAspect = m_camera.m_flWidth / m_camera.m_flHeight; 
+	m_bDirty = true; 
+}
+
+inline void CFrustum::SetCameraHeight( float flHeight ) 
+{ 
+	if ( m_camera.m_flHeight == flHeight ) 
+		return;
+
+	m_camera.m_flHeight = flHeight; 
+	m_camera.m_flAspect = m_camera.m_flWidth / m_camera.m_flHeight;
+	m_bDirty = true; 
+}
+
+inline void CFrustum::SetCameraAspect( float flAspect ) 
+{ 
+	if ( m_camera.m_flAspect == flAspect ) 
+		return;
+
+	m_camera.m_flAspect = flAspect;
+	m_bDirty = true; 
+}
+
+
+inline bool CFrustum::BoundingVolumeIntersectsFrustum( AABB_t const &box ) const
+{
+	Vector vMins = box.m_vMinBounds - m_camera.m_origin;
+	Vector vMaxs = box.m_vMaxBounds - m_camera.m_origin;
+	return m_frustumStruct.Intersects( vMins, vMaxs );
+}
+
+inline bool CFrustum::BoundingVolumeIntersectsFrustum( Vector const &mins, Vector const &maxes ) const
+{
+	Vector vMins = mins - m_camera.m_origin;
+	Vector vMaxs = maxes - m_camera.m_origin;
+	return m_frustumStruct.Intersects( vMins, vMaxs );
+}
+
+inline bool CFrustum::BoundingVolumeIntersectsFrustum( AABB_t const &box, Vector &vOriginShift ) const
+{
+	Vector vMins = box.m_vMinBounds - m_camera.m_origin - vOriginShift;
+	Vector vMaxs = box.m_vMaxBounds - m_camera.m_origin - vOriginShift;
+	return m_frustumStruct.Intersects( vMins, vMaxs );
+}
+
+inline Vector4D CFrustum::TransformPointToHomogenousViewCoordinates( Vector const &pnt ) const
+{
+	Vector4D v4Rslt;
+	GetViewProj().V4Mul( Vector4D( pnt.x, pnt.y, pnt.z, 1.0 ), v4Rslt );
+	return v4Rslt;
+}
+
+inline void CFrustum::ViewForward( Vector& vViewForward ) const 
+{ 
+	Vector vFrustumDir;
+	MatrixGetRow( VMatrix( m_worldToView ), Z_AXIS, &vFrustumDir );
+	VectorNormalize( vFrustumDir );
+	vFrustumDir = -vFrustumDir;
+	vViewForward = vFrustumDir;
+}
+
+inline void CFrustum::ViewLeft( Vector& vViewLeft ) const 
+{ 
+	Vector vFrustumDir;
+	MatrixGetRow( VMatrix( m_worldToView ), X_AXIS, &vFrustumDir ); 
+	VectorNormalize( vFrustumDir );
+	vViewLeft = -vFrustumDir;
+}
+
+inline void CFrustum::ViewUp(  Vector& vViewUp ) const 
+{ 
+	Vector vFrustumDir;
+	MatrixGetRow( VMatrix( m_worldToView ), Y_AXIS, &vFrustumDir );
+	VectorNormalize( vFrustumDir );
+	vViewUp = vFrustumDir;
+}
+
+
+inline void CFrustum::SetClipSpaceBounds( float flClipSpaceBottomLeftX, float flClipSpaceBottomLeftY, float flClipSpaceTopRightX, float flClipSpaceTopRightY )
+{
+	m_flClipSpaceBottomLeftX = flClipSpaceBottomLeftX;
+	m_flClipSpaceBottomLeftY = flClipSpaceBottomLeftY;
+	m_flClipSpaceTopRightX = flClipSpaceTopRightX;
+	m_flClipSpaceTopRightY = flClipSpaceTopRightY;
+	m_bDirty = true;
+}
+
+#endif // CAMERA_H
+
--- a/public/mathlib/capsule.h
+++ b/public/mathlib/capsule.h
@@ -0,0 +1,10 @@
+//========= Copyright © Valve Corporation, All rights reserved. ============//
+#ifndef MATHLIB_CAPSULE_HDR
+#define MATHLIB_CAPSULE_HDR
+
+#include "vector.h"
+#include "rubikon/param_types.h"
+
+void CastCapsuleRay( CShapeCastResult& out, const Vector& vRayStart, const Vector& vRayDelta, const Vector vCenter[], float flRadius );
+
+#endif
--- a/public/mathlib/cholesky.h
+++ b/public/mathlib/cholesky.h
@@ -0,0 +1,70 @@
+//========= Copyright c 1996-2009, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+#ifndef CHOLESKY_HDR
+#define CHOLESKY_HDR
+
+#include "mathlib/vector.h"
+#include "mathlib/mathlib.h"
+
+struct matrix3x4_t;
+
+// this class holds cholesky decomposition of a 3x3 matrix
+struct Cholesky3x3_t
+{
+	// lower diagonal matrix L such that LL' = input matrix
+	float m_00, m_11, m_22; // these aren't used in computations! they're only for debugging and returning "canonical" form (L or R)
+	float m_10, m_20, m_21; // these are off-diagonals used in compuations
+	float m_inv00, m_inv11, m_inv22; // these are reciprocals of diagonals used in all computations
+public:
+	/// @group Construction and initialization {
+	Cholesky3x3_t( const matrix3x4_t &m )
+	{
+		Init( m );
+	}
+	Cholesky3x3_t( float a00, float a10, float a11, float a20, float a21, float a22 )
+	{
+		Init( a00, a10, a11, a20, a21, a22 );
+	}
+	bool Init( float a00, float a10, float a11, float a20, float a21, float a22 );
+	bool Init( const matrix3x4_t &m )
+	{
+		return Init( m[0][0], m[1][0], m[1][1], m[2][0], m[2][1], m[2][2] );
+	}
+
+	//@}
+
+	void FillLeft( matrix3x4_t & l );
+	void FillRight( matrix3x4_t & r );
+	bool IsValid( );
+
+	const Vector SolveRight( const Vector &b );
+	const Vector SolveLeft( const Vector &b );
+
+	// using this decomposition LL', solve the following equation and return the result: LL' x = rhs
+	const Vector Solve( const Vector &rhs )
+	{
+		// L R x =           b
+		//   R x =      L^-1 b
+		//     x = R^-1 L^-1 b
+		return SolveRight( SolveLeft( rhs ) );
+	}
+};
+
+//
+/// Solve this equation: M*(x,1)=(0,0,0,1) where M is 3x4 matrix and x is the unknown 3-vector
+/// that's returned uses cholesky decomposition 3x3 , so assumes 3x3 part of M is symmetrical
+/// positive definite
+//
+inline Vector CholeskySolve( const matrix3x4_t &m )
+{
+	Cholesky3x3_t cholesky( m );
+	return -cholesky.Solve( Vector( m.m_flMatVal[0][3], m.m_flMatVal[1][3], m.m_flMatVal[2][3] ) );
+}
+
+
+#endif
--- a/public/mathlib/compressed_3d_unitvec.h
+++ b/public/mathlib/compressed_3d_unitvec.h
@@ -0,0 +1,284 @@
+//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+#ifndef _3D_UNITVEC_H
+#define _3D_UNITVEC_H
+
+
+#define UNITVEC_DECLARE_STATICS \
+   float cUnitVector::mUVAdjustment[0x2000]; \
+   Vector cUnitVector::mTmpVec;
+
+// upper 3 bits
+#define SIGN_MASK  0xe000
+#define XSIGN_MASK 0x8000
+#define YSIGN_MASK 0x4000
+#define ZSIGN_MASK 0x2000
+
+// middle 6 bits - xbits
+#define TOP_MASK  0x1f80
+
+// lower 7 bits - ybits
+#define BOTTOM_MASK  0x007f
+
+// unitcomp.cpp : A Unit Vector to 16-bit word conversion
+// algorithm based on work of Rafael Baptista (rafael@oroboro.com)
+// Accuracy improved by O.D. (punkfloyd@rocketmail.com)
+// Used with Permission.
+
+// a compressed unit vector. reasonable fidelty for unit
+// vectors in a 16 bit package. Good enough for surface normals
+// we hope.
+class cUnitVector // : public c3dMathObject
+{
+public:
+   cUnitVector() { mVec = 0; }
+   cUnitVector( const Vector& vec )
+   {
+      packVector( vec );
+   }
+   cUnitVector( unsigned short val ) { mVec = val; }
+
+   cUnitVector& operator=( const Vector& vec )
+   { packVector( vec ); return *this; }
+
+   operator Vector()
+   {
+      unpackVector( mTmpVec );
+      return mTmpVec;
+   }
+
+   void packVector( const Vector& vec )
+   {
+      // convert from Vector to cUnitVector
+
+      Assert( vec.IsValid());
+      Vector tmp = vec;
+
+      // input vector does not have to be unit length
+      // Assert( tmp.length() <= 1.001f );
+
+      mVec = 0;
+      if ( tmp.x < 0 ) { mVec |= XSIGN_MASK; tmp.x = -tmp.x; }
+      if ( tmp.y < 0 ) { mVec |= YSIGN_MASK; tmp.y = -tmp.y; }
+      if ( tmp.z < 0 ) { mVec |= ZSIGN_MASK; tmp.z = -tmp.z; }
+
+      // project the normal onto the plane that goes through
+      // X0=(1,0,0),Y0=(0,1,0),Z0=(0,0,1).
+      // on that plane we choose an (projective!) coordinate system
+      // such that X0->(0,0), Y0->(126,0), Z0->(0,126),(0,0,0)->Infinity
+
+      // a little slower... old pack was 4 multiplies and 2 adds.
+      // This is 2 multiplies, 2 adds, and a divide....
+      float w = 126.0f / ( tmp.x + tmp.y + tmp.z );
+      long xbits = (long)( tmp.x * w );
+      long ybits = (long)( tmp.y * w );
+
+      Assert( xbits <  127 );
+      Assert( xbits >= 0   );
+      Assert( ybits <  127 );
+      Assert( ybits >= 0   );
+
+      // Now we can be sure that 0<=xp<=126, 0<=yp<=126, 0<=xp+yp<=126
+      // however for the sampling we want to transform this triangle
+      // into a rectangle.
+      if ( xbits >= 64 )
+      {
+         xbits = 127 - xbits;
+         ybits = 127 - ybits;
+      }
+
+      // now we that have xp in the range (0,127) and yp in
+      // the range (0,63), we can pack all the bits together
+      mVec |= ( xbits << 7 );
+      mVec |= ybits;
+   }
+
+   void unpackVector( Vector& vec )
+   {
+      // if we do a straightforward backward transform
+      // we will get points on the plane X0,Y0,Z0
+      // however we need points on a sphere that goes through
+      // these points. Therefore we need to adjust x,y,z so
+      // that x^2+y^2+z^2=1 by normalizing the vector. We have
+      // already precalculated the amount by which we need to
+      // scale, so all we do is a table lookup and a
+      // multiplication
+
+      // get the x and y bits
+      long xbits = (( mVec & TOP_MASK ) >> 7 );
+      long ybits = ( mVec & BOTTOM_MASK );
+
+      // map the numbers back to the triangle (0,0)-(0,126)-(126,0)
+      if (( xbits + ybits ) >= 127 )
+      {
+         xbits = 127 - xbits;
+         ybits = 127 - ybits;
+      }
+
+      // do the inverse transform and normalization
+      // costs 3 extra multiplies and 2 subtracts. No big deal.
+      float uvadj = mUVAdjustment[mVec & ~SIGN_MASK];
+      vec.x = uvadj * (float) xbits;
+      vec.y = uvadj * (float) ybits;
+      vec.z = uvadj * (float)( 126 - xbits - ybits );
+
+      // set all the sign bits
+      if ( mVec & XSIGN_MASK ) vec.x = -vec.x;
+      if ( mVec & YSIGN_MASK ) vec.y = -vec.y;
+      if ( mVec & ZSIGN_MASK ) vec.z = -vec.z;
+
+      Assert( vec.IsValid());
+   }
+
+   static void initializeStatics()
+   {
+      for ( int idx = 0; idx < 0x2000; idx++ )
+      {
+         long xbits = idx >> 7;
+         long ybits = idx & BOTTOM_MASK;
+
+         // map the numbers back to the triangle (0,0)-(0,127)-(127,0)
+         if (( xbits + ybits ) >= 127 )
+         {
+            xbits = 127 - xbits;
+            ybits = 127 - ybits;
+         }
+
+         // convert to 3D vectors
+         float x = (float)xbits;
+         float y = (float)ybits;
+         float z = (float)( 126 - xbits - ybits );
+		
+         // calculate the amount of normalization required
+         mUVAdjustment[idx] = 1.0f / sqrtf( y*y + z*z + x*x );
+         Assert( _finite( mUVAdjustment[idx]));
+
+         //cerr << mUVAdjustment[idx] << "\t";
+         //if ( xbits == 0 ) cerr << "\n";
+      }
+   }
+
+#if 0
+   void test()
+   {
+      #define TEST_RANGE 4
+      #define TEST_RANDOM 100
+      #define TEST_ANGERROR 1.0
+
+      float maxError = 0;
+      float avgError = 0;
+      int numVecs = 0;
+
+      {for ( int x = -TEST_RANGE; x < TEST_RANGE; x++ )
+      {
+         for ( int y = -TEST_RANGE; y < TEST_RANGE; y++ )
+         {
+            for ( int z = -TEST_RANGE; z < TEST_RANGE; z++ )
+            {
+               if (( x + y + z ) == 0 ) continue;
+
+               Vector vec( (float)x, (float)y, (float)z );
+               Vector vec2;
+
+               vec.normalize();
+               packVector( vec );
+               unpackVector( vec2 );
+
+               float ang = vec.dot( vec2 );
+               ang = (( fabs( ang ) > 0.99999f ) ? 0 : (float)acos(ang));
+
+               if (( ang > TEST_ANGERROR ) | ( !_finite( ang )))
+               {
+                  cerr << "error: " << ang << endl;
+                  cerr << "orig vec:       " << vec.x << ",\t"
+                       << vec.y << ",\t" << vec.z << "\tmVec: "
+                       << mVec << endl;
+                  cerr << "quantized vec2: " << vec2.x
+                       << ",\t" << vec2.y << ",\t"
+                       << vec2.z << endl << endl;
+               }
+               avgError += ang;
+               numVecs++;
+               if ( maxError < ang ) maxError = ang;
+            }
+         }
+      }}
+
+      for ( int w = 0; w < TEST_RANDOM; w++ )
+      {
+         Vector vec( genRandom(), genRandom(), genRandom());
+         Vector vec2;
+         vec.normalize();
+
+         packVector( vec );
+         unpackVector( vec2 );
+
+         float ang =vec.dot( vec2 );
+         ang = (( ang > 0.999f ) ? 0 : (float)acos(ang));
+
+         if (( ang > TEST_ANGERROR ) | ( !_finite( ang )))
+         {
+            cerr << "error: " << ang << endl;
+            cerr << "orig vec:       " << vec.x << ",\t"
+                 << vec.y << ",\t" << vec.z << "\tmVec: "
+                 << mVec << endl;
+            cerr << "quantized vec2: " << vec2.x << ",\t"
+                 << vec2.y << ",\t"
+                 << vec2.z << endl << endl;
+         }
+         avgError += ang;
+         numVecs++;
+         if ( maxError < ang ) maxError = ang;
+      }
+
+      { for ( int x = 0; x < 50; x++ )
+      {
+         Vector vec( (float)x, 25.0f, 0.0f );
+         Vector vec2;
+
+         vec.normalize();
+         packVector( vec );
+         unpackVector( vec2 );
+
+         float ang = vec.dot( vec2 );
+         ang = (( fabs( ang ) > 0.999f ) ? 0 : (float)acos(ang));
+
+         if (( ang > TEST_ANGERROR ) | ( !_finite( ang )))
+         {
+            cerr << "error: " << ang << endl;
+            cerr << "orig vec:       " << vec.x << ",\t"
+                 << vec.y << ",\t" << vec.z << "\tmVec: "
+                 << mVec << endl;
+            cerr << "   quantized vec2: " << vec2.x << ",\t"
+                 << vec2.y << ",\t" << vec2.z << endl << endl;
+         }
+
+         avgError += ang;
+         numVecs++;
+         if ( maxError < ang ) maxError = ang;
+      }}
+
+      cerr << "max angle error: " << maxError
+           << ", average error: " << avgError / numVecs
+           << ", num tested vecs: " << numVecs << endl;
+   }
+
+   friend ostream& operator<< ( ostream& os, const cUnitVector& vec )
+   { os << vec.mVec; return os; }
+#endif
+
+//protected: // !!!!
+
+   unsigned short mVec;
+   static float mUVAdjustment[0x2000];
+   static Vector mTmpVec;
+};
+
+#endif // _3D_VECTOR_H
+
+
--- a/public/mathlib/compressed_light_cube.h
+++ b/public/mathlib/compressed_light_cube.h
@@ -0,0 +1,24 @@
+//========= Copyright Š 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=============================================================================//
+
+#ifndef COMPRESSED_LIGHT_CUBE_H
+#define COMPRESSED_LIGHT_CUBE_H
+#ifdef _WIN32
+#pragma once
+#endif
+
+
+#include "mathlib/mathlib.h"
+
+
+struct CompressedLightCube
+{
+	DECLARE_BYTESWAP_DATADESC();
+	ColorRGBExp32 m_Color[6];
+};
+
+
+#endif // COMPRESSED_LIGHT_CUBE_H
--- a/public/mathlib/compressed_vector.h
+++ b/public/mathlib/compressed_vector.h
--- a/public/mathlib/disjoint_set_forest.h
+++ b/public/mathlib/disjoint_set_forest.h
@@ -0,0 +1,139 @@
+//========= Copyright © Valve Corporation, All rights reserved. ============//
+
+
+#ifndef MATHLIB_DISJOINT_SET_FOREST_HDR
+#define MATHLIB_DISJOINT_SET_FOREST_HDR
+
+#include "tier1/utlvector.h"
+
+/// An excellent overview of the concept is here:
+/// http://en.wikipedia.org/wiki/Disjoint-set_data_structure this algorithm is with path
+/// compression and ranking implemented, so it's essentially amortized const-time operations to
+/// find node's island representative element or union two lists.  ( the "essentially" means
+/// amortized complexity is Ackermann function, which is like 5 for the largest number in any kind
+/// of software development )
+
+class CDisjointSetForest
+{
+public:
+	CDisjointSetForest( int nCount );
+	
+	//void Flatten();
+	int Find( int nNode );
+	void Union( int nNodeA, int nNodeB );
+	void EnsureExists( int nNode );
+	int GetNodeCount()const { return m_Nodes.Count(); }
+protected:
+	struct Node_t
+	{
+		int nRank, nParent;
+	};
+	CUtlVector< Node_t > m_Nodes;
+};
+
+
+inline CDisjointSetForest::CDisjointSetForest( int nCount )
+{
+	m_Nodes.SetCount( nCount );
+	for( int i = 0;i < nCount; ++i )
+	{
+		m_Nodes[i].nRank = 0;
+		m_Nodes[i].nParent = i;
+	}
+}
+
+
+inline void CDisjointSetForest::EnsureExists( int nNode )
+{
+	int nOldCount = m_Nodes.Count();
+	if ( nNode >= nOldCount )
+	{
+		m_Nodes.SetCountNonDestructively( nNode + 1 );
+		for ( int n = nOldCount; n <= nNode; ++n )
+		{
+			m_Nodes[ n ].nRank = 0;
+			m_Nodes[ n ].nParent = n;
+		}
+	}
+}
+
+
+/// Find the representative element for the node in graph representative element is the same for
+/// all connected nodes(vertices) in the graph, and it's one of the nodes in the connected set this
+/// implementation is without recursion to be more cache friendly; recursive implementation would
+/// be clearer, but this is simple enough
+inline int CDisjointSetForest::Find( int nStartNode )
+{
+	int nTopParent;
+	for( int nNode = nStartNode; nTopParent = m_Nodes[nNode].nParent, nNode != nTopParent ; )
+	{
+		nNode = nTopParent;
+	}
+	
+	// found the top parent, now compress the path to achieve that amazing amortized acceleration
+	int nParent;
+	for( int nNode = nStartNode; nParent = m_Nodes[nNode].nParent, nNode != nParent ; )
+	{
+		m_Nodes[nNode].nParent = nTopParent;
+		nNode = nParent;
+	}
+	Assert( nParent == nTopParent );
+	return nTopParent;
+}
+
+
+/// Connect the two (potentially disjoint) sets
+inline void CDisjointSetForest::Union( int nNodeA, int nNodeB )
+{
+	int nRootA = Find( nNodeA );
+	int nRootB = Find( nNodeB );
+	if ( m_Nodes[nRootA].nRank > m_Nodes[nRootB].nRank )
+	{
+		m_Nodes[nRootB].nParent = nRootA;  // note: no change in rank!	we're balanced!
+	}
+	else
+	if ( m_Nodes[nRootA].nRank < m_Nodes[nRootB].nRank )
+	{
+		m_Nodes[nRootA].nParent = nRootB;   // note: no change in rank! we're balanced!
+	}
+	else
+	if ( nRootA != nRootB ) // Unless A and B are already in same set, merge them
+	{
+		m_Nodes[nRootB].nParent = nRootA;
+		m_Nodes[nRootA].nRank = m_Nodes[nRootA].nRank + 1;
+	}
+}
+
+
+/// Given the graph implementing GetParent(), find the indices of all children of the given tip of
+/// the subtree
+template <typename Graph_t, class BitVec_t>
+inline void ComputeSubtree( const Graph_t *pGraph, int nSubtreeTipBone, BitVec_t *pSubtree )
+{
+	int nBoneCount = pSubtree->GetNumBits();
+	Assert( nSubtreeTipBone >= 0 && nSubtreeTipBone < nBoneCount );
+	CDisjointSetForest find( nBoneCount );
+	for( int nBone = 0; nBone < nBoneCount; ++nBone )
+	{
+		if( nBone != nSubtreeTipBone )  // Important: severe the link between the subtree tip bone and the rest of the tree to find the disjoint subtree
+		{
+			int nParent = pGraph->GetParent( nBone );
+			if( nParent >= 0 && nParent < nBoneCount )
+			{
+				find.Union( nBone, nParent );
+			}
+		}
+	}
+	int nIsland = find.Find( nSubtreeTipBone );
+	for( int nBone = 0; nBone < nBoneCount; ++nBone )
+	{
+		if( find.Find( nBone ) == nIsland )
+		{
+			pSubtree->Set( nBone );
+		}
+	}
+}
+
+
+#endif //MATHLIB_DISJOINT_SET_FOREST_HDR
+
--- a/public/mathlib/dynamictree.h
+++ b/public/mathlib/dynamictree.h
@@ -0,0 +1,177 @@
+//========= Copyright © Valve Corporation, All rights reserved. ============//
+#ifndef DYNAMIC_TREE_HDR
+#define DYNAMIC_TREE_HDR
+
+#include "vector.h"
+#include "aabb.h"
+
+#include "utlvector.h"
+
+// This class implements a dynamic AABB tree and allows node insertion, removal and updates.
+// The user needs to provide an AABB on construction and will receive a unique identifier
+// to update and remove the node later. On construction you can also associate some arbitrary
+// user data. The tree is build using the SAH and uses AVL rotations for balancing. Nodes are
+// managed in a free list and no pointers are returned. This allows for fast allocations and
+// we can still grow the tree. Note that no memory is released destruction.
+
+// If you have a large number of proxies and all are moving each frame it is recommend to 
+// insert and inflated AABB and only trigger and update if the original AABB moved out of the
+// fat AABB in the tree. This approach is very successfully used in the Rubikon broadphase.
+
+// Casting:
+// Ray, sphere and box casting uses a simple callback mechanism. 
+// The callback signature is: float f( pUserData, vRayStart, vRayDelta, flBestT );
+// The callback mechanism allows us to implement any-, closest-, and
+// all hit(s) queries in one function. We expect from the client to 
+// return the following for this to work:
+// Any: t = 0 (if hit something)
+// Closest: 0 < t < 1
+// All: t = 1 (always)
+
+// Queries:
+// * The dynamic tree supports sphere and box queries to find all proxies intersected by
+//   the specified volume.
+// * The dynamic tree supports 'closest proxy' queries
+
+
+//--------------------------------------------------------------------------------------------------
+// Proxy vector
+//--------------------------------------------------------------------------------------------------
+typedef CUtlVectorFixedGrowable< int32, 512 > CProxyVector;
+
+
+//--------------------------------------------------------------------------------------------------
+// Dynamic tree
+//--------------------------------------------------------------------------------------------------
+class CDynamicTree
+{
+public:
+	// Construction 
+	CDynamicTree();
+
+	// Proxy interface
+	int ProxyCount() const;
+	int32 CreateProxy( const AABB_t& bounds, void* pUserData = NULL );
+	void* DestroyProxy( int32 nProxyId );
+	void MoveProxy( int32 nProxyId, const AABB_t& bounds );
+
+	void* GetUserData( int32 nProxyId ) const;
+	AABB_t GetBounds( int32 nProxyId ) const;
+
+	// Casting
+	template < typename Functor >
+	void CastRay( const Vector& vRayStart, const Vector &vRayDelta, Functor& callback ) const;
+	template< typename Functor >
+	void CastSphere( const Vector& vRayStart, const Vector& vRayDelta, float flRadius, Functor& callback ) const;
+	template< typename Functor >
+	void CastBox( const Vector& vRayStart, const Vector& vRayDelta, const Vector& vExtent, Functor& callback ) const;
+	
+	// Queries
+	void Query( CProxyVector& proxies, const AABB_t& aabb ) const;
+	void Query( CProxyVector& proxies, const Vector& vCenter, float flRadius ) const;
+
+    // Returns the distance to the closest proxy; FLT_MAX if no proxies were
+    // found (i.e. your tree is empty)
+    float ClosestProxies( CProxyVector& proxies, const Vector &vQuery ) const;
+
+private:
+	// Implementation
+	enum 
+	{ 
+		NULL_NODE = -1,
+		STACK_DEPTH = 64
+	};
+		
+
+	void InsertLeaf( int32 nLeaf );
+	void RemoveLeaf( int32 nLeaf );
+
+	void AdjustAncestors( int32 nNode );
+	int32 Balance( int32 nNode );
+
+	struct Ray_t
+	{
+		Ray_t() {}
+
+		Ray_t( const Vector& vStart, const Vector& vEnd ) 
+		{
+			vOrigin = vStart;
+			vDelta = vEnd - vStart;
+
+			// Pre-compute inverse
+			vDeltaInv.x = vDelta.x != 0.0f ? 1.0f / vDelta.x : FLT_MAX;
+			vDeltaInv.y = vDelta.y != 0.0f ? 1.0f / vDelta.y : FLT_MAX;
+			vDeltaInv.z = vDelta.z != 0.0f ? 1.0f / vDelta.z : FLT_MAX;
+		}
+
+		Vector vOrigin;
+		Vector vDelta;
+		Vector vDeltaInv;
+	};
+
+	AABB_t Inflate( const AABB_t& aabb, float flExtent ) const;
+	AABB_t Inflate( const AABB_t& aabb, const Vector& vExtent ) const;
+	void ClipRay( const Ray_t& ray, const AABB_t& aabb, float& flMinT, float& flMaxT ) const;
+	
+	
+
+	struct Node_t
+	{
+		AABB_t m_Bounds;
+		int32 m_nHeight;
+		int32 m_nParent;
+		int32 m_nChild1;
+		int32 m_nChild2;
+		void* m_pUserData;
+
+		FORCEINLINE bool IsLeaf() const	
+		{
+			return m_nChild1 == NULL_NODE;
+		}
+	};
+
+	class CNodePool
+	{
+	public:
+		// Construction / Destruction
+		CNodePool();
+		~CNodePool();
+
+		// Memory management
+		void Clear();
+		void Reserve( int nCapacity );
+
+		int32 Alloc();
+		void Free( int32 id );
+
+		// Accessors
+		FORCEINLINE Node_t& operator[]( int32 id )
+		{
+			AssertDbg( 0 <= id && id < m_nCapacity );
+			return m_pObjects[ id ];
+		}
+
+		FORCEINLINE const Node_t& operator[]( int32 id ) const
+		{
+			AssertDbg( 0 <= id && id < m_nCapacity );
+			return m_pObjects[ id ];
+		}
+
+	private:
+		int m_nCapacity;
+		Node_t* m_pObjects;
+		int32 m_nNext;
+	};
+
+	// Data members
+	int32 m_nRoot;
+
+	int m_nProxyCount;
+	CNodePool m_NodePool;
+
+};
+
+
+#include "dynamictree.inl"
+
+#endif
--- a/public/mathlib/dynamictree.inl
+++ b/public/mathlib/dynamictree.inl
@@ -0,0 +1,206 @@
+//===================== Copyright (c) Valve Corporation. All Rights Reserved. ======================
+
+
+//--------------------------------------------------------------------------------------------------
+// Dynamic tree
+//--------------------------------------------------------------------------------------------------
+FORCEINLINE void* CDynamicTree::GetUserData( int32 nProxyId ) const
+{
+	AssertDbg( m_NodePool[ nProxyId ].IsLeaf() );
+	return m_NodePool[ nProxyId ].m_pUserData;
+}
+
+
+//--------------------------------------------------------------------------------------------------
+FORCEINLINE AABB_t CDynamicTree::GetBounds( int32 nProxyId ) const
+{
+	AssertDbg( m_NodePool[ nProxyId ].IsLeaf() );
+	return m_NodePool[ nProxyId ].m_Bounds;
+}
+
+
+//-------------------------------------------------------------------------------------------------
+FORCEINLINE AABB_t CDynamicTree::Inflate( const AABB_t& aabb, float flExtent ) const
+{
+	AABB_t out;
+	Vector vExtent( flExtent, flExtent, flExtent );
+	out.m_vMinBounds = aabb.m_vMinBounds - vExtent;
+	out.m_vMaxBounds = aabb.m_vMaxBounds + vExtent;
+
+	return out;
+}
+
+
+//-------------------------------------------------------------------------------------------------
+FORCEINLINE AABB_t CDynamicTree::Inflate( const AABB_t& aabb, const Vector& vExtent ) const
+{
+	AABB_t out;
+	out.m_vMinBounds = aabb.m_vMinBounds - vExtent;
+	out.m_vMaxBounds = aabb.m_vMaxBounds + vExtent;
+
+	return out;
+}
+
+
+//--------------------------------------------------------------------------------------------------
+FORCEINLINE void CDynamicTree::ClipRay( const Ray_t& ray, const AABB_t& aabb, float& flMinT, float& flMaxT ) const
+{
+	for ( int nAxis = 0; nAxis < 3; ++nAxis )
+	{
+		float t1 = ( aabb.m_vMinBounds[ nAxis ] - ray.vOrigin[ nAxis ] ) * ray.vDeltaInv[ nAxis ];
+		float t2 = ( aabb.m_vMaxBounds[ nAxis ] - ray.vOrigin[ nAxis ] ) * ray.vDeltaInv[ nAxis ];
+
+		flMinT = fpmax( flMinT, fpmin( t1, t2 ) );
+		flMaxT = fpmin( flMaxT, fpmax( t1, t2 ) );
+	}
+}
+
+
+//--------------------------------------------------------------------------------------------------
+template < typename Functor > void CDynamicTree::CastRay( const Vector& vRayStart, const Vector& vRayDelta, Functor& callback ) const
+{
+	if ( m_nRoot < 0 )
+	{
+		AssertDbg( m_nRoot == NULL_NODE );
+		return;
+	}
+
+	// 	Setup the ray
+	Ray_t ray = Ray_t( vRayStart, vRayStart + vRayDelta );
+	float flBestT = 1.0f;
+
+	int nCount = 0;
+	int32 stack[ STACK_DEPTH ];
+	stack[ nCount++ ] = m_nRoot;
+
+	while ( nCount > 0 )
+	{
+		int32 nNode = stack[ --nCount ];
+		const Node_t& node = m_NodePool[ nNode ];
+
+		float flMinT = 0.0f, flMaxT = 1.0f;
+		ClipRay( ray, node.m_Bounds, flMinT, flMaxT );
+		if ( flMinT > flMaxT || flMinT > flBestT )
+		{
+			continue;
+		}
+
+		if ( !node.IsLeaf() )
+		{
+			AssertDbg( nCount + 2 <= STACK_DEPTH );
+			stack[ nCount++ ] = node.m_nChild2;
+			stack[ nCount++ ] = node.m_nChild1;
+		}
+		else
+		{
+			float T = callback( GetUserData( nNode ), vRayStart, vRayDelta, flBestT );
+			flBestT = fpmin( T, flBestT );
+
+			if ( T == 0.0f )
+			{
+				// The user terminated the query.
+				return;
+			}	
+		}
+	}
+}
+
+
+//--------------------------------------------------------------------------------------------------
+template< typename Functor > void CDynamicTree::CastSphere( const Vector& vRayStart, const Vector& vRayDelta, float flRadius, Functor& callback ) const
+{
+	if ( m_nRoot < 0 )
+	{
+		AssertDbg( m_nRoot == NULL_NODE );
+		return;
+	}
+
+	// 	Setup the ray
+	Ray_t ray = Ray_t( vRayStart, vRayStart + vRayDelta );
+	float flBestT = 1.0f;
+
+	int nCount = 0;
+	int32 stack[ STACK_DEPTH ];
+	stack[ nCount++ ] = m_nRoot;
+
+	while ( nCount > 0 )
+	{
+		int32 nNode = stack[ --nCount ];
+		const Node_t& node = m_NodePool[ nNode ];
+
+		float flMinT = 0.0f, flMaxT = 1.0f;
+		ClipRay( ray, Inflate( node.m_Bounds, flRadius ), flMinT, flMaxT );
+		if ( flMinT > flMaxT || flMinT > flBestT )
+		{
+			continue;
+		}
+
+		if ( !node.IsLeaf() )
+		{
+			AssertDbg( nCount + 2 <= STACK_DEPTH );
+			stack[ nCount++ ] = node.m_nChild2;
+			stack[ nCount++ ] = node.m_nChild1;
+		}
+		else
+		{
+			float T = callback( GetUserData( nNode ), vRayStart, vRayDelta, flRadius, flBestT );
+			flBestT = fpmin( T, flBestT );
+
+			if ( T == 0.0f )
+			{
+				// The user terminated the query
+				return;
+			}	
+		}
+	}	
+}
+
+
+//--------------------------------------------------------------------------------------------------
+template< typename Functor > void CDynamicTree::CastBox( const Vector& vRayStart, const Vector& vRayDelta, const Vector& vExtent, Functor& callback ) const
+{
+	if ( m_nRoot < 0 )
+	{
+		AssertDbg( m_nRoot == NULL_NODE );
+		return;
+	}
+
+	// 	Setup the ray
+	Ray_t ray = Ray_t( vRayStart, vRayStart + vRayDelta );
+	float flBestT = 1.0f;
+
+	int nCount = 0;
+	int32 stack[ STACK_DEPTH ];
+	stack[ nCount++ ] = m_nRoot;
+
+	while ( nCount > 0 )
+	{
+		int32 nNode = stack[ --nCount ];
+		const Node_t& node = m_NodePool[ nNode ];
+
+		float flMinT = 0.0f, flMaxT = 1.0f;
+		ClipRay( ray, Inflate( node.m_Bounds, vExtent ), flMinT, flMaxT );
+		if ( flMinT > flMaxT || flMinT > flBestT )
+		{
+			continue;
+		}
+
+		if ( !node.IsLeaf() )
+		{
+			AssertDbg( nCount + 2 <= STACK_DEPTH );
+			stack[ nCount++ ] = node.m_nChild2;
+			stack[ nCount++ ] = node.m_nChild1;
+		}
+		else
+		{
+			float T = callback( GetUserData( nNode ), vRayStart, vRayDelta, vExtent, flBestT );
+			flBestT = fpmin( T, flBestT );
+
+			if ( T == 0.0f )
+			{
+				// The user terminated the query
+				return;
+			}	
+		}
+	}	
+}
--- a/public/mathlib/eigen.h
+++ b/public/mathlib/eigen.h
@@ -0,0 +1,26 @@
+//========= Copyright c 1996-2009, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: anything remotely related to eigen-[problem,solver,value,vector, whatever] goes here
+//
+// $NoKeywords: $
+//=============================================================================//
+
+#ifndef EIGEN_H
+#define EIGEN_H
+
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+extern Quaternion Diagonalizer( const matrix3x4_t &A, Vector &d );
+
+// Routines which use Diagonalizer() to compute tight oriented bounding boxes for point sets
+
+// Computes the mean point of a set of points, used by ComputeCovariantMatrix
+extern Vector ComputeMeanPoint( const Vector *pPointList, int nPointCount );
+// Computes a covariance matrix for a set of points which measures spatial dispersion of the points against the mean of the points, the covariance matrix is symmetric and suitable for use in Diagonalizer()
+extern void ComputeCovarianceMatrix( matrix3x4_t &covarianceMatrix, const Vector *pPointList, int nPointCount );
+// Computes the center and scale using qEigenVectors as the orientation to transform a unit cube at the origin to contain the specified point list, calls ComputeCovarianceMatrix(), Diagonalizer()
+extern void ComputeExtents( Vector &vCenter, Vector &vScale, const Quaternion &qEigenVectors, const Vector *pPointList, int nPointCount );
+// Wraps up all of the above calls to compute the matrix to transform a unit cube at the origin to contain the specified point list
+extern void ComputeBoundingBoxMatrix( matrix3x4_t &boundingBoxMatrix, const Vector *pPointList, int nPointCount );
+
+#endif
--- a/public/mathlib/expressioncalculator.h
+++ b/public/mathlib/expressioncalculator.h
@@ -0,0 +1,83 @@
+//====== Copyright c 1996-2008, Valve Corporation, All rights reserved. =======
+
+#ifndef MATHLIB_EXPRESSION_CALCULATOR_H
+#define MATHLIB_EXPRESSION_CALCULATOR_H
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include "tier1/utlstring.h"
+#include "tier1/utlstack.h"
+#include "tier1/utlvector.h"
+
+
+//-----------------------------------------------------------------------------
+// Calculator Parsing class
+// precedence order:
+//		unary operators: + - ! func var
+//		* / %
+//		+ -
+//		< > <= >=
+//		== !=
+//		&&
+//		||
+//		?:
+//-----------------------------------------------------------------------------
+class CExpressionCalculator
+{
+public:
+	CExpressionCalculator( const char *expr = NULL ) : m_expr( expr ) {}
+
+	CExpressionCalculator( const CExpressionCalculator& x );
+	CExpressionCalculator& operator=( const CExpressionCalculator& x );
+	
+public:
+	void SetExpression( const char *expr ) 
+	{
+		m_expr = expr;
+	}
+
+	void SetVariable( const char *var, float value );
+	void SetVariable( int nVariableIndex, float value );
+	void ModifyVariable( const char *var, float value );
+
+	int FindVariableIndex( const char *var );
+
+	bool Evaluate( float &value );
+
+	// Builds a list of variable names from the expression
+	bool BuildVariableListFromExpression( );
+
+	// Iterate over variables
+	int VariableCount();
+	const char *VariableName( int nIndex );
+
+private:
+	bool ParseExpr		 ( const char *&expr );
+	bool ParseConditional( const char *&expr );
+	bool ParseOr		 ( const char *&expr );
+	bool ParseAnd		 ( const char *&expr );
+	bool ParseEquality	 ( const char *&expr );
+	bool ParseLessGreater( const char *&expr );
+	bool ParseAddSub	 ( const char *&expr );
+	bool ParseDivMul	 ( const char *&expr );
+	bool ParseUnary		 ( const char *&expr );
+	bool ParsePrimary	 ( const char *&expr );
+	bool Parse1ArgFunc	 ( const char *&expr );
+	bool Parse2ArgFunc	 ( const char *&expr );
+	bool Parse3ArgFunc	 ( const char *&expr );
+	//	bool Parse4ArgFunc	 ( const char *&expr );
+	bool Parse5ArgFunc	 ( const char *&expr );
+
+	CUtlString m_expr;
+	CUtlVector< CUtlString > m_varNames;
+	CUtlVector<float> m_varValues;
+	CUtlStack<float> m_stack;
+	bool m_bIsBuildingArgumentList;
+};
+
+// simple warppers for using cExpressionCalculator
+float EvaluateExpression( char const *pExprString, float flValueToReturnIfFailure );
+
+
+#endif // MATHLIB_EXPRESSION_CALCULATOR_H
--- a/public/mathlib/feagglomerator.h
+++ b/public/mathlib/feagglomerator.h
@@ -0,0 +1,152 @@
+//========= Copyright © Valve Corporation, All rights reserved. ============//
+//
+// Agglomerative clustering algorithm variant suitable for FE deformable body collision detection
+// Clusters given set of points, with given connectivity, bottom-up (agglomerative clustering, see http://en.wikipedia.org/wiki/Hierarchical_clustering).
+// 
+// All connected elements are first merged into a few big clusters. When there's no more connectivity left, brute-force N^3 aggomerative clustering algorithm 
+// merges the remaining few elements into a large tree. This formulation is especially invented to work with FeModelBuilder : the bigger clusters correspond
+// to multiple disconnected pieces of cloth that can move about freely without destroying their respective cluster spacial coherence. Within those clusters,
+// the nodes correspond to connected pieces of cloth, so that the sub-clusters do not expand uncontrollably during simulation.
+//
+// All in all, it's much simpler and hopefully faster than the classical agglomerative clustering, because it takes domain information into account
+//
+#ifndef FE_AGGLOMERATOR_HDR
+#define FE_AGGLOMERATOR_HDR
+
+
+////////////////////////////////////////////////////////////////////////////
+//
+// Agglomerative clustering can be solved in N^3. If we only limit ourselves to m links to check, it may be faster
+// We can use an unordered list/vector to store connectivity or distances between clusters, or we can use a small tree sorted by Distance 
+// We can also keep all clusters in a heap, to make searching the "best" cluster O(1) and deleting clusters O(lnN)
+//
+// To maintain links between clusters, if we keep them in heaps, we can just lazily delete outdated links to clusters that have been agglomerated into bigger clusters
+// We can simply insert new links as we go, and when we need to find the "Cluster closest to given cluster", we'll just remove outdated links from the top of the heap until we find a non-outdated link
+//
+// To maintain the priority queue of all clusters (sorted by the "closest cluster distance" property), we can, again, lazily delete agglomerated clusters as they come to the top of the heap. 
+//
+// This way, we can improve O( N^2m + Nm^2 ) algorithm (with lists) to O( N m ln m ) (with heaps)
+// To simplify, we can use only the global heap of clusters to find the best cluster in O(1) instead of O(N), and unordered list for the links between clusters, making it O( Nm^2 )
+// for cloth, typical N~= 400, m ~= 8..40, so the difference between O(N m lnm ) and O(Nm^2) is a factor of 4..10
+//
+// NOTE: it's much cleaner to use RB trees (or something like that) in place of heaps above. Lazy-delete nodes in heaps will mean the same cluster will be in the heap in multiple places, and we need to keep track of outdated entries
+// Also, it's cleaner (but slower) to use heaps and update them (by percolating the corresponding elements when clusters change connectivity)
+//
+
+
+#include "tier1/utlpriorityqueue.h"
+#include "mathlib/aabb.h"
+
+
+
+class CFeAgglomerator
+{
+public:
+	CFeAgglomerator( uint nReserveNodes );
+	~CFeAgglomerator();
+
+	enum ConstEnum_t { INVALID_INDEX = -1 };
+
+	class CCluster;
+
+	class CLink
+	{
+	public:
+		CCluster *m_pOtherCluster;
+		// the metric of cost of this node, proportional to the probability of collision with a feature-sized object
+		float m_flCost; 
+	};
+
+	struct LinkLessFunc_t
+	{
+		bool operator()( const CLink &lhs, const CLink &rhs, bool( *lessFuncPtr )( CLink const&, CLink const& ) )
+		{
+			return lhs.m_flCost > rhs.m_flCost;
+/*
+			if ( lhs.m_flDistance > rhs.m_flDistance )
+				return true;
+
+			if ( lhs.m_flDistance == rhs.m_flDistance )
+				return lhs.m_pOtherCluster < rhs.m_pOtherCluster;
+
+			return false;
+*/
+		}
+	};
+
+	typedef CUtlPriorityQueue< CLink, LinkLessFunc_t >ClusterLinkQueue_t;
+
+	class CCluster
+	{
+	public:
+		int m_nIndex;
+		int m_nPriorityIndex;
+		int m_nChildLeafs;
+		
+		AABB_t m_Aabb;
+		int m_nParent;
+		int m_nChild[ 2 ];
+		ClusterLinkQueue_t m_Links;
+
+	public:
+		CCluster( int nIndex, int nLeafCount );
+
+		bool HasLinks()const;
+		float GetBestCost()const;
+		void RemoveLink( CCluster *pOtherCluster );
+		const CLink *FindLink( CCluster *pOtherCluster );
+		float ComputeCost( const Vector &vSize, int nChildLeafs );
+		void AddLink( CCluster *pOtherCluster );
+	};
+
+	struct ClusterLessFunc_t
+	{
+		bool operator()( const CCluster *pLeft, const CCluster *pRight, bool( lessFuncPtr )( CCluster*const&, CCluster*const& ) )
+		{
+			return pLeft->GetBestCost() > pRight->GetBestCost();
+		}
+	};
+
+	struct ClusterSetIndexFunc_t
+	{
+		inline static void SetIndex( CCluster* heapElement, int nNewIndex )
+		{
+			heapElement->m_nPriorityIndex = nNewIndex;
+		}
+	};
+
+	int GetClusterCount() const { return m_Clusters.Count(); }
+	CCluster *GetCluster( int nIndex ) const { return m_Clusters[ nIndex ]; }
+public:
+	// Call this to register all links between all nodes before building agglomerated clusters
+	CCluster* SetNode( int nIndex, const Vector &origin )
+	{
+		if ( !m_Clusters[ nIndex ] )
+		{
+			( m_Clusters[ nIndex ] = new CCluster( nIndex, 1 ) )->m_Aabb.SetToPoint( origin );
+		}
+		else
+		{
+			m_Clusters[ nIndex ]->m_Aabb.SetToPoint( origin );
+		}
+		return m_Clusters[ nIndex ];
+	}
+
+	typedef CUtlPriorityQueue< CCluster*, ClusterLessFunc_t, CUtlMemory< CCluster* >, ClusterSetIndexFunc_t > ClustersPriorityQueue_t;
+
+	void LinkNodes( int nNode0, int nNode1 ); // call before building priority queue
+
+	void Build( bool bSingleRoot );
+
+protected:
+	void Process( ClustersPriorityQueue_t &queue );
+	void Validate( ClustersPriorityQueue_t *pQueue  = NULL );
+	void AddLink( CCluster* pCluster0, CCluster *pCluster1, ClustersPriorityQueue_t &queue );
+protected:
+	// the first N clusters are the original nodes that have no children, 
+	// the next N-1 (or less) clusters are the parents
+	CUtlVector< CCluster* > m_Clusters;
+};
+
+
+#endif //FE_AGGLOMERATOR_HDR
--- a/public/mathlib/femodel.h
+++ b/public/mathlib/femodel.h
--- a/public/mathlib/femodel.inl
+++ b/public/mathlib/femodel.inl
@@ -0,0 +1,230 @@
+//----------------------------------------------------------------------------------------------------------
+#define LOAD_NODES_POS( POS, V, IDX ) {																  \
+	fltx4 _pos0 = POS[ IDX[ 0 ] ], _pos1 = POS[ IDX[ 1 ] ], _pos2 = POS[ IDX[ 2 ] ], _pos3 = POS[ IDX[ 3 ] ]; \
+	__m128 tmp3, tmp2, tmp1, tmp0;                          								  \
+	tmp0 = _mm_shuffle_ps( ( _pos0 ), ( _pos1 ), 0x44 );  										  \
+	tmp2 = _mm_shuffle_ps( ( _pos0 ), ( _pos1 ), 0xEE );  										  \
+	tmp1 = _mm_shuffle_ps( ( _pos2 ), ( _pos3 ), 0x44 );  										  \
+	tmp3 = _mm_shuffle_ps( ( _pos2 ), ( _pos3 ), 0xEE );  										  \
+	V.x = _mm_shuffle_ps( tmp0, tmp1, 0x88 );      											  \
+	V.y = _mm_shuffle_ps( tmp0, tmp1, 0xDD );      											  \
+	V.z = _mm_shuffle_ps( tmp2, tmp3, 0x88 );      											  \
+}
+
+#define LOAD_NODES( V, IDX ) {																  \
+	fltx4 _pos0 = pPos[ IDX[ 0 ] ], _pos1 = pPos[ IDX[ 1 ] ], _pos2 = pPos[ IDX[ 2 ] ], _pos3 = pPos[ IDX[ 3 ] ]; \
+	__m128 tmp3, tmp2, tmp1, tmp0;                          								  \
+	tmp0 = _mm_shuffle_ps( ( _pos0 ), ( _pos1 ), 0x44 );  										  \
+	tmp2 = _mm_shuffle_ps( ( _pos0 ), ( _pos1 ), 0xEE );  										  \
+	tmp1 = _mm_shuffle_ps( ( _pos2 ), ( _pos3 ), 0x44 );  										  \
+	tmp3 = _mm_shuffle_ps( ( _pos2 ), ( _pos3 ), 0xEE );  										  \
+	V.x = _mm_shuffle_ps( tmp0, tmp1, 0x88 );      											  \
+	V.y = _mm_shuffle_ps( tmp0, tmp1, 0xDD );      											  \
+	V.z = _mm_shuffle_ps( tmp2, tmp3, 0x88 );      											  \
+}
+
+
+#define SAVE_NODES_POS( POS, V, IDX ) {														  \
+	__m128 tmp3, tmp2, tmp1, tmp0;                          								  \
+	tmp0 = _mm_shuffle_ps( V.x, V.y, 0x44 );  												  \
+	tmp2 = _mm_shuffle_ps( V.x, V.y, 0xEE );  												  \
+	tmp1 = _mm_shuffle_ps( V.z, Four_Zeros, 0x44 );  										  \
+	tmp3 = _mm_shuffle_ps( V.z, Four_Zeros, 0xEE );  										  \
+	POS[ IDX[ 0 ] ] = _mm_shuffle_ps( tmp0, tmp1, 0x88 );								      \
+	POS[ IDX[ 1 ] ] = _mm_shuffle_ps( tmp0, tmp1, 0xDD );								      \
+	POS[ IDX[ 2 ] ] = _mm_shuffle_ps( tmp2, tmp3, 0x88 );								      \
+	POS[ IDX[ 3 ] ] = _mm_shuffle_ps( tmp2, tmp3, 0xDD );								      \
+}
+
+
+
+#define SAVE_NODES( V, IDX ) {																  \
+	__m128 tmp3, tmp2, tmp1, tmp0;                          								  \
+	tmp0 = _mm_shuffle_ps( V.x, V.y, 0x44 );  												  \
+	tmp2 = _mm_shuffle_ps( V.x, V.y, 0xEE );  												  \
+	tmp1 = _mm_shuffle_ps( V.z, Four_Zeros, 0x44 );  										  \
+	tmp3 = _mm_shuffle_ps( V.z, Four_Zeros, 0xEE );  										  \
+	pPos[ IDX[ 0 ] ] = _mm_shuffle_ps( tmp0, tmp1, 0x88 );								      \
+	pPos[ IDX[ 1 ] ] = _mm_shuffle_ps( tmp0, tmp1, 0xDD );								      \
+	pPos[ IDX[ 2 ] ] = _mm_shuffle_ps( tmp2, tmp3, 0x88 );								      \
+	pPos[ IDX[ 3 ] ] = _mm_shuffle_ps( tmp2, tmp3, 0xDD );								      \
+}
+
+
+
+
+inline void CovMatrix3::InitForWahba( float m, const Vector &x )
+{
+	m_vDiag.x = m * ( Sqr( x.y ) + Sqr( x.z ) );
+	m_vDiag.y = m * ( Sqr( x.x ) + Sqr( x.z ) );
+	m_vDiag.z = m * ( Sqr( x.x ) + Sqr( x.y ) );
+	m_flXY = -m * x.x * x.y;
+	m_flXZ = -m * x.x * x.z;
+	m_flYZ = -m * x.y * x.z;
+}
+
+inline void CovMatrix3::Reset()
+{
+	m_vDiag = vec3_origin;
+	m_flXY = m_flXZ = m_flYZ = 0;
+}
+
+inline void CovMatrix3::AddCov( const Vector &d ) // d is supposedly a vector relatively to the mean of the set; i.e. we assume here that we're actually summing up voth d and -d 
+{
+	m_vDiag.x += Sqr( d.x );
+	m_vDiag.y += Sqr( d.y );
+	m_vDiag.z += Sqr( d.z );
+	m_flXY += d.x * d.y;
+	m_flXZ += d.x * d.z;
+	m_flYZ += d.y * d.z;
+}
+
+
+inline void CovMatrix3::AddCov( const Vector &d, float m ) // d is supposedly a vector relatively to the mean of the set; i.e. we assume here that we're actually summing up voth d and -d 
+{
+	m_vDiag.x += m * Sqr( d.x );
+	m_vDiag.y += m * Sqr( d.y );
+	m_vDiag.z += m * Sqr( d.z );
+	m_flXY += m * d.x * d.y;
+	m_flXZ += m * d.x * d.z;
+	m_flYZ += m * d.y * d.z;
+}
+
+// the element of the sum on the left side of the approximate solution of Wahba's problem (see wahba.nb for details)
+// thi sis essentially Sum[Mi Xi * w * Xi], Mi = weights, "*" means cross product, Xi is a deformed polygon vertex relative to center of mass, 
+// 21 flops, with madd
+inline void CovMatrix3::AddForWahba( float m, const Vector &x )
+{
+	m_vDiag.x += m * ( Sqr( x.y ) + Sqr( x.z ) );
+	m_vDiag.y += m * ( Sqr( x.x ) + Sqr( x.z ) );
+	m_vDiag.z += m * ( Sqr( x.x ) + Sqr( x.y ) );
+	m_flXY -= m * x.x * x.y;
+	m_flXZ -= m * x.x * x.z;
+	m_flYZ -= m * x.y * x.z;
+}
+
+inline void CovMatrix3::NormalizeEigenvalues( )
+{
+	// trace is the sum of eigenvalues; it's not a perfect way , but it's one way
+	float flNorm = 1.0f / ( m_vDiag.x + m_vDiag.y + m_vDiag.z );
+	m_vDiag *= flNorm;
+	m_flXY *= flNorm;
+	m_flXZ *= flNorm;
+	m_flYZ *= flNorm;
+}
+
+inline void CovMatrix3::RegularizeEigenvalues( )
+{
+	m_vDiag += Vector( .001f, .001f, .001f );
+	NormalizeEigenvalues( );
+	m_vDiag += Vector( .1f, .1f, .1f );
+}
+
+inline Vector CovMatrix3::operator * ( const Vector &d )
+{
+	return Vector(
+		m_vDiag.x * d.x + m_flXY * d.y + m_flXZ * d.z,
+		m_flXY * d.x + m_vDiag.y * d.y + m_flYZ * d.z,
+		m_flXZ * d.x + m_flYZ * d.y + m_vDiag.z * d.z
+		);
+}
+
+
+
+inline void FourCovMatrices3::InitForWahba( const fltx4 &m, const FourVectors &x )
+{
+	m_vDiag.x = m * (  x.y  *  x.y  +  x.z  *  x.z  );
+	m_vDiag.y = m * (  x.x  *  x.x  +  x.z  *  x.z  );
+	m_vDiag.z = m * (  x.x  *  x.x  +  x.y  *  x.y  );
+	m_flXY = -m * x.x * x.y;
+	m_flXZ = -m * x.x * x.z;
+	m_flYZ = -m * x.y * x.z;
+}
+
+// the element of the sum on the left side of the approximate solution of Wahba's problem (see wahba.nb for details)
+// thi sis essentially Sum[Mi Xi * w * Xi], Mi = weights, "*" means cross product, Xi is a deformed polygon vertex relative to center of mass, 
+// 21 flops, with madd
+inline void FourCovMatrices3::AddForWahba( const fltx4 &m, const FourVectors &x )
+{
+	m_vDiag.x += m * (  x.y  *  x.y  +  x.z  *  x.z  );
+	m_vDiag.y += m * (  x.x  *  x.x  +  x.z  *  x.z  );
+	m_vDiag.z += m * (  x.x  *  x.x  +  x.y  *  x.y  );
+	m_flXY -= m * x.x * x.y;
+	m_flXZ -= m * x.x * x.z;
+	m_flYZ -= m * x.y * x.z;
+}
+
+inline FourVectors FourCovMatrices3::operator * ( const FourVectors &d )
+{
+	return FourVectors(
+		m_vDiag.x * d.x + m_flXY * d.y + m_flXZ * d.z,
+		m_flXY * d.x + m_vDiag.y * d.y + m_flYZ * d.z,
+		m_flXZ * d.x + m_flYZ * d.y + m_vDiag.z * d.z
+		);
+}
+
+
+inline float Perimeter( const FeQuad_t &quad )
+{
+	return
+		( quad.vShape[ 0 ].AsVector3D( ) - quad.vShape[ 1 ].AsVector3D( ) ).Length( ) +
+		( quad.vShape[ 1 ].AsVector3D( ) - quad.vShape[ 2 ].AsVector3D( ) ).Length( ) +
+		( quad.vShape[ 2 ].AsVector3D( ) - quad.vShape[ 3 ].AsVector3D( ) ).Length( ) +
+		( quad.vShape[ 3 ].AsVector3D( ) - quad.vShape[ 0 ].AsVector3D( ) ).Length( );
+}
+
+inline fltx4 Perimeter( const FeSimdQuad_t &quad )
+{
+	return
+		( quad.vShape[ 0 ] - quad.vShape[ 1 ] ).Length( ) +
+		( quad.vShape[ 1 ] - quad.vShape[ 2 ] ).Length( ) +
+		( quad.vShape[ 2 ] - quad.vShape[ 3 ] ).Length( ) +
+		( quad.vShape[ 3 ] - quad.vShape[ 0 ] ).Length( );
+}
+
+
+inline float Perimeter( const FeTri_t &tri )
+{
+	return fabsf( tri.v1x ) + tri.v2.Length() + sqrtf( tri.v2.y * tri.v2.y + ( tri.v2.x - tri.v1x ) * ( tri.v2.x - tri.v1x ) );
+}
+
+inline fltx4 Perimeter( const FeSimdTri_t &tri )
+{
+	return AbsSIMD( tri.v1x ) + tri.v2.Length( ) + SqrtSIMD( tri.v2.y * tri.v2.y + ( tri.v2.x - tri.v1x ) * ( tri.v2.x - tri.v1x ) );
+}
+
+
+FORCEINLINE float CrossProductZ( const Vector2D &v1, const Vector2D &v2 )
+{
+	return v1.x * v2.y - v1.y * v2.x;
+}
+
+FORCEINLINE float CrossProductZ( const Vector2D &v1, const Vector4D &v2 )
+{
+	return v1.x * v2.y - v1.y * v2.x;
+}
+
+FORCEINLINE float CrossProductZ( const Vector4D &v1, const Vector2D &v2 )
+{
+	return v1.x * v2.y - v1.y * v2.x;
+}
+
+FORCEINLINE fltx4 CrossProductZ( const FourVectors &v1, const FourVectors2D &v2 )
+{
+	return v1.x * v2.y - v1.y * v2.x;
+}
+
+FORCEINLINE float DotProduct( const Vector4D &v1, const Vector2D &v2 )
+{
+	return v1.x * v2.x + v1.y * v2.y;
+}
+
+
+FORCEINLINE FourVectors AndSIMD( const FourVectors &left, const fltx4 &right )
+{
+	FourVectors out;
+	out.x = AndSIMD( left.x, right );
+	out.y = AndSIMD( left.y, right );
+	out.z = AndSIMD( left.z, right );
+	return out;
+}
--- a/public/mathlib/femodelbuilder.h
+++ b/public/mathlib/femodelbuilder.h
@@ -0,0 +1,449 @@
+//===================== Copyright (c) Valve Corporation. All Rights Reserved. ======================
+#ifndef FINITE_ELEMENT_MODEL_BUILDER_HDR
+#define FINITE_ELEMENT_MODEL_BUILDER_HDR
+
+#include "tier1/utlvector.h"
+#include "tier1/utlhashtable.h"
+#include "tier1/utlsortvector.h"
+#include "mathlib/femodel.h"
+#include "tier1/utlbufferstrider.h"
+
+
+template <typename T>
+class CUtlVectorOfPointers: public CUtlVector< T * >
+{
+public:
+	~CUtlVectorOfPointers( )
+	{
+		Purge( );
+	}
+	template <typename Functor>
+	void SetCountAndInit( int nCount, Functor fn )
+	{
+		CUtlVector< T * >::SetCount( nCount );
+		for ( int i = 0; i < nCount; ++i )
+		{
+			CUtlVector< T * >::Element( i ) = fn( i );
+		}
+	}
+	void Purge( )
+	{
+		for ( int i = 0; i < CUtlVector< T * >::Count( ); ++i )
+		{
+			delete ( *this )[ i ];
+		}
+		CUtlVector< T * >::Purge( );
+	}
+};
+
+
+
+class CFeModelBuilder: public CFeModel, public CMultiBufferHelper< CFeModelBuilder >
+{
+public:
+	struct BuildElem_t;
+
+	CFeModelBuilder( )
+	{
+		V_memset( static_cast< CFeModel* >( this ), 0, sizeof( CFeModel ) );
+		m_bIdentityCtrlOrder = false;
+		m_bEnableExplicitNodeMasses = false;
+		m_bUnitlessDamping = false;
+		m_bAddStiffnessRods = true;
+		m_bUsePerNodeLocalForceAndRotation = false;
+		m_flQuadBendTolerance = 0.05f;
+		m_bRigidEdgeHinges = false;
+		m_nFitMatrixMinInfluences = 8;
+		m_bNeedBacksolvedBasesOnly = false;
+	}
+
+	typedef CUtlVectorFixedGrowable< FeFitWeight_t, 8 > FitWeightArray_t;
+
+	void EnableUnitlessDamping( bool bEnableUnitlessDamping ) { m_bUnitlessDamping = bEnableUnitlessDamping;  }
+	void EnableIdentityCtrlOrder( ) { m_bIdentityCtrlOrder = true; }
+	void EnableExplicitNodeMasses( bool bExplicit ) { m_bEnableExplicitNodeMasses = bExplicit; }
+	void EnableRigidStiffnessRods( bool bRigidStiffnessRods ) { m_bRigidEdgeHinges = bRigidStiffnessRods; }
+	void SetQuadBendTolerance( float flQuadBendTolerance ) { m_flQuadBendTolerance = flQuadBendTolerance; }
+
+	bool Finish( bool bTriangulate, float flAddCurvature, float flAddSlack );
+	void AdjustQuads();
+	float ElemNormalLength( const uint nNode[4] );
+	float NodeDist( uint nNode0, uint nNode1 );
+	Vector TriNormal( uint nNode0, uint nNode1, uint nNode2 );
+	void AddBendCurvature( float k )
+	{
+		for ( int i = 0; i < m_KelagerBends.Count( ); ++i )
+		{
+			FeKelagerBend_t &kbend = m_KelagerBends[ i ];
+			float flMinSide = FLT_MAX;
+			for ( int j = 0; j < 4; ++j )
+			{
+				uint n0 = kbend.m_nNode[ j ], n1 = kbend.m_nNode[ ( j + 1 ) % 4 ];
+				if ( n1 != n0 )
+				{
+					float flSide = ( m_Nodes[ n0 ].transform.m_vPosition - m_Nodes[ n1 ].transform.m_vPosition ).Length( );
+					if ( flSide < flMinSide )
+						flMinSide = flSide;
+				}
+			}
+			kbend.flHeight0 += flMinSide * k;
+		}
+
+		for ( int i = 0; i < m_AxialEdges.Count( ); ++i )
+		{
+			FeAxialEdgeBend_t &edge = m_AxialEdges[ i ];
+			Vector f01 = m_Nodes[ edge.nNode[ 0 ] ].transform.m_vPosition * edge.te + m_Nodes[ edge.nNode[ 1 ] ].transform.m_vPosition * ( 1 - edge.te );
+			float h = ( m_Nodes[ edge.nNode[ 2 ] ].transform.m_vPosition - f01 ).Length( ) + ( m_Nodes[ edge.nNode[ 3 ] ].transform.m_vPosition - f01 ).Length( );
+			edge.flDist += h * k;
+		}
+	}
+	
+	struct MbaContext_t;
+
+	int FindBuildNodeIndex( const char *pName );
+
+	void BuildAxialEdges( );
+	void BuildOldFeEdges( );
+	void BuildKelagerBends( );
+	void BuildAndSortRods( float flCurvatureAngle, bool bTriangulate );
+	void BuildRod( float flCurvatureAngle, uint v0, uint v1, uint nElem0, uint nElem1, uint nEdge0, uint nEdge1, CUtlHashtable< uint32, uint32 > &edgeToRod );
+	void BuildQuads( CUtlVector< FeQuad_t > &quads, bool bSkipTris );
+	void BuildTris( CUtlVector< FeTri_t > &quads, bool bTriangulate );
+	void BuildNodeSlack( float flSlackMultiplier );
+	void BuildFeEdgeDesc( );
+	void BuildInvMassesAndSortNodes( );
+	int ReconcileElemStaticNodes();
+	int RemoveFullyStaticElems();
+	void BuildBaseRecovery( );
+	void BuildRopes( );
+	void BuildFreeNodes( MbaContext_t &context );
+	void BuildCtrlOffsets( );
+	void BuildNodeFollowers( CUtlVector< FeFollowNode_t > &nodeFollowers );
+	uint BuildCollisionSpheres( CUtlVector< FeCollisionSphere_t > &collisionSpheres );
+	void BuildCollisionPlanes( CUtlVector< FeCollisionPlane_t > &collisionPlanes );
+	void BuildWorldCollisionNodes( CUtlVector< FeWorldCollisionParams_t > &worldCollisionParams, CUtlVector< uint16 > &worldCollisionNodes );
+	void BuildFitMatrices( MbaContext_t &context );
+	void RemoveStandaloneNodeBases( MbaContext_t &context );
+	Vector ComputeCenter( const FitWeightArray_t &weights );
+	void CheckIdentityCtrlOrder( );
+	void BuildSprings( CUtlVector< FeSpringIntegrator_t > &springs );
+	void ValidateBases( );
+	void PrintNodeTree( uint nNode, const CUtlString &prefix );
+
+	void CleanupElements();
+	void RecomputeMasses( CUtlVector< float >& nodeMass );
+	void BalanceGlobalMassMultipliers( CUtlVector< float >& nodeMass );
+
+	int CountSimulatedNodesIn( const FeRodConstraint_t & rod );
+	int CountSimulatedNodesIn( const BuildElem_t& elem );
+	void BuildTree();
+
+	FeTri_t BuildTri( const BuildElem_t &buildElem, int nTriStaticNodes, int nSubTri );
+	uint GetDampingFlags( )const
+	{
+		uint nFlags = 0;
+		for ( int i = 0; i < m_Nodes.Count( ); ++i )
+		{
+			nFlags |= m_Nodes[ i ].GetDampingFlags( );
+		}
+		return nFlags;
+	}
+
+	bool HasLegacyStretchForce( ) const;
+
+	struct MbaContext_t
+	{
+		CUtlVector< FeQuad_t > quads;
+		CUtlVector< FeTri_t > tris;
+		CUtlVectorAligned< FeSimdRodConstraint_t > simdRods;
+		CUtlVectorAligned< FeSimdQuad_t > simdQuads[ 3 ];
+		CUtlVectorAligned< FeSimdTri_t > simdTris[ 3 ];
+		CUtlVectorAligned< FeSimdNodeBase_t > simdBases;
+		CUtlVector< FeSpringIntegrator_t > springs;
+		CUtlVectorAligned< FeSimdSpringIntegrator_t > simdSprings;
+		CUtlVector< FeFollowNode_t > nodeFollowers;;
+		CUtlVector< FeCollisionSphere_t > collisionSpheres;
+		CUtlVector< FeCollisionPlane_t > collisionPlanes;
+		CUtlVector< FeWorldCollisionParams_t > worldCollisionParams;
+		CUtlVector< uint16 > worldCollisionNodes;
+		CUtlVectorAligned< FeFitMatrix_t > fitMatrices;
+		FitWeightArray_t fitWeights;
+		uint m_nFitMatrices1;
+		uint m_nFitMatrices2;
+		uint nLegacyStretchForceCount;
+		uint nCollisionEllipsoidsInclusive;
+		uint nNodeIntegratorCount;
+		uint nStringsMemSize;
+		uint nCtrlNameCount;
+		bool m_bHasNodeCollisionRadii;
+		bool m_bUsePerNodeLocalRotation;
+		bool m_bUsePerNodeLocalForce;
+	};
+
+	template <typename Allocator > void OnAllocateMultiBuffer( Allocator &a, MbaContext_t &context );
+
+protected:
+	void ConvertCtrlToNode( int &refCtrl )
+	{
+		if ( refCtrl >= 0 )
+		{
+			Assert( refCtrl < m_CtrlToNode.Count() );
+			refCtrl = m_CtrlToNode[ refCtrl ];
+		}
+	}
+	void ConvertCtrlToNode( uint16 &refCtrl )
+	{
+		Assert( int( refCtrl ) < m_CtrlToNode.Count() );
+		refCtrl = m_CtrlToNode[ refCtrl ];
+	}
+	void ConvertCtrlToNode( uint32 &refCtrl )
+	{
+		Assert( int( refCtrl ) < m_CtrlToNode.Count() );
+		refCtrl = m_CtrlToNode[ refCtrl ];
+	}
+	float GetRank( const FeAxialEdgeBend_t &bend )const
+	{
+		float flRank = 0;
+
+		// ad-hoc ranking of bends; if a bend affects some node a lot, then it's ranked closer to that node.
+		// Ranks of nodes within one bend shouldn't differ by more than 2, because the bends are comprised
+		// of adjacent quads' nodes, so this won't deviate much from strict ranking by the lowest-rank node in the bend
+		flRank += bend.flWeight[ 0 ] * m_Nodes[ bend.nNode[ 0 ] ].nRank;
+		flRank += bend.flWeight[ 1 ] * m_Nodes[ bend.nNode[ 1 ] ].nRank;
+		flRank += bend.flWeight[ 2 ] * m_Nodes[ bend.nNode[ 2 ] ].nRank;
+		flRank += bend.flWeight[ 2 ] * m_Nodes[ bend.nNode[ 3 ] ].nRank;
+		flRank += bend.flWeight[ 3 ] * m_Nodes[ bend.nNode[ 4 ] ].nRank;
+		flRank += bend.flWeight[ 3 ] * m_Nodes[ bend.nNode[ 5 ] ].nRank;
+		return flRank;
+	}
+public:
+	struct BuildElem_t
+	{
+		enum { MAX_NODES = 4 };
+		BuildElem_t()
+		{
+			nRank = 0;
+			flSlack = 0;
+			nStaticNodes = 0;
+			for ( int i = 0; i < MAX_NODES; ++i )
+				nNode[ i ] = 0;
+		}
+
+		uint nNode[ MAX_NODES ];
+		uint NumNodes( )const { return nNode[ 3 ] == nNode[ 2 ] ? 3 : 4; }
+		uint nStaticNodes; // 0..2
+		float flSlack;
+		int nRank; // 0 means static, then it means "how many elements removed from the closest static"
+
+		static bool Order( const BuildElem_t &left, const BuildElem_t &right )
+		{
+			int nDelta = int( right.nStaticNodes ) - int( left.nStaticNodes );
+			if ( nDelta == 0 )
+			{
+				return left.nRank < right.nRank;
+			}
+			return nDelta < 0;
+		}
+	};
+	struct BuildSpring_t
+	{
+		uint16 nNode[ 2 ];
+		float32 flSpringConstant;
+		float32 flSpringDamping;
+		float32 flStretchiness; // Not Implemented!
+	};
+
+	struct BuildCollisionSphere_t
+	{
+		int m_nParent;
+		int m_nChild;
+		float m_flRadius;
+		Vector m_vOrigin;
+		bool m_bInclusive;
+		float m_flStickiness;
+
+		BuildCollisionSphere_t( )
+		{
+			m_nParent = -1;
+			m_nChild = -1;
+			m_flRadius = 0;
+			m_vOrigin = vec3_origin;
+			m_bInclusive = true;
+			m_flStickiness = 0;
+		}
+
+		bool IsDegenerate( )const
+		{
+			return m_nParent < 0 || m_nChild < 0 || ( !m_bInclusive && m_flRadius < 1e-3f );
+		}
+	};
+
+	struct BuildCollisionPlane_t
+	{
+		int m_nParent;
+		int m_nChild;
+		RnPlane_t m_Plane;
+		float m_flStickiness;
+		BuildCollisionPlane_t( )
+		{
+			m_nParent = -1;
+			m_nChild = -1;
+			m_Plane.m_flOffset = 0;
+			m_flStickiness = 0;
+			m_Plane.m_vNormal = vec3_origin;
+		}
+		bool IsDegenerate( )const
+		{
+			return m_nParent < 0 || m_nChild < 0 || m_Plane.m_vNormal.LengthSqr( ) < 1e-12f;
+		}
+	};
+
+
+	struct BuildNode_t
+	{
+		CTransform transform; // relaxed position
+		float flMassMultiplier;
+		float flMassBias;
+		float invMass;
+		float flSlack;
+		float flGravityZ;
+		float flCollisionRadius;
+		FeNodeIntegrator_t integrator;
+		const char *pName;
+		int nParent;
+		int nRank; // 0 means static, then it means "how many elements removed from the closest static"
+		uint nCollisionMask;
+
+		int nFollowParent;
+		float flFollowWeight;
+		float flWorldFriction; // not really a friction coefficient, this corresponds to WorldFriction coefficient from Source1 cloth
+		float flGroundFriction;
+
+		float flLegacyStretchForce;
+		float flLocalForce;
+		float flLocalRotation;
+		
+		bool bSimulated : 1;
+		bool bForceSimulated : 1;
+		bool bFreeRotation : 1;
+		bool bAnimRotation : 1;
+		bool bMassMultiplierGlobal : 1; // if true, the mass multipliers are gathered and distributed so that all nodes with "global" multipliers keep the mass ratio = multiplier ratio
+		bool bVirtual : 1;
+		bool bNeedNodeBase : 1;
+		bool bWorldCollision : 1;
+		bool bOsOffset : 1;
+
+		BuildNode_t( )
+		{
+			bSimulated = false;
+			bForceSimulated = false;
+			bFreeRotation = false; // true only makes sense for non-simulated
+			bAnimRotation = false;
+			bMassMultiplierGlobal = false;
+			bVirtual = false;
+			bNeedNodeBase = false;
+			bWorldCollision = false;
+			bOsOffset = false;
+			flWorldFriction = 1.0f;
+			flGroundFriction = 0.0f;
+			flLegacyStretchForce = 0;
+			transform = g_TransformIdentity;
+			flMassMultiplier = 1.0f; // can be 0
+			flMassBias = 0.0f;
+			invMass = 0.0f; // can be arbitrary
+			flSlack = 0.0f;
+			flCollisionRadius = 0;
+			flGravityZ = 360;
+			pName = NULL;
+			nRank = 0;
+			nParent = -1;
+			nFollowParent = -1;
+			flFollowWeight = 0;
+			nCollisionMask = 0;
+			flLocalForce = 1.0f;
+			flLocalRotation = 0.0f;
+			integrator.Init();
+		}
+
+		uint GetDampingFlags( )const
+		{
+			uint nFlags = 0;
+			if ( integrator.flPointDamping != 0 )
+				nFlags |= FE_FLAG_HAS_NODE_DAMPING;
+			if ( integrator.flAnimationVertexAttraction != 0 )
+				nFlags |= FE_FLAG_HAS_ANIMATION_VERTEX_ATTRACTION;
+			if ( integrator.flAnimationForceAttraction != 0 )
+				nFlags |= FE_FLAG_HAS_ANIMATION_FORCE_ATTRACTION;
+			if ( integrator.flGravity != 360 )
+				nFlags |= FE_FLAG_HAS_CUSTOM_GRAVITY;
+			if ( bSimulated && flLegacyStretchForce != 0 )
+				nFlags |= FE_FLAG_HAS_STRETCH_VELOCITY_DAMPING;
+			return nFlags;
+		}
+	};
+
+
+public:
+	// this is to precompute orientations of bones that we have the freedom to orient
+	static FeNodeBase_t BuildNodeBasisFast( const CUtlVectorAligned< BuildNode_t > &nodes, uint nNode, const CUtlSortVector< int > &neighbors );
+	static void BuildNodeBases( const CUtlVectorAligned< BuildNode_t > &nodes, const CUtlVector< BuildElem_t > &elems, const CUtlVector< FeNodeBase_t > &presetNodeBases, CUtlVector< FeNodeBase_t > &nodeBases, CUtlVectorOfPointers< CUtlSortVector< int > > &neighbors );
+	void BuildNodeBases();
+	FeNodeBase_t BuildNodeBasisFast( uint nNode );
+public:
+
+// 	class CFitMatrix
+// 	{
+// 	public:
+// 		int m_nStaticWeights; // should normally be 0,1 or 2. If 3+ static nodes influence this fit matrix, maybe we should only keep those static node influences and say we have 0 static nodes, so that it back-soves to static nodes only
+// 		CUtlVector< InfluenceWeight_t > m_Weights;
+// 	};
+
+
+
+	// generated
+	CUtlVectorOfPointers< CUtlSortVector< int > > m_NodeNeighbors;
+	CUtlVector< FeCtrlOffset_t > m_CtrlOffsets;
+	CUtlVector< FeCtrlOsOffset_t > m_CtrlOsOffsets;
+	CUtlVector< FeAxialEdgeBend_t > m_AxialEdges;
+	CUtlVector< OldFeEdge_t > m_OldFeEdges;
+	CUtlVector< FeKelagerBend_t > m_KelagerBends;
+
+	CUtlVector< FeEdgeDesc_t > m_FeEdgeDesc;
+
+	CUtlVector< int > m_NodeToCtrl;
+	CUtlVector< int > m_CtrlToNode;
+	CUtlVectorOfPointers< CUtlVector< int > > m_Ropes;
+
+	CUtlVector< FeNodeBase_t > m_NodeBases;
+	CUtlVector< FeNodeReverseOffset_t > m_ReverseOffsets;
+	CUtlVector< uint16 > m_TreeParents; // dynamic nodes (N) + clusters (N-1)
+	CUtlVector< FeTreeChildren_t > m_TreeChildren; // clusters (N-1) * 2
+	CUtlVector< uint16 > m_FreeNodes;
+
+	// input
+	CUtlVector< FeNodeBase_t > m_PresetNodeBases;
+	CUtlVector< FeTaperedCapsuleStretch_t > m_TaperedCapsuleStretches;
+	CUtlVector< FeTaperedCapsuleRigid_t > m_TaperedCapsuleRigids;
+	CUtlVector< FeSphereRigid_t > m_SphereRigids;
+	CUtlVector< BuildCollisionSphere_t > m_CollisionSpheres;
+	CUtlVector< BuildCollisionPlane_t > m_CollisionPlanes;
+	CUtlVector< BuildSpring_t > m_Springs;
+	CUtlVector< FeRodConstraint_t > m_Rods;
+	CUtlVector< BuildElem_t > m_Elems;
+	CUtlVectorAligned< BuildNode_t > m_Nodes; // in-out
+	CUtlVector< FeFitInfluence_t > m_FitInfluences;
+	CUtlVectorOfPointers< CUtlSortVector< int > > m_Neighbors;
+
+	bool m_bIdentityCtrlOrder;
+	bool m_bEnableExplicitNodeMasses;
+	bool m_bUnitlessDamping;
+	bool m_bAddStiffnessRods;
+	bool m_bUsePerNodeLocalForceAndRotation;
+	bool m_bRigidEdgeHinges;
+	float m_flQuadBendTolerance;
+	int m_nFitMatrixMinInfluences;
+	bool m_bNeedBacksolvedBasesOnly;
+};
+
+
+#endif
--- a/public/mathlib/femodeldesc.h
+++ b/public/mathlib/femodeldesc.h
@@ -0,0 +1,165 @@
+//========= Copyright © Valve Corporation, All rights reserved. ============//
+#ifndef FE_MODEL_DESC
+#define FE_MODEL_DESC
+
+#include "resourcefile/resourcestream.h"
+#include "tier1/utlvector.h"
+
+class CTransform;
+class CFeModel;
+struct FeNodeBase_t;
+struct FeSimdNodeBase_t;
+struct FeQuad_t;
+struct FeSimdQuad_t ;
+struct FeSimdTri_t ;
+struct FeSimdRodConstraint_t ;
+struct FeRodConstraint_t ;
+struct FeAxialEdgeBend_t ;
+struct FeCtrlOffset_t ;
+struct FeCtrlOsOffset_t ;
+struct FeFollowNode_t ;
+struct FeCollisionSphere_t ;
+struct FeCollisionPlane_t ;
+struct FeNodeIntegrator_t;
+struct FeSpringIntegrator_t ;
+struct FeSimdSpringIntegrator_t ;
+struct FeWorldCollisionParams_t ;
+struct FeTaperedCapsuleStretch_t ;
+struct FeTaperedCapsuleRigid_t ;
+struct FeSphereRigid_t ;
+struct FeTreeChildren_t ;
+struct FeFitMatrix_t ;
+struct FeSimdFitMatrices_t ;
+struct FeFitWeight_t ;
+struct FeNodeReverseOffset_t;
+
+//
+// On-disk structure holding Finite Element Model data
+//
+// When making changes to this structure, also change:
+//   CFeModel - runtime reflection of this same structure, used for easy runtime changes
+//   CLockedResource< PhysFeModelDesc_t > Clone( CFeModel *pFeModel, CResourceStream *pStream ) in physicsresourcehelpers.cpp
+//   CVPhysics2Interface::CreateAggregateDataFromDiskData in vphysics.cpp
+//   CFeModelBuilder (::Finish() ) in mathlib\femodelbuilder.cpp
+//   perhaps CAuthPhysFx::Compile in mdlobjects\authphysmodel.cpp
+//
+struct PhysFeModelDesc_t
+{
+	CResourceArray< uint32 > m_CtrlHash;
+	CResourceArray< CResourceString > m_CtrlName;
+
+	uint32 m_nStaticNodeFlags;
+	uint32 m_nDynamicNodeFlags;
+	float32 m_flLocalForce;
+	float32 m_flLocalRotation;
+	uint16 m_nNodeCount;
+	uint16 m_nStaticNodes;
+	uint16 m_nRotLockStaticNodes;
+	uint16 m_nSimdTriCount1;
+	uint16 m_nSimdTriCount2;
+	uint16 m_nSimdQuadCount1;
+	uint16 m_nSimdQuadCount2;
+	uint16 m_nQuadCount1;
+	uint16 m_nQuadCount2;
+	uint16 m_nCollisionSphereInclusiveCount;
+	uint16 m_nTreeDepth;
+	uint16 m_nFitMatrixCount1;
+	uint16 m_nFitMatrixCount2;
+	uint16 m_nSimdFitMatrixCount1;
+	uint16 m_nSimdFitMatrixCount2;
+
+	uint16 m_nRopeCount;
+	CResourceArray< uint16 > m_Ropes; // first, there's the "end" indices of each rope (1st rope "begin" is assumed to be == m_nRopeCount ). Then, there are ropes: with indices from the parent/anchor (ground truth w.r.t. orientation ) to child
+	CResourceArray< FeNodeBase_t > m_NodeBases;
+	CResourceArray< FeSimdNodeBase_t > m_SimdNodeBases;
+	
+	CResourceArray< FeQuad_t > m_Quads;
+	CResourceArray< FeSimdQuad_t > m_SimdQuads;
+	CResourceArray< FeSimdTri_t > m_SimdTris;
+	CResourceArray< FeSimdRodConstraint_t > m_SimdRods;
+	CResourceArray< CTransform > m_InitPose;
+	CResourceArray< FeRodConstraint_t > m_Rods;
+	CResourceArray< FeAxialEdgeBend_t > m_AxialEdges;
+	CResourceArray< float32 > m_NodeInvMasses;
+	CResourceArray< FeCtrlOffset_t > m_CtrlOffsets;
+	CResourceArray< FeCtrlOsOffset_t > m_CtrlOsOffsets;
+	CResourceArray< FeFollowNode_t > m_FollowNodes;
+	CResourceArray< FeCollisionSphere_t > m_CollisionSpheres;
+	CResourceArray< FeCollisionPlane_t > m_CollisionPlanes;
+
+	// either 0 elements (implying 0 damping for all nodes), or m_nNodeCount elements;
+	// static nodes are damped as they come in from animation,
+	// dynamic nodes as they simulate; 
+	// damping is multiplie	d by the time step (in Dota, they are not)
+	CResourceArray< FeNodeIntegrator_t > m_NodeIntegrator; 
+
+	// this is to simulate spring forces (acceleration level) with the verlet integrator: it gets applied as a separte step, just adding a*t^2 to the corresponding nodes
+	// if nodes have different damping, it needs to be figured out in the weight here. If damping is not 1.0, it needs to be premultiplied in both the constant and damping
+	CResourceArray< FeSpringIntegrator_t > m_SpringIntegrator;
+	CResourceArray< FeSimdSpringIntegrator_t > m_SimdSpringIntegrator;
+	
+	CResourceArray< FeWorldCollisionParams_t > m_WorldCollisionParams;
+	CResourceArray< float > m_LegacyStretchForce;
+	CResourceArray< float > m_NodeCollisionRadii;
+	CResourceArray< float > m_LocalRotation;
+	CResourceArray< float > m_LocalForce;
+	CResourceArray< FeTaperedCapsuleStretch_t > m_TaperedCapsuleStretches;
+	CResourceArray< FeTaperedCapsuleRigid_t > m_TaperedCapsuleRigids;
+	CResourceArray< FeSphereRigid_t > m_SphereRigids;
+	CResourceArray< uint16 > m_WorldCollisionNodes;
+	CResourceArray< uint16 > m_TreeParents;
+	CResourceArray< uint16 > m_TreeCollisionMasks;
+	CResourceArray< FeTreeChildren_t > m_TreeChildren;
+	CResourceArray< uint16 > m_FreeNodes;
+	CResourceArray< FeFitMatrix_t > m_FitMatrices;
+	CResourceArray< FeSimdFitMatrices_t > m_SimdFitMatrices;
+	CResourceArray< FeFitWeight_t > m_FitWeights;
+	CResourceArray< FeNodeReverseOffset_t > m_ReverseOffsets;
+	uint32 m_nReserved[ 30 ];
+
+	float32 m_flWindage;
+	float32 m_flWindDrag;
+	float32 m_flDefaultSurfaceStretch;
+	float32 m_flDefaultThreadStretch;
+	float32 m_flDefaultGravityScale;
+	float32 m_flDefaultVelAirDrag;
+	float32 m_flDefaultExpAirDrag;
+	float32 m_flDefaultVelQuadAirDrag;
+	float32 m_flDefaultExpQuadAirDrag;
+	float32 m_flDefaultVelRodAirDrag;
+	float32 m_flDefaultExpRodAirDrag;
+	float32 m_flRodVelocitySmoothRate;
+	float32 m_flQuadVelocitySmoothRate;
+	float32 m_flAddWorldCollisionRadius;
+	float32 m_flDefaultVolumetricSolveAmount;
+	uint16 m_nRodVelocitySmoothIterations;
+	uint16 m_nQuadVelocitySmoothIterations;
+
+	uint GetNodeCount()const { return m_InitPose.Count(); }
+	uint GetDynamicNodeCount() const { return m_nNodeCount - m_nStaticNodes; }
+};
+
+CLockedResource< PhysFeModelDesc_t > Clone( CFeModel *pFeModel, CResourceStream *pStream );
+void Clone( const PhysFeModelDesc_t *pFeDesc, intp nOffsetBytes, char **pCtrlNames, CFeModel *pFeModel );
+
+
+class CFeModel;
+class CFeModelReplaceContext
+{
+public:
+	CFeModelReplaceContext( const CFeModel *pOld, const CFeModel *pNew );
+	const CFeModel *GetOld() { return m_pOld; }
+	const CFeModel *GetNew() { return m_pNew; }
+	int OldToNewNode( int i ) { return m_OldToNewNode[ i ]; }
+	int NewToOldNode( int i ){ return m_NewToOldNode[ i ]; }
+	int OldToNewCtrl( int i ) { return m_OldToNewCtrl[ i ]; }
+	int NewToOldCtrl( int i ){ return m_NewToOldCtrl[ i ]; }
+protected:
+	const CFeModel *m_pOld;
+	const CFeModel *m_pNew;
+	CUtlVector< int > m_OldToNewNode, m_NewToOldNode;
+	CUtlVector< int > m_OldToNewCtrl, m_NewToOldCtrl;
+};
+
+
+#endif
--- a/public/mathlib/fltx4.h
+++ b/public/mathlib/fltx4.h
@@ -0,0 +1,107 @@
+//===== Copyright 1996-2010, Valve Corporation, All rights reserved. ======//
+//
+// Purpose: - defines the type fltx4 - Avoid cyclic includion.
+//
+//===========================================================================//
+
+#ifndef FLTX4_H
+#define FLTX4_H
+
+#if defined(GNUC)
+#define USE_STDC_FOR_SIMD 0
+#else
+#define USE_STDC_FOR_SIMD 0
+#endif
+
+#if (!defined(PLATFORM_PPC) && (USE_STDC_FOR_SIMD == 0))
+#define _SSE1 1
+#endif
+
+// I thought about defining a class/union for the SIMD packed floats instead of using fltx4,
+// but decided against it because (a) the nature of SIMD code which includes comparisons is to blur
+// the relationship between packed floats and packed integer types and (b) not sure that the
+// compiler would handle generating good code for the intrinsics.
+
+#if USE_STDC_FOR_SIMD
+#error "hello"
+typedef union
+{
+	float  m128_f32[4];
+	uint32 m128_u32[4];
+} fltx4;
+
+typedef fltx4 i32x4;
+typedef fltx4 u32x4;
+
+#ifdef _PS3
+typedef fltx4 u32x4;
+typedef fltx4 i32x4;
+#endif
+typedef fltx4 bi32x4;
+
+#elif ( defined( _PS3 ) )
+
+typedef union
+{
+	// This union allows float/int access (which generally shouldn't be done in inner loops)
+
+	vec_float4	vmxf;
+	vec_int4	vmxi;
+	vec_uint4	vmxui;
+#if defined(__SPU__)
+	vec_uint4	vmxbi;
+#else
+	__vector bool vmxbi;
+#endif
+
+	struct 
+	{
+		float x;
+		float y;
+		float z;
+		float w;
+	};
+
+	float		m128_f32[4];
+	uint32		m128_u32[4];
+	int32		m128_i32[4];
+
+} fltx4_union;
+
+typedef vec_float4 fltx4;
+typedef vec_uint4  u32x4;
+typedef vec_int4   i32x4;
+
+#if defined(__SPU__)
+typedef vec_uint4 bi32x4;
+#else
+typedef __vector bool bi32x4;
+#endif
+
+#define DIFFERENT_NATIVE_VECTOR_TYPES // true if the compiler has different types for float4, uint4, int4, etc
+
+#elif ( defined( _X360 ) )
+
+typedef union
+{
+	// This union allows float/int access (which generally shouldn't be done in inner loops)
+	__vector4	vmx;
+	float		m128_f32[4];
+	uint32		m128_u32[4];
+} fltx4_union;
+
+typedef __vector4 fltx4;
+typedef __vector4 i32x4; // a VMX register; just a way of making it explicit that we're doing integer ops.
+typedef __vector4 u32x4; // a VMX register; just a way of making it explicit that we're doing unsigned integer ops.
+typedef fltx4 bi32x4;
+#else
+
+typedef __m128 fltx4;
+typedef __m128 i32x4;
+typedef __m128 u32x4;
+typedef __m128i shortx8;
+typedef fltx4 bi32x4;
+
+#endif
+
+#endif
--- a/public/mathlib/halton.h
+++ b/public/mathlib/halton.h
@@ -0,0 +1,70 @@
+// $Id$
+
+// halton.h - classes, etc for generating numbers using the Halton pseudo-random sequence.  See
+// http://halton-sequences.wikiverse.org/.
+//
+// what this function is useful for is any sort of sampling/integration problem where
+// you want to solve it by random sampling. Each call the NextValue() generates
+// a random number between 0 and 1, in an unclumped manner, so that the space can be more
+// or less evenly sampled with a minimum number of samples.
+//
+// It is NOT useful for generating random numbers dynamically, since the outputs aren't
+// particularly random.
+//
+// To generate multidimensional sample values (points in a plane, etc), use two
+// HaltonSequenceGenerator_t's, with different (primes) bases.
+
+#ifndef HALTON_H
+#define HALTON_H
+
+#include <tier0/platform.h>
+#include <mathlib/vector.h>
+
+class HaltonSequenceGenerator_t
+{
+	int seed;
+	int base;
+	float fbase;											//< base as a float
+
+public:
+	HaltonSequenceGenerator_t(int base);					//< base MUST be prime, >=2
+
+	float GetElement(int element);
+
+	inline float NextValue(void)
+	{
+		return GetElement(seed++);
+	}
+
+};
+
+
+class DirectionalSampler_t									//< pseudo-random sphere sampling
+{
+	HaltonSequenceGenerator_t zdot;
+	HaltonSequenceGenerator_t vrot;
+public:
+	DirectionalSampler_t(void)
+		: zdot(2),vrot(3)
+	{
+	}
+
+	Vector NextValue(void)
+	{
+		float zvalue=zdot.NextValue();
+		zvalue=2*zvalue-1.0;								// map from 0..1 to -1..1
+		float phi=acos(zvalue);
+		// now, generate a random rotation angle for x/y
+		float theta=2.0*M_PI*vrot.NextValue();
+		float sin_p=sin(phi);
+		return Vector(cos(theta)*sin_p,
+					  sin(theta)*sin_p,
+					  zvalue);
+
+	}
+};
+
+
+
+
+#endif // halton_h
--- a/public/mathlib/intvector3d.h
+++ b/public/mathlib/intvector3d.h
@@ -0,0 +1,192 @@
+//============ Copyright (c) Valve Corporation, All rights reserved. ============
+//
+// A simple 3D integer vector class.
+//
+//===============================================================================
+
+#ifndef INTVECTOR3D_H
+#define INTVECTOR3D_H
+
+#if defined( COMPILER_MSVC )
+#pragma once
+#endif
+
+//-----------------------------------------------------------------------------
+// A simple, 3-component, 32-bit integer vector.
+//
+// Use when SIMD versions aren't appropriate (e.g. for branch-heavy code, 
+// when readability/ease-of-use trump performance).
+//-----------------------------------------------------------------------------
+class IntVector3D
+{
+public:
+	int x, y, z;
+
+	IntVector3D() { }
+	IntVector3D( int nX, int nY, int nZ ) : x( nX ), y( nY ), z( nZ ) { }
+	explicit IntVector3D( int nReplicate ) : x( nReplicate ), y( nReplicate ), z( nReplicate ) { }
+	explicit IntVector3D( Vector v, float flEpsilon ) : x( v.x + flEpsilon ), y( v.y + flEpsilon ), z( v.z + flEpsilon ) { }
+
+	Vector ToVector() const { return Vector( x, y, z ); }
+
+	IntVector3D operator+( const IntVector3D &rhs ) const
+	{
+		return IntVector3D( x + rhs.x, y + rhs.y, z + rhs.z );
+	}
+
+	IntVector3D operator-( const IntVector3D &rhs ) const
+	{
+		return IntVector3D( x - rhs.x, y - rhs.y, z - rhs.z );
+	}
+
+	IntVector3D operator-() const
+	{
+		return IntVector3D( -x, -y, -z );
+	}
+
+	IntVector3D operator*( int n ) const
+	{
+		return IntVector3D( n * x, n * y, n * z );
+	}
+
+	IntVector3D operator*( const IntVector3D &rhs ) const
+	{
+		return IntVector3D( x * rhs.x, y * rhs.y, z * rhs.z );
+	}
+
+	IntVector3D operator/( int n ) const
+	{
+		return IntVector3D( x / n, y / n, z / n );
+	}
+
+	IntVector3D operator%( int n ) const
+	{
+		return IntVector3D( x % n, y % n, z % n );
+	}
+
+	IntVector3D& operator+=( const IntVector3D &rhs )
+	{
+		x += rhs.x;
+		y += rhs.y;
+		z += rhs.z;
+		return *this;
+	}
+
+	IntVector3D& operator-=( const IntVector3D &rhs )
+	{
+		x -= rhs.x;
+		y -= rhs.y;
+		z -= rhs.z;
+		return *this;
+	}
+	
+	IntVector3D& operator*=( int n )
+	{
+		x *= n;
+		y *= n;
+		z *= n;
+		return *this;
+	}
+
+	IntVector3D& operator/=( int n )
+	{
+		x /= n;
+		y /= n;
+		z /= n;
+		return *this;
+	}
+
+	IntVector3D& operator%=( int n )
+	{
+		x %= n;
+		y %= n;
+		z %= n;
+		return *this;
+	}
+
+	bool operator==( const IntVector3D &rhs ) const
+	{
+		return x == rhs.x && y == rhs.y && z == rhs.z;
+	}
+
+	bool operator!=( const IntVector3D &rhs ) const
+	{
+		return x != rhs.x || y != rhs.y || z != rhs.z;
+	}
+
+	const int& operator[]( const int i ) const
+	{
+		Assert( i >= 0 && i < 3 );
+		return ( ( int * )this )[i];
+	}
+
+	int& operator[]( const int i )
+	{
+		Assert( i >= 0 && i < 3 );
+		return ( ( int * )this )[i];
+	}
+
+	int Dot( const IntVector3D &rhs ) const
+	{
+		return x * rhs.x + y * rhs.y + z * rhs.z;
+	}
+
+	int LengthSqr() const
+	{
+		return x * x + y * y + z * z;
+	}
+
+	bool StrictlyGreater( const IntVector3D &rhs ) const
+	{
+		return x > rhs.x && y > rhs.y && z > rhs.z;
+	}
+
+	bool StrictlyGreaterOrEqual( const IntVector3D &rhs ) const
+	{
+		return x >= rhs.x && y >= rhs.y && z >= rhs.z;
+	}
+
+	bool StrictlyLess( const IntVector3D &rhs ) const
+	{
+		return x < rhs.x && y < rhs.y && z < rhs.z;
+	}
+
+	bool StrictlyLessOrEqual( const IntVector3D &rhs ) const
+	{
+		return x <= rhs.x && y <= rhs.y && z <= rhs.z;
+	}
+
+	bool AnyGreater( const IntVector3D &rhs ) const
+	{
+		return x > rhs.x || y > rhs.y || z > rhs.z;
+	}
+
+	bool AnyGreaterOrEqual( const IntVector3D &rhs ) const
+	{
+		return x >= rhs.x || y >= rhs.y || z >= rhs.z;
+	}
+
+	bool AnyLess( const IntVector3D &rhs ) const
+	{
+		return x < rhs.x || y < rhs.y || z < rhs.z;
+	}
+
+	bool AnyLessOrEqual( const IntVector3D &rhs ) const
+	{
+		return x <= rhs.x || y <= rhs.y || z <= rhs.z;
+	}
+};
+
+inline IntVector3D Max( const IntVector3D &lhs, const IntVector3D &rhs )
+{
+	return IntVector3D( MAX( lhs.x, rhs.x ), MAX( lhs.y, rhs.y ), MAX( lhs.z, rhs.z ) );
+}
+
+inline IntVector3D Min( const IntVector3D &rhs, const IntVector3D &lhs )
+{
+	return IntVector3D( MIN( lhs.x, rhs.x ), MIN( lhs.y, rhs.y ), MIN( lhs.z, rhs.z ) );
+}
+
+static const IntVector3D INT_VECTOR3_ORIGIN( 0, 0, 0 );
+
+#endif // INTVECTOR3D_H
--- a/public/mathlib/lightdesc.h
+++ b/public/mathlib/lightdesc.h
@@ -0,0 +1,185 @@
+//===== Copyright © 1996-2005, Valve Corporation, All rights reserved. ======//
+//
+// Purpose: 
+//
+//===========================================================================//
+
+// light structure definitions.
+#ifndef LIGHTDESC_H
+#define LIGHTDESC_H
+
+#include <mathlib/ssemath.h>
+#include <mathlib/vector.h>
+
+//-----------------------------------------------------------------------------
+// Light structure
+//-----------------------------------------------------------------------------
+enum LightType_t
+{
+	MATERIAL_LIGHT_DISABLE = 0,
+	MATERIAL_LIGHT_POINT,
+	MATERIAL_LIGHT_DIRECTIONAL,
+	MATERIAL_LIGHT_SPOT,
+};
+
+enum LightType_OptimizationFlags_t
+{
+	LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 = 1,
+	LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 = 2,
+	LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 = 4,
+	LIGHTTYPE_OPTIMIZATIONFLAGS_DERIVED_VALUES_CALCED = 8,
+};
+
+struct LightDesc_t 
+{
+    LightType_t m_Type;										//< MATERIAL_LIGHT_xxx
+	Vector m_Color;											//< color+intensity 
+    Vector m_Position;										//< light source center position
+    Vector m_Direction;										//< for SPOT, direction it is pointing
+    float  m_Range;											//< distance range for light.0=infinite
+    float m_Falloff;										//< angular falloff exponent for spot lights
+    float m_Attenuation0;									//< constant distance falloff term
+    float m_Attenuation1;									//< linear term of falloff
+    float m_Attenuation2;									//< quadatic term of falloff
+
+	// NOTE: theta and phi are *half angles*
+    float m_Theta;											//< inner cone angle. no angular falloff 
+															//< within this cone
+    float m_Phi;											//< outer cone angle
+
+	// the values below are derived from the above settings for optimizations
+	// These aren't used by DX8. . used for software lighting.
+
+	// NOTE: These dots are cos( m_Theta ), cos( m_Phi )
+	float m_ThetaDot;
+	float m_PhiDot;
+	float m_OneOverThetaDotMinusPhiDot;
+	unsigned int m_Flags;
+protected:
+	float m_RangeSquared;
+public:
+
+	void RecalculateDerivedValues(void);			 // calculate m_xxDot, m_Type for changed parms
+	void RecalculateOneOverThetaDotMinusPhiDot();
+
+	LightDesc_t(void)
+	{
+	}
+
+	// constructors for various useful subtypes
+
+	// a point light with infinite range
+	LightDesc_t( const Vector &pos, const Vector &color )
+	{
+		InitPoint( pos, color );
+	}
+
+	LightDesc_t &operator=( const LightDesc_t &src )
+	{
+		memcpy( this, &src, sizeof(LightDesc_t) );
+		return *this;
+	}
+
+	/// a simple light. cone boundaries in radians. you pass a look_at point and the
+	/// direciton is derived from that.
+	LightDesc_t( const Vector &pos, const Vector &color, const Vector &point_at,
+				float inner_cone_boundary, float outer_cone_boundary )
+	{
+		InitSpot( pos, color, point_at, inner_cone_boundary, outer_cone_boundary );
+	}
+
+	void InitPoint( const Vector &pos, const Vector &color );
+	void InitDirectional( const Vector &dir, const Vector &color );
+	void InitSpot(const Vector &pos, const Vector &color, const Vector &point_at,
+		float inner_cone_boundary, float outer_cone_boundary );
+
+	/// Given 4 points and 4 normals, ADD lighting from this light into "color".
+	void ComputeLightAtPoints( const FourVectors &pos, const FourVectors &normal,
+							   FourVectors &color, bool DoHalfLambert=false ) const;
+	void ComputeNonincidenceLightAtPoints( const FourVectors &pos, FourVectors &color ) const;
+	void ComputeLightAtPointsForDirectional( const FourVectors &pos,
+											 const FourVectors &normal,
+											 FourVectors &color, bool DoHalfLambert=false ) const;
+
+	// warning - modifies color!!! set color first!!
+	void SetupOldStyleAttenuation( float fQuadatricAttn, float fLinearAttn, float fConstantAttn );
+
+	void SetupNewStyleAttenuation( float fFiftyPercentDistance, float fZeroPercentDistance );
+
+
+/// given a direction relative to the light source position, is this ray within the
+	/// light cone (for spotlights..non spots consider all rays to be within their cone)
+	bool IsDirectionWithinLightCone(const Vector &rdir) const
+	{
+		return ( ( m_Type != MATERIAL_LIGHT_SPOT ) || ( rdir.Dot(m_Direction) >= m_PhiDot ) );
+	}
+
+	float OneOverThetaDotMinusPhiDot() const
+	{
+		return m_OneOverThetaDotMinusPhiDot;
+	}
+
+	float DistanceAtWhichBrightnessIsLessThan( float flAmount ) const;
+};
+
+
+//-----------------------------------------------------------------------------
+// a point light with infinite range
+//-----------------------------------------------------------------------------
+inline void LightDesc_t::InitPoint( const Vector &pos, const Vector &color )
+{
+	m_Type=MATERIAL_LIGHT_POINT;
+	m_Color=color;
+	m_Position=pos;
+	m_Range=0.0;									// infinite
+	m_Attenuation0=1.0;
+	m_Attenuation1=0;
+	m_Attenuation2=0;
+	RecalculateDerivedValues();
+}
+
+
+//-----------------------------------------------------------------------------
+// a directional light with infinite range
+//-----------------------------------------------------------------------------
+inline void LightDesc_t::InitDirectional( const Vector &dir, const Vector &color )
+{
+	m_Type=MATERIAL_LIGHT_DIRECTIONAL;
+	m_Color=color;
+	m_Direction=dir;
+	m_Range=0.0;									// infinite
+	m_Attenuation0=1.0;
+	m_Attenuation1=0;
+	m_Attenuation2=0;
+	RecalculateDerivedValues();
+}
+
+
+//-----------------------------------------------------------------------------
+// a simple light. cone boundaries in radians. you pass a look_at point and the
+// direciton is derived from that.
+//-----------------------------------------------------------------------------
+inline void LightDesc_t::InitSpot(const Vector &pos, const Vector &color, const Vector &point_at,
+	float inner_cone_boundary, float outer_cone_boundary)
+{
+	m_Type=MATERIAL_LIGHT_SPOT;
+	m_Color=color;
+	m_Position=pos;
+	m_Direction=point_at;
+	m_Direction-=pos;
+	VectorNormalizeFast(m_Direction);
+	m_Falloff=5.0;										// linear angle falloff
+	m_Theta=inner_cone_boundary;
+	m_Phi=outer_cone_boundary;
+
+	m_Range=0.0;										// infinite
+
+	m_Attenuation0=1.0;
+	m_Attenuation1=0;
+	m_Attenuation2=0;
+	RecalculateDerivedValues();
+}
+
+
+#endif
+
--- a/public/mathlib/math_pfns.h
+++ b/public/mathlib/math_pfns.h
@@ -0,0 +1,355 @@
+//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=====================================================================================//
+
+#ifndef _MATH_PFNS_H_
+#define _MATH_PFNS_H_
+
+#include <limits>
+
+// YUP_ACTIVE is from Source2. It's (obviously) not supported on this branch, just including it here to help merge camera.cpp/.h and the CSM shadow code.
+//#define YUP_ACTIVE 1
+
+enum MatrixAxisType_t
+{
+#ifdef YUP_ACTIVE
+	FORWARD_AXIS = 2,
+	LEFT_AXIS = 0,
+	UP_AXIS = 1,
+#else
+	FORWARD_AXIS = 0,
+	LEFT_AXIS = 1,
+	UP_AXIS = 2,
+#endif
+
+	X_AXIS = 0,
+	Y_AXIS = 1,
+	Z_AXIS = 2,
+	ORIGIN = 3,
+	PROJECTIVE = 3,
+};
+
+#if defined( _X360 )
+#include <xboxmath.h>
+#elif defined(_PS3)
+
+#ifdef SPU
+#include <vectormath/c/vectormath_aos.h>
+#include <spu_intrinsics.h>
+#else
+#include <ppu_asm_intrinsics.h>
+#endif
+
+// Note that similar defines exist in ssemath.h
+// Maybe we should consolidate in one place for all platforms.
+
+#define _VEC_0x7ff		(vec_int4){0x7ff,0x7ff,0x7ff,0x7ff}
+#define _VEC_0x3ff		(vec_int4){0x3ff,0x3ff,0x3ff,0x3ff}
+#define _VEC_22L		(vector unsigned int){22,22,22,22}
+#define _VEC_11L		(vector unsigned int){11,11,11,11}
+#define _VEC_0L			(vector unsigned int){0,0,0,0}
+#define _VEC_255F		(vector float){255.0f,255.0f,255.0f,255.0f}
+#define _VEC_NEGONEF	(vector float){-1.0f,-1.0f,-1.0f,-1.0f}
+#define _VEC_ONEF		(vector float){1.0f,1.0f,1.0f,1.0f}
+#define _VEC_ZEROF		(vector float){0.0f,0.0f,0.0f,0.0f}
+#define _VEC_ZEROxyzONEwF (vector float){0.0f,0.0f,0.0f,1.0f}
+#define _VEC_HALFF		(vector float){0.5f,0.5f,0.5f,0.5f}
+#define _VEC_HALFxyzZEROwF	(vector float){0.5f,0.5f,0.5f,0.0f}
+#define _VEC_PERMUTE_XYZ0W1   (vector unsigned char){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x1c,0x1d,0x1e,0x1f}
+
+#define _VEC_IEEEHACK (vector float){(float)(1 << 23),(float)(1 << 23),(float)(1 << 23),(float)(1 << 23)}
+#define _VEC_PERMUTE_FASTFTOC (vector unsigned char){0,0,0,0,0,0,0,0,0,0,0,0,0x03,0x07,0x0b,0x0f}
+
+// AngleQuaternion
+#define _VEC_PERMUTE_AQsxsxcxcx (vector unsigned char) {0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x10,0x11,0x12,0x13}	
+#define _VEC_PERMUTE_AQczszszcz (vector unsigned char) {0x18,0x19,0x1a,0x1b,0x08,0x09,0x0a,0x0b,0x08,0x09,0x0a,0x0b,0x18,0x19,0x1a,0x1b}	
+#define _VEC_PERMUTE_AQcxcxsxsx (vector unsigned char) {0x10,0x11,0x12,0x13,0x10,0x11,0x12,0x13,0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03}	
+#define _VEC_PERMUTE_AQszczczsz (vector unsigned char) {0x08,0x09,0x0a,0x0b,0x18,0x19,0x1a,0x1b,0x18,0x19,0x1a,0x1b,0x08,0x09,0x0a,0x0b}	
+#define _VEC_PERMUTE_ANGLEQUAT  (vector unsigned char) {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x18,0x19,0x1a,0x1b,0x0c,0x0d,0x0e,0x0f}	
+
+#define _VEC_EPSILONF		(__vector float)			{FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON}
+
+#endif
+
+#if !(defined( PLATFORM_PPC ) || defined(SPU))
+// If we are not PPC based or SPU based, then assumes it is SSE2. We should make this code cleaner.
+
+#include <xmmintrin.h>
+
+
+
+// These globals are initialized by mathlib and redirected based on available fpu features
+
+// The following are not declared as macros because they are often used in limiting situations,
+// and sometimes the compiler simply refuses to inline them for some reason
+FORCEINLINE float VECTORCALL FastSqrt( float x )
+{
+	__m128 root = _mm_sqrt_ss( _mm_load_ss( &x ) );
+	return *( reinterpret_cast<float *>( &root ) );
+}
+
+FORCEINLINE float VECTORCALL FastRSqrtFast( float x )
+{
+	// use intrinsics
+	__m128 rroot = _mm_rsqrt_ss( _mm_load_ss( &x ) );
+	return *( reinterpret_cast<float *>( &rroot ) );
+}
+// Single iteration NewtonRaphson reciprocal square root:
+// 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) 	
+// Very low error, and fine to use in place of 1.f / sqrtf(x).	
+FORCEINLINE float VECTORCALL FastRSqrt( float x )
+{
+	float rroot = FastRSqrtFast( x );
+	return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
+}
+
+void FastSinCos( float x, float* s, float* c );  // any x
+float FastCos( float x );
+
+
+
+inline float FastRecip(float x) {return 1.0f / x;}
+// Simple SSE rsqrt.  Usually accurate to around 6 (relative) decimal places 
+// or so, so ok for closed transforms.  (ie, computing lighting normals)
+inline float FastSqrtEst(float x) { return FastRSqrtFast(x) * x; }
+
+
+#else // !defined( PLATFORM_PPC ) && !defined(_SPU)
+
+#ifndef SPU
+// We may not need this for SPU, so let's not bother for now
+
+FORCEINLINE float _VMX_Sqrt( float x )
+{
+	return __fsqrts( x );
+}
+
+FORCEINLINE double _VMX_RSqrt( double x )
+{
+	double rroot = __frsqrte( x );
+
+	// Single iteration NewtonRaphson on reciprocal square root estimate
+	return (0.5f * rroot) * (3.0f - (x * rroot) * rroot);
+}
+
+FORCEINLINE double _VMX_RSqrtFast( double x )
+{
+	return __frsqrte( x );
+}
+
+#ifdef _X360
+FORCEINLINE void _VMX_SinCos( float a, float *pS, float *pC )
+{
+	XMScalarSinCos( pS, pC, a );
+}
+
+FORCEINLINE float _VMX_Cos( float a )
+{
+	return XMScalarCos( a );
+}
+#endif
+
+// the 360 has fixed hw and calls directly
+#define FastSqrt(x)			_VMX_Sqrt(x)
+#define	FastRSqrt(x)		_VMX_RSqrt(x)
+#define FastRSqrtFast(x)	_VMX_RSqrtFast(x)
+#define FastSinCos(x,s,c)	_VMX_SinCos(x,s,c)
+#define FastCos(x)			_VMX_Cos(x)
+
+inline double FastRecip(double x) {return __fres(x);}
+inline double FastSqrtEst(double x) { return __frsqrte(x) * x; }
+
+#endif // !defined( PLATFORM_PPC ) && !defined(_SPU)
+
+
+// if x is infinite, return FLT_MAX
+inline float FastClampInfinity( float x )
+{
+#ifdef PLATFORM_PPC
+	return fsel( std::numeric_limits<float>::infinity() - x, x, FLT_MAX );
+#else
+	return ( x > FLT_MAX ? FLT_MAX : x );
+#endif
+}
+
+#if defined (_PS3) 
+
+#if defined(__SPU__)
+
+inline int _rotl( int a, int count )
+{
+	vector signed int vi;
+	vi = spu_promote(a, 0);
+	vi = spu_rl(vi, count);
+	return spu_extract(vi, 0);
+}
+
+#else
+
+// extern float cosvf(float);      /* single precision cosine      */
+// extern float sinvf(float);      /* single precision sine        */
+// TODO: need a faster single precision equivalent
+#define cosvf cosf
+#define sinvf sinf
+
+inline int _rotl( int x, int c )
+{
+	return __rlwimi(x,x,c,0,31);
+}
+
+inline int64 _rotl64( int64 x, int c )
+{
+	return __rldicl( x, c, 0 );
+}
+
+
+/*
+FORCEINLINE float _VMX_Sqrt( float x )
+{
+	vector_float_union vIn, vOut;
+
+	vIn.f[0] = x;
+
+	vOut.vf = sqrtf4(vIn.vf);
+	
+	return vOut.f[0];
+}
+
+FORCEINLINE float _VMX_RSqrt( float x )
+{
+	vector_float_union vIn, vOut;
+
+	vIn.f[0] = x;
+
+	vOut.vf = rsqrtf4(vIn.vf);
+
+	return vOut.f[0];
+}
+
+FORCEINLINE float _VMX_RSqrtFast( float x )
+{
+	vector_float_union vIn, vOut;
+
+	vIn.f[0] = x;
+
+	vOut.vf = rsqrtf4fast(vIn.vf);
+
+	return vOut.f[0];
+}
+*/
+
+FORCEINLINE void _VMX_SinCos( float a, float *pS, float *pC )
+{
+	*pS=sinvf(a);
+	*pC=cosvf(a);
+}
+
+FORCEINLINE float _VMX_Cos( float a )
+{
+	return cosvf(a);
+}
+
+
+// the 360 has fixed hw and calls directly
+/*
+#define FastSqrt(x)			_VMX_Sqrt(x)
+#define	FastRSqrt(x)		_VMX_RSqrt(x)
+#define FastRSqrtFast(x)	_VMX_RSqrtFast(x)
+#define FastSinCos(x,s,c)	_VMX_SinCos(x,s,c)
+#define FastCos(x)			_VMX_Cos(x)
+*/
+
+#endif
+
+
+#if defined(__SPU__)
+
+// do we need these optimized yet?
+
+FORCEINLINE float FastSqrt( float x )
+{
+	return sqrtf( x );
+}
+
+FORCEINLINE float FastRSqrt( float x )
+{
+	float rroot = 1.f / (sqrtf(x) + FLT_EPSILON);
+	return rroot;
+}
+
+
+#define FastRSqrtFast(x)	FastRSqrt(x)
+
+
+#endif
+
+
+
+//-----------------------------------------------------------------
+// Vector Unions
+//-----------------------------------------------------------------
+
+//-----------------------------------------------------------------
+// Floats
+//-----------------------------------------------------------------
+typedef union
+{
+	vector float vf;
+	float f[4];
+} vector_float_union;
+
+#if !defined(__SPU__)
+//-----------------------------------------------------------------
+// Ints
+//-----------------------------------------------------------------
+typedef union
+{
+	vector int vi;
+	int i[4];
+} vector_int4_union;
+
+typedef union
+{
+	vector unsigned int vui;
+	unsigned int ui[4];
+} vector_uint4_union;
+
+//-----------------------------------------------------------------
+// Shorts
+//-----------------------------------------------------------------
+typedef union
+{
+	vector signed short vs;
+	signed short s[8];
+} vector_short8_union;
+
+typedef union
+{
+	vector unsigned short vus;
+	unsigned short us[8];
+} vector_ushort8_union;
+
+//-----------------------------------------------------------------
+// Chars
+//-----------------------------------------------------------------
+typedef union
+{
+	vector signed char vc;
+	signed char c[16];
+} vector_char16_union;
+
+typedef union
+{
+	vector unsigned char vuc;
+	unsigned char uc[16];
+} vector_uchar16_union;
+#endif
+
+
+
+#endif	// _PS3
+#endif	// #ifndef SPU
+
+#endif // _MATH_PFNS_H_
--- a/public/mathlib/mathlib.h
+++ b/public/mathlib/mathlib.h
--- a/public/mathlib/matrixmath.h
+++ b/public/mathlib/matrixmath.h
@@ -0,0 +1,385 @@
+//===== Copyright © 1996-2011, Valve Corporation, All rights reserved. ======//
+//
+// Purpose: 
+//
+//  A set of generic, template-based matrix functions.
+//===========================================================================//
+
+#ifndef MATRIXMATH_H
+#define MATRIXMATH_H
+
+#include <stdarg.h>
+
+// The operations in this file can perform basic matrix operations on matrices represented
+// using any class that supports the necessary operations:
+//
+//  .Element( row, col )  - return the element at a given matrox position
+//  .SetElement( row, col, val ) - modify an element
+//  .Width(), .Height() - get dimensions
+//  .SetDimensions( nrows, ncols) - set a matrix to be un-initted and the appropriate size
+//
+// Generally, vectors can be used with these functions by using N x 1 matrices to represent them.
+//  Matrices are addressed as row, column, and indices are 0-based
+//
+//
+// Note that the template versions of these routines are defined for generality - it is expected
+// that template specialization is used for common high performance cases.
+
+namespace MatrixMath
+{
+	/// M *= flScaleValue
+	template<class MATRIXCLASS>
+	void ScaleMatrix( MATRIXCLASS &matrix, float flScaleValue )
+	{
+		for( int i = 0; i < matrix.Height(); i++ )
+		{
+			for( int j = 0; j < matrix.Width(); j++ )
+			{
+				matrix.SetElement( i, j, flScaleValue * matrix.Element( i, j ) );
+			}
+		}
+	}
+
+	/// AppendElementToMatrix - same as setting the element, except only works when all calls
+	/// happen in top to bottom left to right order, end you have to call FinishedAppending when
+	/// done. For normal matrix classes this is not different then SetElement, but for
+	/// CSparseMatrix, it is an accelerated way to fill a matrix from scratch.
+	template<class MATRIXCLASS>
+	FORCEINLINE void AppendElement( MATRIXCLASS &matrix, int nRow, int nCol, float flValue )
+	{
+		matrix.SetElement( nRow, nCol, flValue );			// default implementation
+	}
+
+	template<class MATRIXCLASS>
+	FORCEINLINE void FinishedAppending( MATRIXCLASS &matrix ) {} // default implementation
+
+	/// M += fl
+	template<class MATRIXCLASS>
+	void AddToMatrix( MATRIXCLASS &matrix, float flAddend )
+	{
+		for( int i = 0; i < matrix.Height(); i++ )
+		{
+			for( int j = 0; j < matrix.Width(); j++ )
+			{
+				matrix.SetElement( i, j, flAddend + matrix.Element( i, j ) );
+			}
+		}
+	}
+
+	/// transpose
+	template<class MATRIXCLASSIN, class MATRIXCLASSOUT>
+	void TransposeMatrix( MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut )
+	{
+		pMatrixOut->SetDimensions( matrixIn.Width(), matrixIn.Height() );
+		for( int i = 0; i < pMatrixOut->Height(); i++ )
+		{
+			for( int j = 0; j < pMatrixOut->Width(); j++ )
+			{
+				AppendElement( *pMatrixOut, i, j, matrixIn.Element( j, i ) );
+			}
+		}
+		FinishedAppending( *pMatrixOut );
+	}
+
+	/// copy
+	template<class MATRIXCLASSIN, class MATRIXCLASSOUT>
+	void CopyMatrix( MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut )
+	{
+		pMatrixOut->SetDimensions( matrixIn.Height(), matrixIn.Width() );
+		for( int i = 0; i < matrixIn.Height(); i++ )
+		{
+			for( int j = 0; j < matrixIn.Width(); j++ )
+			{
+				AppendElement( *pMatrixOut, i, j, matrixIn.Element( i, j ) );
+			}
+		}
+		FinishedAppending( *pMatrixOut );
+	}
+
+
+
+	/// M+=M
+	template<class MATRIXCLASSIN, class MATRIXCLASSOUT>
+	void AddMatrixToMatrix( MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut )
+	{
+		for( int i = 0; i < matrixIn.Height(); i++ )
+		{
+			for( int j = 0; j < matrixIn.Width(); j++ )
+			{
+				pMatrixOut->SetElement( i, j, pMatrixOut->Element( i, j ) + matrixIn.Element( i, j ) );
+			}
+		}
+	}
+
+	// M += scale * M
+	template<class MATRIXCLASSIN, class MATRIXCLASSOUT>
+	void AddScaledMatrixToMatrix( float flScale, MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut )
+	{
+		for( int i = 0; i < matrixIn.Height(); i++ )
+		{
+			for( int j = 0; j < matrixIn.Width(); j++ )
+			{
+				pMatrixOut->SetElement( i, j, pMatrixOut->Element( i, j ) + flScale * matrixIn.Element( i, j ) );
+			}
+		}
+	}
+
+
+	// simple way to initialize a matrix with constants from code.
+	template<class MATRIXCLASSOUT> 
+	void SetMatrixToIdentity( MATRIXCLASSOUT *pMatrixOut, float flDiagonalValue = 1.0 )
+	{
+		for( int i = 0; i < pMatrixOut->Height(); i++ )
+		{
+			for( int j = 0; j < pMatrixOut->Width(); j++ )
+			{
+				AppendElement( *pMatrixOut, i, j, ( i == j ) ? flDiagonalValue : 0 );
+			}
+		}
+		FinishedAppending( *pMatrixOut );
+	}
+
+	//// simple way to initialize a matrix with constants from code
+	template<class MATRIXCLASSOUT> 
+	void SetMatrixValues( MATRIXCLASSOUT *pMatrix, int nRows, int nCols, ... )
+	{
+		va_list argPtr;
+		va_start( argPtr, nCols );
+
+		pMatrix->SetDimensions( nRows, nCols );
+		for( int nRow = 0; nRow < nRows; nRow++ )
+		{
+			for( int nCol = 0; nCol < nCols; nCol++ )
+			{
+				double flNewValue = va_arg( argPtr, double );
+				pMatrix->SetElement( nRow, nCol, flNewValue );
+			}
+		}
+		va_end( argPtr );
+	}
+
+
+	/// row and colum accessors. treat a row or a column as a column vector
+	template<class MATRIXTYPE> class MatrixRowAccessor
+	{
+	public:
+		FORCEINLINE MatrixRowAccessor( MATRIXTYPE const &matrix, int nRow )
+		{
+			m_pMatrix = &matrix;
+			m_nRow = nRow;
+		}
+
+		FORCEINLINE float Element( int nRow, int nCol ) const
+		{
+			Assert( nCol == 0 );
+			return m_pMatrix->Element( m_nRow, nRow );
+		}
+
+		FORCEINLINE int Width( void ) const { return 1; };
+		FORCEINLINE int Height( void ) const { return m_pMatrix->Width(); }
+
+	private:
+		MATRIXTYPE const *m_pMatrix;
+		int m_nRow;
+	};
+
+	template<class MATRIXTYPE> class MatrixColumnAccessor
+	{
+	public:
+		FORCEINLINE MatrixColumnAccessor( MATRIXTYPE const &matrix, int nColumn )
+		{
+			m_pMatrix = &matrix;
+			m_nColumn = nColumn;
+		}
+
+		FORCEINLINE float Element( int nRow, int nColumn ) const
+		{
+			Assert( nColumn == 0 );
+			return m_pMatrix->Element( nRow, m_nColumn );
+		}
+
+		FORCEINLINE int Width( void ) const { return 1; }
+		FORCEINLINE int Height( void ) const { return m_pMatrix->Height(); }
+	private:
+		MATRIXTYPE const *m_pMatrix;
+		int m_nColumn;
+	};
+
+	/// this translator acts as a proxy for the transposed matrix
+	template<class MATRIXTYPE> class MatrixTransposeAccessor
+	{
+	public:
+		FORCEINLINE MatrixTransposeAccessor( MATRIXTYPE const & matrix )
+		{
+			m_pMatrix = &matrix;
+		}
+
+		FORCEINLINE float Element( int nRow, int nColumn ) const
+		{
+			return m_pMatrix->Element( nColumn, nRow );
+		}
+
+		FORCEINLINE int Width( void ) const { return m_pMatrix->Height(); }
+		FORCEINLINE int Height( void ) const { return m_pMatrix->Width(); }
+	private:
+		MATRIXTYPE const *m_pMatrix;
+	};
+
+	/// this tranpose returns a wrapper around it's argument, allowing things like AddMatrixToMatrix( Transpose( matA ), &matB ) without an extra copy
+	template<class MATRIXCLASSIN>
+	MatrixTransposeAccessor<MATRIXCLASSIN> TransposeMatrix( MATRIXCLASSIN const &matrixIn )
+	{
+		return MatrixTransposeAccessor<MATRIXCLASSIN>( matrixIn );
+	}
+
+
+	/// retrieve rows and columns
+	template<class MATRIXTYPE>
+	FORCEINLINE MatrixColumnAccessor<MATRIXTYPE> MatrixColumn( MATRIXTYPE const &matrix, int nColumn )
+	{
+		return MatrixColumnAccessor<MATRIXTYPE>( matrix, nColumn );
+	}
+
+	template<class MATRIXTYPE>
+	FORCEINLINE MatrixRowAccessor<MATRIXTYPE> MatrixRow( MATRIXTYPE const &matrix, int nRow )
+	{
+		return MatrixRowAccessor<MATRIXTYPE>( matrix, nRow );
+	}
+
+	//// dot product between vectors (or rows and/or columns via accessors)
+	template<class MATRIXACCESSORATYPE, class MATRIXACCESSORBTYPE >
+	float InnerProduct( MATRIXACCESSORATYPE const &vecA, MATRIXACCESSORBTYPE const &vecB )
+	{
+		Assert( vecA.Width() == 1 );
+		Assert( vecB.Width() == 1 );
+		Assert( vecA.Height() == vecB.Height() );
+		double flResult = 0;
+		for( int i = 0; i < vecA.Height(); i++ )
+		{
+			flResult += vecA.Element( i, 0 ) * vecB.Element( i, 0 );
+		}
+		return flResult;
+	}
+
+
+
+	/// matrix x matrix multiplication
+	template<class MATRIXATYPE, class MATRIXBTYPE, class MATRIXOUTTYPE>
+	void MatrixMultiply( MATRIXATYPE const &matA, MATRIXBTYPE const &matB, MATRIXOUTTYPE *pMatrixOut )
+	{
+		Assert( matA.Width() == matB.Height() );
+		pMatrixOut->SetDimensions( matA.Height(), matB.Width() );
+		for( int i = 0; i < matA.Height(); i++ )
+		{
+			for( int j = 0; j < matB.Width(); j++ )
+			{
+				pMatrixOut->SetElement( i, j, InnerProduct( MatrixRow( matA, i ), MatrixColumn( matB, j ) ) );
+			}
+		}
+	}
+
+	/// solve Ax=B via the conjugate graident method. Code and naming conventions based on the
+	/// wikipedia article.
+	template<class ATYPE, class XTYPE, class BTYPE>
+	void ConjugateGradient( ATYPE const &matA, BTYPE const &vecB, XTYPE &vecX, float flTolerance = 1.0e-20 )
+	{
+		XTYPE vecR;
+		vecR.SetDimensions( vecX.Height(), 1 );
+		MatrixMultiply( matA, vecX, &vecR );
+		ScaleMatrix( vecR, -1 );
+		AddMatrixToMatrix( vecB, &vecR );
+		XTYPE vecP;
+		CopyMatrix( vecR, &vecP );
+		float flRsOld = InnerProduct( vecR, vecR );
+		for( int nIter = 0; nIter < 100; nIter++ )
+		{
+			XTYPE vecAp;
+			MatrixMultiply( matA, vecP, &vecAp );
+			float flDivisor = InnerProduct( vecAp, vecP );
+			float flAlpha = flRsOld / flDivisor;
+			AddScaledMatrixToMatrix( flAlpha, vecP, &vecX );
+			AddScaledMatrixToMatrix( -flAlpha, vecAp, &vecR );
+			float flRsNew = InnerProduct( vecR, vecR );
+			if ( flRsNew < flTolerance )
+			{
+				break;
+			}
+			ScaleMatrix( vecP, flRsNew / flRsOld );
+			AddMatrixToMatrix( vecR, &vecP );
+			flRsOld = flRsNew;
+		}
+	}
+
+	/// solve (A'*A) x=B via the conjugate gradient method. Code and naming conventions based on
+	/// the wikipedia article. Same as Conjugate gradient but allows passing in two matrices whose
+	/// product is used as the A matrix (in order to preserve sparsity)
+	template<class ATYPE, class APRIMETYPE, class XTYPE, class BTYPE>
+	void ConjugateGradient( ATYPE const &matA, APRIMETYPE const &matAPrime, BTYPE const &vecB, XTYPE &vecX, float flTolerance = 1.0e-20 )
+	{
+		XTYPE vecR1;
+		vecR1.SetDimensions( vecX.Height(), 1 );
+		MatrixMultiply( matA, vecX, &vecR1 );
+		XTYPE vecR;
+		vecR.SetDimensions( vecR1.Height(), 1 );
+		MatrixMultiply( matAPrime, vecR1, &vecR );
+		ScaleMatrix( vecR, -1 );
+		AddMatrixToMatrix( vecB, &vecR );
+		XTYPE vecP;
+		CopyMatrix( vecR, &vecP );
+		float flRsOld = InnerProduct( vecR, vecR );
+		for( int nIter = 0; nIter < 100; nIter++ )
+		{
+			XTYPE vecAp1;
+			MatrixMultiply( matA, vecP, &vecAp1 );
+			XTYPE vecAp;
+			MatrixMultiply( matAPrime, vecAp1, &vecAp );
+			float flDivisor = InnerProduct( vecAp, vecP );
+			float flAlpha = flRsOld / flDivisor;
+			AddScaledMatrixToMatrix( flAlpha, vecP, &vecX );
+			AddScaledMatrixToMatrix( -flAlpha, vecAp, &vecR );
+			float flRsNew = InnerProduct( vecR, vecR );
+			if ( flRsNew < flTolerance )
+			{
+				break;
+			}
+			ScaleMatrix( vecP, flRsNew / flRsOld );
+			AddMatrixToMatrix( vecR, &vecP );
+			flRsOld = flRsNew;
+		}
+	}
+
+	
+	template<class ATYPE,  class XTYPE, class BTYPE>
+	void LeastSquaresFit( ATYPE const &matA, BTYPE const &vecB, XTYPE &vecX )
+	{
+		// now, generate the normal equations
+		BTYPE vecBeta;
+		MatrixMath::MatrixMultiply( MatrixMath::TransposeMatrix( matA ), vecB, &vecBeta );
+
+		vecX.SetDimensions( matA.Width(), 1 );
+		MatrixMath::SetMatrixToIdentity( &vecX );
+
+		ATYPE matATransposed;
+		TransposeMatrix( matA, &matATransposed );
+		ConjugateGradient( matA, matATransposed, vecBeta, vecX, 1.0e-20 );
+	}
+
+};
+
+/// a simple fixed-size matrix class
+template<int NUMROWS, int NUMCOLS> class CFixedMatrix
+{
+public:
+	FORCEINLINE int Width( void ) const { return NUMCOLS; }
+	FORCEINLINE int Height( void ) const { return NUMROWS; }
+	FORCEINLINE float Element( int nRow, int nCol ) const { return m_flValues[nRow][nCol]; }
+	FORCEINLINE void SetElement( int nRow, int nCol, float flValue ) { m_flValues[nRow][nCol] = flValue; }
+	FORCEINLINE void SetDimensions( int nNumRows, int nNumCols ) { Assert( ( nNumRows == NUMROWS ) && ( nNumCols == NUMCOLS ) ); }
+
+private:
+	float m_flValues[NUMROWS][NUMCOLS];
+};
+
+
+
+#endif //matrixmath_h
--- a/public/mathlib/noise.h
+++ b/public/mathlib/noise.h
@@ -0,0 +1,35 @@
+//========= Copyright © 1996-2006, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=====================================================================================//
+
+#ifndef NOISE_H
+#define NOISE_H
+
+#include <math.h>
+#include "basetypes.h"
+#include "mathlib/vector.h"
+#include "tier0/dbg.h"
+
+
+// The following code is the c-ification of Ken Perlin's new noise algorithm
+// "JAVA REFERENCE IMPLEMENTATION OF IMPROVED NOISE - COPYRIGHT 2002 KEN PERLIN"
+// as available here: http://mrl.nyu.edu/~perlin/noise/
+// it generates a single octave of noise in the -1..1 range
+// this should at some point probably replace SparseConvolutionNoise - jd
+float ImprovedPerlinNoise( Vector const &pnt );
+
+// get the noise value at a point. Output range is 0..1.
+float SparseConvolutionNoise( Vector const &pnt );
+
+// get the noise value at a point, passing a custom noise shaping function. The noise shaping
+// function should map the domain 0..1 to 0..1.
+float SparseConvolutionNoise(Vector const &pnt, float (*pNoiseShapeFunction)(float) );
+
+// returns a 1/f noise. more octaves take longer
+float FractalNoise( Vector const &pnt, int n_octaves );
+
+// returns a abs(f)*1/f noise i.e. turbulence
+float Turbulence( Vector const &pnt, int n_octaves );
+#endif // NOISE_H
--- a/public/mathlib/planefit.h
+++ b/public/mathlib/planefit.h
@@ -0,0 +1,40 @@
+//============ Copyright (c) Valve Corporation, All rights reserved. ============
+//
+// Code to compute the equation of a plane with a least-squares residual fit.
+//
+//===============================================================================
+
+#ifndef PLANEFIT_H
+#define PLANEFIT_H
+
+#if defined( COMPILER_MSVC )
+#pragma once
+#endif
+
+class VPlane;
+
+//-----------------------------------------------------------------------------
+// Finds a plane to best fit a set of points.  The least-squares residual 
+// error is computed along the X/Y/Z-axis, not orthogonally to the plane,
+// since doing the latter requires an SVD or a 3x3 eigendecomposition.
+//-----------------------------------------------------------------------------
+bool ComputeLeastSquaresPlaneFitX( const Vector *pPoints, int nNumPoints, VPlane *pFitPlane );
+bool ComputeLeastSquaresPlaneFitY( const Vector *pPoints, int nNumPoints, VPlane *pFitPlane );
+bool ComputeLeastSquaresPlaneFitZ( const Vector *pPoints, int nNumPoints, VPlane *pFitPlane );
+
+
+//-----------------------------------------------------------------------------
+// *WORK-IN-PROGRESS*
+// Finds a plane to best fit a set of points.  The least-squares residual 
+// error is computed along the optimal axis, orthogonally to the plane,
+// and requires a 3x3 eigendecomposition.
+//-----------------------------------------------------------------------------
+bool ComputeLeastSquaresOrthogonalPlaneFit( const Vector *pPoints, int nNumPoints, VPlane *pFitPlane );
+
+//-----------------------------------------------------------------------------
+// Given a plane and a set of points, computes the sum of
+// squared orthogonal residuals.
+//-----------------------------------------------------------------------------
+float ComputeSquaredError( const Vector *pPoints, int nNumPoints, const VPlane *pFitPlane );
+
+#endif // PLANEFIT_H
--- a/public/mathlib/polygon.h
+++ b/public/mathlib/polygon.h
@@ -0,0 +1,105 @@
+//============ Copyright (c) Valve Corporation, All rights reserved. ============
+//
+// Utility functions for polygon simplification / convex decomposition.
+//
+//===============================================================================
+
+#ifndef POLYGON_H
+#define POLYGON_H
+
+#if defined( COMPILER_MSVC )
+#pragma once
+#endif
+
+#include "utlvector.h"
+
+//-----------------------------------------------------------------------------
+// NOTE: Polygons are assumed to be wound clockwise unless otherwise noted.
+// Holes in polygons wind counter-clockwise.
+//-----------------------------------------------------------------------------
+
+
+
+static const float POINT_IN_POLYGON_EPSILON = 0.01f;
+
+//-----------------------------------------------------------------------------
+// Simplifies a polygon by removing points such that the area will not
+// decrease by more than the specified amount (area increases are not
+// allowed).
+// With a low max deviation, this will perfectly remove all colinear points.
+//-----------------------------------------------------------------------------
+void SimplifyPolygon( CUtlVector< Vector > *pPoints, const Vector &vNormal, float flMaxDeviation );
+
+//-----------------------------------------------------------------------------
+// Simplifies a polygon using quadric error metrics.
+//-----------------------------------------------------------------------------
+void SimplifyPolygonQEM( CUtlVector< Vector > *pPoints, const Vector &vNormal, float flMaximumSquaredError, bool bUseOptimalPointPlacement );
+
+//-----------------------------------------------------------------------------
+// Returns whether a vertex of a polygon (v1) is concave, given its normal 
+// and previous & next vertices (v0 and v2, respectively).
+//-----------------------------------------------------------------------------
+bool IsConcave( const Vector &v0, const Vector &v1, const Vector &v2, const Vector &vNormal );
+
+//-----------------------------------------------------------------------------
+// Returns whether a vertex (points[nVertex]) of a polygon is 
+// concave, given the polygon's vertices, and its normal.
+//-----------------------------------------------------------------------------
+bool IsConcave( const Vector *pPolygonPoints, int nPointCount, int nVertex, const Vector &vNormal );
+
+//-----------------------------------------------------------------------------
+// Returns whether a polygon is concave.
+//-----------------------------------------------------------------------------
+bool IsConcave( const Vector *pPolygonPoints, int nPointCount, const Vector &vNormal );
+
+//-----------------------------------------------------------------------------
+// Given a set of points (i.e. vertex buffer), this represents an ordered
+// subset of points which comprise a polygon (i.e. index buffer).
+//-----------------------------------------------------------------------------
+struct SubPolygon_t
+{
+	CUtlVector< int > m_Indices;
+	
+	int GetVertexIndex( int i ) const
+	{
+		i = i % m_Indices.Count();
+		if ( i < 0 )
+		{
+			i += m_Indices.Count();
+		}
+		return m_Indices[i];
+	}
+
+	static const Vector &GetPoint( const Vector *pPolygonPoints, int nPointCount, int nVertex )
+	{
+		nVertex = nVertex % nPointCount;
+		if ( nVertex < 0 )
+		{
+			nVertex += nPointCount;
+		}
+		return pPolygonPoints[nVertex];
+	}
+
+	static const Vector &GetPoint( const CUtlVector< Vector > &originalPoints, int nVertex )
+	{
+		return GetPoint( originalPoints.Base(), originalPoints.Count(), nVertex );
+	}
+};
+
+//-----------------------------------------------------------------------------
+// Attempts to strip off one convex region from a concave/convex polygon.
+//-----------------------------------------------------------------------------
+void DecomposePolygon_Step( const CUtlVector< Vector > &polygonPoints, const Vector &vNormal, CUtlVector< SubPolygon_t > *pHoles, SubPolygon_t *pNewPartition, SubPolygon_t *pRemainingPolygon, int *pFirstIndex );
+
+//-----------------------------------------------------------------------------
+// Decomposes a polygon into one or more convex, non-overlapping parts.
+//-----------------------------------------------------------------------------
+void DecomposePolygon( const CUtlVector< Vector > &polygonPoints, const Vector &vNormal, SubPolygon_t *pOriginalPolygon, CUtlVector< SubPolygon_t > *pHoles, CUtlVector< SubPolygon_t > *pPartitions );
+
+//-----------------------------------------------------------------------------
+// Is a point in the prism formed by extruding the polygon?
+// If so, what is its height above/below the plane of the polygon?
+//-----------------------------------------------------------------------------
+bool IsPointInPolygonPrism( const Vector *pPolygonPoints, int nPointCount, const Vector &vPoint, float flThreshold = 0.0f, float *pHeight = NULL );
+
+#endif // POLYGON_H
--- a/public/mathlib/polyhedron.h
+++ b/public/mathlib/polyhedron.h
@@ -0,0 +1,73 @@
+//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#ifndef POLYHEDRON_H_
+#define	POLYHEDRON_H_
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include "mathlib/mathlib.h"
+
+
+
+struct Polyhedron_IndexedLine_t
+{
+	unsigned short iPointIndices[2];
+};
+
+struct Polyhedron_IndexedLineReference_t
+{
+	unsigned short iLineIndex;
+	unsigned char iEndPointIndex; //since two polygons reference any one line, one needs to traverse the line backwards, this flags that behavior
+};
+
+struct Polyhedron_IndexedPolygon_t
+{
+	unsigned short iFirstIndex;
+	unsigned short iIndexCount;
+	Vector polyNormal;
+};
+
+class CPolyhedron //made into a class because it's going virtual to support distinctions between temp and permanent versions
+{
+public:
+	Vector *pVertices;
+	Polyhedron_IndexedLine_t *pLines;
+	Polyhedron_IndexedLineReference_t *pIndices;
+	Polyhedron_IndexedPolygon_t *pPolygons;
+	
+	unsigned short iVertexCount;
+	unsigned short iLineCount;
+	unsigned short iIndexCount;
+	unsigned short iPolygonCount;
+
+	virtual ~CPolyhedron( void ) {};
+	virtual void Release( void ) = 0;
+	Vector Center( void ) const;
+};
+
+class CPolyhedron_AllocByNew : public CPolyhedron
+{
+public:
+	virtual void Release( void );
+	static CPolyhedron_AllocByNew *Allocate( unsigned short iVertices, unsigned short iLines, unsigned short iIndices, unsigned short iPolygons ); //creates the polyhedron along with enough memory to hold all it's data in a single allocation
+
+private:
+	CPolyhedron_AllocByNew( void ) { }; //CPolyhedron_AllocByNew::Allocate() is the only way to create one of these.
+};
+
+CPolyhedron *GeneratePolyhedronFromPlanes( const float *pOutwardFacingPlanes, int iPlaneCount, float fOnPlaneEpsilon, bool bUseTemporaryMemory = false ); //be sure to polyhedron->Release()
+CPolyhedron *ClipPolyhedron( const CPolyhedron *pExistingPolyhedron, const float *pOutwardFacingPlanes, int iPlaneCount, float fOnPlaneEpsilon, bool bUseTemporaryMemory = false ); //this does NOT modify/delete the existing polyhedron
+
+CPolyhedron *GetTempPolyhedron( unsigned short iVertices, unsigned short iLines, unsigned short iIndices, unsigned short iPolygons ); //grab the temporary polyhedron. Avoids new/delete for quick work. Can only be in use by one chunk of code at a time
+
+
+#endif //#ifndef POLYHEDRON_H_
+
--- a/public/mathlib/quadric.h
+++ b/public/mathlib/quadric.h
@@ -0,0 +1,144 @@
+//=========== Copyright © Valve Corporation, All rights reserved. ============//
+//
+// Purpose: Quadric math functionality used for squared distance error metrics.
+//
+//===========================================================================//
+
+#ifndef QUADRIC_H
+#define QUADRIC_H
+
+#if defined( COMPILER_MSVC )
+#pragma once
+#endif
+
+#include "vector.h"
+#include "cholesky.h"
+
+// this class holds a quadric error function and implements integrating and evaluating these functions.
+// see "Surface Simplfication using Quadric Error metrics.  Garland, Heckbert"
+// http://mgarland.org/files/papers/quadric2.pdf  (updated version)
+// NOTE: This will be expanded using Hughes Hoppe's method for including interpolated vertex attributes in a future version
+class CQuadricError
+{
+public:
+	CQuadricError() {}
+	// used this to track down error
+	void CheckDebug()
+	{
+		Assert( IsFinite(m_coefficients[0]) );
+	}
+
+	// integrate these by summing coefficients
+	FORCEINLINE CQuadricError&	operator+=(const CQuadricError &inError)
+	{
+		for ( int i = 0; i < ARRAYSIZE(m_coefficients); i++ )
+		{
+			m_coefficients[i] += inError.m_coefficients[i];
+		}
+		return *this;
+	}
+
+	CQuadricError operator+(const CQuadricError& inError0 ) const
+	{
+		CQuadricError tmp;
+		for ( int i = 0; i < ARRAYSIZE(m_coefficients); i++ )
+		{
+			tmp.m_coefficients[i] = inError0.m_coefficients[i] + m_coefficients[i];
+		}
+		return tmp;
+	}
+	// assignment
+	CQuadricError& operator=(const CQuadricError &inError0)
+	{
+		for ( int i = 0; i < ARRAYSIZE(m_coefficients); i++ )
+		{
+			m_coefficients[i] = inError0.m_coefficients[i];
+		}
+		return *this;
+	}
+
+	CQuadricError& operator*=( float flScale )
+	{
+		for ( int i = 0; i < ARRAYSIZE(m_coefficients); i++ )
+		{
+			m_coefficients[i] *= flScale;
+		}
+		return *this;
+	}
+
+
+	// solves for the point with minimum error (inverts the matrix)
+	Vector SolveForMinimumError()
+	{
+		matrix3x4_t tmp( 
+			m_coefficients[0],		m_coefficients[1]*0.5f,	m_coefficients[2]*0.5f,		m_coefficients[3]*0.5f, 
+			m_coefficients[1]*0.5f, m_coefficients[4],		m_coefficients[5]*0.5f,		m_coefficients[6]*0.5f,
+			m_coefficients[2]*0.5f, m_coefficients[5]*0.5f, m_coefficients[7],			m_coefficients[8]*0.5f );
+
+		return CholeskySolve( tmp );
+	}
+
+	// clear all coefficients
+	void SetToZero()
+	{
+		for ( int i = 0; i < ARRAYSIZE(m_coefficients); i++ )
+		{
+			m_coefficients[i] = 0.0f;
+		}
+	}
+
+	// usually these are initialized by summing quadrics for the planes coincident at each vert (one per triangle)
+	// these are helpers to do that
+	void InitFromPlane( const Vector &vNormal, float flDist, float flScale )
+	{
+		float flScale2 = flScale * 2.0f;
+		m_coefficients[0] = vNormal.x * vNormal.x * flScale;	// a^2
+		m_coefficients[1] = vNormal.x * vNormal.y * flScale2;	// 2ab
+		m_coefficients[2] = vNormal.x * vNormal.z * flScale2;	// 2ac
+		m_coefficients[3] = vNormal.x * flDist * flScale2;		// 2ad
+		m_coefficients[4] = vNormal.y * vNormal.y * flScale;	// b^2
+		m_coefficients[5] = vNormal.y * vNormal.z * flScale2;	// 2bc
+		m_coefficients[6] = vNormal.y * flDist * flScale2;		// 2bd
+		m_coefficients[7] = vNormal.z * vNormal.z * flScale;	// c^2
+		m_coefficients[8] = vNormal.z * flDist * flScale2;		// cd
+		m_coefficients[9] = flDist * flDist * flScale;	// d^2
+	}
+
+	void InitFromTriangle( const Vector &v0, const Vector &v1, const Vector &v2, float flMinArea )
+	{
+		Vector vNormal = CrossProduct( v2 - v0, v1 - v0 );
+		float flArea = 0.5f * vNormal.NormalizeInPlace();
+		flArea = MAX(flMinArea, flArea);
+		float flDist = -DotProduct(vNormal, v0);
+		InitFromPlane( vNormal, flDist, flArea );
+	}
+
+	// this evaluates the error at a point in space
+	inline float ComputeError( const Vector &v0 )
+	{
+		float x = v0.x;
+		float y = v0.y;
+		float z = v0.z;
+		float flVertex[9];
+		flVertex[0] = x * x;
+		flVertex[1] = x * y;
+		flVertex[2] = x * z;
+		flVertex[3] = x;
+		flVertex[4] = y * y;
+		flVertex[5] = y * z;
+		flVertex[6] = y;
+		flVertex[7] = z * z;
+		flVertex[8] = z;
+		float flTotal = m_coefficients[9];
+		for ( int i = 0; i < 9; i++ )
+		{
+			flTotal += flVertex[i] * m_coefficients[i];
+		}
+		return flTotal;
+	}
+
+private:
+	float m_coefficients[10];
+};
+
+#endif // QUADRIC_H
--- a/public/mathlib/quantize.h
+++ b/public/mathlib/quantize.h
@@ -0,0 +1,141 @@
+//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+#ifndef QUANTIZE_H
+#define QUANTIZE_H
+
+#ifndef STRING_H
+#include <string.h>
+#endif
+
+#define MAXDIMS 768
+#define MAXQUANT 16000
+
+
+#include <tier0/platform.h>
+
+struct Sample;
+
+struct QuantizedValue {
+	double MinError;											// minimum possible error. used
+	// for neighbor searches.
+	struct QuantizedValue *Children[2];						// splits
+	int32 value;											// only exists for leaf nodes
+	struct Sample *Samples;									// every sample quantized into this
+	// entry
+	int32 NSamples;											// how many were quantized to this.
+	int32 TotSamples;
+	double *ErrorMeasure;									// variance measure for each dimension
+	double TotalError;										// sum of errors
+	uint8 *Mean;											// average value of each dimension
+	uint8 *Mins;											// min box for children and this
+	uint8 *Maxs;											// max box for children and this
+	int NQuant;												// the number of samples which were
+															// quantzied to this node since the
+															// last time OptimizeQuantizer()
+															// was called.
+	int *Sums;												// sum used by OptimizeQuantizer
+	int sortdim;											// dimension currently sorted along.
+};
+
+struct Sample {
+	int32 ID;												// identifier of this sample. can
+															// be used for any purpose.
+	int32 Count;											// number of samples this sample
+															// represents
+	int32 QNum;										   // what value this sample ended up quantized
+															// to.
+	struct QuantizedValue *qptr;							// ptr to what this was quantized to.
+	uint8 Value[1];										   // array of values for multi-dimensional
+	// variables.
+};
+
+void FreeQuantization(struct QuantizedValue *t);
+
+struct QuantizedValue *Quantize(struct Sample *s, int nsamples, int ndims,
+								int nvalues, uint8 *weights, int value0=0);
+
+int CompressSamples(struct Sample *s, int nsamples, int ndims);
+
+struct QuantizedValue *FindMatch(uint8 const *sample,
+								 int ndims,uint8 *weights,
+								 struct QuantizedValue *QTable);
+void PrintSamples(struct Sample const *s, int nsamples, int ndims);
+
+struct QuantizedValue *FindQNode(struct QuantizedValue const *q, int32 code);
+
+inline struct Sample *NthSample(struct Sample *s, int i, int nd)
+{
+	uint8 *r=(uint8 *) s;
+	r+=i*(sizeof(*s)+(nd-1));
+	return (struct Sample *) r;
+}
+
+inline struct Sample *AllocSamples(int ns, int nd)
+{
+	size_t size5=(sizeof(struct Sample)+(nd-1))*ns;
+	void *ret=new uint8[size5];
+	memset(ret,0,size5);
+	for(int i=0;i<ns;i++)
+		NthSample((struct Sample *)ret,i,nd)->Count=1;
+	return (struct Sample *) ret;
+}
+
+
+// MinimumError: what is the min error which will occur if quantizing
+// a sample to the given qnode? This is just the error if the qnode
+// is a leaf.
+double MinimumError(struct QuantizedValue const *q, uint8 const *sample,
+					int ndims, uint8 const *weights);
+double MaximumError(struct QuantizedValue const *q, uint8 const *sample,
+					int ndims, uint8 const *weights);
+
+void PrintQTree(struct QuantizedValue const *p,int idlevel=0);
+void OptimizeQuantizer(struct QuantizedValue *q, int ndims);
+
+// RecalculateVelues: update the means in a sample tree, based upon
+// the samples. can be used to reoptimize when samples are deleted,
+// for instance.
+
+void RecalculateValues(struct QuantizedValue *q, int ndims);
+
+extern double SquaredError;	// may be reset and examined. updated by
+															// FindMatch()
+
+
+
+
+// the routines below can be used for uniform quantization via dart-throwing.
+typedef void (*GENERATOR)(void *);    // generate a random sample
+typedef double (*COMPARER)(void const *a, void const *b);
+
+void *DartThrow(int NResults, int NTries, size_t itemsize, GENERATOR gen,
+				COMPARER cmp);
+void *FindClosestDart(void *items,int NResults, size_t itemsize,
+					  COMPARER cmp, void *lookfor, int *idx);
+
+
+
+
+// color quantization of 24 bit images
+#define QUANTFLAGS_NODITHER 1	// don't do Floyd-steinberg dither
+
+extern void ColorQuantize(
+uint8 const	*pImage,			// 4 byte pixels ARGB
+int			nWidth,
+int			nHeight,
+int			nFlags, 			// QUANTFLAGS_xxx
+int			nColors,			// # of colors to fill in in palette
+uint8		*pOutPixels,		// where to store resulting 8 bit pixels
+uint8		*pOutPalette,		// where to store resulting 768-byte palette
+int			nFirstColor);		// first color to use in mapping
+
+
+
+
+
+#endif
--- a/public/mathlib/simdvectormatrix.h
+++ b/public/mathlib/simdvectormatrix.h
@@ -0,0 +1,142 @@
+//====== Copyright © 1996-2006, Valve Corporation, All rights reserved. =======//
+//
+// Purpose: Provide a class (SSE/SIMD only) holding a 2d matrix of class FourVectors,
+// for high speed processing in tools.
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#ifndef SIMDVECTORMATRIX_H
+#define SIMDVECTORMATRIX_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+
+#include <string.h>
+#include "tier0/platform.h"
+#include "tier0/dbg.h"
+#include "tier1/utlsoacontainer.h"
+#include "mathlib/ssemath.h"
+
+class CSIMDVectorMatrix
+{
+public:
+	int m_nWidth;											// in actual vectors
+	int m_nHeight;
+
+	int m_nPaddedWidth;										// # of 4x wide elements
+
+	FourVectors *m_pData;
+
+protected:
+	void Init( void )
+	{
+		m_pData = NULL;
+		m_nWidth = 0;
+		m_nHeight = 0;
+		m_nPaddedWidth = 0;
+	}
+
+	int NVectors( void ) const
+	{
+		return m_nHeight * m_nPaddedWidth;
+	}
+
+public:
+	// constructors and destructors
+	CSIMDVectorMatrix( void )
+	{
+		Init();
+	}
+
+	~CSIMDVectorMatrix( void )
+	{
+		if ( m_pData )
+			delete[] m_pData;
+	}
+
+	// set up storage and fields for m x n matrix. destroys old data
+	void SetSize( int width, int height )
+	{
+		if ( ( ! m_pData ) || ( width != m_nWidth ) || ( height != m_nHeight ) )
+		{
+			if ( m_pData )
+				delete[] m_pData;
+			
+			m_nWidth = width;
+			m_nHeight = height;
+			
+			m_nPaddedWidth = ( m_nWidth + 3) >> 2;
+			m_pData = NULL;
+			if ( width && height )
+				m_pData = new FourVectors[ m_nPaddedWidth * m_nHeight ];
+		}
+	}
+
+	CSIMDVectorMatrix( int width, int height )
+	{
+		Init();
+		SetSize( width, height );
+	}
+
+	CSIMDVectorMatrix &operator=( CSIMDVectorMatrix const &src )
+	{
+		SetSize( src.m_nWidth, src.m_nHeight );
+		if ( m_pData )
+			memcpy( m_pData, src.m_pData, m_nHeight*m_nPaddedWidth*sizeof(m_pData[0]) ); 
+		return *this;
+	}
+
+	CSIMDVectorMatrix &operator+=( CSIMDVectorMatrix const &src );
+
+	CSIMDVectorMatrix &operator*=( Vector const &src );
+
+	// create from an RGBA float bitmap. alpha ignored.
+	void CreateFromRGBA_FloatImageData(int srcwidth, int srcheight, float const *srcdata );
+
+	// create from 3 fields in a csoa
+	void CreateFromCSOAAttributes( CSOAContainer const *pSrc,
+								   int nAttrIdx0, int nAttrIdx1, int nAttrIdx2 );
+
+	// Element access. If you are calling this a lot, you don't want to use this class, because
+	// you're not getting the sse advantage
+	Vector Element(int x, int y) const
+	{
+		Assert( m_pData );
+		Assert( x < m_nWidth );
+		Assert( y < m_nHeight );
+		Vector ret;
+		FourVectors const *pData=m_pData+y*m_nPaddedWidth+(x >> 2);
+
+		int xo=(x & 3);
+		ret.x=pData->X( xo );
+		ret.y=pData->Y( xo );
+		ret.z=pData->Z( xo );
+		return ret;
+	}
+
+	//addressing the individual fourvectors elements
+	FourVectors &CompoundElement(int x, int y)
+	{
+		Assert( m_pData );
+		Assert( y < m_nHeight );
+		Assert( x < m_nPaddedWidth );
+		return m_pData[x + m_nPaddedWidth*y ];
+	}
+
+	// math operations on the whole image
+	void Clear( void )
+	{
+		Assert( m_pData );
+		memset( m_pData, 0, m_nHeight*m_nPaddedWidth*sizeof(m_pData[0]) );
+	}
+
+	void RaiseToPower( float power );
+};
+
+
+
+#endif
--- a/public/mathlib/simplex.h
+++ b/public/mathlib/simplex.h
@@ -0,0 +1,91 @@
+//========= Copyright c 1996-2009, Valve Corporation, All rights reserved. ============//
+//
+// Purpose:
+//
+// $NoKeywords: $
+//=============================================================================//
+
+#ifndef VPHYSICS_PHYSX_SIMPLEX_H
+#define VPHYSICS_PHYSX_SIMPLEX_H
+
+#include "tier0/dbg.h"
+
+//////////////////////////////////////////////////////////////////////////
+// Direct simplex LP solver using tableau and Gauss pivoting rule;
+// see http://www.teachers.ash.org.au/mikemath/mathsc/linearprogramming/simplex.PDF
+// After constructing an instance of this class with the appropriate number of vars and constraints,
+// fill in constraints using SetConstraintFactor and SetConstraintConst
+//
+// Here's the problem in its canonical form:
+//  Maximize objective = c'x : x[i] >= 0, A.x <= b; and make that c' positive! negative will automatically mean 0 is your best answer
+//  Vector x is the vector of unknowns (variables) and has dimentionality of numVariables
+//  Vector c is dotted with the x, so has the same dimentionality; you set it with SetObjectiveFactor()
+//  every component of x must be positive in feasible solution
+//  A is constraint matrix and has dims: m_numConstraints by m_numVariables; you set it with SetConstraintFactor();
+//  b has dims: m_numConstraints, you set it with SetConstraintConst()
+//
+// This is solved with the simplest possible simplex method (I have no good reason to implement pivot rules now) 
+// The simplex tableau (m_pTableau) starts like this:
+// | A  | b |
+// | c' | 0 |
+//
+class CSimplex
+{
+public:
+	int m_numConstraints, m_numVariables;
+	float *m_pTableau;
+	float *m_pInitialTableau;
+	float *m_pSolution;
+	int *m_pBasis; // indices of basis variables, corresponding to each row in the tableau; >= numVars if the slack var corresponds to that row
+	int *m_pNonBasis; // indices of non-basis primal variables (labels on the top of the classic Tucker(?) tableau)
+	enum StateEnum{kInfeasible, kUnbound, kOptimal, kUnknown, kCannotPivot};
+	StateEnum m_state;
+	//CVarBitVec m_isBasis;
+public:
+	CSimplex();
+	CSimplex(int numVariables, int numConstraints);
+	~CSimplex();
+
+	void Init(int numVariables, int numConstraints);
+	void InitTableau(const float *pTableau);
+	void SetObjectiveFactors(int numFactors, const float *pFactors);
+
+	void SetConstraintFactor(int nConstraint, int nConstant, float fFactor);
+	void SetConstraintConst(int nConstraint, float fConst);
+	void SetObjectiveFactor(int nConstant, float fFactor);
+
+	StateEnum Solve(float flThreshold = 1e-5f, int maxStallIterations = 128);
+	StateEnum SolvePhase1(float flThreshold = 1e-5f, int maxStallIterations = 128);
+	StateEnum SolvePhase2(float flThreshold = 1e-5f, int maxStallIterations = 128);
+	float GetSolution(int nVariable)const;
+	float GetSlack(int nConstraint)const;
+	float GetObjective()const;
+	void PrintTableau()const;
+
+protected:
+	void Destruct();
+	float *operator [] (int row) {Assert(row >= 0 && row < NumRows());return m_pTableau + row * NumColumns();}
+	float &Tableau(int row, int col){Assert(row >= 0 && row < NumRows());return m_pTableau[row * NumColumns()+col];}
+	float Tableau(int row, int col)const{Assert(row >= 0 && row < NumRows());return m_pTableau[row * NumColumns()+col];}
+	float GetInitialTableau(int row, int col)const{return m_pInitialTableau[row * NumColumns()+col];}
+	bool IteratePhase1();
+	bool IteratePhase2();
+	int NumRows()const {return m_numConstraints + 1;}
+	int NumColumns()const{return m_numVariables + 1;}
+	void Validate();
+	void PrepareTableau();
+	void GatherSolution();
+	bool Pivot(int nPivotRow, int nPivotColumn);
+
+	void MultiplyRow(int nRow, float fFactor);
+	void AddRowMulFactor(int nTargetRow, int nPivotRow, float fFactor);
+
+	int FindPivotColumn();
+	int FindPivotRow(int nColumn);
+
+	int FindLastNegConstrRow();
+	int ChooseNegativeElementInRow(int nRow);
+};
+
+
+#endif
--- a/public/mathlib/softbody.h
+++ b/public/mathlib/softbody.h
@@ -0,0 +1,536 @@
+//========= Copyright © Valve Corporation, All rights reserved. ============//
+//
+// A port of CRnSoftbody
+// Note: mathlib is tentative place for this code. We will probably move it to a separate lib or dll or vphysics.dll
+//
+#ifndef MATHLIB_SOFTBODY_HDR
+#define MATHLIB_SOFTBODY_HDR
+
+#include "rubikon/param_types.h"
+#include "rubikon/debugname.h"
+#include "rubikon/intersection.h"
+
+// #include "rnmath.h"
+// #include "geometry.h"
+// #include "mass.h"
+// #include "graphedge.h"
+// #include "collisionfilter.h"
+// #include "rnserialize.h"
+// #include "legacyobject.h"
+#include "tier1/utlbuffer.h"
+#include "mathlib/femodel.h"
+#include "tier1/hierarchicalbitvec.h"
+#include "rubikon/serializehelpers.h"
+
+class CRnSoftbodyDesc;
+class CSoftbodyEnvironment;
+class CRnBody;
+class CRnJoint;
+struct PhysSoftbodyDesc_t;
+class CJob ;
+class CSoftbody;
+class CFeModel;
+struct FilterTransformsParams_t;
+struct SetNodePositionsParams_t;
+class CFeModelReplaceContext;
+class IVDebugOverlay;
+class IMesh;
+
+
+struct FilterTransformsParams_t
+{
+	matrix3x4a_t *pOutputWorldTransforms;
+	const int16 *pCtrlToBone;
+	const uint32 *pValidTransforms;
+	VectorAligned *pNodePos;
+	bool flMatrixScale; // Scale to convey in matrices; 1.0 will not scale anything (appropriate for Hammer conventions). 0.0 means use the model scale (appropriate for game conventions)
+};
+
+
+struct SetNodePositionsParams_t
+{
+	SetNodePositionsParams_t()
+	{
+		nFrame = 1;
+	}
+	const VectorAligned *pPos;
+	int nCount;
+	Vector vAbsOrigin;
+	QAngle vAbsAngles;
+	int nFrame;
+};
+
+
+
+class CParticleGlue
+{
+public:
+	float m_flStickiness;
+	float m_flWeight1; // 0: only use parent [0], >0 : blend from [0] to [1]
+	uint16 m_nParentNode[ 2 ]; // this is actual node index; we're mostly parented to static nodes
+public:
+	CParticleGlue(){}
+	CParticleGlue( uint nParentNode, float flStickiness )
+	{
+		m_flStickiness = flStickiness;
+		m_flWeight1 = 0;
+		m_nParentNode[ 0 ] = m_nParentNode[ 1 ] = nParentNode;
+	}
+};
+
+
+class ALIGN16 CSoftbody
+{
+public:
+	CSoftbody( void );
+	CSoftbody( CSoftbodyEnvironment *pWorld, const CFeModel *pFeModel );
+	~CSoftbody();
+
+	void Shutdown( );
+
+	void Init( int numModelBones = 0);
+	void Init( CSoftbodyEnvironment *pWorld, const CFeModel *pFeModel, int numModelBones );
+
+	void InitFeModel( int numModelBones = 0 );
+
+	void InitDefaults( );
+	
+	void Integrate( float flTimeStep );
+	void IntegrateWind( VectorAligned *pPos, float flTimeStep );
+	void RawSimulate( int nIterations, float flTimeStep );
+	void ValidatingSimulate( int nIterations, float flTimeStep );
+	void AddAnimationAttraction( float flTimeStep );
+	void Predict( float flTimeStep );
+	void Post( );
+	void Collide();
+	void CollideWithWorldInternal();
+	void CollideWithRigidsInternal();
+// 	void CollideTaperedCapsule( uint16 nCollisionMask, const VectorAligned &vCenter0, float flRadius0, const VectorAligned &vCenter1, float flRadius1, const Vector &vAxis, float flDist );
+// 	void CollideTaperedCapsule( uint16 nCollisionMask, const VectorAligned &vCenter0, float flRadius0, const VectorAligned &vCenter1, float flRadius1 );
+
+	// @begin_publish
+	uint Step( int nIterations, float flTimeStep );
+	uint Step( float flTimeStep );
+	uint GetStateHash()const;
+	void TouchAnimatedTransforms();
+	void SetAnimatedTransform( int nParticle, const matrix3x4a_t &transform );
+	void SetAnimatedTransforms( const matrix3x4a_t *pSimulationWorldTransforms );
+	void SetAnimatedTransformsNoScale( const matrix3x4a_t *pSimulationWorldTransforms );
+	matrix3x4a_t* GetParticleTransforms( const VectorAligned *pInputNodePos = NULL, uint nFlags = 0 );
+	const VectorAligned* GetNodePositions( int nFrame = 1 ) const { return nFrame ? m_pPos1 : m_pPos0; }
+	VectorAligned* GetNodePositions( int nFrame = 1 ) { return nFrame ? m_pPos1 : m_pPos0; }
+	void SetNodePositions( const Vector *pPos, int nCount, int nFrame = 1 );
+	void SetNodePositions( SetNodePositionsParams_t &params );
+	uint GetNodeCount() const { return m_nNodeCount; }
+	uint GetCtrlCount() const { return m_nParticleCount; }
+	Vector GetNodeVelocity( uint nNode ) const { return ( m_pPos1[ nNode ] - m_pPos0[ nNode ] ) / m_flLastTimestep;  }
+	Vector GetCtrlVelocity( uint nCtrl ) const;
+	matrix3x4a_t* GetAnimatedTransforms( ) { return m_pParticles; }
+	const matrix3x4a_t* GetAnimatedTransforms() const { return m_pParticles; }
+	matrix3x4a_t* GetSimulatedTransforms() { return m_pParticles + m_nParticleCount; }
+	const matrix3x4a_t* GetSimulatedTransforms() const { return m_pParticles + m_nParticleCount; }
+	const CFeModel *GetFeModel()const { return m_pFeModel; }
+	CFeModel *GetFeModel( ){ return m_pFeModel; }
+	CSoftbodyEnvironment *GetEnvironment( ) const { return m_pEnvironment;  }
+
+	int GetParticleCount( ) const { return m_nParticleCount; }
+
+	void SetDebugNameV( const char* pNameFormat, va_list args ) { m_DebugName.SetV( pNameFormat, args );}
+	const char *GetDebugName() const { return m_DebugName.Get(); }
+	void AppendDebugInfo( CUtlString &line );
+	void Draw( const RnDebugDrawOptions_t &options, IVDebugOverlay* pDebugOverlay );
+	void Draw( const RnDebugDrawOptions_t &options, IMesh *pDynamicMesh );
+	void SetPose() { SetPose( m_pFeModel->m_pInitPose ); }
+	void SetPose( const CTransform *pPose );
+	void SetPose( const CTransform &tm ) { SetPose( tm, m_pFeModel->m_pInitPose ); }
+	void SetPose( const CTransform &tm, const CTransform *pPose );
+	void SetPoseFromBones( const int16 *pCtrlToBone, const matrix3x4a_t *pBones, float flScale = 1.0f );
+	void SetCtrl( int nParticle, const CTransform &tm );
+	void SetDebugSelection( int nSelection );
+	void SetSimFlags( uint nNewSimFlags );
+	void AddSimFlags( uint nAddSimFlags ) { SetSimFlags( m_nSimFlags | nAddSimFlags ); }
+	void ClearSimFlags( uint nClearSimFlags ) { SetSimFlags( m_nSimFlags & ~nClearSimFlags ); }
+	void SetAnimSpace( PhysicsSoftbodyAnimSpace_t nAnimSpace ) { m_nAnimSpace = nAnimSpace; }
+	PhysicsSoftbodyAnimSpace_t GetAnimSpace()const { return m_nAnimSpace; }
+	uint GetSimFlags()const { return m_nSimFlags; }
+	int CastCone( const Vector &vStart, const Vector &vDir, float flConePitch );
+	uint GetContactCount( )const { return 0; }
+
+	void SetOverPredict( float flOverPredict ) { m_flOverPredict = flOverPredict;  }
+	float GetOverPredict( ) const { return m_flOverPredict; }
+
+	void ResetVelocities( );
+
+	void SetThreadStretch( float flBendUnderRelax ) { m_flThreadStretch = flBendUnderRelax; }
+	float GetThreadStretch(  )const  { return m_flThreadStretch; }
+
+	void SetSurfaceStretch( float flStretchUnderRelax ) { m_flSurfaceStretch = flStretchUnderRelax; }
+	float GetSurfaceStretch(  )const { return m_flSurfaceStretch; }
+
+	void SetStepUnderRelax( float flStepUnderRelax ) { m_flStepUnderRelax = flStepUnderRelax;  }
+	float GetStepUnderRelax( void ) const { return m_flStepUnderRelax; }
+	AABB_t BuildBounds( )const;
+	void FilterTransforms( const FilterTransformsParams_t &params );
+	void FilterTransforms( matrix3x4a_t *pModelBones );
+
+	void SetGravityScale( float flScale ) { m_flGravityScale = flScale; }
+	void EnableGravity( bool bEnableGravity ) { m_bGravityDisabled = !bEnableGravity; }
+	void EnableGravity( ) { m_bGravityDisabled = false; }
+	void DisableGravity( ) { m_bGravityDisabled = true; }
+	bool IsGravityEnabled( ) const { return !m_bGravityDisabled; }
+	bool IsGravityDisabled( ) const { return m_bGravityDisabled; }
+	bool IsFtlPassEnabled( ) const { return m_bEnableFtlPass; }
+	void EnableFtlPass( bool bEnable ) { m_bEnableFtlPass = bEnable; }
+
+	float GetGravityScale( void ) const{ return m_flGravityScale; }
+	Vector GetEffectiveGravity( void ) const;
+	float GetEffectiveGravityScale( void ) const;
+	void EnableAnimationAttraction( bool bEnable ) { m_bEnableAnimationAttraction = bEnable; }
+	bool IsAnimationAttractionEnabled( )const { return m_bEnableAnimationAttraction; }
+	void EnableFollowNode( bool bEnable ) { m_bEnableFollowNodes = bEnable;  }
+	bool IsFollowNodeEnabled( ) const { return m_bEnableFollowNodes;  }
+	void EnableSprings( bool bEnable ){ m_bEnableSprings = bEnable;  }
+	bool AreSpringsEnabled( )const { return m_bEnableSprings; }
+	void EnableInclusiveCollisionSpheres( bool bEnable ) { m_bEnableInclusiveCollisionSpheres = bEnable; }
+	bool AreInclusiveCollisionSpheresEnabled( ) const { return m_bEnableInclusiveCollisionSpheres; }
+	void EnableExclusiveCollisionSpheres( bool bEnable ) { m_bEnableExclusiveCollisionSpheres = bEnable; }
+	bool AreExclusiveCollisionSpheresEnabled( ) const { return m_bEnableExclusiveCollisionSpheres; }
+	void EnableCollisionPlanes( bool bEnable ) { m_bEnableCollisionPlanes = bEnable; }
+	bool AreCollisionPlanesEnabled( ) const { return m_bEnableCollisionPlanes; }
+	void EnableGroundCollision( bool bEnable ) { m_bEnableGroundCollision = bEnable; }
+	bool IsGroundCollisionEnabled( ) const { return m_bEnableGroundCollision; }
+	void EnableGroundTrace( bool bEnable ) { m_bEnableGroundTrace = bEnable; }
+	bool IsGroundTraceEnabled( ) const { return m_bEnableGroundTrace; }
+
+	float GetTimeStep( void )const { return m_flLastTimestep;  }
+	void SetModelScale( float flModelScale ) { m_flModelScale = flModelScale; }
+	float GetModelScale( )const { return m_flModelScale; } 
+	void SetVelocityDamping( float flDamping ) { m_flVelocityDamping = flDamping; } // 1.0f - full damping; 0.0 - no damping (default)
+	float GetVelocityDamping( )const { return m_flVelocityDamping; }
+	void SetUserData( uint nIndex, void *pData );
+	void* GetUserData( uint nIndex );
+	//void SetOrigin( const Vector &vAbsOrigin );
+	void InitializeTransforms( const int16 *pCtrlToBone, const matrix3x4a_t *pSimulationWorldTransforms );
+	void SetAbsAngles( const QAngle &vNewAngles, bool bTeleport );
+	const QAngle GetAbsAngles() const { return m_vSimAngles; }
+	void SetAbsOrigin( const Vector &vNewOrigin, bool bTeleport );
+	const Vector GetAbsOrigin()const { return m_vSimOrigin; }
+	float GetEnergy( PhysicsSoftbodyEnergyTypeEnum_t nEnergy )const;
+	float GetElasticEnergy( )const;
+	float GetPotentialEnergy( )const;
+	float GetKinematicEnergy( )const;
+	void SetDampingMultiplier( float flMul ) { m_flDampingMultiplier = flMul; }
+	float GetDampingMultiplier( )const { return m_flDampingMultiplier; }
+	void SetGroundZ( float flGroundZ ) { m_vGround.Init( m_vSimOrigin.x, m_vSimOrigin.y, flGroundZ );  }
+	float GetGroundZ( )const { return m_vGround.z; }
+
+	bool IsDormant( )const;
+	void GoDormant( );
+	bool AdvanceSleepCounter();
+	void GoWakeup( );
+	bool IsActive()const;
+	bool BeforeFilterTransforms( );
+	void SetDebugDrawTreeBeginLevel( int nLevel ) { m_nDebugDrawTreeBeginLevel = nLevel; }
+	int GetDebugDrawTreeBeginLevel() { return m_nDebugDrawTreeBeginLevel; }
+	void SetDebugDrawTreeEndLevel( int nLevel ) { m_nDebugDrawTreeEndLevel = nLevel; }
+	int GetDebugDrawTreeEndLevel() { return m_nDebugDrawTreeEndLevel; }
+	void SetDebugDrawTreeFlags( uint nFlags ) { m_nDebugDrawTreeFlags = nFlags; }
+	uint GetDebugDrawTreeFlags(){ return m_nDebugDrawTreeFlags; }
+	void EnableDebugDraw( bool bEnable ){ m_bDebugDraw = bEnable; }
+	void EnableDebugRendering( bool bEnable );
+
+	float GetVelAirDrag() const { return m_flVelAirDrag; }
+	void SetVelAirDrag( float flVelAirDrag ){ m_flVelAirDrag = flVelAirDrag; }
+	float GetExpAirDrag() const { return m_flExpAirDrag; }
+	void SetExpAirDrag( float flExpAirDrag ){ m_flExpAirDrag = flExpAirDrag; }
+	float GetVelQuadAirDrag() const { return m_flVelQuadAirDrag; }
+	void SetVelQuadAirDrag( float flVelQuadAirDrag ){ m_flVelQuadAirDrag = flVelQuadAirDrag; }
+	float GetExpQuadAirDrag() const { return m_flExpQuadAirDrag; }
+	void SetExpQuadAirDrag( float flExpQuadAirDrag ){ m_flExpQuadAirDrag = flExpQuadAirDrag; }
+	float GetVelRodAirDrag() const { return m_flVelRodAirDrag; }
+	void SetVelRodAirDrag( float flVelRodAirDrag ){ m_flVelRodAirDrag = flVelRodAirDrag; }
+	float GetExpRodAirDrag() const { return m_flExpRodAirDrag; }
+	void SetExpRodAirDrag( float flExpRodAirDrag ){ m_flExpRodAirDrag = flExpRodAirDrag; }
+	float GetQuadVelocitySmoothRate()const { return m_flQuadVelocitySmoothRate; }
+	void SetQuadVelocitySmoothRate( float flRate ){ m_flQuadVelocitySmoothRate = flRate; }
+	float GetRodVelocitySmoothRate()const { return m_flRodVelocitySmoothRate; }
+	void SetRodVelocitySmoothRate( float flRate ){ m_flRodVelocitySmoothRate = flRate; }
+	uint16 GetQuadVelocitySmoothIterations()const { return m_nQuadVelocitySmoothIterations; }
+	void SetQuadVelocitySmoothIterations( uint16 nIterations ){ m_nQuadVelocitySmoothIterations = nIterations; }
+	uint16 GetRodVelocitySmoothIterations()const { return m_nRodVelocitySmoothIterations; }
+	void SetRodVelocitySmoothIterations( uint16 nIterations ){ m_nRodVelocitySmoothIterations = nIterations; }
+
+	const RnCollisionAttr_t &GetCollisionAttributes() const { return m_CollisionAttributes; }
+	void SetCollisionAttributes( const RnCollisionAttr_t &attr );
+	int GetIndexInWorld() const { return m_nIndexInWorld; }
+
+	void ReplaceFeModel( CFeModelReplaceContext &context );
+	matrix3x4_t GetDifferenceTransform( const Vector &vAltOrigin, const QAngle &vAltAngles );
+	void ComputeInterpolatedNodePositions( float flFactor, VectorAligned *pPosOut );
+	void SetInstanceSettings( void *pSettings );
+	void SetFrozen( bool bFrozen ) { m_bFrozen = true; }
+	bool IsFrozen()const { return m_bFrozen; }
+	void SetVolumetricSolveAmount( float flVolumetricSolveAmount ) { m_flVolumetricSolveAmount = flVolumetricSolveAmount; }
+	float GetVolumetricSolveAmount()const { return m_flVolumetricSolveAmount; }
+	void ParseParticleState( CUtlBuffer &buf, float flTimeStep );
+	CUtlString PrintParticleState( )const;
+	int16 *GetModelBoneToCtrl() { return m_pModelBoneToCtrl; }
+	void BindModelBoneToCtrl( int nModelBone, int nCtrl );
+	//bool SetupCtrl( uint nCtrl, matrix3x4a_t &writeBone );
+	// @end_publish
+
+	void DebugDump( );
+	void UpdateCtrlOffsets( bool bOverridePose );
+
+	uint ComputeVirtualCtrls( CVarBitVec &virtualNodes );
+
+	matrix3x4a_t &GetAnim( int i ) { return GetAnimatedTransforms()[ i ]; }
+	const matrix3x4a_t &GetAnim( int i ) const { return GetAnimatedTransforms()[ i ]; }
+	matrix3x4a_t &GetSim( int i )  { return GetSimulatedTransforms()[ i ]; }
+	const matrix3x4a_t &GetSim( int i ) const { return GetSimulatedTransforms()[ i ]; }
+	uint ShouldUsePreconditioner()const { return m_nSimFlags & ( SOFTBODY_SIM_DIAGONAL_PRECONDITIONER | SOFTBODY_SIM_TRIDIAGONAL_PRECONDITIONER | SOFTBODY_SIM_RELAXATION_PRECONDITIONER ); }
+
+	class CWorldIndexPred
+	{
+	public:
+		static int GetIndex( const CSoftbody *pBody ) { return pBody->m_nIndexInWorld; }
+		static void SetIndex( CSoftbody *pBody, int nIndex ) { pBody->m_nIndexInWorld = nIndex; }
+	};
+
+	friend class CWorldIndexPred;
+	uint GetParticleArrayCount( ) const { return m_nParticleCount * 2; }
+
+	void Validate();
+	void DebugPreStep( float flTimeStep );
+	void DebugPostStep();
+
+	class CConstraintIterator
+	{
+	public:
+		CConstraintIterator( CSoftbody *pSoftbody );
+		~CConstraintIterator( );
+		void Iterate( int nIterations );
+	protected:
+		CSoftbody *m_pSoftbody;
+		CSoftbodyEnvironment *m_pEnvironment;
+		CUtlVectorFixedGrowable< VectorAligned, 128 > m_PosBeforeCorrect; // the biggest hero in source1 is Medusa, with 104 nodes (52 useful, 52 virtual)
+	};
+
+	friend class CConstraintIterator;
+
+	void GlueNode( uint nDynNode, uint nParentNode, float flStickiness );
+	void GlueNode( uint nDynNode, uint nParentNode0, uint nParentNode1, float flStickiness, float flWeight1 );
+	void GlueNode( uint nDynNode, const CParticleGlue &glue, float flReplacementStickiness );
+	void DebugTraceMove( const char *pMsg );
+protected:
+	void Integrate_S1( float flTimeStep );
+	void ResolveStretch_S1( float flTimeStep );
+	void ResolveAnimAttraction_S1( float flTimeStep );
+
+protected:
+	friend class CRnSoftbodyChangeGuard;
+	CRnDebugName m_DebugName;
+	CSoftbodyEnvironment *m_pEnvironment;
+	RnCollisionAttr_t m_CollisionAttributes;
+
+	CFeModel *m_pFeModel;  // Finite Element Model
+	float m_flThreadStretch; // positive: underrelax; negative: overrelax
+	float m_flSurfaceStretch;
+	float m_flStepUnderRelax;
+	float m_flOverPredict; // 0 : normal integration; positive: overpredict, correct, step back
+	float m_flGravityScale;
+	uint m_nNodeCount;     // actual simulated node count (includes static and dynamic nodes: even though static nodes are not simulated, we need their coordinates to simulate the other nodes connected to them)
+	uint m_nParticleCount; // Ctrl count
+	float m_flModelScale;
+	float m_flVelocityDamping;
+	float m_flDampingMultiplier;
+	float m_flClothScale;
+	Vector m_vGround;
+
+	Vector m_vSimOrigin;
+	QAngle m_vSimAngles;
+
+	matrix3x4a_t *m_pParticles SERIALIZE_ARRAY_SIZE( GetParticleArrayCount() );
+	VectorAligned *m_pPos0 SERIALIZE_ARRAY_SIZE( m_nNodeCount );
+	VectorAligned *m_pPos1 SERIALIZE_ARRAY_SIZE( m_nNodeCount );
+	FeAabb_t *m_pAabb SERIALIZE_ARRAY_SIZE( m_pFeModel->GetDynamicNodeCount() - 1 );
+
+	int16 *m_pModelBoneToCtrl;
+	int16 *m_pCtrlToModelBone;
+
+	CHierarchicalBitVector m_StickyBuffer; // sticky particles
+	CParticleGlue *m_pParticleGlue SERIALIZE_ARRAY_SIZE( m_pFeModel->GetDynamicNodeCount() );
+
+	enum StateEnum_t
+	{
+		STATE_ACTIVE, // actively simulating, taking in transforms, filtering out transforms
+		STATE_DORMANT,// not simulating, not taking in transforms, not filtering anything
+		STATE_WAKEUP,	// StateCounter == 0 : not simulating, readying to take in transforms, not filtering anything, not copying transforms
+						// StateCounter >  0 : not simulating, taking in transforms, not filtering anything, copying transforms
+		STEPS_INVISIBLE_BEFORE_DORMANT = 12,
+		FRAMES_INVISIBLE_BEFORE_DORMANT = 3
+	};
+
+	uint32 m_nSimFlags;
+	uint32 m_nStepsSimulated;
+
+	int8 m_nDebugDrawTreeEndLevel;
+	int8 m_nDebugDrawTreeBeginLevel;
+	uint8 m_nDebugDrawTreeFlags;
+
+	// STATE_ACTIVE: how many steps we've taken without having FilterTransforms called once
+	// STATE_DORMANT: doesn't matter
+	// STATE_WAKEUP: how many times we've set animated transforms (need 2 to switch to ACTIVE state)
+	uint8 m_nStateCounter; 
+	float m_flLastTimestep;
+	float m_flVelAirDrag;
+	float m_flExpAirDrag;
+	float m_flVelQuadAirDrag;
+	float m_flExpQuadAirDrag;
+	float m_flVelRodAirDrag;
+	float m_flExpRodAirDrag;
+	float m_flQuadVelocitySmoothRate;
+	float m_flRodVelocitySmoothRate;
+	float m_flVolumetricSolveAmount;
+	uint16 m_nQuadVelocitySmoothIterations;
+	uint16 m_nRodVelocitySmoothIterations;
+
+	int m_nIndexInWorld;
+	int m_nDebugSelection;
+	
+	Vector m_vRopeOffset; // <sergiy> a horrible S1 cloth rope hack I'm faithfully replicating so that dota cloth looks exactly the same
+
+	uintp m_pUserData[ 2 ] ; 
+	StateEnum_t m_nActivityState;
+
+	//uint m_nDebugNode;
+	uint8 m_nEnableWorldShapeCollision : 4;
+	PhysicsSoftbodyAnimSpace_t m_nAnimSpace : 2;
+	bool m_bAnimTransformChanged : 1; // True means that the animation transforms, that the game sets from outside, have changed and need to be propagated into the simulation
+	bool m_bSimTransformsOutdated : 1; // True means that the sim transforms, that the game queries from outside, are out of date and need to be copied from the simulation (and their rotations computed)
+	bool m_bGravityDisabled : 1;
+	bool m_bEnableAnimationAttraction : 1;
+	bool m_bEnableFollowNodes : 1;
+	bool m_bEnableSprings : 1;
+	bool m_bEnableInclusiveCollisionSpheres : 1;
+	bool m_bEnableExclusiveCollisionSpheres : 1;
+	bool m_bEnableCollisionPlanes : 1;
+	bool m_bEnableGroundCollision : 1;
+	bool m_bEnableGroundTrace : 1;
+	bool m_bEnableFtlPass : 1;
+	bool m_bFrozen : 1;
+	bool m_bDebugDraw : 1;
+	bool m_bEnableSimd : 1;
+	/*
+	bool m_bTeleportOnNextSetAbsOrigin : 1;
+	bool m_bTeleportOnNextSetAbsAngles : 1;
+*/
+
+	friend class CTaperedCapsuleColliderFunctor;
+	friend class CGluePredictFunctor;
+	friend class CSphereColliderFunctor;
+} ALIGN16_POST;
+
+
+
+inline void CSoftbody::GlueNode( uint nDynNode, uint nParentNode, float flStickiness )
+{
+	Assert( nDynNode < m_pFeModel->GetDynamicNodeCount() );
+	m_StickyBuffer.Set( nDynNode );
+	CParticleGlue &glue = m_pParticleGlue[ nDynNode ];
+	glue.m_flStickiness = flStickiness;
+	glue.m_flWeight1 = 0;
+	glue.m_nParentNode[ 0 ] = nParentNode;
+	glue.m_nParentNode[ 1 ] = nParentNode;
+}
+
+inline void CSoftbody::GlueNode( uint nDynNode, uint nParentNode0, uint nParentNode1, float flStickiness, float flWeight1 )
+{
+	Assert( nDynNode < m_pFeModel->GetDynamicNodeCount() );
+	m_StickyBuffer.Set( nDynNode );
+	CParticleGlue &glue = m_pParticleGlue[ nDynNode ];
+	glue.m_flStickiness = flStickiness;
+	glue.m_flWeight1 = flWeight1;
+	glue.m_nParentNode[ 0 ] = nParentNode0;
+	glue.m_nParentNode[ 1 ] = nParentNode1;
+}
+
+
+inline void CSoftbody::GlueNode( uint nDynNode, const CParticleGlue &glueBase, float flReplacementStickiness )
+{
+	m_StickyBuffer.Set( nDynNode );
+	CParticleGlue &glueNode = m_pParticleGlue[ nDynNode ];
+	glueNode.m_flStickiness = flReplacementStickiness;
+	glueNode.m_flWeight1 = glueBase.m_flWeight1; // 0: only use parent [0], >0 : blend from [0] to [1]
+	glueNode.m_nParentNode[ 0 ] = glueBase.m_nParentNode[ 0 ];
+	glueNode.m_nParentNode[ 1 ] = glueBase.m_nParentNode[ 1 ];
+}
+
+
+class CSphereColliderFunctor
+{
+public:
+	VectorAligned m_Sphere;
+	VectorAligned *m_pDynPos1;
+	const float *m_pNodeCollisionRadii;
+	CSoftbody *m_pSoftbody;
+	float m_flStickiness;
+	uint16 m_nParentNode; // needed for gluing particle to this node
+public:
+	CSphereColliderFunctor(){}
+	CSphereColliderFunctor( CSoftbody *pSoftbody, const Vector &vSphereCenter, float flSphereRadius, float flStickiness, uint16 nParentNode )
+	{
+		m_flStickiness = flStickiness;
+		m_nParentNode = nParentNode;
+		m_pSoftbody = pSoftbody;
+		m_Sphere = vSphereCenter;
+		m_Sphere.w = flSphereRadius;
+		const CFeModel *pFeModel = pSoftbody->GetFeModel();
+		m_pDynPos1 = pSoftbody->m_pPos1 + pFeModel->m_nStaticNodes;
+		m_pNodeCollisionRadii = pFeModel->m_pNodeCollisionRadii;
+	}
+
+	void Collide( uint16 nCollisionMask );
+};
+
+
+class CTaperedCapsuleColliderFunctor
+{
+public:
+	VectorAligned m_vSphereCenter0;
+	VectorAligned m_vSphereCenter1;
+	VectorAligned *m_pDynPos1;
+	const float *m_pNodeCollisionRadii;
+	CSoftbody *m_pSoftbody;
+	float m_flStickiness;
+	uint16 m_nParentNode[ 2 ];
+	float m_flSlope;
+	Vector m_vAxisX;
+	float m_flDist;
+public:
+	CTaperedCapsuleColliderFunctor(){}
+	CTaperedCapsuleColliderFunctor( CSoftbody *pSoftbody, const Vector &vSphereCenter0, float flSphereRadius0, const Vector &vSphereCenter1, float flSphereRadius1, float flStickiness, uint16 nParentNodes0, uint16 nParentNodes1 )
+	{
+		m_pSoftbody = pSoftbody;
+		m_vSphereCenter0 = vSphereCenter0;
+		m_vSphereCenter0.w = flSphereRadius0;
+		m_vSphereCenter1 = vSphereCenter1;
+		m_vSphereCenter1.w = flSphereRadius1;
+		m_flStickiness = flStickiness;
+		m_nParentNode[ 0 ] = nParentNodes0;
+		m_nParentNode[ 1 ] = nParentNodes1;
+		const CFeModel *pFeModel = pSoftbody->GetFeModel();
+		m_pDynPos1 = pSoftbody->m_pPos1 + pFeModel->m_nStaticNodes;
+		m_pNodeCollisionRadii = pFeModel->m_pNodeCollisionRadii;
+		m_vAxisX = ( vSphereCenter1 - vSphereCenter0 ) ;
+		m_flDist = m_vAxisX.Length();
+		m_vAxisX /= m_flDist;
+		m_flSlope = ( flSphereRadius1 - flSphereRadius0 ) / m_flDist;
+	}
+
+	void Collide( uint16 nCollisionMask );
+};
+
+
+
+#endif
--- a/public/mathlib/softbody.inl
+++ b/public/mathlib/softbody.inl
@@ -0,0 +1,13 @@
+//========= Copyright © Valve Corporation, All rights reserved. ============//
+#ifndef MATHLIB_SOFTBODY_INL_HDR
+#define MATHLIB_SOFTBODY_INL_HDR
+
+#include "mathlib/softbodyenvironment.h"
+inline void CSoftbody::Shutdown()
+{
+	m_pEnvironment->Unregister( this );
+	MemAlloc_FreeAligned( m_pParticles );
+	m_StickyBuffer.Clear();
+}
+
+#endif // MATHLIB_SOFTBODY_INL_HDR
--- a/public/mathlib/softbodyenvironment.h
+++ b/public/mathlib/softbodyenvironment.h
@@ -0,0 +1,178 @@
+//========= Copyright © Valve Corporation, All rights reserved. ============//
+//
+// A small subset of CRnWorld
+// Note: mathlib is tentative place for this code. We will probably move it to a separate lib or dll or vphysics.dll
+//
+#ifndef MATHLIB_SOFTBODY_ENV_HDR
+#define MATHLIB_SOFTBODY_ENV_HDR
+
+#include "mathlib/vector.h"
+#include "rubikon/param_types.h"
+#include "tier1/utlincrementalvector.h"
+#include "mathlib/softbody.h"
+#include "rubikon/intersection.h"
+#include "mathlib/aabb.h"
+#include "mathlib/dynamictree.h"
+
+class CDynamicTree;
+
+class CSoftbodyCollisionSphere;
+class CSoftbodyCollisionCapsule;
+
+
+class CSoftbodyCollisionFilter
+{
+public:
+	CSoftbodyCollisionFilter();
+
+	void InitGroup( int nGroup, CollisionGroupPairFlags defaultFlags = 0 );
+	uint16 TestSimulation( const RnCollisionAttr_t &left, const RnCollisionAttr_t &right )const;
+
+
+public:
+	enum ConstEnum_t{ MAX_GROUPS = COLLISION_GROUPS_MAX_ALLOWED };
+	CollisionGroupPairFlags m_GroupPairs[ MAX_GROUPS ][ MAX_GROUPS ];
+};
+
+
+class CSoftbodyCollisionShape
+{
+public:
+	CSoftbodyCollisionShape( PhysicsShapeType_t type ) : m_nType( type ), m_nProxyId( -1 ) {}
+// 	CSoftbodyCollisionSphere *IsSphere();
+// 	CSoftbodyCollisionCapsule *IsCapsule();
+
+	// Shape type
+	PhysicsShapeType_t GetType( void ) const { return m_nType; }
+	const RnCollisionAttr_t &GetCollisionAttributes( void ) const { return m_CollisionAttr; }
+	RnCollisionAttr_t &GetCollisionAttributes( void ) { return m_CollisionAttr; }
+	int32 GetProxyId() const { return m_nProxyId; }
+	void SetProxyId( int32 nProxyId ){ m_nProxyId = nProxyId; }
+
+	AABB_t GetBbox()const;
+protected:
+	RnCollisionAttr_t m_CollisionAttr;
+	PhysicsShapeType_t m_nType;	 // not really necessary..
+
+	int32 m_nProxyId;
+};
+
+class CSoftbodyCollisionSphere: public CSoftbodyCollisionShape
+{
+public:
+	CSoftbodyCollisionSphere() : CSoftbodyCollisionShape( SHAPE_SPHERE ){}
+	void SetRadius( float flRadius ) { m_flRadius = flRadius; }
+	float GetRadius() const { return m_flRadius; }
+
+	void SetCenter( const Vector &vCenter ){ m_vCenter = vCenter; }
+	const Vector &GetCenter()const { return m_vCenter; }
+	AABB_t GetBbox()const;
+protected:
+	Vector m_vCenter;
+	float m_flRadius;
+};
+
+class CSoftbodyCollisionCapsule : public CSoftbodyCollisionShape
+{
+public:
+	CSoftbodyCollisionCapsule() : CSoftbodyCollisionShape( SHAPE_CAPSULE ) {}
+	void SetRadius( float flRadius ) { m_flRadius = flRadius; }
+	float GetRadius() const { return m_flRadius; }
+
+	void SetCenter( int nIndex, const Vector &vCenter ){ m_vCenter[ nIndex ] = vCenter; }
+	const Vector &GetCenter( int nIndex )const { return m_vCenter[ nIndex ]; }
+	AABB_t GetBbox()const;
+protected:
+	Vector m_vCenter[2];
+	float m_flRadius;
+};
+
+class CSoftbodyEnvironment
+{
+public:
+	CSoftbodyEnvironment();
+
+	uint GetSoftbodySimulationFlags()const { return 0; }
+	uint GetSoftbodyIterations() const { return m_nIterations; }
+	void SetSoftbodyIterations( int nIterations ) { m_nIterations = nIterations; }
+	CDynamicTree *GetBroadphaseTree() { return &m_BroadphaseTree; }
+
+	const VectorAligned &GetGravity()const { return m_vGravity; }
+	CDebugHighlightCone& GetDebugHighlightCone() { return m_DebugHighlightCone; }
+	
+	void Register( CSoftbody *pSoftbody ) { m_Softbodies.AddToTail( pSoftbody ); }
+	void Unregister( CSoftbody *pSoftbody ) { m_Softbodies.FindAndFastRemove( pSoftbody ); }
+	void Step( float dt, float flSubstepDt = 1.0f / 60.0f, int nMaxSubsteps = 3 );
+
+	int GetSoftbodyCount() const { return m_Softbodies.Count(); }
+	CSoftbody* GetSoftbody( int i ) { return m_Softbodies[ i ]; }
+
+	void Add( CSoftbodyCollisionShape * pShape );
+	void AddOrUpdate( CSoftbodyCollisionShape * pShape );
+	void Update( CSoftbodyCollisionShape * pShape );
+	void Remove( CSoftbodyCollisionShape * pShape );
+	const Vector4DAligned &GetWindDesc() const { return m_vWindDesc; }
+	void SetWind( const Vector & vWind );
+	void SetWindDesc( const Vector &vWindDir, float flStrength ) { m_vWindDesc.Init( vWindDir, flStrength ); }
+	void SetNoWind() { m_vWindDesc.Init( 1, 0, 0, 0 ); }
+public:
+	CSoftbodyCollisionFilter m_Filter;
+protected:
+	Vector4DAligned m_vWindDesc; // normalized direction in x,y,z and strength in w
+	VectorAligned m_vGravity;
+	CDynamicTree m_BroadphaseTree;
+	int m_nIterations;
+	CDebugHighlightCone m_DebugHighlightCone;
+	CUtlIncrementalVector< CSoftbody, CSoftbody::CWorldIndexPred > m_Softbodies;
+	float m_flAccumulatedTimeSlack;
+};
+
+inline CSoftbodyEnvironment::CSoftbodyEnvironment()
+{
+	SetNoWind();
+	m_flAccumulatedTimeSlack = 0.0f;
+	m_nIterations = 1;
+	m_vGravity.Init( 0, 0, -360 );
+}
+
+
+inline void CSoftbodyEnvironment::Step( float dt, float flSubstepDt, int nMaxSubsteps )
+{
+	m_flAccumulatedTimeSlack += dt;
+	if ( m_flAccumulatedTimeSlack < flSubstepDt )
+		return;
+	float flSubsteps = m_flAccumulatedTimeSlack / flSubstepDt;
+	int nSubsteps = int( flSubsteps );
+	if ( nSubsteps < nMaxSubsteps )
+	{
+		m_flAccumulatedTimeSlack = m_flAccumulatedTimeSlack - floorf( flSubsteps * flSubstepDt );
+	}
+	else
+	{
+		nSubsteps = nMaxSubsteps;
+		m_flAccumulatedTimeSlack = 0.0f;
+	}
+	for ( int i = 0; i < m_Softbodies.Count(); ++i )
+	{
+		CSoftbody *pSoftbody = m_Softbodies[ i ];
+		for ( int j = 0; j < nSubsteps; ++j )
+		{
+			pSoftbody->Step( flSubstepDt );
+		}
+	}
+}
+
+inline AABB_t CSoftbodyCollisionShape::GetBbox()const
+{
+	switch ( m_nType )
+	{
+	default:
+		Assert( m_nType == SHAPE_SPHERE );
+		return static_cast< const CSoftbodyCollisionSphere* >( this )->GetBbox();
+	case SHAPE_CAPSULE:
+		return static_cast< const CSoftbodyCollisionCapsule* >( this )->GetBbox();
+	}
+}
+
+#endif
+
--- a/public/mathlib/sphere.h
+++ b/public/mathlib/sphere.h
@@ -0,0 +1,10 @@
+//========= Copyright © Valve Corporation, All rights reserved. ============//
+#ifndef MATHLIB_SPHERE_HDR
+#define MATHLIB_SPHERE_HDR
+
+#include "rubikon/param_types.h"
+#include "vector.h"
+
+void CastSphereRay( CShapeCastResult& out, const Vector &m, const Vector& p, const Vector& d, float flRadius );
+
+#endif
--- a/public/mathlib/spherical_geometry.h
+++ b/public/mathlib/spherical_geometry.h
@@ -0,0 +1,73 @@
+//====== Copyright © 2007-2007, Valve Corporation, All rights reserved. =======//
+//
+// Purpose: Functions for spherical geometry.
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#ifndef SPHERICAL_GEOMETRY_H
+#define SPHERICAL_GEOMETRY_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include <math.h>
+#include <float.h>
+
+// see http://mathworld.wolfram.com/SphericalTrigonometry.html
+
+// return the spherical distance, in radians, between 2 points on the unit sphere.
+FORCEINLINE float UnitSphereLineSegmentLength( Vector const &a, Vector const &b )
+{
+	// check unit length
+	Assert( fabs( VectorLength( a ) - 1.0 ) < 1.0e-3 );
+	Assert( fabs( VectorLength( b ) - 1.0 ) < 1.0e-3 );
+	return acos( DotProduct( a, b ) );
+}
+
+
+// given 3 points on the unit sphere, return the spherical area (in radians) of the triangle they form.
+// valid for "small" triangles.
+FORCEINLINE float UnitSphereTriangleArea( Vector const &a, Vector const &b , Vector const &c )
+{
+	float flLengthA = UnitSphereLineSegmentLength( b, c );
+	float flLengthB = UnitSphereLineSegmentLength( c, a );
+	float flLengthC = UnitSphereLineSegmentLength( a, b );
+	
+	if ( ( flLengthA == 0. ) || ( flLengthB == 0. ) || ( flLengthC == 0. ) )
+		return 0.;											// zero area triangle
+			
+	// now, find the 3 incribed angles for the triangle
+	float flHalfSumLens = 0.5 * ( flLengthA + flLengthB + flLengthC );
+	float flSinSums = sin( flHalfSumLens );
+	float flSinSMinusA= sin( flHalfSumLens - flLengthA );
+	float flSinSMinusB= sin( flHalfSumLens - flLengthB );
+	float flSinSMinusC= sin( flHalfSumLens - flLengthC );
+	
+	float flTanAOver2 = sqrt ( ( flSinSMinusB * flSinSMinusC ) / ( flSinSums * flSinSMinusA ) );
+	float flTanBOver2 = sqrt ( ( flSinSMinusA * flSinSMinusC ) / ( flSinSums * flSinSMinusB ) );
+	float flTanCOver2 = sqrt ( ( flSinSMinusA * flSinSMinusB ) / ( flSinSums * flSinSMinusC ) );
+
+	// Girards formula : area = sum of angles - pi.
+	return 2.0 * ( atan( flTanAOver2 ) + atan( flTanBOver2 ) + atan( flTanCOver2 ) ) - M_PI;
+}
+
+// spherical harmonics-related functions. Best explanation at http://www.research.scea.com/gdc2003/spherical-harmonic-lighting.pdf
+
+// Evaluate associated legendre polynomial P( l, m ) at flX, using recurrence relation
+float AssociatedLegendrePolynomial( int nL, int nM, float flX );
+
+// Evaluate order N spherical harmonic with spherical coordinates
+// nL = band, 0..N
+// nM = -nL .. nL
+// theta = 0..M_PI
+// phi = 0.. 2 * M_PHI
+float SphericalHarmonic( int nL, int nM, float flTheta, float flPhi );
+
+// evaluate spherical harmonic with normalized vector direction
+float SphericalHarmonic( int nL, int nM, Vector const &vecDirection );
+
+
+#endif // SPHERICAL_GEOMETRY_H
--- a/public/mathlib/ssecholesky.h
+++ b/public/mathlib/ssecholesky.h
@@ -0,0 +1,49 @@
+//========= Copyright c 1996-2009, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+#ifndef SSE_CHOLESKY_HDR
+#define SSE_CHOLESKY_HDR
+
+#include "mathlib/vector.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/ssemath.h"
+
+// This class holds cholesky decomposition of Four 3x3 matrices
+struct SimdCholesky3x3_t
+{
+	/// lower diagonal matrix L such that LL' = input matrix
+	fltx4 m_10, m_20, m_21; // These are off-diagonals used in compuations
+	fltx4 m_inv00, m_inv11, m_inv22; // These are reciprocals of diagonals used in all computations
+public:
+	/// @group Construction and initialization {
+	SimdCholesky3x3_t( const fltx4 &a00, const fltx4 & a10, const fltx4 & a11, const fltx4 & a20, const fltx4 & a21, const fltx4 & a22 )
+	{
+		Init( a00, a10, a11, a20, a21, a22 );
+	}
+	void Init( const fltx4 & a00, const fltx4 & a10, const fltx4 & a11, const fltx4 & a20, const fltx4 & a21, const fltx4 & a22 );
+
+	//@}
+
+	bool IsValid( )const ;
+	fltx4 GetValidMask( )const;
+
+	const FourVectors SolveRight( const FourVectors &b );
+	const FourVectors SolveLeft( const FourVectors &b );
+
+	/// Using this decomposition LL', solve the following equation and return the result: LL' x = rhs
+	const FourVectors Solve( const FourVectors &rhs )
+	{
+		// L R x =           b
+		//   R x =      L^-1 b
+		//     x = R^-1 L^-1 b
+		return SolveRight( SolveLeft( rhs ) );
+	}
+};
+
+
+
+#endif
--- a/public/mathlib/ssemath.h
+++ b/public/mathlib/ssemath.h
--- a/public/mathlib/ssemath_360.h
+++ b/public/mathlib/ssemath_360.h
@@ -0,0 +1,801 @@
+//===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======//
+//
+// Purpose: Implementation of our SIMD functions for the 360.
+//==============================================================//
+
+
+#ifndef DBG_H
+#include "tier0/dbg.h"
+#endif
+
+
+//---------------------------------------------------------------------
+// X360 implementation
+//---------------------------------------------------------------------
+
+FORCEINLINE float & FloatSIMD( fltx4 & a, int idx )
+{
+	fltx4_union & a_union = (fltx4_union &)a;
+	return a_union.m128_f32[idx];
+}
+
+FORCEINLINE unsigned int & UIntSIMD( fltx4 & a, int idx )
+{
+	fltx4_union & a_union = (fltx4_union &)a;
+	return a_union.m128_u32[idx];
+}
+
+FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
+{
+	return __vaddfp( a, b );
+}
+
+FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b )				// a-b
+{
+	return __vsubfp( a, b );
+}
+
+FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b )				// a*b
+{
+	return __vmulfp( a, b );
+}
+
+FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// a*b + c
+{
+	return __vmaddfp( a, b, c );
+}
+
+FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// c - a*b
+{
+	return __vnmsubfp( a, b, c );
+};
+
+FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
+{
+	return __vmsum3fp( a, b );
+}
+
+FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
+{
+	return __vmsum4fp( a, b );
+}
+
+FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
+{
+	return XMVectorSin( radians );
+}
+
+FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
+{
+	XMVectorSinCos( &sine, &cosine, radians ); 	
+}
+
+FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )			
+{
+	XMVectorSinCos( &sine, &cosine, radians ); 	
+}
+
+FORCEINLINE void CosSIMD( fltx4 &cosine, const fltx4 &radians )				
+{
+	cosine = XMVectorCos( radians ); 	
+}
+
+FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
+{
+	return XMVectorASin( sine );
+}
+
+FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
+{
+	return XMVectorACos( cs );
+}
+
+// tan^1(a/b) .. ie, pass sin in as a and cos in as b
+FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
+{
+	return XMVectorATan2( a, b );
+}
+
+// DivSIMD defined further down, since it uses ReciprocalSIMD
+
+FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// max(a,b)
+{
+	return __vmaxfp( a, b );
+}
+
+FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b )				// min(a,b)
+{
+	return __vminfp( a, b );
+}
+
+FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b )				// a & b
+{
+    return __vand( a, b );
+}
+
+FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b )			// ~a & b
+{
+	// NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
+    return __vandc( b, a );
+}
+
+FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b )				// a ^ b
+{
+    return __vxor( a, b );
+}
+
+FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b )				// a | b
+{
+    return __vor( a, b );
+}
+
+FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
+{
+	return XMVectorNegate(a);
+}
+
+FORCEINLINE bool IsAllZeros( const fltx4 & a )								// all floats of a zero?
+{
+	unsigned int equalFlags = 0;
+    __vcmpeqfpR( a, Four_Zeros, &equalFlags );
+    return XMComparisonAllTrue( equalFlags );
+}
+
+FORCEINLINE bool IsAnyZeros( const fltx4 & a )								// any floats are zero?
+{
+	unsigned int conditionregister;
+	XMVectorEqualR(&conditionregister, a, XMVectorZero());
+	return XMComparisonAnyTrue(conditionregister);
+}
+
+FORCEINLINE bool IsAnyXYZZero( const fltx4 &a )								// are any of x,y,z zero?
+{
+	// copy a's x component into w, in case w was zero.
+	fltx4 temp = __vrlimi(a, a, 1, 1);
+	unsigned int conditionregister;
+	XMVectorEqualR(&conditionregister, temp, XMVectorZero());
+	return XMComparisonAnyTrue(conditionregister);
+}
+
+/// for branching when a.xyzw > b.xyzw
+FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
+{
+	unsigned int cr;
+	XMVectorGreaterR(&cr,a,b);
+	return XMComparisonAllTrue(cr);
+}
+
+/// for branching when a.xyzw >= b.xyzw
+FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
+{
+	unsigned int cr;
+	XMVectorGreaterOrEqualR(&cr,a,b);
+	return XMComparisonAllTrue(cr);
+}
+
+/// for branching when a.xyzw > b.xyzw
+FORCEINLINE bool IsAnyGreaterThan( const fltx4 &a, const fltx4 &b )
+{
+	unsigned int cr;
+	XMVectorGreaterR(&cr,a,b);
+	return XMComparisonAnyTrue(cr);
+}
+
+/// for branching when a.xyzw >= b.xyzw
+FORCEINLINE bool IsAnyGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
+{
+	unsigned int cr;
+	XMVectorGreaterOrEqualR(&cr,a,b);
+	return XMComparisonAnyTrue(cr);
+}
+
+// For branching if all a.xyzw == b.xyzw
+FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
+{
+	unsigned int cr;
+	XMVectorEqualR(&cr,a,b);
+	return XMComparisonAllTrue(cr);
+}
+
+
+FORCEINLINE int TestSignSIMD( const fltx4 & a )								// mask of which floats have the high bit set
+{
+	// NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)
+	int nRet = 0;
+
+	const fltx4_union & a_union = (const fltx4_union &)a;
+	nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0
+	nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1
+	nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2
+	nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3
+
+	return nRet;
+}
+
+// Squelch the w component of a vector to +0.0.
+// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
+FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
+{
+	return __vrlimi( a, __vzero(), 1, 0 );
+}
+
+FORCEINLINE bool IsAnyNegative( const fltx4 & a )							// (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
+{
+	// NOTE: this tests the top bits of each vector element using integer math
+	//       (so it ignores NaNs - it will return true for "-NaN")
+	unsigned int equalFlags = 0;
+    fltx4 signMask = __vspltisw( -1 );             // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low order 5 bits of each element = 31)
+    signMask       = __vslw( signMask, signMask ); // 0x80000000 0x80000000 0x80000000 0x80000000 
+	__vcmpequwR( Four_Zeros, __vand( signMask, a ), &equalFlags );
+	return !XMComparisonAllTrue( equalFlags );
+}
+
+FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b )				// (a==b) ? ~0:0
+{
+    return __vcmpeqfp( a, b );
+}
+
+
+FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b )				// (a>b) ? ~0:0
+{
+    return __vcmpgtfp( a, b );
+}
+
+FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b )				// (a>=b) ? ~0:0
+{
+    return __vcmpgefp( a, b );
+}
+
+FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b )				// (a<b) ? ~0:0
+{
+    return __vcmpgtfp( b, a );
+}
+
+FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b )				// (a<=b) ? ~0:0
+{
+    return __vcmpgefp( b, a );
+}
+
+FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b )		// (a <= b && a >= -b) ? ~0 : 0
+{
+	return XMVectorInBounds( a, b );
+}
+
+FORCEINLINE fltx4 Cmp01EqSIMD( const fltx4 & a, const fltx4 & b )				// (a==b) ? 1.0:0
+{
+    return AndSIMD( Four_Ones, __vcmpeqfp( a, b ) );
+}
+
+
+FORCEINLINE fltx4 Cmp01GtSIMD( const fltx4 & a, const fltx4 & b )				// (a>b) ? 1.0:0
+{
+    return AndSIMD( Four_Ones, __vcmpgtfp( a, b ) );
+}
+
+FORCEINLINE fltx4 Cmp01GeSIMD( const fltx4 & a, const fltx4 & b )				// (a>=b) ? 1.0:0
+{
+    return AndSIMD( Four_Ones, __vcmpgefp( a, b ) );
+}
+
+FORCEINLINE fltx4 Cmp01LtSIMD( const fltx4 & a, const fltx4 & b )				// (a<b) ? 1.0:0
+{
+    return AndSIMD( Four_Ones, __vcmpgtfp( b, a ) );
+}
+
+FORCEINLINE fltx4 Cmp01LeSIMD( const fltx4 & a, const fltx4 & b )				// (a<=b) ? 1.0:0
+{
+    return AndSIMD( Four_Ones, __vcmpgefp( b, a ) );
+}
+
+FORCEINLINE fltx4 Cmp01InBoundsSIMD( const fltx4 & a, const fltx4 & b )		// (a <= b && a >= -b) ? 1.0 : 0
+{
+	return AndSIMD( Four_Ones, XMVectorInBounds( a, b ) );
+}
+
+
+// returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue
+FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
+{
+    return __vsel( OldValue, NewValue, ReplacementMask );
+}
+
+// AKA "Broadcast", "Splat"
+FORCEINLINE fltx4 ReplicateX4( float flValue )					//  a,a,a,a
+{
+	// NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
+	float * pValue = &flValue;
+	Assert( pValue );
+    Assert( ((unsigned int)pValue & 3) == 0);
+	return __vspltw( __lvlx( pValue, 0 ), 0 );
+}
+
+FORCEINLINE fltx4 ReplicateX4( const float *pValue )					//  a,a,a,a
+{
+	Assert( pValue );
+	return __vspltw( __lvlx( pValue, 0 ), 0 );
+}
+
+/// replicate a single 32 bit integer value to all 4 components of an m128
+FORCEINLINE fltx4 ReplicateIX4( int nValue )
+{
+	// NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!)
+	int * pValue = &nValue;
+	Assert( pValue );
+    Assert( ((unsigned int)pValue & 3) == 0);
+	return __vspltw( __lvlx( pValue, 0 ), 0 );
+}
+
+// Round towards positive infinity
+FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
+{
+	return __vrfip(a);
+}
+
+// Round towards nearest integer
+FORCEINLINE fltx4 RoundSIMD( const fltx4 &a )
+{
+	return __vrfin(a);
+}
+
+// Round towards negative infinity
+FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
+{
+	return __vrfim(a);
+}
+
+FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a )				// sqrt(a), more or less
+{
+	// This is emulated from rsqrt
+	return XMVectorSqrtEst( a );
+}
+
+FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a )					// sqrt(a)
+{
+	// This is emulated from rsqrt
+	return XMVectorSqrt( a );
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a )		// 1/sqrt(a), more or less
+{
+    return __vrsqrtefp( a );
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
+{
+	// Convert zeros to epsilons
+	fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
+	fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
+	return ReciprocalSqrtEstSIMD( a_safe );
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a )			// 1/sqrt(a)
+{
+	// This uses Newton-Raphson to improve the HW result
+ 	return XMVectorReciprocalSqrt( a );
+}
+
+FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a )			// 1/a, more or less
+{
+    return __vrefp( a );
+}
+
+/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
+/// No error checking!
+FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a )				// 1/a
+{
+	// This uses Newton-Raphson to improve the HW result
+	return XMVectorReciprocal( a );
+}
+
+// FIXME: on 360, this is very slow, since it uses ReciprocalSIMD (do we need DivEstSIMD?)
+FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b )	// a/b
+{
+	return MulSIMD( ReciprocalSIMD( b ), a );
+}
+
+
+// CHRISG: is it worth doing integer bitfiddling for this?
+// 2^x for all values (the antilog)
+FORCEINLINE fltx4 PowerOfTwoSIMD( const fltx4 &toPower )
+{
+	return XMVectorExp(toPower);
+}
+
+// Clamps the components of a vector to a specified minimum and maximum range.
+FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
+{
+	return XMVectorClamp(in, min, max);
+}
+
+FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
+{
+	return XMLoadVector4( pSIMD );
+}
+
+// load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec). 
+FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
+{
+	return XMLoadVector3( pSIMD );
+}
+
+// load a single unaligned float into the x component of a SIMD word
+FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt )
+{
+	return __lvlx( pFlt, 0 );
+}
+
+FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
+{
+	return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
+}
+
+
+FORCEINLINE shortx8 LoadAlignedShortSIMD( const void *pSIMD )
+{
+	return XMLoadVector4A( pSIMD );
+}
+
+FORCEINLINE shortx8 LoadUnalignedShortSIMD( const void *pSIMD )
+{
+	return XMLoadVector4( pSIMD );
+}
+
+FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
+}
+
+FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
+{
+	XMStoreVector4( pSIMD, a );
+}
+
+FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
+{
+	XMStoreVector3( pSIMD, a );
+}
+
+
+
+// Fixed-point conversion and save as SIGNED INTS.
+// pDest->x = Int (vSrc.x)
+// note: some architectures have means of doing 
+// fixed point conversion when the fix depth is
+// specified as an immediate.. but there is no way 
+// to guarantee an immediate as a parameter to function
+// like this.
+FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
+{
+	fltx4 asInt = __vctsxs( vSrc, 0 );
+	XMStoreVector4A(pDest->Base(), asInt);
+}
+
+FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
+{
+	XMMATRIX xyzwMatrix = _XMMATRIX( x, y, z, w );
+	xyzwMatrix = XMMatrixTranspose( xyzwMatrix );
+	x = xyzwMatrix.r[0];
+	y = xyzwMatrix.r[1];
+	z = xyzwMatrix.r[2];
+	w = xyzwMatrix.r[3];
+}
+
+// Return one in the fastest way -- faster even than loading.
+FORCEINLINE fltx4 LoadZeroSIMD( void )
+{
+	return XMVectorZero();
+}
+
+// Return one in the fastest way -- faster even than loading.
+FORCEINLINE fltx4 LoadOneSIMD( void )
+{
+	return XMVectorSplatOne();
+}
+
+FORCEINLINE fltx4 SplatXSIMD( fltx4 a )
+{
+	return XMVectorSplatX( a );
+}
+
+FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
+{
+	return XMVectorSplatY( a );
+}
+
+FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
+{
+	return XMVectorSplatZ( a );
+}
+
+FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
+{
+	return XMVectorSplatW( a );
+}
+
+FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
+{
+	fltx4 result = __vrlimi(a, x, 8, 0);
+	return result;
+}
+
+FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
+{
+	fltx4 result = __vrlimi(a, y, 4, 0);
+	return result;
+}
+
+FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
+{
+	fltx4 result = __vrlimi(a, z, 2, 0);
+	return result;
+}
+
+FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
+{
+	fltx4 result = __vrlimi(a, w, 1, 0);
+	return result;
+}
+
+
+FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
+{
+	fltx4 compareOne = a;
+	return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 1 );
+}
+
+FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
+{
+	fltx4 compareOne = a;
+	return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 );
+}
+
+FORCEINLINE fltx4 RotateRight( const fltx4 & a )
+{
+	fltx4 compareOne = a;
+	return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 3 );
+}
+
+FORCEINLINE fltx4 RotateRight2( const fltx4 & a )
+{
+	fltx4 compareOne = a;
+	return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 );
+}
+
+
+
+
+// find the lowest component of a.x, a.y, a.z,
+// and replicate it to the whole return value.
+// ignores a.w.
+// Though this is only five instructions long,
+// they are all dependent, making this stall city.
+// Forcing this inline should hopefully help with scheduling.
+FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
+{
+	// a is [x,y,z,G] (where G is garbage)
+	// rotate left by one 
+	fltx4 compareOne = a ;
+	compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
+	// compareOne is [y,z,G,G]
+	fltx4 retval = MinSIMD( a, compareOne );
+	// retVal is [min(x,y), min(y,z), G, G]
+	compareOne = __vrlimi( compareOne, a, 8 , 2);
+	// compareOne is [z, G, G, G]
+	retval = MinSIMD( retval, compareOne );
+	// retVal = [ min(min(x,y),z), G, G, G ]
+	
+	// splat the x component out to the whole vector and return
+	return SplatXSIMD( retval );
+}
+
+// find the highest component of a.x, a.y, a.z,
+// and replicate it to the whole return value.
+// ignores a.w.
+// Though this is only five instructions long,
+// they are all dependent, making this stall city.
+// Forcing this inline should hopefully help with scheduling.
+FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
+{
+	// a is [x,y,z,G] (where G is garbage)
+	// rotate left by one 
+	fltx4 compareOne = a ;
+	compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
+	// compareOne is [y,z,G,G]
+	fltx4 retval = MaxSIMD( a, compareOne );
+	// retVal is [max(x,y), max(y,z), G, G]
+	compareOne = __vrlimi( compareOne, a, 8 , 2);
+	// compareOne is [z, G, G, G]
+	retval = MaxSIMD( retval, compareOne );
+	// retVal = [ max(max(x,y),z), G, G, G ]
+
+	// splat the x component out to the whole vector and return
+	return SplatXSIMD( retval );
+}
+
+
+
+// ------------------------------------
+// INTEGER SIMD OPERATIONS.
+// ------------------------------------
+
+// Load 4 aligned words into a SIMD register
+FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD)
+{
+	return XMLoadVector4A(pSIMD);
+}
+
+// Load 4 unaligned words into a SIMD register
+FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void * RESTRICT pSIMD)
+{
+	return XMLoadVector4( pSIMD );
+}
+
+// save into four words, 16-byte aligned
+FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
+}
+
+FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;
+}
+
+FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )
+{
+	XMStoreVector4(pSIMD, a);
+}
+
+// Load four consecutive uint16's, and turn them into floating point numbers.
+// This function isn't especially fast and could be made faster if anyone is
+// using it heavily.
+FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts )
+{
+	return XMLoadUShort4(reinterpret_cast<const XMUSHORT4 *>(pInts));
+}
+
+// a={ a.x, a.z, b.x, b.z }
+// combine two fltx4s by throwing away every other field.
+FORCEINLINE fltx4 CompressSIMD( fltx4 const & a, fltx4 const &b )
+{
+	return XMVectorPermute( a, b, XMVectorPermuteControl( 0, 2, 4, 6  )  );
+}
+
+// a={ a.x, b.x, c.x, d.x }
+// combine 4 fltx4s by throwing away 3/4s of the fields
+// TODO: make more efficient by doing this in a parallel way at the caller
+//    Compress4SIMD(FourVectors.. )
+FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d )
+{
+	fltx4 abcd = __vrlimi( a, b, 4, 3 );  // a.x, b.x, a.z, a.w
+	abcd = __vrlimi( abcd, c, 2, 2 );  // ax, bx, cx, aw
+	abcd = __vrlimi( abcd, d, 1, 1 );  // ax, bx, cx, dx
+
+	return abcd;
+}
+
+// Take a fltx4 containing fixed-point uints and 
+// return them as single precision floats. No
+// fixed point conversion is done.
+FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA )
+{
+	return __vcfux( vSrcA, 0 );
+}
+
+// Take a fltx4 containing fixed-point sints and 
+// return them as single precision floats. No 
+// fixed point conversion is done.
+FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
+{
+	return __vcfsx( vSrcA, 0 );
+}
+
+// Take a fltx4 containing fixed-point uints and 
+// return them as single precision floats. Each uint
+// will be divided by 2^immed after conversion
+// (eg, this is fixed point math). 
+/* as if:
+   FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
+   {
+   return __vcfux( vSrcA, uImmed );
+   }
+*/
+#define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfux( (vSrcA), (uImmed) ))
+
+// Take a fltx4 containing fixed-point sints and 
+// return them as single precision floats. Each int
+// will be divided by 2^immed (eg, this is fixed point
+// math). 
+/* as if:
+   FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
+   {
+   return __vcfsx( vSrcA, uImmed );
+   }
+*/
+#define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfsx( (vSrcA), (uImmed) ))
+
+// set all components of a vector to a signed immediate int number.
+/* as if:
+   FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate)
+   {
+   return __vspltisw( toImmediate );
+   }
+*/
+#define IntSetImmediateSIMD(x) (__vspltisw(x))
+
+/*
+  works on fltx4's as if they are four uints.
+  the first parameter contains the words to be shifted,
+  the second contains the amount to shift by AS INTS
+
+  for i = 0 to 3
+  shift = vSrcB_i*32:(i*32)+4
+  vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
+*/
+FORCEINLINE fltx4 IntShiftLeftWordSIMD(fltx4 vSrcA, fltx4 vSrcB)
+{
+	return __vslw(vSrcA, vSrcB);
+}
+
+FORCEINLINE float SubFloat( const fltx4 & a, int idx )
+{
+	// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
+	const fltx4_union & a_union = (const fltx4_union &)a;
+	return a_union.m128_f32[ idx ];
+}
+
+FORCEINLINE float & SubFloat( fltx4 & a, int idx )
+{
+	fltx4_union & a_union = (fltx4_union &)a;
+	return a_union.m128_f32[idx];
+}
+
+/// Set one component of a SIMD word with the given float value. 
+/// This function is a template because the native implementation of
+/// this on PPC platforms requires that the component be given as a 
+/// compiler immediate -- not a function parameter, not a const function
+/// parameter, not even a load from a const static array. It has to be
+/// a real immediate. 
+/// \param NCOMPONENT 0 is x, 1 is y, 2 is z, 3 is w.
+/// \note This function is not particularly performant on any platform (because of 
+///       the load from float), so prefer a masked assign from a fltx4 wherever
+///       possible. 
+template < unsigned int NCOMPONENT >
+FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, float flValue )
+{
+	// vrlimi can only take an immediate opcode -- that is a constant
+	// passed in from the compiler, not a function parameter, nor an 
+	// element loaded from an array, not even a const static array.
+#define SETCOMPONENTSIMD_MASK_IMMEDIATE ( NCOMPONENT == 0 ) ? 8 :\
+										( NCOMPONENT == 1 ) ? 4 :\
+										( NCOMPONENT == 2 ) ? 2 :\
+										( NCOMPONENT == 3 ) ? 1 :\
+										17 //< a meaningless immediate intended to make the compiler angry
+
+	fltx4 val = ReplicateX4( flValue );
+	fltx4 result = __vrlimi(a, val, SETCOMPONENTSIMD_MASK_IMMEDIATE, 0);
+	return result;
+
+#undef SETCOMPONENTSIMD_MASK_IMMEDIATE
+}
+
+
+FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
+{
+	fltx4 t = __vctuxs( a, 0 );
+	const fltx4_union & a_union = (const fltx4_union &)t;
+	return a_union.m128_u32[idx];
+}
+
+
+FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
+{
+	const fltx4_union & a_union = (const fltx4_union &)a;
+	return a_union.m128_u32[idx];
+}
+
+FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
+{
+	fltx4_union & a_union = (fltx4_union &)a;
+	return a_union.m128_u32[idx];
+}
--- a/public/mathlib/ssemath_emulated.h
+++ b/public/mathlib/ssemath_emulated.h
@@ -0,0 +1,921 @@
+//===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======//
+//
+// Purpose: Implementation of our SIMD function using generic c++ code and a struct. This
+// implementation will not be especially fast, but gets us up fast on new platforms and also acts
+// as an easy-to-understand reference implementation.
+//
+//==============================================================//
+
+
+//---------------------------------------------------------------------
+// Standard C (fallback/new platform) implementation (only there for compat - slow)
+//---------------------------------------------------------------------
+
+FORCEINLINE float SubFloat( const fltx4 & a, int idx )
+{
+	return a.m128_f32[ idx ];
+}
+
+FORCEINLINE float & SubFloat( fltx4 & a, int idx )
+{
+	return a.m128_f32[idx];
+}
+
+FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
+{
+	return a.m128_u32[idx];
+}
+
+FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
+{
+	return a.m128_u32[idx];
+}
+
+// Return one in the fastest way -- on the x360, faster even than loading.
+FORCEINLINE fltx4 LoadZeroSIMD( void )
+{
+	return Four_Zeros;
+}
+
+// Return one in the fastest way -- on the x360, faster even than loading.
+FORCEINLINE fltx4 LoadOneSIMD( void )
+{
+	return Four_Ones;
+}
+
+FORCEINLINE fltx4 SplatXSIMD( const fltx4 & a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = SubFloat( a, 0 );
+	SubFloat( retVal, 1 ) = SubFloat( a, 0 );
+	SubFloat( retVal, 2 ) = SubFloat( a, 0 );
+	SubFloat( retVal, 3 ) = SubFloat( a, 0 );
+	return retVal;
+}
+
+FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = SubFloat( a, 1 );
+	SubFloat( retVal, 1 ) = SubFloat( a, 1 );
+	SubFloat( retVal, 2 ) = SubFloat( a, 1 );
+	SubFloat( retVal, 3 ) = SubFloat( a, 1 );
+	return retVal;
+}
+
+FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = SubFloat( a, 2 );
+	SubFloat( retVal, 1 ) = SubFloat( a, 2 );
+	SubFloat( retVal, 2 ) = SubFloat( a, 2 );
+	SubFloat( retVal, 3 ) = SubFloat( a, 2 );
+	return retVal;
+}
+
+FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = SubFloat( a, 3 );
+	SubFloat( retVal, 1 ) = SubFloat( a, 3 );
+	SubFloat( retVal, 2 ) = SubFloat( a, 3 );
+	SubFloat( retVal, 3 ) = SubFloat( a, 3 );
+	return retVal;
+}
+
+FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
+{
+	fltx4 result = a;
+	SubFloat( result, 0 ) = SubFloat( x, 0 );
+	return result;
+}
+
+FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
+{
+	fltx4 result = a;
+	SubFloat( result, 1 ) = SubFloat( y, 1 );
+	return result;
+}
+
+FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
+{
+	fltx4 result = a;
+	SubFloat( result, 2 ) = SubFloat( z, 2 );
+	return result;
+}
+
+FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
+{
+	fltx4 result = a;
+	SubFloat( result, 3 ) = SubFloat( w, 3 );
+	return result;
+}
+
+/// Set one component of a SIMD word with the given float value. 
+/// This function is a template because the native implementation of
+/// this on PPC platforms requires that the component be given as a 
+/// compiler immediate -- not a function parameter, not a const function
+/// parameter, not even a load from a const static array. It has to be
+/// a real immediate. 
+/// \param NCOMPONENT 0 is x, 1 is y, 2 is z, 3 is w.
+/// \note This function is not particularly performant on any platform (because of 
+///       the load from float), so prefer a masked assign from a fltx4 wherever
+///       possible. 
+template < unsigned int NCOMPONENT >
+FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, float flValue )
+{
+	fltx4 result = a;
+	SubFloat( result, NCOMPONENT ) = flValue;
+	return result;
+}
+
+
+// a b c d -> b c d a
+FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = SubFloat( a, 1 );
+	SubFloat( retVal, 1 ) = SubFloat( a, 2 );
+	SubFloat( retVal, 2 ) = SubFloat( a, 3 );
+	SubFloat( retVal, 3 ) = SubFloat( a, 0 );
+	return retVal;
+}
+
+// a b c d -> c d a b
+FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = SubFloat( a, 2 );
+	SubFloat( retVal, 1 ) = SubFloat( a, 3 );
+	SubFloat( retVal, 2 ) = SubFloat( a, 0 );
+	SubFloat( retVal, 3 ) = SubFloat( a, 1 );
+	return retVal;
+}
+
+#define BINOP(op) 														\
+	fltx4 retVal;                                          				\
+	SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) op SubFloat( b, 0 ) );	\
+	SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) op SubFloat( b, 1 ) );	\
+	SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) op SubFloat( b, 2 ) );	\
+	SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) op SubFloat( b, 3 ) );	\
+    return retVal;
+
+#define IBINOP(op) 														\
+	fltx4 retVal;														\
+	SubInt( retVal, 0 ) = ( SubInt( a, 0 ) op SubInt ( b, 0 ) );		\
+	SubInt( retVal, 1 ) = ( SubInt( a, 1 ) op SubInt ( b, 1 ) );		\
+	SubInt( retVal, 2 ) = ( SubInt( a, 2 ) op SubInt ( b, 2 ) );		\
+	SubInt( retVal, 3 ) = ( SubInt( a, 3 ) op SubInt ( b, 3 ) );		\
+    return retVal;
+
+FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
+{
+	BINOP(+);
+}
+
+FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b )				// a-b
+{
+	BINOP(-);
+};
+
+FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b )				// a*b
+{
+	BINOP(*);
+}
+
+FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b )				// a/b
+{
+	BINOP(/);
+}
+
+
+FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// a*b + c
+{
+	return AddSIMD( MulSIMD(a,b), c );
+}
+
+FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// c - a*b
+{
+	return SubSIMD( c, MulSIMD(a,b) );
+};
+
+
+FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
+{
+	fltx4 result;
+	SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) );
+	SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) );
+	SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) );
+	SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) );
+	return result;
+}
+
+FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
+{
+	SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
+	SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
+	SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
+}
+
+FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
+{
+	SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
+	SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
+	SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
+	SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) );
+}
+
+FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
+{
+	fltx4 result;
+	SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) );
+	SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) );
+	SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) );
+	SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) );
+	return result;
+}
+
+FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
+{
+	fltx4 result;
+	SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) );
+	SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) );
+	SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) );
+	SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) );
+	return result;
+}
+
+// tan^1(a/b) .. ie, pass sin in as a and cos in as b
+FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
+{
+	fltx4 result;
+	SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) );
+	SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) );
+	SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) );
+	SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) );
+	return result;
+}
+
+FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// max(a,b)
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = max( SubFloat( a, 0 ), SubFloat( b, 0 ) );
+	SubFloat( retVal, 1 ) = max( SubFloat( a, 1 ), SubFloat( b, 1 ) );
+	SubFloat( retVal, 2 ) = max( SubFloat( a, 2 ), SubFloat( b, 2 ) );
+	SubFloat( retVal, 3 ) = max( SubFloat( a, 3 ), SubFloat( b, 3 ) );
+	return retVal;
+}
+
+FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b )				// min(a,b)
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = min( SubFloat( a, 0 ), SubFloat( b, 0 ) );
+	SubFloat( retVal, 1 ) = min( SubFloat( a, 1 ), SubFloat( b, 1 ) );
+	SubFloat( retVal, 2 ) = min( SubFloat( a, 2 ), SubFloat( b, 2 ) );
+	SubFloat( retVal, 3 ) = min( SubFloat( a, 3 ), SubFloat( b, 3 ) );
+	return retVal;
+}
+
+FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b )				// a & b
+{
+	IBINOP(&);
+}
+
+FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b )			// ~a & b
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = ~SubInt( a, 0 ) & SubInt( b, 0 );
+	SubInt( retVal, 1 ) = ~SubInt( a, 1 ) & SubInt( b, 1 );
+	SubInt( retVal, 2 ) = ~SubInt( a, 2 ) & SubInt( b, 2 );
+	SubInt( retVal, 3 ) = ~SubInt( a, 3 ) & SubInt( b, 3 );
+	return retVal;
+}
+
+FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b )				// a ^ b
+{
+	IBINOP(^);
+}
+
+FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b )				// a | b
+{
+	IBINOP(|);
+}
+
+FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
+{
+	fltx4 retval;
+	SubFloat( retval, 0 ) = -SubFloat( a, 0 );
+	SubFloat( retval, 1 ) = -SubFloat( a, 1 );
+	SubFloat( retval, 2 ) = -SubFloat( a, 2 );
+	SubFloat( retval, 3 ) = -SubFloat( a, 3 );
+
+	return retval;
+}
+
+FORCEINLINE bool IsAllZeros( const fltx4 & a )								// all floats of a zero?
+{
+	return	( SubFloat( a, 0 ) == 0.0 ) &&
+		( SubFloat( a, 1 ) == 0.0 ) &&
+		( SubFloat( a, 2 ) == 0.0 ) &&
+		( SubFloat( a, 3 ) == 0.0 ) ;
+}
+
+
+// for branching when a.xyzw > b.xyzw
+FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
+{
+	return	SubFloat(a,0) > SubFloat(b,0) &&
+		SubFloat(a,1) > SubFloat(b,1) &&
+		SubFloat(a,2) > SubFloat(b,2) &&
+		SubFloat(a,3) > SubFloat(b,3);
+}
+
+// for branching when a.xyzw >= b.xyzw
+FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
+{
+	return	SubFloat(a,0) >= SubFloat(b,0) &&
+		SubFloat(a,1) >= SubFloat(b,1) &&
+		SubFloat(a,2) >= SubFloat(b,2) &&
+		SubFloat(a,3) >= SubFloat(b,3);
+}
+
+// For branching if all a.xyzw == b.xyzw
+FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
+{
+	return	SubFloat(a,0) == SubFloat(b,0) &&
+		SubFloat(a,1) == SubFloat(b,1) &&
+		SubFloat(a,2) == SubFloat(b,2) &&
+		SubFloat(a,3) == SubFloat(b,3);
+}
+
+// For branching if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w
+FORCEINLINE bool IsAnyEqual( const fltx4 & a, const fltx4 & b )
+{
+	return	SubFloat(a,0) == SubFloat(b,0) ||
+			SubFloat(a,1) == SubFloat(b,1) ||
+			SubFloat(a,2) == SubFloat(b,2) ||
+			SubFloat(a,3) == SubFloat(b,3);
+}
+
+FORCEINLINE int TestSignSIMD( const fltx4 & a )								// mask of which floats have the high bit set
+{
+	int nRet = 0;
+
+	nRet |= ( SubInt( a, 0 ) & 0x80000000 ) >> 31; // sign(x) -> bit 0
+	nRet |= ( SubInt( a, 1 ) & 0x80000000 ) >> 30; // sign(y) -> bit 1
+	nRet |= ( SubInt( a, 2 ) & 0x80000000 ) >> 29; // sign(z) -> bit 2
+	nRet |= ( SubInt( a, 3 ) & 0x80000000 ) >> 28; // sign(w) -> bit 3
+
+	return nRet;
+}
+
+FORCEINLINE bool IsAnyNegative( const fltx4 & a )							// (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
+{
+	return (0 != TestSignSIMD( a ));
+}
+
+FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b )				// (a==b) ? ~0:0
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) == SubFloat( b, 0 )) ? ~0 : 0;
+	SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) == SubFloat( b, 1 )) ? ~0 : 0;
+	SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) == SubFloat( b, 2 )) ? ~0 : 0;
+	SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) == SubFloat( b, 3 )) ? ~0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b )				// (a>b) ? ~0:0
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) > SubFloat( b, 0 )) ? ~0 : 0;
+	SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) > SubFloat( b, 1 )) ? ~0 : 0;
+	SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) > SubFloat( b, 2 )) ? ~0 : 0;
+	SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) > SubFloat( b, 3 )) ? ~0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b )				// (a>=b) ? ~0:0
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) >= SubFloat( b, 0 )) ? ~0 : 0;
+	SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) >= SubFloat( b, 1 )) ? ~0 : 0;
+	SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) >= SubFloat( b, 2 )) ? ~0 : 0;
+	SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) >= SubFloat( b, 3 )) ? ~0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b )				// (a<b) ? ~0:0
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) < SubFloat( b, 0 )) ? ~0 : 0;
+	SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) < SubFloat( b, 1 )) ? ~0 : 0;
+	SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) < SubFloat( b, 2 )) ? ~0 : 0;
+	SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) < SubFloat( b, 3 )) ? ~0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b )				// (a<=b) ? ~0:0
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 )) ? ~0 : 0;
+	SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 )) ? ~0 : 0;
+	SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 )) ? ~0 : 0;
+	SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 )) ? ~0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b )		// (a <= b && a >= -b) ? ~0 : 0
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 ) && SubFloat( a, 0 ) >= -SubFloat( b, 0 ) ) ? ~0 : 0;
+	SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 ) && SubFloat( a, 1 ) >= -SubFloat( b, 1 ) ) ? ~0 : 0;
+	SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 ) && SubFloat( a, 2 ) >= -SubFloat( b, 2 ) ) ? ~0 : 0;
+	SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 ) && SubFloat( a, 3 ) >= -SubFloat( b, 3 ) ) ? ~0 : 0;
+	return retVal;
+}
+
+///\name Functions which perform comparisons, resulting in a float value of either 0.0 or 1.0 (as opposed to resulting in a 32-bit integer mask ).
+///@{
+FORCEINLINE fltx4 Cmp01EqSIMD( const fltx4 & a, const fltx4 & b )				// (a==b) ? 1.0:0
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) == SubFloat( b, 0 )) ? 1.0 : 0;
+	SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) == SubFloat( b, 1 )) ? 1.0 : 0;
+	SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) == SubFloat( b, 2 )) ? 1.0 : 0;
+	SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) == SubFloat( b, 3 )) ? 1.0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 Cmp01GtSIMD( const fltx4 & a, const fltx4 & b )				// (a>b) ? 1.0:0
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) > SubFloat( b, 0 )) ? 1.0 : 0;
+	SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) > SubFloat( b, 1 )) ? 1.0 : 0;
+	SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) > SubFloat( b, 2 )) ? 1.0 : 0;
+	SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) > SubFloat( b, 3 )) ? 1.0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 Cmp01GeSIMD( const fltx4 & a, const fltx4 & b )				// (a>=b) ? 1.0:0
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) >= SubFloat( b, 0 )) ? 1.0 : 0;
+	SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) >= SubFloat( b, 1 )) ? 1.0 : 0;
+	SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) >= SubFloat( b, 2 )) ? 1.0 : 0;
+	SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) >= SubFloat( b, 3 )) ? 1.0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 Cmp01LtSIMD( const fltx4 & a, const fltx4 & b )				// (a<b) ? 1.0:0
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) < SubFloat( b, 0 )) ? 1.0 : 0;
+	SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) < SubFloat( b, 1 )) ? 1.0 : 0;
+	SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) < SubFloat( b, 2 )) ? 1.0 : 0;
+	SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) < SubFloat( b, 3 )) ? 1.0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 Cmp01LeSIMD( const fltx4 & a, const fltx4 & b )				// (a<=b) ? 1.0:0
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 )) ? 1.0 : 0;
+	SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 )) ? 1.0 : 0;
+	SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 )) ? 1.0 : 0;
+	SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 )) ? 1.0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 Cmp01InBoundsSIMD( const fltx4 & a, const fltx4 & b )		// (a <= b && a >= -b) ? 1.0 : 0
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 ) && SubFloat( a, 0 ) >= -SubFloat( b, 0 ) ) ? 1.0 : 0;
+	SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 ) && SubFloat( a, 1 ) >= -SubFloat( b, 1 ) ) ? 1.0 : 0;
+	SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 ) && SubFloat( a, 2 ) >= -SubFloat( b, 2 ) ) ? 1.0 : 0;
+	SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 ) && SubFloat( a, 3 ) >= -SubFloat( b, 3 ) ) ? 1.0 : 0;
+	return retVal;
+}
+
+
+//@}
+
+
+
+// simd conditional. for example, a simd version of "( x > 0 ) ? a : b" would be expressed as
+// "MaskedAssign( CmpGtSIMD( x, Four_Zeros ), a, b )". A typical use is to conditionally update
+// subfiles of a fltx4 based upon some test.
+FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
+{
+	return OrSIMD(
+		AndSIMD( ReplacementMask, NewValue ),
+		AndNotSIMD( ReplacementMask, OldValue ) );
+}
+
+FORCEINLINE fltx4 ReplicateX4( float flValue )					//  a,a,a,a
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = flValue;
+	SubFloat( retVal, 1 ) = flValue;
+	SubFloat( retVal, 2 ) = flValue;
+	SubFloat( retVal, 3 ) = flValue;
+	return retVal;
+}
+
+/// replicate a single 32 bit integer value to all 4 components of an m128
+FORCEINLINE fltx4 ReplicateIX4( int nValue )
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = nValue;
+	SubInt( retVal, 1 ) = nValue;
+	SubInt( retVal, 2 ) = nValue;
+	SubInt( retVal, 3 ) = nValue;
+	return retVal;
+
+}
+
+// Round towards positive infinity
+FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) );
+	SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) );
+	SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) );
+	SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) );
+	return retVal;
+
+}
+
+// Round towards negative infinity
+FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = floor( SubFloat( a, 0 ) );
+	SubFloat( retVal, 1 ) = floor( SubFloat( a, 1 ) );
+	SubFloat( retVal, 2 ) = floor( SubFloat( a, 2 ) );
+	SubFloat( retVal, 3 ) = floor( SubFloat( a, 3 ) );
+	return retVal;
+
+}
+
+FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a )				// sqrt(a), more or less
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) );
+	SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) );
+	SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) );
+	SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) );
+	return retVal;
+}
+
+FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a )					// sqrt(a)
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) );
+	SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) );
+	SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) );
+	SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) );
+	return retVal;
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a )		// 1/sqrt(a), more or less
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) );
+	SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) );
+	SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) );
+	SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) );
+	return retVal;
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) != 0.0f ? SubFloat( a, 0 ) : FLT_EPSILON );
+	SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) != 0.0f ? SubFloat( a, 1 ) : FLT_EPSILON );
+	SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) != 0.0f ? SubFloat( a, 2 ) : FLT_EPSILON );
+	SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) != 0.0f ? SubFloat( a, 3 ) : FLT_EPSILON );
+	return retVal;
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a )			// 1/sqrt(a)
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) );
+	SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) );
+	SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) );
+	SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) );
+	return retVal;
+}
+
+FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a )			// 1/a, more or less
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 );
+	SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 );
+	SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 );
+	SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 );
+	return retVal;
+}
+
+FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a )				// 1/a
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 );
+	SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 );
+	SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 );
+	SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 );
+	return retVal;
+}
+
+/// 1/x for all 4 values.
+/// 1/0 will result in a big but NOT infinite result
+FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 ));
+	SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 ));
+	SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 ));
+	SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 ));
+	return retVal;
+}
+
+FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 ));
+	SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 ));
+	SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 ));
+	SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 ));
+	return retVal;
+}
+
+/// 2^x for all values (the antilog)
+FORCEINLINE fltx4 PowerOfTwoSIMD( const fltx4 &toPower )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = powf( 2, SubFloat(toPower, 0) );
+	SubFloat( retVal, 1 ) = powf( 2, SubFloat(toPower, 1) );
+	SubFloat( retVal, 2 ) = powf( 2, SubFloat(toPower, 2) );
+	SubFloat( retVal, 3 ) = powf( 2, SubFloat(toPower, 3) );
+
+	return retVal;
+}
+
+/// horizontal 3d dotproduct
+FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
+{
+	float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) +
+		SubFloat( a, 1 ) * SubFloat( b, 1 ) + 
+		SubFloat( a, 2 ) * SubFloat( b, 2 );
+	return ReplicateX4( flDot );
+}
+
+/// horizontal 4d dotproduct
+FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
+{
+	float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) +
+		SubFloat( a, 1 ) * SubFloat( b, 1 ) + 
+		SubFloat( a, 2 ) * SubFloat( b, 2 ) +
+		SubFloat( a, 3 ) * SubFloat( b, 3 );
+	return ReplicateX4( flDot );
+}
+
+/// Clamps the components of a vector to a specified minimum and maximum range.
+FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
+{
+	return MaxSIMD( min, MinSIMD( max, in ) );
+}
+
+/// Squelch the w component of a vector to +0.0.  Most efficient when you say a = SetWToZeroSIMD(a)
+/// (avoids a copy)
+FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
+{
+	fltx4 retval;
+	retval = a;
+	SubFloat( retval, 0 ) = 0;
+	return retval;
+}
+
+FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
+{
+	return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
+}
+
+FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
+{
+	return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
+}
+
+/// load a single unaligned float into the x component of a SIMD word
+FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt )
+{
+	fltx4 retval;
+	SubFloat( retval, 0 ) = *pFlt;
+	return retval;
+}
+
+FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
+{
+	return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
+}
+
+/// for the transitional class -- load a 3-by VectorAligned and squash its w component
+FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
+{
+	fltx4 retval = LoadAlignedSIMD(pSIMD.Base());
+	// squelch w
+	SubInt( retval, 3 ) = 0;
+	return retval;
+}
+
+FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
+}
+
+FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
+}
+
+FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
+{
+	*pSIMD     = SubFloat(a, 0);
+	*(pSIMD+1) = SubFloat(a, 1);
+	*(pSIMD+2) = SubFloat(a, 2);
+}
+
+/// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
+FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
+{
+	StoreAlignedSIMD(pSIMD->Base(),a);
+}
+
+/// Store the x,y,z components of the four FLTX4 parameters
+// into the four consecutive Vectors pDestination[0], pDestination[1], pDestination[2],
+// pDestination[3] The Vectors are assumed
+/// to be unaligned.
+FORCEINLINE void StoreFourUnalignedVector3SIMD( fltx4 a, fltx4 b, fltx4	c, FLTX4 d, // first three passed by copy (deliberate)
+											   Vector * const pDestination )
+{
+	StoreUnaligned3SIMD( pDestination->Base(), a );
+	StoreUnaligned3SIMD( (pDestination+1)->Base(), b );
+	StoreUnaligned3SIMD( (pDestination+2)->Base(), c );
+	StoreUnaligned3SIMD( (pDestination+3)->Base(), d );
+}
+
+// Store the x,y,z components of the four FLTX4 parameters
+// into the four consecutive Vectors:
+//    pDestination ,  pDestination + 1,  pDestination + 2,  pDestination + 3
+// The Vectors are assumed to start on an ALIGNED address, that is, 
+// pDestination is 16-byte aligned (thhough obviously pDestination+1 is not).
+FORCEINLINE void StoreFourAlignedVector3SIMD( fltx4 a, fltx4 b, fltx4	c, FLTX4 d, // first three passed by copy (deliberate)
+											 Vector * const pDestination )
+{
+	StoreUnaligned3SIMD( pDestination->Base(), a );
+	StoreUnaligned3SIMD( (pDestination+1)->Base(), b );
+	StoreUnaligned3SIMD( (pDestination+2)->Base(), c );
+	StoreUnaligned3SIMD( (pDestination+3)->Base(), d );
+}
+
+
+FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
+{
+
+#define SWAP_FLOATS( _a_, _ia_, _b_, _ib_ )				\
+	{													\
+		float tmp = SubFloat( _a_, _ia_ );				\
+		SubFloat( _a_, _ia_ ) = SubFloat( _b_, _ib_ );	\
+		SubFloat( _b_, _ib_ ) = tmp;					\
+	}
+
+	SWAP_FLOATS( x, 1, y, 0 );
+	SWAP_FLOATS( x, 2, z, 0 );
+	SWAP_FLOATS( x, 3, w, 0 );
+	SWAP_FLOATS( y, 2, z, 1 );
+	SWAP_FLOATS( y, 3, w, 1 );
+	SWAP_FLOATS( z, 3, w, 2 );
+}
+
+/// find the lowest component of a.x, a.y, a.z, and replicate it to the whole return value.
+FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
+{
+	float lowest = min( min( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
+	return ReplicateX4(lowest);
+}
+
+/// find the highest component of a.x, a.y, a.z, and replicate it to the whole return value.
+FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
+{
+	float highest = max( max( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
+	return ReplicateX4(highest);
+}
+
+/// Fixed-point conversion and save as SIGNED INTS.  pDest->x = Int (vSrc.x) note: some
+/// architectures have means of doing fixed point conversion when the fix depth is specified as an
+/// immediate.. but there is no way to guarantee an immediate as a parameter to function like this.
+FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
+{
+	(*pDest)[0] = SubFloat(vSrc, 0);
+	(*pDest)[1] = SubFloat(vSrc, 1);
+	(*pDest)[2] = SubFloat(vSrc, 2);
+	(*pDest)[3] = SubFloat(vSrc, 3);
+}
+
+///@group INTEGER SIMD OPERATIONS {
+
+/// splat all components of a vector to a signed immediate int number.
+FORCEINLINE fltx4 IntSetImmediateSIMD( int nValue )
+{
+	fltx4 retval;
+	SubInt( retval, 0 ) = SubInt( retval, 1 ) = SubInt( retval, 2 ) = SubInt( retval, 3) = nValue;
+	return retval;
+}
+
+/// Load 4 aligned words into a SIMD register
+FORCEINLINE i32x4 LoadAlignedIntSIMD(const void * RESTRICT pSIMD)
+{
+	return *( reinterpret_cast< const i32x4 *> ( pSIMD ) );
+}
+
+/// Load 4 unaligned words into a SIMD register
+FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD)
+{
+	return *( reinterpret_cast< const i32x4 *> ( pSIMD ) );
+}
+
+/// save into four words, 16-byte aligned
+FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
+}
+
+FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;
+}
+
+FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
+}
+
+/// Load four consecutive uint16's, and turn them into floating point numbers.  This function isn't
+/// especially fast and could be made faster if anyone is using it heavily.
+FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts )
+{
+	fltx4 retval;
+	SubFloat( retval, 0 ) = pInts[0];
+	SubFloat( retval, 1 ) = pInts[1];
+	SubFloat( retval, 2 ) = pInts[2];
+	SubFloat( retval, 3 ) = pInts[3];
+}
+
+
+/// Take a fltx4 containing fixed-point uints and return them as single precision floats. No fixed
+/// point conversion is done.
+FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
+{
+	Assert(0);			/* pc has no such operation */
+	fltx4 retval;
+	SubFloat( retval, 0 ) = ( (float) SubInt( vSrcA, 0 ) );
+	SubFloat( retval, 1 ) = ( (float) SubInt( vSrcA, 1 ) );
+	SubFloat( retval, 2 ) = ( (float) SubInt( vSrcA, 2 ) );
+	SubFloat( retval, 3 ) = ( (float) SubInt( vSrcA, 3 ) );
+	return retval;
+}
+
+
+#if 0				/* pc has no such op */
+// Take a fltx4 containing fixed-point sints and 
+// return them as single precision floats. No 
+// fixed point conversion is done.
+FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
+{
+	fltx4 retval;
+	SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[0])) );
+	SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[1])) );
+	SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[2])) );
+	SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[3])) );
+	return retval;
+}
+
+
+///  works on fltx4's as if they are four uints.  the first parameter contains the words to be
+///  shifted, the second contains the amount to shift by AS INTS
+///
+///  for i = 0 to 3
+///  shift = vSrcB_i*32:(i*32)+4
+///  vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
+FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB)
+{
+	i32x4 retval;
+	SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
+	SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
+	SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
+	SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);
+
+
+	return retval;
+}
+//@} 
--- a/public/mathlib/ssemath_x86.h
+++ b/public/mathlib/ssemath_x86.h
@@ -0,0 +1,920 @@
+//===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======//
+//
+// Purpose: Implementation of our SIMD functions for the x86 using SSE
+//==============================================================//
+
+
+#ifndef _MATH_PFNS_H_
+#include "mathlib/math_pfns.h"
+#endif
+
+#if defined( PLATFORM_WINDOWS_PC )
+#include <intrin.h>
+#else
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+#endif
+ 
+
+//---------------------------------------------------------------------
+// Intel/SSE implementation
+//---------------------------------------------------------------------
+
+FORCEINLINE void StoreAlignedSIMD( float * RESTRICT pSIMD, const fltx4 & a )
+{
+	_mm_store_ps( pSIMD, a );
+}
+
+FORCEINLINE void StoreUnalignedSIMD( float * RESTRICT pSIMD, const fltx4 & a )
+{
+	_mm_storeu_ps( pSIMD, a );
+}
+
+FORCEINLINE void StoreUnalignedSIMD( int * RESTRICT pSIMD, const i32x4 &a )
+{
+	_mm_storeu_si128( ( __m128i * ) pSIMD, a );
+}
+
+
+FORCEINLINE fltx4 RotateLeft( const fltx4 & a );
+FORCEINLINE fltx4 RotateLeft2( const fltx4 & a );
+
+FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
+{
+	_mm_store_ss(pSIMD, a);
+	_mm_store_ss(pSIMD+1, RotateLeft(a));
+	_mm_store_ss(pSIMD+2, RotateLeft2(a));
+}
+
+
+
+FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
+{
+	return _mm_load_ps( reinterpret_cast< const float *> ( pSIMD ) );
+}
+
+FORCEINLINE shortx8 LoadAlignedShortSIMD( const void *pSIMD )
+{
+	return _mm_load_si128( reinterpret_cast< const shortx8 *> ( pSIMD ) );
+}
+
+FORCEINLINE shortx8 LoadUnalignedShortSIMD( const void *pSIMD )
+{
+	return _mm_loadu_si128( reinterpret_cast< const shortx8 *> ( pSIMD ) );
+}
+
+
+FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b )				// a & b
+{
+	return _mm_and_ps( a, b );
+}
+
+FORCEINLINE i32x4 AndSIMD( const i32x4 &a, const i32x4 &b )
+{
+	return _mm_and_si128( a, b );
+}
+
+FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b )			// ~a & b
+{
+	return _mm_andnot_ps( a, b );
+}
+
+FORCEINLINE i32x4 AndNotSIMD( const i32x4 & a, const i32x4 & b )			// ~a & b
+{
+	return _mm_andnot_si128( a, b );
+}
+
+
+FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b )				// a ^ b
+{
+	return _mm_xor_ps( a, b );
+}
+
+FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b )				// a | b
+{
+	return _mm_or_ps( a, b );
+}
+
+FORCEINLINE i32x4 OrSIMD( const i32x4 &a, const i32x4 &b )
+{
+	return _mm_or_si128( a, b );
+}
+
+
+// Squelch the w component of a vector to +0.0.
+// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
+FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
+{
+	return AndSIMD( a, LoadAlignedSIMD( g_SIMD_clear_wmask ) );
+}
+
+
+FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
+{
+	return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) );
+}
+
+FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
+{
+	return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) );
+}
+
+// load a single unaligned float into the x component of a SIMD word
+FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt )
+{
+	return _mm_load_ss(pFlt);
+}
+
+FORCEINLINE fltx4 CastToFltx4( i32x4 const & a )
+{
+	return _mm_castsi128_ps( a );
+}
+
+/// replicate a single 32 bit integer value to all 4 components of an m128
+FORCEINLINE i32x4 ReplicateIX4( int i )
+{
+	return _mm_set1_epi32( i );
+}
+
+
+FORCEINLINE fltx4 ReplicateX4( float flValue )
+{
+	__m128 value = _mm_set_ss( flValue );
+	return _mm_shuffle_ps( value, value, 0 );
+}
+
+// AltiVec compilers may have trouble inlining pass-by-value variant of ReplicateX4, whereas 
+// they will have absolutely no problem inlining pass-by-pointer variant. So it's better to use
+// the pass-by-pointer variant unless you're mixing scalar and vector code (which is bad for perf on AltiVec anyway)
+FORCEINLINE fltx4 ReplicateX4( const float *pValue )
+{
+	return ReplicateX4( *pValue );
+}
+
+
+FORCEINLINE float SubFloat( const fltx4 & a, int idx )
+{
+	// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
+#ifndef POSIX
+	return a.m128_f32[ idx ];
+#else
+	return (reinterpret_cast<float const *>(&a))[idx];
+#endif
+}
+
+FORCEINLINE float & SubFloat( fltx4 & a, int idx )
+{
+#ifndef POSIX
+	return a.m128_f32[ idx ];
+#else
+	return (reinterpret_cast<float *>(&a))[idx];
+#endif
+}
+
+FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
+{
+	return (uint32)SubFloat(a,idx);
+}
+
+FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
+{
+#ifndef POSIX
+	return a.m128_u32[idx];
+#else
+	return (reinterpret_cast<uint32 const *>(&a))[idx];
+#endif
+}
+
+FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
+{
+#ifndef POSIX
+	return a.m128_u32[idx];
+#else
+	return (reinterpret_cast<uint32 *>(&a))[idx];
+#endif
+}
+
+FORCEINLINE uint32 SubInt( i32x4 const & a, int idx )
+{
+#ifndef POSIX
+	return a.m128i_u32[idx];
+#else
+	return (reinterpret_cast<const uint32 *>(&a))[idx];
+#endif
+}
+
+FORCEINLINE uint32 & SubInt( i32x4 & a, int idx )
+{
+#ifndef POSIX
+	return a.m128i_u32[idx];
+#else
+	return (reinterpret_cast<uint32 *>(&a))[idx];
+#endif
+}
+
+// gather from array. Indices are in units of float size
+FORCEINLINE fltx4 GatherFltX4SIMD( float const *pData, i32x4 n4Indices )
+{
+	fltx4 fl4Ret;
+	SubFloat( fl4Ret, 0 ) = pData[SubInt(n4Indices,0)];
+	SubFloat( fl4Ret, 1 ) = pData[SubInt(n4Indices,1)];
+	SubFloat( fl4Ret, 2 ) = pData[SubInt(n4Indices,2)];
+	SubFloat( fl4Ret, 3 ) = pData[SubInt(n4Indices,3)];
+	return fl4Ret;
+
+}
+
+// gather from array. Indices are in units of float size
+FORCEINLINE fltx4 GatherFltX4SIMD( fltx4 const *pData, i32x4 n4Indices )
+{
+	return GatherFltX4SIMD( ( float const * ) pData, n4Indices );
+
+}
+
+
+// Return one in the fastest way -- on the x360, faster even than loading.
+FORCEINLINE fltx4 LoadZeroSIMD( void )
+{
+	return Four_Zeros;
+}
+
+// Return one in the fastest way -- on the x360, faster even than loading.
+FORCEINLINE fltx4 LoadOneSIMD( void )
+{
+	return Four_Ones;
+}
+
+FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
+{
+	return OrSIMD(
+		AndSIMD( ReplacementMask, NewValue ),
+		AndNotSIMD( ReplacementMask, OldValue ) );
+}
+
+// remember, the SSE numbers its words 3 2 1 0
+// The way we want to specify shuffles is backwards from the default
+// MM_SHUFFLE_REV is in array index order (default is reversed)
+#define MM_SHUFFLE_REV(a,b,c,d) _MM_SHUFFLE(d,c,b,a)
+
+FORCEINLINE fltx4 SplatXSIMD( fltx4 const & a )
+{
+	return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 0, 0 ) );
+}
+
+FORCEINLINE fltx4 SplatYSIMD( fltx4 const &a )
+{
+	return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 1, 1, 1 ) );
+}
+
+FORCEINLINE fltx4 SplatZSIMD( fltx4 const &a )
+{
+	return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 2, 2 ) );
+}
+
+FORCEINLINE fltx4 SplatWSIMD( fltx4 const &a )
+{
+	return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 3, 3, 3, 3 ) );
+}
+
+FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
+{
+	fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[0] ), x, a );
+	return result;
+}
+
+FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
+{
+	fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[1] ), y, a );
+	return result;
+}
+
+FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
+{
+	fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[2] ), z, a );
+	return result;
+}
+
+FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
+{
+	fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[3] ), w, a );
+	return result;
+}
+
+/// Set one component of a SIMD word with the given float value. 
+/// This function is a template because the native implementation of
+/// this on PPC platforms requires that the component be given as a 
+/// compiler immediate -- not a function parameter, not a const function
+/// parameter, not even a load from a const static array. It has to be
+/// a real immediate. 
+/// \param NCOMPONENT 0 is x, 1 is y, 2 is z, 3 is w.
+/// \note This function is not particularly performant on any platform (because of 
+///       the load from float), so prefer a masked assign from a fltx4 wherever
+///       possible. 
+template < unsigned int NCOMPONENT >
+FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, float flValue )
+{
+	fltx4 val = ReplicateX4( flValue );
+	fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[NCOMPONENT] ), val, a );
+	return result;
+}
+
+// a b c d -> b c d a
+FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
+{
+	return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 2, 3, 0 ) );
+}
+
+// a b c d -> c d a b
+FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
+{
+	return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 3, 0, 1 ) );
+}
+
+// a b c d -> d a b c
+FORCEINLINE fltx4 RotateRight( const fltx4 & a )
+{
+	return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 3, 0, 1, 2 ) );
+}
+
+// a b c d -> c d a b
+FORCEINLINE fltx4 RotateRight2( const fltx4 & a )
+{
+	return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 3, 0, 1 ) );
+}
+
+FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )				// a+b
+{
+	return _mm_add_ps( a, b );
+}
+
+FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b )				// a-b
+{
+	return _mm_sub_ps( a, b );
+};
+
+FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b )				// a*b
+{
+	return _mm_mul_ps( a, b );
+};
+
+FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b )				// a/b
+{
+	return _mm_div_ps( a, b );
+};
+
+FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// a*b + c
+{
+	return AddSIMD( MulSIMD(a,b), c );
+}
+
+FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// c - a*b
+{
+	return SubSIMD( c, MulSIMD(a,b) );
+};
+
+FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
+{
+	fltx4 m = MulSIMD( a, b );
+	return AddSIMD( AddSIMD( SplatXSIMD(m), SplatYSIMD(m) ), SplatZSIMD(m) );
+}
+
+FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
+{
+	// 4 instructions, serial, order of addition varies so individual elements my differ in the LSB on some CPUs
+	fltx4 fl4Product = MulSIMD( a, b );
+	fltx4 fl4YXWZ = _mm_shuffle_ps( fl4Product, fl4Product, MM_SHUFFLE_REV(1,0,3,2) );
+	fltx4 fl4UUVV = AddSIMD( fl4Product, fl4YXWZ ); // U = X+Y; V = Z+W
+	fltx4 fl4VVUU = RotateLeft2( fl4UUVV );
+	return AddSIMD( fl4UUVV, fl4VVUU );
+}
+
+FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
+{
+	// FIXME: Make a fast SSE version
+	SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
+	SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
+	SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
+}
+
+
+//TODO: implement as four-way Taylor series (see xbox implementation)
+FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
+{
+	// FIXME: Make a fast SSE version
+	fltx4 result;
+	SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) );
+	SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) );
+	SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) );
+	SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) );
+	return result;
+}
+
+FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
+{
+	fltx4 result;
+	SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) );
+	SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) );
+	SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) );
+	SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) );
+	return result;
+}
+
+// tan^1(a/b) .. ie, pass sin in as a and cos in as b
+FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
+{
+	fltx4 result;
+	SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) );
+	SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) );
+	SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) );
+	SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) );
+	return result;
+}
+
+/// [ a.x+a.y a.z+a.w b.x+b.y b.z+b.w ] from sse3
+FORCEINLINE fltx4 PairwiseHorizontalAddSIMD( const fltx4 &a, const fltx4 &b )
+{
+	return _mm_hadd_ps( a, b );
+}
+
+FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
+{
+	return SubSIMD(LoadZeroSIMD(),a);
+}
+
+FORCEINLINE int TestSignSIMD( const fltx4 & a )								// mask of which floats have the high bit set
+{
+	return _mm_movemask_ps( a );
+}
+
+FORCEINLINE bool IsAnyNegative( const fltx4 & a )							// (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
+{
+	return (0 != TestSignSIMD( a ));
+}
+
+
+FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b )				// (a==b) ? ~0:0
+{
+	return _mm_cmpeq_ps( a, b );
+}
+
+FORCEINLINE fltx4 CmpEqSIMD( const i32x4 & a, const i32x4 & b )				// (a==b) ? ~0:0   for 32 bit ints.fltx4 result.
+{
+	return _mm_castsi128_ps( _mm_cmpeq_epi32( a, b ) );
+}
+
+FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b )				// (a>b) ? ~0:0
+{
+	return _mm_cmpgt_ps( a, b );
+}
+
+FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b )				// (a>=b) ? ~0:0
+{
+	return _mm_cmpge_ps( a, b );
+}
+
+FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b )				// (a<b) ? ~0:0
+{
+	return _mm_cmplt_ps( a, b );
+}
+
+FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b )				// (a<=b) ? ~0:0
+{
+	return _mm_cmple_ps( a, b );
+}
+
+FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b )		// (a <= b && a >= -b) ? 1.0 : 0
+{
+	return AndSIMD( CmpLeSIMD(a,b), CmpGeSIMD(a, NegSIMD(b)) ) ;
+}
+
+FORCEINLINE fltx4 Cmp01EqSIMD( const fltx4 & a, const fltx4 & b )				// (a==b) ? 1.0:0
+{
+	return AndSIMD( Four_Ones, _mm_cmpeq_ps( a, b ) );
+}
+
+FORCEINLINE fltx4 Cmp01GtSIMD( const fltx4 & a, const fltx4 & b )				// (a>b) ? 1.0:0
+{
+	return AndSIMD( Four_Ones, _mm_cmpgt_ps( a, b ) );
+}
+
+FORCEINLINE fltx4 Cmp01GeSIMD( const fltx4 & a, const fltx4 & b )				// (a>=b) ? 1.0:0
+{
+	return AndSIMD( Four_Ones, _mm_cmpge_ps( a, b ) );
+}
+
+FORCEINLINE fltx4 Cmp01LtSIMD( const fltx4 & a, const fltx4 & b )				// (a<b) ? 1.0:0
+{
+	return AndSIMD( Four_Ones, _mm_cmplt_ps( a, b ) );
+}
+
+FORCEINLINE fltx4 Cmp01LeSIMD( const fltx4 & a, const fltx4 & b )				// (a<=b) ? 1.0:0
+{
+	return AndSIMD( Four_Ones, _mm_cmple_ps( a, b ) );
+}
+
+FORCEINLINE fltx4 Cmp01InBoundsSIMD( const fltx4 & a, const fltx4 & b )		// (a <= b && a >= -b) ? 1.0 : 0
+{
+	return AndSIMD( Four_Ones, AndSIMD( CmpLeSIMD(a,b), CmpGeSIMD(a, NegSIMD(b)) ) );
+}
+
+
+// for branching when a.xyzw > b.xyzw
+FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
+{
+	return	TestSignSIMD( CmpLeSIMD( a, b ) ) == 0;
+}
+
+// for branching when a.xyzw >= b.xyzw
+FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
+{
+	return	TestSignSIMD( CmpLtSIMD( a, b ) ) == 0;
+}
+
+// For branching if all a.xyzw == b.xyzw
+FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
+{
+	return	TestSignSIMD( CmpEqSIMD( a, b ) ) == 0xf;
+}
+
+
+FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b )				// min(a,b)
+{
+	return _mm_min_ps( a, b );
+}
+
+FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// max(a,b)
+{
+	return _mm_max_ps( a, b );
+}
+
+
+
+// SSE lacks rounding operations. 
+// Really.
+// You can emulate them by setting the rounding mode for the 
+// whole processor and then converting to int, and then back again.
+// But every time you set the rounding mode, you clear out the
+// entire pipeline. So, I can't do them per operation. You
+// have to do it once, before the loop that would call these.
+// Round towards positive infinity
+FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) );
+	SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) );
+	SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) );
+	SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) );
+	return retVal;
+
+}
+
+fltx4 fabs( const fltx4 & x );
+
+
+// Round towards negative infinity
+// This is the implementation that was here before; it assumes
+// you are in round-to-floor mode, which I guess is usually the
+// case for us vis-a-vis SSE. It's totally unnecessary on 
+// VMX, which has a native floor op.
+FORCEINLINE fltx4 FloorSIMD( const fltx4 &val )
+{
+	fltx4 fl4Abs = fabs( val );
+	fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s );
+	ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival );
+	return XorSIMD( ival, XorSIMD( val, fl4Abs ) );			// restore sign bits
+}
+
+
+
+FORCEINLINE bool IsAnyZeros( const fltx4 & a )								// any floats are zero?
+{
+	return TestSignSIMD( CmpEqSIMD( a, Four_Zeros ) ) != 0;
+}
+
+inline bool IsAllZeros( const fltx4 & var )
+{
+	return TestSignSIMD( CmpEqSIMD( var, Four_Zeros ) ) == 0xF;
+}
+
+FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a )					// sqrt(a), more or less
+{
+	return _mm_sqrt_ps( a );
+}
+
+FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a )						// sqrt(a)
+{
+	return _mm_sqrt_ps( a );
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a )			// 1/sqrt(a), more or less
+{
+	return _mm_rsqrt_ps( a );
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
+{
+	fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
+	fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
+	ret = ReciprocalSqrtEstSIMD( ret );
+	return ret;
+}
+
+/// uses newton iteration for higher precision results than ReciprocalSqrtEstSIMD
+FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a )				// 1/sqrt(a)
+{
+	fltx4 guess = ReciprocalSqrtEstSIMD( a );
+	// newton iteration for 1/sqrt(a) : y(n+1) = 1/2 (y(n)*(3-a*y(n)^2));
+	guess = MulSIMD( guess, SubSIMD( Four_Threes, MulSIMD( a, MulSIMD( guess, guess ))));
+	guess = MulSIMD( Four_PointFives, guess);
+	return guess;
+}
+
+FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a )				// 1/a, more or less
+{
+	return _mm_rcp_ps( a );
+}
+
+/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
+/// No error checking!
+FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a )					// 1/a
+{
+	fltx4 ret = ReciprocalEstSIMD( a );
+	// newton iteration is: Y(n+1) = 2*Y(n)-a*Y(n)^2
+	ret = SubSIMD( AddSIMD( ret, ret ), MulSIMD( a, MulSIMD( ret, ret ) ) );
+	return ret;
+}
+
+// CHRISG: is it worth doing integer bitfiddling for this?
+// 2^x for all values (the antilog)
+FORCEINLINE fltx4 PowerOfTwoSIMD( const fltx4 &toPower )
+{
+	fltx4 retval;
+	SubFloat( retval, 0 ) = powf( 2, SubFloat(toPower, 0) );
+	SubFloat( retval, 1 ) = powf( 2, SubFloat(toPower, 1) );
+	SubFloat( retval, 2 ) = powf( 2, SubFloat(toPower, 2) );
+	SubFloat( retval, 3 ) = powf( 2, SubFloat(toPower, 3) );
+
+	return retval;
+}
+
+// Clamps the components of a vector to a specified minimum and maximum range.
+FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
+{
+	return MaxSIMD( min, MinSIMD( max, in ) );
+}
+
+FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w)
+{
+	_MM_TRANSPOSE4_PS( x, y, z, w );
+}
+
+FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 &a )
+{
+	// a is [x,y,z,G] (where G is garbage)
+	// rotate left by one 
+	fltx4 compareOne = RotateLeft( a );
+	// compareOne is [y,z,G,x]
+	fltx4 retval = MinSIMD( a, compareOne );
+	// retVal is [min(x,y), ... ]
+	compareOne = RotateLeft2( a );
+	// compareOne is [z, G, x, y]
+	retval = MinSIMD( retval, compareOne );
+	// retVal = [ min(min(x,y),z)..]
+	// splat the x component out to the whole vector and return
+	return SplatXSIMD( retval );
+	
+}
+
+FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 &a )
+{
+	// a is [x,y,z,G] (where G is garbage)
+	// rotate left by one 
+	fltx4 compareOne = RotateLeft( a );
+	// compareOne is [y,z,G,x]
+	fltx4 retval = MaxSIMD( a, compareOne );
+	// retVal is [max(x,y), ... ]
+	compareOne = RotateLeft2( a );
+	// compareOne is [z, G, x, y]
+	retval = MaxSIMD( retval, compareOne );
+	// retVal = [ max(max(x,y),z)..]
+	// splat the x component out to the whole vector and return
+	return SplatXSIMD( retval );
+	
+}
+
+// ------------------------------------
+// INTEGER SIMD OPERATIONS.
+// ------------------------------------
+
+
+#if 0				/* pc does not have these ops */
+// splat all components of a vector to a signed immediate int number.
+FORCEINLINE fltx4 IntSetImmediateSIMD(int to)
+{
+	//CHRISG: SSE2 has this, but not SSE1. What to do?
+	fltx4 retval;
+	SubInt( retval, 0 ) = to;
+	SubInt( retval, 1 ) = to;
+	SubInt( retval, 2 ) = to;
+	SubInt( retval, 3 ) = to;
+	return retval;
+}
+#endif
+
+// Load 4 aligned words into a SIMD register
+FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD)
+{
+	return _mm_load_si128( reinterpret_cast<const __m128i *>(pSIMD) );
+}
+
+// Load 4 unaligned words into a SIMD register
+FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD)
+{
+	return _mm_loadu_si128( reinterpret_cast<const __m128i *>(pSIMD) );
+}
+
+// save into four words, 16-byte aligned
+FORCEINLINE void StoreAlignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a )
+{
+	_mm_store_ps( reinterpret_cast<float *>(pSIMD), a );
+}
+
+FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
+{
+	_mm_store_ps( reinterpret_cast<float *>(pSIMD.Base()), a );
+}
+
+FORCEINLINE void StoreAlignedSIMD( short * RESTRICT pSIMD, const shortx8 & a )
+{
+	_mm_store_si128( (shortx8 *)pSIMD, a );
+}
+
+FORCEINLINE void StoreUnalignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a )
+{
+	_mm_storeu_ps( reinterpret_cast<float *>(pSIMD), a );
+}
+
+FORCEINLINE void StoreUnalignedSIMD( short * RESTRICT pSIMD, const shortx8 & a )
+{
+	_mm_storeu_si128( (shortx8 *)pSIMD, a );
+}
+// a={ a.x, a.z, b.x, b.z }
+// combine two fltx4s by throwing away every other field.
+FORCEINLINE fltx4 CompressSIMD( fltx4 const & a, fltx4 const &b )
+{
+	return _mm_shuffle_ps( a, b, MM_SHUFFLE_REV( 0, 2, 0, 2 ) );
+}
+
+// Load four consecutive uint16's, and turn them into floating point numbers.
+// This function isn't especially fast and could be made faster if anyone is
+// using it heavily.
+FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts )
+{
+#ifdef POSIX
+	fltx4 retval;
+	SubFloat( retval, 0 ) = pInts[0];
+	SubFloat( retval, 1 ) = pInts[1];
+	SubFloat( retval, 2 ) = pInts[2];
+	SubFloat( retval, 3 ) = pInts[3];
+	return retval;
+#else
+	__m128i inA = _mm_loadl_epi64( (__m128i const*) pInts); // Load the lower 64 bits of the value pointed to by p into the lower 64 bits of the result, zeroing the upper 64 bits of the result.
+	inA = _mm_unpacklo_epi16( inA, _mm_setzero_si128() ); // unpack unsigned 16's to signed 32's
+	return _mm_cvtepi32_ps(inA);
+#endif
+}
+
+
+// a={ a.x, b.x, c.x, d.x }
+// combine 4 fltx4s by throwing away 3/4s of the fields
+FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d )
+{
+	fltx4 aacc = _mm_shuffle_ps( a, c, MM_SHUFFLE_REV( 0, 0, 0, 0 ) );
+	fltx4 bbdd = _mm_shuffle_ps( b, d, MM_SHUFFLE_REV( 0, 0, 0, 0 ) );
+	return MaskedAssign( LoadAlignedSIMD( g_SIMD_EveryOtherMask ), bbdd, aacc );
+}
+
+// outa={a.x, a.x, a.y, a.y}, outb = a.z, a.z, a.w, a.w }
+FORCEINLINE void ExpandSIMD( fltx4 const &a, fltx4 &fl4OutA, fltx4 &fl4OutB )
+{
+	fl4OutA = _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 1, 1 ) );
+	fl4OutB = _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 3, 3 ) );
+
+}
+
+// CHRISG: the conversion functions all seem to operate on m64's only...
+// how do we make them work here?
+
+// Take a fltx4 containing fixed-point uints and 
+// return them as single precision floats. No
+// fixed point conversion is done.
+FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
+{
+	fltx4 retval;
+	SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) );
+	SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) );
+	SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) );
+	SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) );
+	return retval;
+}
+
+
+// Convert the 4 32-bit integers to single precison floats.
+FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
+{
+	return  _mm_cvtepi32_ps( (const __m128i &)vSrcA );
+}
+
+
+/*
+  works on fltx4's as if they are four uints.
+  the first parameter contains the words to be shifted,
+  the second contains the amount to shift by AS INTS
+
+  for i = 0 to 3
+  shift = vSrcB_i*32:(i*32)+4
+  vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
+*/
+FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB)
+{
+	i32x4 retval;
+	SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
+	SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
+	SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
+	SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);
+
+
+	return retval;
+}
+
+
+// Fixed-point conversion and save as SIGNED INTS.
+// pDest->x = Int (vSrc.x)
+// note: some architectures have means of doing 
+// fixed point conversion when the fix depth is
+// specified as an immediate.. but there is no way 
+// to guarantee an immediate as a parameter to function
+// like this.
+FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
+{
+#if defined( COMPILER_MSVC64 )
+
+	(*pDest)[0] = SubFloat(vSrc, 0);
+	(*pDest)[1] = SubFloat(vSrc, 1);
+	(*pDest)[2] = SubFloat(vSrc, 2);
+	(*pDest)[3] = SubFloat(vSrc, 3);
+
+#else
+
+	__m64 bottom = _mm_cvttps_pi32( vSrc );
+	__m64 top    = _mm_cvttps_pi32( _mm_movehl_ps(vSrc,vSrc) );
+
+	*reinterpret_cast<__m64 *>(&(*pDest)[0]) = bottom;
+	*reinterpret_cast<__m64 *>(&(*pDest)[2]) = top;
+
+	_mm_empty();
+
+#endif
+}
+
+
+// some sse2 packed integer intrinsic wrappers
+#if _MSC_VER >= 1600 || defined(LINUX) || defined(OSX)
+/// replicate an 16 bit integer value to all 8 16-bit positions in an fltx4
+FORCEINLINE fltx4 ReplicateWordX8( uint16 nWord )
+{
+	return _mm_castsi128_ps( _mm_set_epi16( nWord, nWord, nWord, nWord, nWord, nWord, nWord, nWord ) );
+
+}
+/// Return a 16-bit mask consiting of the upper bit of each of the bytes in the input
+FORCEINLINE int TestSignsOfBytesSIMD( fltx4 const &packedBytes )
+{
+	return _mm_movemask_epi8( _mm_castps_si128( packedBytes ) );
+}
+
+/// compare each 16-bit field of a word for equality
+FORCEINLINE fltx4 CmpEqWordsSIMD( fltx4 const &flIn, fltx4 const &flValue )
+{
+	return _mm_castsi128_ps( _mm_cmpeq_epi16( _mm_castps_si128( flIn ), _mm_castps_si128( flValue ) ) );
+}
+
+/// grab 16 16-bit signed words from two fltx4s, and pack them into one register holding 16 bytes converted from them
+FORCEINLINE fltx4 PackSignedWordsToBytesWithSaturateSIMD( fltx4 const &packedWorlds0, fltx4 const &packedWorlds1 )
+{
+	return _mm_castsi128_ps( _mm_packs_epi16( _mm_castps_si128( packedWorlds0 ), _mm_castps_si128( packedWorlds1 ) ) );
+}
+
+
+FORCEINLINE fltx4 CrossProduct3SIMD( const fltx4 &v1, const fltx4 &v2 )
+{
+	fltx4 v1_yzxx = _mm_shuffle_ps( v1, v1, MM_SHUFFLE_REV( 1,2,0,0 ) );
+	fltx4 v2_zxyy = _mm_shuffle_ps( v2, v2, MM_SHUFFLE_REV( 2,0,1,0 ) );
+	fltx4 v1_zxyy = _mm_shuffle_ps( v1, v1, MM_SHUFFLE_REV( 2,0,1,0 ) );
+	fltx4 v2_yzxx = _mm_shuffle_ps( v2, v2, MM_SHUFFLE_REV( 1,2,0,0 ) );
+	return SubSIMD( MulSIMD( v1_yzxx, v2_zxyy ), MulSIMD( v1_zxyy, v2_yzxx ) );
+}
+
+
+
+
+#endif
--- a/public/mathlib/ssequaternion.h
+++ b/public/mathlib/ssequaternion.h
--- a/public/mathlib/svd.h
+++ b/public/mathlib/svd.h
@@ -0,0 +1,519 @@
+//////////////////////////////////////////////////////////////////////////
+//
+// Implementaiton of iterative 3x3 SVD without branches, using approximate Givens rotations,
+// applied sequentially to every off-diagonal element. The same code can compile into scalar, SSE and AVX
+// by templetizing on the Float data type. 
+//
+
+#include "ssemath.h"
+
+namespace SVD
+{
+	inline fltx4 CmpLt( const fltx4 &a, const fltx4 &b ) { return CmpLtSIMD( a, b ); }
+	inline bool CmpLt( float a, float b ) { return a < b ? true : false; }
+	
+	inline bool AllLe( const fltx4 &a, const fltx4 &b ){ return IsAllGreaterThanOrEq( b, a ); }
+	inline bool AllLe( float a, float b ) { return a <= b; }
+
+
+	template <typename Float >
+	struct FloatTraits
+	{
+		typedef Float Bool;
+	};
+	template <>
+	struct FloatTraits < float >
+	{
+		typedef bool Bool;
+	};
+
+	template< typename Float > inline Float Replicate( float a );
+
+	template <> inline fltx4 Replicate< fltx4 >( float a )
+	{
+		return _mm_set1_ps( a );
+	}
+
+	template <> inline float Replicate< float >( float a ){ return a; }
+
+	template <typename Float>
+	class SymMatrix3;
+
+
+	inline fltx4 RsqrtEst( const fltx4 &a ){ return ReciprocalSqrtEstSIMD( a ); }
+	template <typename Float>
+	inline Float Rsqrt( const Float &a )
+	{
+		fltx4 it = RsqrtEst( a );
+		// a single Newton iteration; can repeat multiple times
+		return Replicate<Float>( .5f ) * it * ( Replicate<Float>( 3.0f ) - ( a * it * it ) );
+	}
+	inline float RsqrtEst( float a ) 
+	{
+		float res;
+		StoreUnalignedFloat( &res, RsqrtEst( LoadUnalignedFloatSIMD(&a)  ) ); 
+		return res;
+	}
+	template <>
+	inline float Rsqrt( const float &a ) { return 1.0f / sqrtf( a ); }
+
+	inline fltx4 Select( const fltx4& a /*mask=0*/, const fltx4& b/*mask=1*/, const fltx4& mask )
+	{
+		// (((b ^ a) & mask)^a)
+		return _mm_xor_ps( a, _mm_and_ps( mask, _mm_xor_ps( b, a ) ) );
+	}
+
+	inline float Select( float a, float b, bool mask )
+	{
+		return mask ? b : a;
+	}
+
+
+	template <typename Float>
+	class Matrix3
+	{
+	public:
+		Matrix3() {}
+		Float m[ 3 ][ 3 ];
+
+		Matrix3 < Float > operator * ( const Matrix3< Float > &other )const
+		{
+			Matrix3 < Float > res;
+			for ( int i = 0; i < 3; ++i )
+				for ( int j = 0; j < 3; ++j )
+					res.m[ i ][ j ] = m[ i ][ 0 ] * other.m[ 0 ][ j ] + m[ i ][ 1 ] * other.m[ 1 ][ j ] + m[ i ][ 2 ] * other.m[ 2 ][ j ];
+			return res;
+		}
+		Matrix3 < Float > operator - ( const Matrix3< Float > &other )const
+		{
+			Matrix3 < Float > res;
+			for ( int i = 0; i < 3; ++i )
+				for ( int j = 0; j < 3; ++j )
+					res.m[ i ][ j ] = m[ i ][ j ] - other.m[ i ][ j ];
+			return res;
+		}
+		explicit Matrix3< Float >( const SymMatrix3< Float > &other );
+		Float FrobeniusNorm()const
+		{
+			Float sum = Replicate<Float>( 0.0f );
+			for ( int i = 0; i < 3; ++i )
+				for ( int j = 0; j < 3; ++j )
+					sum += m[ i ][ j ] * m[ i ][ j ];
+			return sum;
+		}
+
+		Float AtA( int i, int j )const
+		{
+			return m[ 0 ][ i ] * m[ 0 ][ j ] + m[ 1 ][ i ] * m[ 1 ][ j ] + m[ 2 ][ i ] * m[ 2 ][ j ];
+		}
+
+		void SetIdentity()
+		{
+			m[ 0 ][ 0 ] = m[ 1 ][ 1 ] = m[ 2 ][ 2 ] = Replicate<Float>( 1.0f );
+			m[ 0 ][ 1 ] = m[ 1 ][ 0 ] = m[ 0 ][ 2 ] = m[ 2 ][ 0 ] = m[ 2 ][ 1 ] = m[ 1 ][ 2 ] = Replicate<Float>( 0.0f );
+		}
+		void SetZero()
+		{
+			memset( this, 0, sizeof( *this ) );
+		}
+		Float ColLenSqr( int j )const
+		{
+			return m[ 0 ][ j ] * m[ 0 ][ j ] + m[ 1 ][ j ] * m[ 1 ][ j ] + m[ 2 ][ j ] * m[ 2 ][ j ];
+		}
+		Float Det()const
+		{
+			return -( m[ 0 ][ 2 ]*m[ 1 ][ 1 ]*m[ 2 ][ 0 ] ) + m[ 0 ][ 1 ]*m[ 1 ][ 2 ]*m[ 2 ][ 0 ] + m[ 0 ][ 2 ]*m[ 1 ][ 0 ]*m[ 2 ][ 1 ] - m[ 0 ][ 0 ]*m[ 1 ][ 2 ]*m[ 2 ][ 1 ] - m[ 0 ][ 1 ]*m[ 1 ][ 0 ]*m[ 2 ][ 2 ] + m[ 0 ][ 0 ]*m[ 1 ][ 1 ]*m[ 2 ][ 2 ];
+		}
+	};
+
+
+	template <typename Float>
+	inline Matrix3 < Float > MulT( const Matrix3< Float > &u, const Matrix3< Float > &vt )
+	{
+		Matrix3 < Float > res;
+		for ( int i = 0; i < 3; ++i )
+			for ( int j = 0; j < 3; ++j )
+				res.m[ i ][ j ] = u.m[ i ][ 0 ] * vt.m[ j ][ 0 ] + u.m[ i ][ 1 ] * vt.m[ j ][ 1 ] + u.m[ i ][ 2 ] * vt.m[ j ][ 2 ];
+		return res;
+	}
+
+
+	template <typename Float>
+	Float OrthogonalityError( const Matrix3<Float> &m )
+	{
+		Float result = Replicate<Float>( 0.0f );
+		for ( int i = 0; i < 2; ++i )
+		{
+			for ( int j = i + 1; j < 3; ++j )
+			{
+				Float dot = m.m[ 0 ][ i ] * m.m[ 0 ][ j ] + m.m[ 1 ][ i ] * m.m[ 1 ][ j ] + m.m[ 2 ][ i ] * m.m[ 2 ][ j ];
+				result += dot * dot;
+			}
+		}
+		return result;
+	}
+
+	template <typename Float >
+	class DiagMatrix3
+	{
+	public:
+		Float m[ 3 ];
+	};
+
+	template <typename Float>
+	class SymMatrix3
+	{
+	public:
+		enum Index_t
+		{
+			a00, a10, a11, a20, a21, a22, Count, a01 = a10, a02 = a20, a12 = a21
+		};
+
+		Float m[ 6 ];
+		Float &m00() { return m[ a00 ]; }
+		Float &m01() { return m[ a01 ]; }
+		Float &m02() { return m[ a02 ]; }
+		Float &m11() { return m[ a11 ]; }
+		Float &m12() { return m[ a12 ]; }
+		Float &m22() { return m[ a22 ]; }
+
+		Float OffDiagNorm()const { return m[ a01 ] * m[ a01 ] + m[ a21 ] * m[ a21 ] + m[ a02 ] * m[ a02 ]; }
+		Float DiagNorm()const { return m[ a00 ] * m[ a00 ] + m[ a11 ] * m[ a11 ] + m[ a22 ] * m[ a22 ]; }
+	};
+
+
+	template < typename Float >
+	Matrix3< Float >::Matrix3( const SymMatrix3< Float > &other )
+	{
+		m[ 0 ][ 0 ] = other.m[ other.a00 ];
+		m[ 0 ][ 1 ] = m[ 1 ][ 0 ] = other.m[ other.a01 ];
+		m[ 1 ][ 1 ] = other.m[ other.a11 ];
+		m[ 2 ][ 0 ] = m[ 0 ][ 2 ] = other.m[ other.a01 ];
+		m[ 2 ][ 1 ] = m[ 1 ][ 2 ] = other.m[ other.a01 ];
+		m[ 2 ][ 2 ] = other.m[ other.a22 ];
+	}
+
+
+	template <typename Float >
+	class SinCos
+	{
+	public:
+		Float s, c;
+		SinCos() {}
+		SinCos( const Float &_c, const Float &_s ) : c( _c ), s( _s ) {}
+
+		SinCos< Float> DoubleAngle()const
+		{
+			SinCos< Float> res;
+			res.s = Replicate<Float>( 2.0f ) * s * c;
+			res.c = c * c - s * s;
+			return res;
+		}
+	};
+
+	template <typename Float>
+	class Quaternion
+	{
+	public:
+		Float x, y, z, w;
+
+		void SetIdentity()
+		{
+			x = y = z = Replicate<float>( 0.0f );
+			w = Replicate<float>( 1.0f );
+		}
+
+		Quaternion<Float> operator * ( const Float &f )const
+		{
+			Quaternion< Float > res;
+			res.x = x * f;
+			res.y = y * f;
+			res.z = z * f;
+			res.w = w * f;
+			return res;
+		}
+
+		Float LengthSqr() const
+		{
+			return x * x + y * y + z * z + w * w;
+		}
+	};
+
+	template <typename Float>
+	Matrix3<Float> QuaternionMatrix( const Quaternion<Float> &q )
+	{
+		Matrix3<Float> matrix;
+		const Float one = Replicate<Float>( 1.0f ), two = Replicate<Float>( 2.0f );
+		matrix.m[ 0 ][ 0 ] = one - two * q.y * q.y - two * q.z * q.z;
+		matrix.m[ 1 ][ 0 ] = two * q.x * q.y + two * q.w * q.z;
+		matrix.m[ 2 ][ 0 ] = two * q.x * q.z - two * q.w * q.y;
+		matrix.m[ 0 ][ 1 ] = two * q.x * q.y - two * q.w * q.z;
+		matrix.m[ 1 ][ 1 ] = one - two * q.x * q.x - two * q.z * q.z;
+		matrix.m[ 2 ][ 1 ] = two * q.y * q.z + two * q.w * q.x;
+		matrix.m[ 0 ][ 2 ] = two * q.x * q.z + two * q.w * q.y;
+		matrix.m[ 1 ][ 2 ] = two * q.y * q.z - two * q.w * q.x;
+		matrix.m[ 2 ][ 2 ] = one - two * q.x * q.x - two * q.y * q.y;
+		return matrix;
+	}
+
+	template <typename Float >
+	inline SymMatrix3< Float > AtA( const Matrix3< Float > &a )
+	{
+		SymMatrix3< Float > res;
+		res.m[ res.a00 ] = a.AtA( 0, 0 );
+		res.m[ res.a10 ] = a.AtA( 1, 0 );
+		res.m[ res.a11 ] = a.AtA( 1, 1 );
+		res.m[ res.a20 ] = a.AtA( 2, 0 );
+		res.m[ res.a21 ] = a.AtA( 2, 1 );
+		res.m[ res.a22 ] = a.AtA( 2, 2 );
+		return res;
+	}
+
+
+	template <typename Float >
+	inline SymMatrix3< Float > QtAQ( const Matrix3< Float > &q, const SymMatrix3< Float > &a )
+	{
+		SymMatrix3< Float > res;
+		res.m[ res.a00 ] = q.m[ 0 ][ 0 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 0 ] + a.m[ a.a01 ] * q.m[ 1 ][ 0 ] + a.m[ a.a02 ] * q.m[ 2 ][ 0 ] ) +
+			q.m[ 1 ][ 0 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 0 ] + a.m[ a.a11 ] * q.m[ 1 ][ 0 ] + a.m[ a.a12 ] * q.m[ 2 ][ 0 ] ) +
+			q.m[ 2 ][ 0 ] * ( a.m[ a.a02 ] * q.m[ 0 ][ 0 ] + a.m[ a.a12 ] * q.m[ 1 ][ 0 ] + a.m[ a.a22 ] * q.m[ 2 ][ 0 ] );
+		res.m[ res.a01 ] =
+			q.m[ 0 ][ 1 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 0 ] + a.m[ a.a01 ] * q.m[ 1 ][ 0 ] + a.m[ a.a02 ] * q.m[ 2 ][ 0 ] ) +
+			q.m[ 1 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 0 ] + a.m[ a.a11 ] * q.m[ 1 ][ 0 ] + a.m[ a.a12 ] * q.m[ 2 ][ 0 ] ) + ( a.m[ a.a02 ] * q.m[ 0 ][ 0 ] + a.m[ a.a12 ] * q.m[ 1 ][ 0 ] +
+			a.m[ a.a22 ] * q.m[ 2 ][ 0 ] ) * q.m[ 2 ][ 1 ];
+		res.m[ res.a02 ] =
+			q.m[ 0 ][ 2 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 0 ] + a.m[ a.a01 ] * q.m[ 1 ][ 0 ] + a.m[ a.a02 ] * q.m[ 2 ][ 0 ] ) +
+			q.m[ 1 ][ 2 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 0 ] + a.m[ a.a11 ] * q.m[ 1 ][ 0 ] + a.m[ a.a12 ] * q.m[ 2 ][ 0 ] ) + ( a.m[ a.a02 ] * q.m[ 0 ][ 0 ] + a.m[ a.a12 ] * q.m[ 1 ][ 0 ] +
+			a.m[ a.a22 ] * q.m[ 2 ][ 0 ] ) * q.m[ 2 ][ 2 ];
+		res.m[ res.a11 ] =
+			q.m[ 0 ][ 1 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 1 ] + a.m[ a.a01 ] * q.m[ 1 ][ 1 ] + a.m[ a.a02 ] * q.m[ 2 ][ 1 ] ) +
+			q.m[ 1 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 1 ] + a.m[ a.a11 ] * q.m[ 1 ][ 1 ] + a.m[ a.a12 ] * q.m[ 2 ][ 1 ] ) +
+			q.m[ 2 ][ 1 ] * ( a.m[ a.a02 ] * q.m[ 0 ][ 1 ] + a.m[ a.a12 ] * q.m[ 1 ][ 1 ] + a.m[ a.a22 ] * q.m[ 2 ][ 1 ] );
+		res.m[ res.a12 ] =
+			q.m[ 0 ][ 2 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 1 ] + a.m[ a.a01 ] * q.m[ 1 ][ 1 ] + a.m[ a.a02 ] * q.m[ 2 ][ 1 ] ) +
+			q.m[ 1 ][ 2 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 1 ] + a.m[ a.a11 ] * q.m[ 1 ][ 1 ] + a.m[ a.a12 ] * q.m[ 2 ][ 1 ] ) + ( a.m[ a.a02 ] * q.m[ 0 ][ 1 ] + a.m[ a.a12 ] * q.m[ 1 ][ 1 ] +
+			a.m[ a.a22 ] * q.m[ 2 ][ 1 ] ) * q.m[ 2 ][ 2 ];
+		res.m[ res.a22 ] =
+			q.m[ 0 ][ 2 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 2 ] + a.m[ a.a01 ] * q.m[ 1 ][ 2 ] + a.m[ a.a02 ] * q.m[ 2 ][ 2 ] ) +
+			q.m[ 1 ][ 2 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 2 ] + a.m[ a.a11 ] * q.m[ 1 ][ 2 ] + a.m[ a.a12 ] * q.m[ 2 ][ 2 ] ) +
+			q.m[ 2 ][ 2 ] * ( a.m[ a.a02 ] * q.m[ 0 ][ 2 ] + a.m[ a.a12 ] * q.m[ 1 ][ 2 ] + a.m[ a.a22 ] * q.m[ 2 ][ 2 ] );
+		return res;
+	}
+
+	template <typename Float >
+	inline SymMatrix3< Float > QAQt( const Matrix3< Float > &q, const SymMatrix3< Float > &a )
+	{
+		SymMatrix3< Float > res;
+		res.m[ res.a00 ] = q.m[ 0 ][ 0 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 0 ] + a.m[ a.a01 ] * q.m[ 0 ][ 1 ] + a.m[ a.a02 ] * q.m[ 0 ][ 2 ] ) +
+			q.m[ 0 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 0 ] + a.m[ a.a11 ] * q.m[ 0 ][ 1 ] + a.m[ a.a12 ] * q.m[ 0 ][ 2 ] ) +
+			q.m[ 0 ][ 2 ] * ( a.m[ a.a02 ] * q.m[ 0 ][ 0 ] + a.m[ a.a12 ] * q.m[ 0 ][ 1 ] + a.m[ a.a22 ] * q.m[ 0 ][ 2 ] );
+		res.m[ res.a01 ] =
+			q.m[ 1 ][ 0 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 0 ] + a.m[ a.a01 ] * q.m[ 0 ][ 1 ] + a.m[ a.a02 ] * q.m[ 0 ][ 2 ] ) +
+			q.m[ 1 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 0 ] + a.m[ a.a11 ] * q.m[ 0 ][ 1 ] + a.m[ a.a12 ] * q.m[ 0 ][ 2 ] ) + ( a.m[ a.a02 ] * q.m[ 0 ][ 0 ] + a.m[ a.a12 ] * q.m[ 0 ][ 1 ] +
+			a.m[ a.a22 ] * q.m[ 0 ][ 2 ] ) * q.m[ 1 ][ 2 ];
+		res.m[ res.a02 ] =
+			q.m[ 2 ][ 0 ] * ( a.m[ a.a00 ] * q.m[ 0 ][ 0 ] + a.m[ a.a01 ] * q.m[ 0 ][ 1 ] + a.m[ a.a02 ] * q.m[ 0 ][ 2 ] ) +
+			q.m[ 2 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 0 ][ 0 ] + a.m[ a.a11 ] * q.m[ 0 ][ 1 ] + a.m[ a.a12 ] * q.m[ 0 ][ 2 ] ) + ( a.m[ a.a02 ] * q.m[ 0 ][ 0 ] + a.m[ a.a12 ] * q.m[ 0 ][ 1 ] +
+			a.m[ a.a22 ] * q.m[ 0 ][ 2 ] ) * q.m[ 2 ][ 2 ];
+		res.m[ res.a11 ] =
+			q.m[ 1 ][ 0 ] * ( a.m[ a.a00 ] * q.m[ 1 ][ 0 ] + a.m[ a.a01 ] * q.m[ 1 ][ 1 ] + a.m[ a.a02 ] * q.m[ 1 ][ 2 ] ) +
+			q.m[ 1 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 1 ][ 0 ] + a.m[ a.a11 ] * q.m[ 1 ][ 1 ] + a.m[ a.a12 ] * q.m[ 1 ][ 2 ] ) +
+			q.m[ 1 ][ 2 ] * ( a.m[ a.a02 ] * q.m[ 1 ][ 0 ] + a.m[ a.a12 ] * q.m[ 1 ][ 1 ] + a.m[ a.a22 ] * q.m[ 1 ][ 2 ] );
+		res.m[ res.a12 ] =
+			q.m[ 2 ][ 0 ] * ( a.m[ a.a00 ] * q.m[ 1 ][ 0 ] + a.m[ a.a01 ] * q.m[ 1 ][ 1 ] + a.m[ a.a02 ] * q.m[ 1 ][ 2 ] ) +
+			q.m[ 2 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 1 ][ 0 ] + a.m[ a.a11 ] * q.m[ 1 ][ 1 ] + a.m[ a.a12 ] * q.m[ 1 ][ 2 ] ) + ( a.m[ a.a02 ] * q.m[ 1 ][ 0 ] + a.m[ a.a12 ] * q.m[ 1 ][ 1 ] +
+			a.m[ a.a22 ] * q.m[ 1 ][ 2 ] ) * q.m[ 2 ][ 2 ];
+		res.m[ res.a22 ] =
+			q.m[ 2 ][ 0 ] * ( a.m[ a.a00 ] * q.m[ 2 ][ 0 ] + a.m[ a.a01 ] * q.m[ 2 ][ 1 ] + a.m[ a.a02 ] * q.m[ 2 ][ 2 ] ) +
+			q.m[ 2 ][ 1 ] * ( a.m[ a.a01 ] * q.m[ 2 ][ 0 ] + a.m[ a.a11 ] * q.m[ 2 ][ 1 ] + a.m[ a.a12 ] * q.m[ 2 ][ 2 ] ) +
+			q.m[ 2 ][ 2 ] * ( a.m[ a.a02 ] * q.m[ 2 ][ 0 ] + a.m[ a.a12 ] * q.m[ 2 ][ 1 ] + a.m[ a.a22 ] * q.m[ 2 ][ 2 ] );
+		return res;
+	}
+
+
+	template <typename Float >
+	inline SymMatrix3< Float > QAQt( const Matrix3< Float > &q, const DiagMatrix3< Float > &a )
+	{
+		SymMatrix3< Float > res;
+		res.m[ res.a00 ] = q.m[ 0 ][ 0 ] * a.m[ 0 ] * q.m[ 0 ][ 0 ] +
+			q.m[ 0 ][ 1 ] *  a.m[ 1 ] * q.m[ 0 ][ 1 ]  +
+			q.m[ 0 ][ 2 ] *  a.m[ 2 ] * q.m[ 0 ][ 2 ] ;
+		res.m[ res.a01 ] =
+			q.m[ 1 ][ 0 ] * ( a.m[ 0 ] * q.m[ 0 ][ 0 ]) +
+			q.m[ 1 ][ 1 ] * ( a.m[ 1 ] * q.m[ 0 ][ 1 ]) + ( a.m[ 2 ] * q.m[ 0 ][ 2 ] ) * q.m[ 1 ][ 2 ];
+		res.m[ res.a02 ] =
+			q.m[ 2 ][ 0 ] * ( a.m[ 0 ] * q.m[ 0 ][ 0 ]) +
+			q.m[ 2 ][ 1 ] * ( a.m[ 1 ] * q.m[ 0 ][ 1 ] ) + ( a.m[ 2 ] * q.m[ 0 ][ 2 ] ) * q.m[ 2 ][ 2 ];
+		res.m[ res.a11 ] =
+			q.m[ 1 ][ 0 ] * ( a.m[ 0 ] * q.m[ 1 ][ 0 ]) +
+			q.m[ 1 ][ 1 ] * ( a.m[ 1 ] * q.m[ 1 ][ 1 ]) +
+			q.m[ 1 ][ 2 ] * ( a.m[ 2 ] * q.m[ 1 ][ 2 ] );
+		res.m[ res.a12 ] =
+			q.m[ 2 ][ 0 ] * ( a.m[ 0 ] * q.m[ 1 ][ 0 ]) +
+			q.m[ 2 ][ 1 ] * (  a.m[ 1 ] * q.m[ 1 ][ 1 ] ) + ( a.m[ 2 ] * q.m[ 1 ][ 2 ] ) * q.m[ 2 ][ 2 ];
+		res.m[ res.a22 ] =
+			q.m[ 2 ][ 0 ] * ( a.m[ 0 ] * q.m[ 2 ][ 0 ] ) +
+			q.m[ 2 ][ 1 ] * ( a.m[ 1 ] * q.m[ 2 ][ 1 ] ) +
+			q.m[ 2 ][ 2 ] * ( a.m[ 2 ] * q.m[ 2 ][ 2 ] );
+		return res;
+	}
+
+	template <typename Float >
+	void PerformGivensRotation2x2( const SinCos<Float> &res, Float &a11, Float &a12, Float &a22 )
+	{
+		const Float two = Replicate<Float>( 2.0f );
+		Float cc = res.c * res.c, ss = res.s * res.s, cs = res.c * res.s;
+		Float b11 = cc * a11 + two * cs * a12 + ss * a22;
+		Float b12 = cs * ( a22 - a11 ) + ( cc - ss ) * a12;
+		Float b22 = ss * a11 - two * cs * a12 + cc * a22;
+
+		a11 = b11;
+		a12 = b12;
+		a22 = b22;
+	}
+
+
+	template <typename Float>
+	void UnperformGivensRotation3x3( const SinCos<Float> &r, Float &a00, Float &a01, Float &a11, Float &a02, Float &a12 )
+	{
+		const Float two = Replicate<Float>( 2.0f );
+
+		Float b00 = a00 * r.c * r.c - r.s * ( two * a01 * r.c - a11 * r.s );
+		Float b01 = r.c*( a01 * r.c + a00 * r.s ) - r.s * ( a11 * r.c + a01 * r.s );
+		Float b11 = a11 * r.c * r.c + r.s * ( two * a01 * r.c + a00 * r.s );
+		Float b02 = a02 *r.c - a12 * r.s;
+		Float b12 = a12 *r.c + a02 * r.s;
+
+		a00 = b00;
+		a01 = b01;
+		a11 = b11;
+		a02 = b02;
+		a12 = b12;
+	}
+
+	template <typename Float>
+	void PerformGivensRotation3x3( const SinCos<Float> &r, Float &a00, Float &a01, Float &a11, Float &a02, Float &a12 )
+	{
+		const Float two = Replicate<Float>( 2.0f );
+
+		Float b00 = a00 * r.c * r.c + r.s * ( two * a01 * r.c + a11 * r.s );
+		Float b01 = r.c*( a01 *r.c - a00 * r.s ) + r.s * ( a11 *r.c - a01 * r.s );
+		Float b11 = a11 * r.c *r.c - r.s * ( two * a01 * r.c - a00 * r.s );
+		Float b02 = a02 *r.c + a12 * r.s;
+		Float b12 = a12 *r.c - a02 * r.s;
+
+		a00 = b00;
+		a01 = b01;
+		a11 = b11;
+		a02 = b02;
+		a12 = b12;
+	}
+
+	inline SinCos< float > ComputeGivensRotation( float a11, float a12, float a22 )
+	{
+		float theta = fabsf( a11 - a22 ) > 1e-6f ? 0.5f * atanf( 2 * a12 / ( a11 - a22 ) ) : 3.14159265358979323846f / 4;
+		SinCos< float >res( cosf( theta ), sinf( theta ) );
+#ifdef _DEBUG
+		PerformGivensRotation2x2( res, a11, a12, a22 );
+		Assert( fabsf( a12 ) < 0.001f * ( 1 + fabsf( a11 ) + fabsf( a22 ) ) );
+#endif
+		return res;
+	}
+
+	template <typename Float >
+	inline SinCos< Float> ApproximateGivensRotation( const Float & a11, const Float & a12, const Float & a22 )
+	{
+		const Float two = Replicate<Float>( 2.0f );
+		Float ch = two * ( a11 - a22 );
+		Float sh = a12;
+		typename FloatTraits<Float>::Bool b = CmpLt( Replicate<Float>( 5.82842712474619f ) * sh*sh, ch*ch );
+		Float r2 = ch*ch + sh *sh;
+		typename FloatTraits<Float>::Bool bZero = CmpLt( r2, Replicate<Float>( 1e-12f ) );
+		Float omega = RsqrtEst( r2 );
+		SinCos<Float>res;
+		res.s = Select( Replicate<Float>( 0.3826834323650897717284599840304f ), omega * sh, b );
+		res.c = Select( Replicate<Float>( 0.92387953251128675612818318939679f ), omega * ch, b );
+
+		res.s = Select( res.s, Replicate<Float>( 0.0f ), bZero ); // todo: replace with And
+		res.c = Select( res.c, Replicate<Float>( 1.0f ), bZero );
+
+		return res;
+	}
+
+
+
+	template <typename Float >
+	void PerformGivensRotationQuaternion( const SinCos<Float> &res, Float &x, Float &y, Float &z, Float &w )
+	{
+		//const Float two = Replicate<Float>( 2.0f );
+		Float xNew = res.c * x + res.s * w, yNew = res.c * y + res.s * z, zNew = res.c * z - res.s * y, wNew = res.c * w - res.s * x;
+		x = xNew;
+		y = yNew;
+		z = zNew;
+		w = wNew;
+	}
+
+	template <typename Float >
+	class SvdIterator
+	{
+	public:
+		SvdIterator(){}
+		
+		Quaternion < Float > q;
+		SymMatrix3<Float> ata;
+
+		void Init( const Matrix3<Float> a )
+		{
+			q.SetIdentity();
+			ata = AtA( a );
+		}
+
+
+		void Iterate( int nIterations, float flEpsilon = 0.0f )
+		{
+			SinCos< Float> r;
+
+			//SymMatrix3<Float> inv0 = QAQt( QuaternionMatrix( q ), ata ), origAta = AtA( a );
+
+			for ( int i = 0; i < nIterations; ++i )
+			{
+				r = ApproximateGivensRotation( ata.m[ ata.a00 ], ata.m[ ata.a10 ], ata.m[ ata.a11 ] );
+				Float sumErrors = r.s * r.s;
+				PerformGivensRotation3x3( r.DoubleAngle(), ata.m[ ata.a00 ], ata.m[ ata.a10 ], ata.m[ ata.a11 ], ata.m[ ata.a20 ], ata.m[ ata.a21 ] );
+				PerformGivensRotationQuaternion( r, q.z, q.x, q.y, q.w );
+				//SymMatrix3<Float> inv1 = QAQt( QuaternionMatrix( q ), ata );
+
+				r = ApproximateGivensRotation( ata.m[ ata.a11 ], ata.m[ ata.a21 ], ata.m[ ata.a22 ] );
+				sumErrors += r.s * r.s;
+				PerformGivensRotation3x3( r.DoubleAngle(), ata.m[ ata.a11 ], ata.m[ ata.a21 ], ata.m[ ata.a22 ], ata.m[ ata.a01 ], ata.m[ ata.a02 ] );
+				PerformGivensRotationQuaternion( r, q.x, q.y, q.z, q.w );
+				//SymMatrix3<Float> inv2 = QAQt( QuaternionMatrix( q ), ata );
+
+				r = ApproximateGivensRotation( ata.m[ ata.a22 ], ata.m[ ata.a20 ], ata.m[ ata.a00 ] );
+				sumErrors += r.s * r.s;
+				PerformGivensRotation3x3( r.DoubleAngle(), ata.m[ ata.a22 ], ata.m[ ata.a02 ], ata.m[ ata.a00 ], ata.m[ ata.a12 ], ata.m[ ata.a10 ] );
+				PerformGivensRotationQuaternion( r, q.y, q.z, q.x, q.w );
+				//SymMatrix3<Float> inv3 = QAQt( QuaternionMatrix( q ), ata );
+				if ( AllLe( sumErrors, Replicate<Float>( flEpsilon ) ) )
+					break; // early out
+			}
+		}
+
+		Matrix3< Float > ComputeV()const { return QuaternionMatrix( q * Rsqrt( q.LengthSqr() ) ); }
+	};
+
+	inline float PseudoInverse( float fl ) { return fabsf( fl ) < FLT_EPSILON ? 0 : 1.0f / fl; }
+
+	inline SymMatrix3< float > PseudoInverse( const SymMatrix3< float > &cov )
+	{
+		SvdIterator< float > si;
+		si.q.SetIdentity();
+		si.ata = cov;
+		si.Iterate( 5 );
+		DiagMatrix3< float > pseudoInverseDiag;
+		pseudoInverseDiag.m[ 0 ] = PseudoInverse( si.ata.m00() );
+		pseudoInverseDiag.m[ 1 ] = PseudoInverse( si.ata.m11() );
+		pseudoInverseDiag.m[ 2 ] = PseudoInverse( si.ata.m22() );
+		return QAQt( si.ComputeV(), pseudoInverseDiag );
+	}
+}
--- a/public/mathlib/transform.h
+++ b/public/mathlib/transform.h
@@ -0,0 +1,401 @@
+//====== Copyright 1996-2005, Valve Corporation, All rights reserved. =======//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//===========================================================================//
+
+#ifndef TRANSFORM_H
+#define TRANSFORM_H
+
+#ifdef COMPILER_MSVC
+#pragma once
+#endif
+
+#include "tier0/memalloc.h"
+#include "mathlib/vector.h"
+#include "mathlib/mathlib.h"
+
+//-----------------------------------------------------------------------------
+// Matrix 3x4_t
+//-----------------------------------------------------------------------------
+class CTransformUnaligned;
+
+
+//-----------------------------------------------------------------------------
+// Represents a position + orientation using quaternions
+//-----------------------------------------------------------------------------
+class ALIGN16 CTransform
+{
+public:
+	CTransform() {}
+	CTransform( const Vector &v, const Quaternion &q ) : m_vPosition(v), m_orientation(q) {}
+	CTransform( const Vector &v, const QAngle &a ) : m_vPosition(v)
+	{
+		AngleQuaternion( a, m_orientation );
+	}
+
+	VectorAligned m_vPosition;
+	QuaternionAligned m_orientation;
+
+	bool IsValid() const
+	{
+		return m_vPosition.IsValid() && m_orientation.IsValid();
+	}
+
+	bool operator==(const CTransform& v) const;					///< exact equality check
+	bool operator!=(const CTransform& v) const;
+
+	// for API compatibility with matrix3x4_t
+	inline void InitFromQAngles( const QAngle &angles, const Vector &vPosition = vec3_origin );
+	inline void InitFromMatrix( const matrix3x4_t &transform );
+	inline void InitFromQuaternion( const Quaternion &orientation, const Vector &vPosition = vec3_origin );
+
+	inline Quaternion ToQuaternion() const;
+	inline QAngle ToQAngle() const;
+	inline matrix3x4_t ToMatrix() const;
+
+	inline void SetToIdentity();
+
+	inline void SetOrigin( Vector const &vPos ) { m_vPosition = vPos; }
+	inline void SetAngles( QAngle const &vAngles );
+	inline Vector GetOrigin( void ) const { return m_vPosition; }
+
+	inline void GetBasisVectorsFLU( Vector *pForward, Vector *pLeft, Vector *pUp ) const;
+	inline Vector GetForward() const;
+	inline Vector TransformVector( const Vector &v0 ) const;
+	inline Vector RotateVector( const Vector &v0 ) const;
+	inline Vector TransformVectorByInverse( const Vector &v0 ) const;
+	inline Vector RotateVectorByInverse( const Vector &v0 ) const;
+	inline Vector RotateExtents( const Vector &vBoxExtents ) const; // these are extents and must remain positive/symmetric after rotation
+	inline void TransformAABB( const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut ) const;
+	inline void TransformAABBByInverse( const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut ) const;
+	inline void RotateAABB( const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut ) const;
+	inline void RotateAABBByInverse( const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut ) const;
+	//inline void TransformPlane( const cplane_t &inPlane, cplane_t &outPlane ) const;
+	//inline void InverseTransformPlane( const cplane_t &inPlane, cplane_t &outPlane ) const;
+
+	/// Computes an inverse.  Uses the 'TR' naming to be consistent with the same method in matrix3x4_t (which only works with orthonormal matrices) 
+	inline void InverseTR( CTransform &out ) const;
+
+public:
+	CTransform &operator=( const CTransformUnaligned &i );
+} ALIGN16_POST;
+
+
+extern const CTransform g_TransformIdentity;
+
+
+//-----------------------------------------------------------------------------
+// Represents an unaligned position + orientation using quaternions,
+// used only for copying data around
+//-----------------------------------------------------------------------------
+class CTransformUnaligned
+{
+public:
+	CTransformUnaligned() {}
+	CTransformUnaligned( const Vector &v, const Quaternion &q ) : m_vPosition(v), m_orientation(q) {}
+	CTransformUnaligned( const CTransform &transform ) : m_vPosition(transform.m_vPosition), m_orientation(transform.m_orientation) {}
+	CTransform AsTransform() const { return CTransform( m_vPosition, m_orientation ); }
+
+	Vector m_vPosition;
+	Quaternion m_orientation;
+
+	bool IsValid() const
+	{
+		return m_vPosition.IsValid() && m_orientation.IsValid();
+	}
+
+public:
+	CTransformUnaligned &operator=( const CTransform &i );
+};
+
+
+//-----------------------------------------------------------------------------
+// Inline methods
+//-----------------------------------------------------------------------------
+inline CTransform &CTransform::operator=( const CTransformUnaligned &i )
+{
+	m_vPosition = i.m_vPosition;
+	m_orientation = i.m_orientation;
+	return *this;
+}
+
+inline CTransformUnaligned &CTransformUnaligned::operator=( const CTransform &i )
+{
+	m_vPosition = i.m_vPosition;
+	m_orientation = i.m_orientation;
+	return *this;
+}
+
+
+//-----------------------------------------------------------------------------
+// Other methods
+//-----------------------------------------------------------------------------
+void ConcatTransforms( const CTransform &in1, const CTransform &in2, CTransform &out );
+void TransformSlerp( const CTransform &p, const CTransform &q, float t, CTransform &qt );
+void TransformLerp( const CTransform &p, const CTransform &q, float t, CTransform &qt );
+void TransformMatrix( const CTransform &in, matrix3x4_t &out );
+void TransformMatrix( const CTransform &in, const Vector &vScaleIn, matrix3x4_t &out );
+
+inline void TransformMatrix( const CTransform &in, float flScale, matrix3x4_t &out )
+{
+	QuaternionMatrix( in.m_orientation, in.m_vPosition, Vector( flScale, flScale, flScale ), out );
+}
+
+inline float TransformNormalize( CTransform &in )
+{
+	return QuaternionNormalize( in.m_orientation );
+}
+
+void TransformMatrix( const CTransformUnaligned &in, matrix3x4_t &out );
+void MatrixTransform( const matrix3x4_t &in, CTransform &out );
+void MatrixTransform( const matrix3x4_t &in, CTransformUnaligned &out );
+void MatrixTransform( const matrix3x4_t &in, CTransform &out, Vector &vScaleOut );
+
+inline void MatrixTransform( const matrix3x4_t &in, CTransform &out, float &flScale )
+{
+	Vector vScale;
+	MatrixTransform( in, out, vScale );
+	flScale = vScale.LargestComponentValue();
+}
+
+void AngleTransform( const QAngle &angles, const Vector &origin, CTransform &out );
+void SetIdentityTransform( CTransform &out );
+void TransformVectorsFLU( const CTransform &in, Vector* pForward, Vector *pLeft, Vector *pUp );
+void TransformVectorsForward( const CTransform &in, Vector* pForward );
+
+inline const CTransform GetIdentityTransform()
+{
+	CTransform out;
+	SetIdentityTransform( out );
+	return out;
+}
+
+inline const CTransform MatrixTransform( const matrix3x4_t &in )
+{
+	CTransform out ;
+	MatrixTransform( in, out );
+	return out;
+}
+
+inline const matrix3x4_t TransformMatrix( const CTransform &in )
+{
+	matrix3x4_t out;
+	TransformMatrix( in, out );
+	return out;
+}
+inline const matrix3x4_t TransformMatrix( const CTransformUnaligned &in )
+{
+	matrix3x4_t out;
+	TransformMatrix( in, out );
+	return out;
+}
+
+inline const CTransform ConcatTransforms( const CTransform &in1, const CTransform &in2 )
+{
+	CTransform result;
+	ConcatTransforms( in1, in2, result );
+	return result;
+}
+
+
+void TransformInvert( const CTransform &in, CTransform &out );
+void AxisAngleTransform( const Vector &vecAxis, float flAngleDegrees, CTransform &out );
+void VectorIRotate( const Vector &v, const CTransform &t, Vector &out );
+void VectorITransform( const Vector &v, const CTransform &t, Vector &out );
+
+inline Vector TransformPoint( const CTransformUnaligned & tm, const Vector & p )
+{
+	return Vector(
+		tm.m_vPosition.x + ( 1.0f - 2.0f * tm.m_orientation.y * tm.m_orientation.y - 2.0f * tm.m_orientation.z * tm.m_orientation.z ) * p.x + ( 2.0f * tm.m_orientation.x * tm.m_orientation.y - 2.0f * tm.m_orientation.w * tm.m_orientation.z ) * p.y + ( 2.0f * tm.m_orientation.x * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.y ) * p.z,
+		tm.m_vPosition.y + ( 2.0f * tm.m_orientation.x * tm.m_orientation.y + 2.0f * tm.m_orientation.w * tm.m_orientation.z ) * p.x + ( 1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.z * tm.m_orientation.z ) * p.y + ( 2.0f * tm.m_orientation.y * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.x ) * p.z,
+		tm.m_vPosition.z + ( 2.0f * tm.m_orientation.x * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.y ) * p.x + ( 2.0f * tm.m_orientation.y * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.x ) * p.y + ( 1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.y * tm.m_orientation.y ) * p.z
+	);
+}
+
+// TODO: implement in SIMD?
+inline Vector TransformPoint( const CTransform & tm, const Vector & p )
+{
+	return Vector(
+		tm.m_vPosition.x + ( 1.0f - 2.0f * tm.m_orientation.y * tm.m_orientation.y - 2.0f * tm.m_orientation.z * tm.m_orientation.z ) * p.x + ( 2.0f * tm.m_orientation.x * tm.m_orientation.y - 2.0f * tm.m_orientation.w * tm.m_orientation.z ) * p.y + ( 2.0f * tm.m_orientation.x * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.y ) * p.z,
+		tm.m_vPosition.y + ( 2.0f * tm.m_orientation.x * tm.m_orientation.y + 2.0f * tm.m_orientation.w * tm.m_orientation.z ) * p.x + ( 1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.z * tm.m_orientation.z ) * p.y + ( 2.0f * tm.m_orientation.y * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.x ) * p.z,
+		tm.m_vPosition.z + ( 2.0f * tm.m_orientation.x * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.y ) * p.x + ( 2.0f * tm.m_orientation.y * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.x ) * p.y + ( 1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.y * tm.m_orientation.y ) * p.z
+		);
+}
+
+
+template < class T >
+inline void TransformPoint( const T & tm, const Vector & p, Vector & out )
+{
+	out.x = tm.m_vPosition.x + ( 1.0f - 2.0f * tm.m_orientation.y * tm.m_orientation.y - 2.0f * tm.m_orientation.z * tm.m_orientation.z ) * p.x + ( 2.0f * tm.m_orientation.x * tm.m_orientation.y - 2.0f * tm.m_orientation.w * tm.m_orientation.z ) * p.y + ( 2.0f * tm.m_orientation.x * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.y ) * p.z;
+	out.y = tm.m_vPosition.y + ( 2.0f * tm.m_orientation.x * tm.m_orientation.y + 2.0f * tm.m_orientation.w * tm.m_orientation.z ) * p.x + ( 1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.z * tm.m_orientation.z ) * p.y + ( 2.0f * tm.m_orientation.y * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.x ) * p.z;
+	out.z = tm.m_vPosition.z + ( 2.0f * tm.m_orientation.x * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.y ) * p.x + ( 2.0f * tm.m_orientation.y * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.x ) * p.y + ( 1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.y * tm.m_orientation.y ) * p.z;
+}
+
+template < class T >
+inline void RotatePoint( const T & tm, const Vector & p, Vector & out )
+{
+	out.x = ( 1.0f - 2.0f * tm.m_orientation.y * tm.m_orientation.y - 2.0f * tm.m_orientation.z * tm.m_orientation.z ) * p.x + ( 2.0f * tm.m_orientation.x * tm.m_orientation.y - 2.0f * tm.m_orientation.w * tm.m_orientation.z ) * p.y + ( 2.0f * tm.m_orientation.x * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.y ) * p.z;
+	out.y = ( 2.0f * tm.m_orientation.x * tm.m_orientation.y + 2.0f * tm.m_orientation.w * tm.m_orientation.z ) * p.x + ( 1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.z * tm.m_orientation.z ) * p.y + ( 2.0f * tm.m_orientation.y * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.x ) * p.z;
+	out.z = ( 2.0f * tm.m_orientation.x * tm.m_orientation.z - 2.0f * tm.m_orientation.w * tm.m_orientation.y ) * p.x + ( 2.0f * tm.m_orientation.y * tm.m_orientation.z + 2.0f * tm.m_orientation.w * tm.m_orientation.x ) * p.y + ( 1.0f - 2.0f * tm.m_orientation.x * tm.m_orientation.x - 2.0f * tm.m_orientation.y * tm.m_orientation.y ) * p.z;
+}
+
+
+inline const CTransform TransformInvert( const CTransform &in )
+{
+	CTransform out ;
+	TransformInvert( in, out );
+	return out;
+}
+
+// Transform equality test
+bool TransformsAreEqual( const CTransform &src1, const CTransform &src2, float flPosTolerance = 1e-2, float flRotTolerance = 1e-1f );
+
+// Computes world-space transforms given local-space transforms + parent info
+// The start of the pTransforms array (nRootTransformCount # of transforms) must be filled with 
+// the root transforms which have no parent. The end of the pTransforms array (nTransformCount # of transforms)
+// must be filled with local-space transforms which are relative to other transforms, including possibly the
+// root transforms. Therefore, (nRootTransformCount + nTransformCount) # of transforms must be passed into pTransforms.
+// Only nTransformCount parent indices should be passed in. 
+// Parent indices are relative to the entire array, so a parent index of 0 indicates the first element
+// of the array, which is always a root transform. -1 parent index is *illegal*
+// Parent indices must always be sorted so that the index transforms earlier in the array.
+// The transforms are modified in-place.
+void TransformToWorldSpace( int nRootTransformCount, int nTransformCount, const int *pParentIndices, CTransform *pTransforms );
+void TransformToParentSpace( int nRootTransformCount, int nTransformCount, const int *pParentIndices, CTransform *pTransforms );
+
+
+inline void CTransform::InitFromQAngles( const QAngle &angles, const Vector &vPosition )
+{
+	AngleQuaternion( angles, m_orientation );
+	m_vPosition = vPosition;
+}
+
+inline void CTransform::InitFromMatrix( const matrix3x4_t &transform )
+{
+	m_orientation = MatrixQuaternion( transform );
+	m_vPosition = transform.GetOrigin();
+}
+
+inline void CTransform::InitFromQuaternion( const Quaternion &orientation, const Vector &vPosition )
+{
+	m_orientation = orientation;
+	m_vPosition = vPosition;
+}
+
+inline void CTransform::SetAngles( QAngle const &vAngles )
+{
+	AngleQuaternion( vAngles, m_orientation );
+}
+
+inline Quaternion CTransform::ToQuaternion() const
+{
+	return m_orientation;
+}
+inline QAngle CTransform::ToQAngle() const
+{
+	QAngle angles;
+	QuaternionAngles( m_orientation, angles );
+	return angles;
+}
+
+inline matrix3x4_t CTransform::ToMatrix() const
+{
+	return TransformMatrix( *this );
+}
+
+inline void CTransform::SetToIdentity()
+{
+	m_vPosition = vec3_origin;
+	m_orientation = quat_identity;
+}
+
+inline void CTransform::GetBasisVectorsFLU( Vector *pForward, Vector *pLeft, Vector *pUp ) const
+{
+	TransformVectorsFLU( *this, pForward, pLeft, pUp );
+}
+
+inline Vector CTransform::GetForward() const
+{
+	Vector vForward;
+	TransformVectorsForward( *this, &vForward );
+	return vForward;
+}
+
+inline Vector CTransform::TransformVector( const Vector &v0 ) const
+{
+	return TransformPoint( *this, v0 );
+}
+
+inline Vector CTransform::RotateVector( const Vector &v0 ) const
+{
+	Vector vOut;
+	RotatePoint( *this, v0, vOut );
+	return vOut;
+}
+
+inline Vector CTransform::TransformVectorByInverse( const Vector &v0 ) const
+{
+	Vector vOut;
+	VectorITransform( v0, *this, vOut );
+	return vOut;
+}
+
+inline Vector CTransform::RotateVectorByInverse( const Vector &v0 ) const
+{
+	Vector vOut;
+	VectorIRotate( v0, *this, vOut );
+	return vOut;
+}
+
+inline bool CTransform::operator==(const CTransform& t) const
+{
+	return t.m_vPosition == m_vPosition && t.m_orientation == m_orientation;
+}
+
+inline bool CTransform::operator!=(const CTransform& t) const
+{
+	return t.m_vPosition != m_vPosition || t.m_orientation != m_orientation;
+}
+
+// PERFORMANCE: No native versions of these but implement them on matrix for convenient access
+inline void CTransform::TransformAABB( const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut ) const
+{
+	ToMatrix().TransformAABB( vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut );
+}
+
+inline void CTransform::TransformAABBByInverse( const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut ) const
+{
+	ToMatrix().TransformAABBByInverse( vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut );
+}
+
+inline void CTransform::RotateAABB( const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut ) const
+{
+	ToMatrix().RotateAABB( vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut );
+}
+inline void CTransform::RotateAABBByInverse( const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut ) const
+{
+	ToMatrix().RotateAABBByInverse( vecMinsIn, vecMaxsIn, vecMinsOut, vecMaxsOut );
+}
+
+inline void CTransform::InverseTR( CTransform &out ) const
+{
+	matrix3x4_t xForm = ToMatrix();
+	out = xForm.InverseTR().ToCTransform();
+}
+
+
+// transform conversion operators on matrix3x4_t
+inline void matrix3x4_t::InitFromCTransform( const CTransform &transform )
+{
+	TransformMatrix( transform, *this );
+}
+inline CTransform matrix3x4_t::ToCTransform() const
+{
+	return MatrixTransform( *this );
+}
+
+
+#endif // TRANSFORM
--- a/public/mathlib/vector.h
+++ b/public/mathlib/vector.h
--- a/public/mathlib/vector2d.h
+++ b/public/mathlib/vector2d.h
@@ -0,0 +1,695 @@
+//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#ifndef VECTOR2D_H
+#define VECTOR2D_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include <math.h>
+#include <float.h>
+
+// For vec_t, put this somewhere else?
+#include "tier0/basetypes.h"
+
+// For RandomFloat()
+#include "vstdlib/random.h"
+
+#include "tier0/dbg.h"
+#include "mathlib/math_pfns.h"
+
+#ifndef M_PI
+	#define M_PI		3.14159265358979323846	// matches value in gcc v2 math.h
+#endif
+
+#ifndef M_PI_F
+	#define M_PI_F		((float)(M_PI))
+#endif
+
+#ifndef DEG2RAD
+	#define DEG2RAD( x  )  ( (float)(x) * (float)(M_PI_F / 180.f) )
+#endif
+
+extern void inline SinCos( float radians, float * RESTRICT sine, float * RESTRICT cosine );
+
+//=========================================================
+// 2D Vector2D
+//=========================================================
+
+class Vector2D					
+{
+public:
+	// Members
+	vec_t x, y;
+
+	// Construction/destruction
+	Vector2D();
+	Vector2D(vec_t X, vec_t Y);
+	explicit Vector2D(const float *pFloat);
+
+	// Initialization
+	void Init(vec_t ix=0.0f, vec_t iy=0.0f);
+
+	// Got any nasty NAN's?
+	bool IsValid() const;
+
+	// array access...
+	vec_t operator[](int i) const;
+	vec_t& operator[](int i);
+
+	// Base address...
+	vec_t* Base();
+	vec_t const* Base() const;
+
+	// Initialization methods
+	void Random( float minVal, float maxVal );
+
+	// equality
+	bool operator==(const Vector2D& v) const;
+	bool operator!=(const Vector2D& v) const;	
+
+	// arithmetic operations
+	Vector2D&	operator+=(const Vector2D &v);			
+	Vector2D&	operator-=(const Vector2D &v);		
+	Vector2D&	operator*=(const Vector2D &v);			
+	Vector2D&	operator*=(float s);
+	Vector2D&	operator/=(const Vector2D &v);		
+	Vector2D&	operator/=(float s);					
+
+	// negate the Vector2D components
+	void	Negate(); 
+
+	// Get the Vector2D's magnitude.
+	vec_t	Length() const;
+
+	// Get the Vector2D's magnitude squared.
+	vec_t	LengthSqr(void) const;
+
+	// return true if this vector is (0,0) within tolerance
+	bool IsZero( float tolerance = 0.01f ) const
+	{
+		return (x > -tolerance && x < tolerance &&
+				y > -tolerance && y < tolerance);
+	}
+
+	// Normalize in place and return the old length.
+	vec_t	NormalizeInPlace();
+
+	// Compare length.
+	bool	IsLengthGreaterThan( float val ) const;
+	bool	IsLengthLessThan( float val ) const;
+
+	// Get the distance from this Vector2D to the other one.
+	vec_t	DistTo(const Vector2D &vOther) const;
+
+	// Get the distance from this Vector2D to the other one squared.
+	vec_t	DistToSqr(const Vector2D &vOther) const;		
+
+	// Copy
+	void	CopyToArray(float* rgfl) const;	
+
+	// Multiply, add, and assign to this (ie: *this = a + b * scalar). This
+	// is about 12% faster than the actual Vector2D equation (because it's done per-component
+	// rather than per-Vector2D).
+	void	MulAdd(const Vector2D& a, const Vector2D& b, float scalar);	
+
+	// Dot product.
+	vec_t	Dot(const Vector2D& vOther) const;			
+
+	// assignment
+	Vector2D& operator=(const Vector2D &vOther);
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+	// copy constructors
+	Vector2D(const Vector2D &vOther);
+
+	// arithmetic operations
+	Vector2D	operator-(void) const;
+				
+	Vector2D	operator+(const Vector2D& v) const;	
+	Vector2D	operator-(const Vector2D& v) const;	
+	Vector2D	operator*(const Vector2D& v) const;	
+	Vector2D	operator/(const Vector2D& v) const;	
+	Vector2D	operator*(float fl) const;
+	Vector2D	operator/(float fl) const;			
+	
+	// Cross product between two vectors.
+	Vector2D	Cross(const Vector2D &vOther) const;		
+
+	// Returns a Vector2D with the min or max in X, Y, and Z.
+	Vector2D	Min(const Vector2D &vOther) const;
+	Vector2D	Max(const Vector2D &vOther) const;
+
+#else
+
+private:
+	// No copy constructors allowed if we're in optimal mode
+	Vector2D(const Vector2D& vOther);
+#endif
+};
+
+//-----------------------------------------------------------------------------
+
+const Vector2D vec2_origin(0,0);
+const Vector2D vec2_invalid( FLT_MAX, FLT_MAX );
+
+//-----------------------------------------------------------------------------
+// Vector2D related operations
+//-----------------------------------------------------------------------------
+
+// Vector2D clear
+void Vector2DClear( Vector2D& a );
+
+// Copy
+void Vector2DCopy( const Vector2D& src, Vector2D& dst );
+
+// Vector2D arithmetic
+void Vector2DAdd( const Vector2D& a, const Vector2D& b, Vector2D& result );
+void Vector2DSubtract( const Vector2D& a, const Vector2D& b, Vector2D& result );
+void Vector2DMultiply( const Vector2D& a, vec_t b, Vector2D& result );
+void Vector2DMultiply( const Vector2D& a, const Vector2D& b, Vector2D& result );
+void Vector2DDivide( const Vector2D& a, vec_t b, Vector2D& result );
+void Vector2DDivide( const Vector2D& a, const Vector2D& b, Vector2D& result );
+void Vector2DMA( const Vector2D& start, float s, const Vector2D& dir, Vector2D& result );
+
+// Store the min or max of each of x, y, and z into the result.
+void Vector2DMin( const Vector2D &a, const Vector2D &b, Vector2D &result );
+void Vector2DMax( const Vector2D &a, const Vector2D &b, Vector2D &result );
+
+#define Vector2DExpand( v ) (v).x, (v).y
+
+// Normalization
+vec_t Vector2DNormalize( Vector2D& v );
+
+// Length
+vec_t Vector2DLength( const Vector2D& v );
+
+// Dot Product
+vec_t DotProduct2D(const Vector2D& a, const Vector2D& b);
+
+// Linearly interpolate between two vectors
+void Vector2DLerp(const Vector2D& src1, const Vector2D& src2, vec_t t, Vector2D& dest );
+
+
+//-----------------------------------------------------------------------------
+//
+// Inlined Vector2D methods
+//
+//-----------------------------------------------------------------------------
+
+
+//-----------------------------------------------------------------------------
+// constructors
+//-----------------------------------------------------------------------------
+
+inline Vector2D::Vector2D()									
+{ 
+#ifdef _DEBUG
+	// Initialize to NAN to catch errors
+	x = y = VEC_T_NAN;
+#endif
+}
+
+inline Vector2D::Vector2D(vec_t X, vec_t Y)						
+{ 
+	x = X; y = Y;
+	Assert( IsValid() );
+}
+
+inline Vector2D::Vector2D(const float *pFloat)					
+{
+	Assert( pFloat );
+	x = pFloat[0]; y = pFloat[1];	
+	Assert( IsValid() );
+}
+
+
+//-----------------------------------------------------------------------------
+// copy constructor
+//-----------------------------------------------------------------------------
+
+inline Vector2D::Vector2D(const Vector2D &vOther)					
+{ 
+	Assert( vOther.IsValid() );
+	x = vOther.x; y = vOther.y;
+}
+
+//-----------------------------------------------------------------------------
+// initialization
+//-----------------------------------------------------------------------------
+
+inline void Vector2D::Init( vec_t ix, vec_t iy )    
+{ 
+	x = ix; y = iy;
+	Assert( IsValid() );
+}
+
+#if !defined(__SPU__)
+inline void Vector2D::Random( float minVal, float maxVal )
+{
+	x = RandomFloat( minVal , maxVal );
+	y = RandomFloat( minVal , maxVal );
+}
+#endif
+
+inline void Vector2DClear( Vector2D& a )
+{
+	a.x = a.y = 0.0f;
+}
+
+//-----------------------------------------------------------------------------
+// assignment
+//-----------------------------------------------------------------------------
+
+inline Vector2D& Vector2D::operator=(const Vector2D &vOther)	
+{
+	Assert( vOther.IsValid() );
+	x=vOther.x; y=vOther.y;
+	return *this; 
+}
+
+//-----------------------------------------------------------------------------
+// Array access
+//-----------------------------------------------------------------------------
+
+inline vec_t& Vector2D::operator[](int i)
+{
+	Assert( (i >= 0) && (i < 2) );
+	return ((vec_t*)this)[i];
+}
+
+inline vec_t Vector2D::operator[](int i) const
+{
+	Assert( (i >= 0) && (i < 2) );
+	return ((vec_t*)this)[i];
+}
+
+//-----------------------------------------------------------------------------
+// Base address...
+//-----------------------------------------------------------------------------
+
+inline vec_t* Vector2D::Base()
+{
+	return (vec_t*)this;
+}
+
+inline vec_t const* Vector2D::Base() const
+{
+	return (vec_t const*)this;
+}
+
+//-----------------------------------------------------------------------------
+// IsValid?
+//-----------------------------------------------------------------------------
+
+inline bool Vector2D::IsValid() const
+{
+	return IsFinite(x) && IsFinite(y);
+}
+
+//-----------------------------------------------------------------------------
+// comparison
+//-----------------------------------------------------------------------------
+
+inline bool Vector2D::operator==( const Vector2D& src ) const
+{
+	Assert( src.IsValid() && IsValid() );
+	return (src.x == x) && (src.y == y);
+}
+
+inline bool Vector2D::operator!=( const Vector2D& src ) const
+{
+	Assert( src.IsValid() && IsValid() );
+	return (src.x != x) || (src.y != y);
+}
+
+
+//-----------------------------------------------------------------------------
+// Copy
+//-----------------------------------------------------------------------------
+
+inline void Vector2DCopy( const Vector2D& src, Vector2D& dst )
+{
+	Assert( src.IsValid() );
+	dst.x = src.x;
+	dst.y = src.y;
+}
+
+inline void	Vector2D::CopyToArray(float* rgfl) const		
+{ 
+	Assert( IsValid() );
+	Assert( rgfl );
+	rgfl[0] = x; rgfl[1] = y; 
+}
+
+//-----------------------------------------------------------------------------
+// standard math operations
+//-----------------------------------------------------------------------------
+
+inline void Vector2D::Negate()
+{ 
+	Assert( IsValid() );
+	x = -x; y = -y;
+} 
+
+inline Vector2D& Vector2D::operator+=(const Vector2D& v)	
+{ 
+	Assert( IsValid() && v.IsValid() );
+	x+=v.x; y+=v.y;	
+	return *this;
+}
+
+inline Vector2D& Vector2D::operator-=(const Vector2D& v)	
+{ 
+	Assert( IsValid() && v.IsValid() );
+	x-=v.x; y-=v.y;	
+	return *this;
+}
+
+inline Vector2D& Vector2D::operator*=(float fl)	
+{
+	x *= fl;
+	y *= fl;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline Vector2D& Vector2D::operator*=(const Vector2D& v)	
+{ 
+	x *= v.x;
+	y *= v.y;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline Vector2D& Vector2D::operator/=(float fl)	
+{
+	Assert( fl != 0.0f );
+	float oofl = 1.0f / fl;
+	x *= oofl;
+	y *= oofl;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline Vector2D& Vector2D::operator/=(const Vector2D& v)	
+{ 
+	Assert( v.x != 0.0f && v.y != 0.0f );
+	x /= v.x;
+	y /= v.y;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline void Vector2DAdd( const Vector2D& a, const Vector2D& b, Vector2D& c )
+{
+	Assert( a.IsValid() && b.IsValid() );
+	c.x = a.x + b.x;
+	c.y = a.y + b.y;
+}
+
+inline void Vector2DSubtract( const Vector2D& a, const Vector2D& b, Vector2D& c )
+{
+	Assert( a.IsValid() && b.IsValid() );
+	c.x = a.x - b.x;
+	c.y = a.y - b.y;
+}
+
+inline void Vector2DMultiply( const Vector2D& a, vec_t b, Vector2D& c )
+{
+	Assert( a.IsValid() && IsFinite(b) );
+	c.x = a.x * b;
+	c.y = a.y * b;
+}
+
+inline void Vector2DMultiply( const Vector2D& a, const Vector2D& b, Vector2D& c )
+{				  
+	Assert( a.IsValid() && b.IsValid() );
+	c.x = a.x * b.x;
+	c.y = a.y * b.y;
+}
+
+
+inline void Vector2DDivide( const Vector2D& a, vec_t b, Vector2D& c )
+{
+	Assert( a.IsValid() );
+	Assert( b != 0.0f );
+	vec_t oob = 1.0f / b;
+	c.x = a.x * oob;
+	c.y = a.y * oob;
+}
+
+inline void Vector2DDivide( const Vector2D& a, const Vector2D& b, Vector2D& c )
+{
+	Assert( a.IsValid() );
+	Assert( (b.x != 0.0f) && (b.y != 0.0f) );
+	c.x = a.x / b.x;
+	c.y = a.y / b.y;
+}
+
+inline void Vector2DRotate( const Vector2D& vIn, float flDegrees, Vector2D& vOut )
+{
+	float c, s;
+	SinCos( DEG2RAD( flDegrees ), &s, &c );
+
+	vOut.x = vIn.x*c - vIn.y*s;
+	vOut.y = vIn.x*s + vIn.y*c;
+}
+
+inline void Vector2DMA( const Vector2D& start, float s, const Vector2D& dir, Vector2D& result )
+{
+	Assert( start.IsValid() && IsFinite(s) && dir.IsValid() );
+	result.x = start.x + s*dir.x;
+	result.y = start.y + s*dir.y;
+}
+
+// FIXME: Remove
+// For backwards compatability
+inline void	Vector2D::MulAdd(const Vector2D& a, const Vector2D& b, float scalar)
+{
+	x = a.x + b.x * scalar;
+	y = a.y + b.y * scalar;
+}
+
+inline void Vector2DLerp(const Vector2D& src1, const Vector2D& src2, vec_t t, Vector2D& dest )
+{
+	dest[0] = src1[0] + (src2[0] - src1[0]) * t;
+	dest[1] = src1[1] + (src2[1] - src1[1]) * t;
+}
+
+//-----------------------------------------------------------------------------
+// dot, cross
+//-----------------------------------------------------------------------------
+inline vec_t DotProduct2D(const Vector2D& a, const Vector2D& b) 
+{ 
+	Assert( a.IsValid() && b.IsValid() );
+	return( a.x*b.x + a.y*b.y ); 
+}
+
+// for backwards compatability
+inline vec_t Vector2D::Dot( const Vector2D& vOther ) const
+{
+	return DotProduct2D( *this, vOther );
+}
+
+
+//-----------------------------------------------------------------------------
+// length
+//-----------------------------------------------------------------------------
+inline vec_t Vector2DLength( const Vector2D& v )
+{
+	Assert( v.IsValid() );
+	return (vec_t)FastSqrt(v.x*v.x + v.y*v.y);		
+}
+
+inline vec_t Vector2D::LengthSqr(void) const	
+{ 
+	Assert( IsValid() );
+	return (x*x + y*y);		
+}
+
+inline vec_t Vector2D::NormalizeInPlace()
+{
+	return Vector2DNormalize( *this );
+}
+
+inline bool Vector2D::IsLengthGreaterThan( float val ) const
+{
+	return LengthSqr() > val*val;
+}
+
+inline bool Vector2D::IsLengthLessThan( float val ) const
+{
+	return LengthSqr() < val*val;
+}
+
+inline vec_t Vector2D::Length(void) const	
+{
+	return Vector2DLength( *this );
+}
+
+
+inline void Vector2DMin( const Vector2D &a, const Vector2D &b, Vector2D &result )
+{
+	result.x = (a.x < b.x) ? a.x : b.x;
+	result.y = (a.y < b.y) ? a.y : b.y;
+}
+
+
+inline void Vector2DMax( const Vector2D &a, const Vector2D &b, Vector2D &result )
+{
+	result.x = (a.x > b.x) ? a.x : b.x;
+	result.y = (a.y > b.y) ? a.y : b.y;
+}
+
+
+//-----------------------------------------------------------------------------
+// Normalization
+//-----------------------------------------------------------------------------
+inline vec_t Vector2DNormalize( Vector2D& v )
+{
+	Assert( v.IsValid() );
+	vec_t l = v.Length();
+	if (l != 0.0f)
+	{
+		v /= l;
+	}
+	else
+	{
+		v.x = v.y = 0.0f; 
+	}
+	return l;
+}
+
+
+//-----------------------------------------------------------------------------
+// Get the distance from this Vector2D to the other one 
+//-----------------------------------------------------------------------------
+inline vec_t Vector2D::DistTo(const Vector2D &vOther) const
+{
+	Vector2D delta;
+	Vector2DSubtract( *this, vOther, delta );
+	return delta.Length();
+}
+
+inline vec_t Vector2D::DistToSqr(const Vector2D &vOther) const
+{
+	Vector2D delta;
+	Vector2DSubtract( *this, vOther, delta );
+	return delta.LengthSqr();
+}
+
+
+//-----------------------------------------------------------------------------
+// Computes the closest point to vecTarget no farther than flMaxDist from vecStart
+//-----------------------------------------------------------------------------
+inline void ComputeClosestPoint2D( const Vector2D& vecStart, float flMaxDist, const Vector2D& vecTarget, Vector2D *pResult )
+{
+	Vector2D vecDelta;
+	Vector2DSubtract( vecTarget, vecStart, vecDelta );
+	float flDistSqr = vecDelta.LengthSqr();
+	if ( flDistSqr <= flMaxDist * flMaxDist )
+	{
+		*pResult = vecTarget;
+	}
+	else
+	{
+		vecDelta /= FastSqrt( flDistSqr );
+		Vector2DMA( vecStart, flMaxDist, vecDelta, *pResult );
+	}
+}
+
+
+
+//-----------------------------------------------------------------------------
+//
+// Slow methods
+//
+//-----------------------------------------------------------------------------
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+//-----------------------------------------------------------------------------
+// Returns a Vector2D with the min or max in X, Y, and Z.
+//-----------------------------------------------------------------------------
+
+inline Vector2D Vector2D::Min(const Vector2D &vOther) const
+{
+	return Vector2D(x < vOther.x ? x : vOther.x, 
+		y < vOther.y ? y : vOther.y);
+}
+
+inline Vector2D Vector2D::Max(const Vector2D &vOther) const
+{
+	return Vector2D(x > vOther.x ? x : vOther.x, 
+		y > vOther.y ? y : vOther.y);
+}
+
+
+//-----------------------------------------------------------------------------
+// arithmetic operations
+//-----------------------------------------------------------------------------
+
+inline Vector2D Vector2D::operator-(void) const
+{ 
+	return Vector2D(-x,-y);				
+}
+
+inline Vector2D Vector2D::operator+(const Vector2D& v) const	
+{ 
+	Vector2D res;
+	Vector2DAdd( *this, v, res );
+	return res;	
+}
+
+inline Vector2D Vector2D::operator-(const Vector2D& v) const	
+{ 
+	Vector2D res;
+	Vector2DSubtract( *this, v, res );
+	return res;	
+}
+
+inline Vector2D Vector2D::operator*(float fl) const	
+{ 
+	Vector2D res;
+	Vector2DMultiply( *this, fl, res );
+	return res;	
+}
+
+inline Vector2D Vector2D::operator*(const Vector2D& v) const	
+{ 
+	Vector2D res;
+	Vector2DMultiply( *this, v, res );
+	return res;	
+}
+
+inline Vector2D Vector2D::operator/(float fl) const	
+{ 
+	Vector2D res;
+	Vector2DDivide( *this, fl, res );
+	return res;	
+}
+
+inline Vector2D Vector2D::operator/(const Vector2D& v) const	
+{ 
+	Vector2D res;
+	Vector2DDivide( *this, v, res );
+	return res;	
+}
+
+inline Vector2D operator*(float fl, const Vector2D& v)	
+{ 
+	return v * fl; 
+}
+
+#endif //slow
+
+#endif // VECTOR2D_H
+
--- a/public/mathlib/vector4d.h
+++ b/public/mathlib/vector4d.h
@@ -0,0 +1,771 @@
+//========= Copyright 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#ifndef VECTOR4D_H
+#define VECTOR4D_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include <math.h>
+#include <float.h>
+#if !defined( PLATFORM_PPC ) && !defined( _PS3 )
+#include <xmmintrin.h>	// for sse
+#endif
+#include "tier0/basetypes.h"	// For vec_t, put this somewhere else?
+#include "tier0/dbg.h"
+#include "mathlib/math_pfns.h"
+#include "mathlib/vector.h"
+#include "vstdlib/random.h"
+// forward declarations
+class Vector;
+class Vector2D;
+
+//=========================================================
+// 4D Vector4D
+//=========================================================
+
+class Vector4D					
+{
+public:
+	// Members
+	vec_t x, y, z, w;
+
+	// Construction/destruction
+	Vector4D();
+	Vector4D(vec_t X, vec_t Y, vec_t Z, vec_t W);
+	explicit Vector4D(const float *pFloat);
+
+	// Initialization
+	void Init(vec_t ix=0.0f, vec_t iy=0.0f, vec_t iz=0.0f, vec_t iw=0.0f);
+	void Init( const Vector& src, vec_t iw=0.0f );
+
+	// Got any nasty NAN's?
+	bool IsValid() const;
+
+	// array access...
+	vec_t operator[](int i) const;
+	vec_t& operator[](int i);
+
+	// Base address...
+	inline vec_t* Base();
+	inline vec_t const* Base() const;
+
+	// Cast to Vector and Vector2D...
+	Vector& AsVector3D();
+	Vector const& AsVector3D() const;
+
+	Vector2D& AsVector2D();
+	Vector2D const& AsVector2D() const;
+
+	// Initialization methods
+	void Random( vec_t minVal, vec_t maxVal );
+
+	// equality
+	bool operator==(const Vector4D& v) const;
+	bool operator!=(const Vector4D& v) const;	
+
+	// arithmetic operations
+	Vector4D&	operator+=(const Vector4D &v);			
+	Vector4D&	operator-=(const Vector4D &v);		
+	Vector4D&	operator*=(const Vector4D &v);			
+	Vector4D&	operator*=(float s);
+	Vector4D&	operator/=(const Vector4D &v);		
+	Vector4D&	operator/=(float s);					
+
+	Vector4D	operator-( void ) const;
+	Vector4D	operator*( float fl ) const;
+	Vector4D	operator/( float fl ) const;
+	Vector4D	operator*( const Vector4D& v ) const;
+	Vector4D	operator+( const Vector4D& v ) const;
+	Vector4D	operator-( const Vector4D& v ) const;
+
+	// negate the Vector4D components
+	void	Negate(); 
+
+	// Get the Vector4D's magnitude.
+	vec_t	Length() const;
+
+	// Get the Vector4D's magnitude squared.
+	vec_t	LengthSqr(void) const;
+
+	// return true if this vector is (0,0,0,0) within tolerance
+	bool IsZero( float tolerance = 0.01f ) const
+	{
+		return (x > -tolerance && x < tolerance &&
+			y > -tolerance && y < tolerance &&
+			z > -tolerance && z < tolerance &&
+			w > -tolerance && w < tolerance);
+	}
+
+	// Get the distance from this Vector4D to the other one.
+	vec_t	DistTo(const Vector4D &vOther) const;
+
+	// Get the distance from this Vector4D to the other one squared.
+	vec_t	DistToSqr(const Vector4D &vOther) const;		
+
+	// Copy
+	void	CopyToArray(float* rgfl) const;	
+
+	// Multiply, add, and assign to this (ie: *this = a + b * scalar). This
+	// is about 12% faster than the actual Vector4D equation (because it's done per-component
+	// rather than per-Vector4D).
+	void	MulAdd(Vector4D const& a, Vector4D const& b, float scalar);	
+
+	// Dot product.
+	vec_t	Dot(Vector4D const& vOther) const;			
+
+	// No copy constructors allowed if we're in optimal mode
+#ifdef VECTOR_NO_SLOW_OPERATIONS
+private:
+#else
+public:
+#endif
+	Vector4D(Vector4D const& vOther);
+
+	// No assignment operators either...
+	Vector4D& operator=( Vector4D const& src );
+};
+
+const Vector4D vec4_origin( 0.0f, 0.0f, 0.0f, 0.0f );
+const Vector4D vec4_invalid( FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX );
+
+//-----------------------------------------------------------------------------
+// SSE optimized routines
+//-----------------------------------------------------------------------------
+
+class ALIGN16 Vector4DAligned : public Vector4D
+{
+public:
+	Vector4DAligned(void) {}
+	Vector4DAligned( vec_t X, vec_t Y, vec_t Z, vec_t W );
+
+	inline void Set( vec_t X, vec_t Y, vec_t Z, vec_t W );
+	inline void InitZero( void );
+
+	inline __m128 &AsM128() { return *(__m128*)&x; }
+	inline const __m128 &AsM128() const { return *(const __m128*)&x; } 
+
+private:
+	// No copy constructors allowed if we're in optimal mode
+	Vector4DAligned( Vector4DAligned const& vOther );
+
+	// No assignment operators either...
+	Vector4DAligned& operator=( Vector4DAligned const& src );
+} ALIGN16_POST;
+
+//-----------------------------------------------------------------------------
+// Vector4D related operations
+//-----------------------------------------------------------------------------
+
+// Vector4D clear
+void Vector4DClear( Vector4D& a );
+
+// Copy
+void Vector4DCopy( Vector4D const& src, Vector4D& dst );
+
+// Vector4D arithmetic
+void Vector4DAdd( Vector4D const& a, Vector4D const& b, Vector4D& result );
+void Vector4DSubtract( Vector4D const& a, Vector4D const& b, Vector4D& result );
+void Vector4DMultiply( Vector4D const& a, vec_t b, Vector4D& result );
+void Vector4DMultiply( Vector4D const& a, Vector4D const& b, Vector4D& result );
+void Vector4DDivide( Vector4D const& a, vec_t b, Vector4D& result );
+void Vector4DDivide( Vector4D const& a, Vector4D const& b, Vector4D& result );
+void Vector4DMA( Vector4D const& start, float s, Vector4D const& dir, Vector4D& result );
+
+// Vector4DAligned arithmetic
+void Vector4DMultiplyAligned( Vector4DAligned const& a, vec_t b, Vector4DAligned& result );
+
+
+#define Vector4DExpand( v ) (v).x, (v).y, (v).z, (v).w
+
+// Normalization
+vec_t Vector4DNormalize( Vector4D& v );
+
+// Length
+vec_t Vector4DLength( Vector4D const& v );
+
+// Dot Product
+vec_t DotProduct4D(Vector4D const& a, Vector4D const& b);
+
+// Linearly interpolate between two vectors
+void Vector4DLerp(Vector4D const& src1, Vector4D const& src2, vec_t t, Vector4D& dest );
+
+
+//-----------------------------------------------------------------------------
+//
+// Inlined Vector4D methods
+//
+//-----------------------------------------------------------------------------
+
+
+//-----------------------------------------------------------------------------
+// constructors
+//-----------------------------------------------------------------------------
+
+inline Vector4D::Vector4D()									
+{ 
+#ifdef _DEBUG
+	// Initialize to NAN to catch errors
+	x = y = z = w = VEC_T_NAN;
+#endif
+}
+
+inline Vector4D::Vector4D(vec_t X, vec_t Y, vec_t Z, vec_t W )
+{ 
+	x = X; y = Y; z = Z; w = W;
+	Assert( IsValid() );
+}
+
+inline Vector4D::Vector4D(const float *pFloat)					
+{
+	Assert( pFloat );
+	x = pFloat[0]; y = pFloat[1]; z = pFloat[2]; w = pFloat[3];	
+	Assert( IsValid() );
+}
+
+
+//-----------------------------------------------------------------------------
+// copy constructor
+//-----------------------------------------------------------------------------
+
+inline Vector4D::Vector4D(const Vector4D &vOther)					
+{ 
+	Assert( vOther.IsValid() );
+	x = vOther.x; y = vOther.y; z = vOther.z; w = vOther.w;
+}
+
+//-----------------------------------------------------------------------------
+// initialization
+//-----------------------------------------------------------------------------
+inline void Vector4D::Init( vec_t ix, vec_t iy, vec_t iz, vec_t iw )
+{ 
+	x = ix; y = iy; z = iz;	w = iw;
+	Assert( IsValid() );
+}
+
+inline void Vector4D::Init( const Vector& src, vec_t iw )
+{
+	x = src.x; y = src.y; z = src.z; w = iw;
+	Assert( IsValid() );
+}
+
+#if !defined(__SPU__)
+inline void Vector4D::Random( vec_t minVal, vec_t maxVal )
+{
+	x = RandomFloat( minVal , maxVal );
+	y = RandomFloat( minVal , maxVal );
+	z = RandomFloat( minVal , maxVal );
+	w = RandomFloat( minVal , maxVal );
+}
+#endif
+
+inline void Vector4DClear( Vector4D& a )
+{
+	a.x = a.y = a.z = a.w = 0.0f;
+}
+
+//-----------------------------------------------------------------------------
+// assignment
+//-----------------------------------------------------------------------------
+
+inline Vector4D& Vector4D::operator=(const Vector4D &vOther)	
+{
+	Assert( vOther.IsValid() );
+	x=vOther.x; y=vOther.y; z=vOther.z; w=vOther.w;
+	return *this; 
+}
+
+//-----------------------------------------------------------------------------
+// Array access
+//-----------------------------------------------------------------------------
+
+inline vec_t& Vector4D::operator[](int i)
+{
+	Assert( (i >= 0) && (i < 4) );
+	return ((vec_t*)this)[i];
+}
+
+inline vec_t Vector4D::operator[](int i) const
+{
+	Assert( (i >= 0) && (i < 4) );
+	return ((vec_t*)this)[i];
+}
+
+//-----------------------------------------------------------------------------
+// Cast to Vector and Vector2D...
+//-----------------------------------------------------------------------------
+
+inline Vector& Vector4D::AsVector3D()
+{
+	return *(Vector*)this;
+}
+
+inline Vector const& Vector4D::AsVector3D() const
+{
+	return *(Vector const*)this;
+}
+
+inline Vector2D& Vector4D::AsVector2D()
+{
+	return *(Vector2D*)this;
+}
+
+inline Vector2D const& Vector4D::AsVector2D() const
+{
+	return *(Vector2D const*)this;
+}
+
+//-----------------------------------------------------------------------------
+// Base address...
+//-----------------------------------------------------------------------------
+
+inline vec_t* Vector4D::Base()
+{
+	return (vec_t*)this;
+}
+
+inline vec_t const* Vector4D::Base() const
+{
+	return (vec_t const*)this;
+}
+
+//-----------------------------------------------------------------------------
+// IsValid?
+//-----------------------------------------------------------------------------
+
+inline bool Vector4D::IsValid() const
+{
+	return IsFinite(x) && IsFinite(y) && IsFinite(z) && IsFinite(w);
+}
+
+//-----------------------------------------------------------------------------
+// comparison
+//-----------------------------------------------------------------------------
+
+inline bool Vector4D::operator==( Vector4D const& src ) const
+{
+	Assert( src.IsValid() && IsValid() );
+	return (src.x == x) && (src.y == y) && (src.z == z) && (src.w == w);
+}
+
+inline bool Vector4D::operator!=( Vector4D const& src ) const
+{
+	Assert( src.IsValid() && IsValid() );
+	return (src.x != x) || (src.y != y) || (src.z != z) || (src.w != w);
+}
+
+
+//-----------------------------------------------------------------------------
+// Copy
+//-----------------------------------------------------------------------------
+
+inline void Vector4DCopy( Vector4D const& src, Vector4D& dst )
+{
+	Assert( src.IsValid() );
+	dst.x = src.x;
+	dst.y = src.y;
+	dst.z = src.z;
+	dst.w = src.w;
+}
+
+inline void	Vector4D::CopyToArray(float* rgfl) const		
+{ 
+	Assert( IsValid() );
+	Assert( rgfl );
+	rgfl[0] = x; rgfl[1] = y; rgfl[2] = z; rgfl[3] = w;
+}
+
+//-----------------------------------------------------------------------------
+// standard math operations
+//-----------------------------------------------------------------------------
+
+inline void Vector4D::Negate()
+{ 
+	Assert( IsValid() );
+	x = -x; y = -y; z = -z; w = -w;
+} 
+
+inline Vector4D& Vector4D::operator+=(const Vector4D& v)	
+{ 
+	Assert( IsValid() && v.IsValid() );
+	x+=v.x; y+=v.y; z += v.z; w += v.w;	
+	return *this;
+}
+
+inline Vector4D& Vector4D::operator-=(const Vector4D& v)	
+{ 
+	Assert( IsValid() && v.IsValid() );
+	x-=v.x; y-=v.y; z -= v.z; w -= v.w;
+	return *this;
+}
+
+inline Vector4D& Vector4D::operator*=(float fl)	
+{
+	x *= fl;
+	y *= fl;
+	z *= fl;
+	w *= fl;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline Vector4D& Vector4D::operator*=(Vector4D const& v)	
+{ 
+	x *= v.x;
+	y *= v.y;
+	z *= v.z;
+	w *= v.w;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline Vector4D Vector4D::operator-(void) const
+{ 
+	return Vector4D(-x,-y,-z,-w);				
+}
+
+inline Vector4D Vector4D::operator+(const Vector4D& v) const	
+{ 
+	Vector4D res;
+	Vector4DAdd( *this, v, res );
+	return res;	
+}
+
+inline Vector4D Vector4D::operator-(const Vector4D& v) const	
+{ 
+	Vector4D res;
+	Vector4DSubtract( *this, v, res );
+	return res;	
+}
+
+
+inline Vector4D Vector4D::operator*(float fl) const	
+{ 
+	Vector4D res;
+	Vector4DMultiply( *this, fl, res );
+	return res;	
+}
+
+inline Vector4D Vector4D::operator*(const Vector4D& v) const	
+{ 
+	Vector4D res;
+	Vector4DMultiply( *this, v, res );
+	return res;	
+}
+
+inline Vector4D Vector4D::operator/(float fl) const	
+{ 
+	Vector4D res;
+	Vector4DDivide( *this, fl, res );
+	return res;	
+}
+
+inline Vector4D operator*( float fl, const Vector4D& v )	
+{ 
+	return v * fl; 
+}
+
+inline Vector4D& Vector4D::operator/=(float fl)	
+{
+	Assert( fl != 0.0f );
+	float oofl = 1.0f / fl;
+	x *= oofl;
+	y *= oofl;
+	z *= oofl;
+	w *= oofl;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline Vector4D& Vector4D::operator/=(Vector4D const& v)	
+{ 
+	Assert( v.x != 0.0f && v.y != 0.0f && v.z != 0.0f && v.w != 0.0f );
+	x /= v.x;
+	y /= v.y;
+	z /= v.z;
+	w /= v.w;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline void Vector4DAdd( Vector4D const& a, Vector4D const& b, Vector4D& c )
+{
+	Assert( a.IsValid() && b.IsValid() );
+	c.x = a.x + b.x;
+	c.y = a.y + b.y;
+	c.z = a.z + b.z;
+	c.w = a.w + b.w;
+}
+
+inline void Vector4DSubtract( Vector4D const& a, Vector4D const& b, Vector4D& c )
+{
+	Assert( a.IsValid() && b.IsValid() );
+	c.x = a.x - b.x;
+	c.y = a.y - b.y;
+	c.z = a.z - b.z;
+	c.w = a.w - b.w;
+}
+
+inline void Vector4DMultiply( Vector4D const& a, vec_t b, Vector4D& c )
+{
+	Assert( a.IsValid() && IsFinite(b) );
+	c.x = a.x * b;
+	c.y = a.y * b;
+	c.z = a.z * b;
+	c.w = a.w * b;
+}
+
+inline void Vector4DMultiply( Vector4D const& a, Vector4D const& b, Vector4D& c )
+{
+	Assert( a.IsValid() && b.IsValid() );
+	c.x = a.x * b.x;
+	c.y = a.y * b.y;
+	c.z = a.z * b.z;
+	c.w = a.w * b.w;
+}
+
+inline void Vector4DDivide( Vector4D const& a, vec_t b, Vector4D& c )
+{
+	Assert( a.IsValid() );
+	Assert( b != 0.0f );
+	vec_t oob = 1.0f / b;
+	c.x = a.x * oob;
+	c.y = a.y * oob;
+	c.z = a.z * oob;
+	c.w = a.w * oob;
+}
+
+inline void Vector4DDivide( Vector4D const& a, Vector4D const& b, Vector4D& c )
+{
+	Assert( a.IsValid() );
+	Assert( (b.x != 0.0f) && (b.y != 0.0f) && (b.z != 0.0f) && (b.w != 0.0f) );
+	c.x = a.x / b.x;
+	c.y = a.y / b.y;
+	c.z = a.z / b.z;
+	c.w = a.w / b.w;
+}
+
+inline void Vector4DMA( Vector4D const& start, float s, Vector4D const& dir, Vector4D& result )
+{
+	Assert( start.IsValid() && IsFinite(s) && dir.IsValid() );
+	result.x = start.x + s*dir.x;
+	result.y = start.y + s*dir.y;
+	result.z = start.z + s*dir.z;
+	result.w = start.w + s*dir.w;
+}
+
+// FIXME: Remove
+// For backwards compatability
+inline void	Vector4D::MulAdd(Vector4D const& a, Vector4D const& b, float scalar)
+{
+	x = a.x + b.x * scalar;
+	y = a.y + b.y * scalar;
+	z = a.z + b.z * scalar;
+	w = a.w + b.w * scalar;
+}
+
+inline void Vector4DLerp(const Vector4D& src1, const Vector4D& src2, vec_t t, Vector4D& dest )
+{
+	dest[0] = src1[0] + (src2[0] - src1[0]) * t;
+	dest[1] = src1[1] + (src2[1] - src1[1]) * t;
+	dest[2] = src1[2] + (src2[2] - src1[2]) * t;
+	dest[3] = src1[3] + (src2[3] - src1[3]) * t;
+}
+
+//-----------------------------------------------------------------------------
+// dot, cross
+//-----------------------------------------------------------------------------
+
+inline vec_t DotProduct4D(const Vector4D& a, const Vector4D& b) 
+{ 
+	Assert( a.IsValid() && b.IsValid() );
+	return( a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w ); 
+}
+
+// for backwards compatability
+inline vec_t Vector4D::Dot( Vector4D const& vOther ) const
+{
+	return DotProduct4D( *this, vOther );
+}
+
+
+//-----------------------------------------------------------------------------
+// length
+//-----------------------------------------------------------------------------
+
+inline vec_t Vector4DLength( Vector4D const& v )
+{				   
+	Assert( v.IsValid() );
+	return (vec_t)FastSqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);		
+}
+
+inline vec_t Vector4D::LengthSqr(void) const	
+{ 
+	Assert( IsValid() );
+	return (x*x + y*y + z*z + w*w);		
+}
+
+inline vec_t Vector4D::Length(void) const	
+{
+	return Vector4DLength( *this );
+}
+
+
+//-----------------------------------------------------------------------------
+// Normalization
+//-----------------------------------------------------------------------------
+
+// FIXME: Can't use until we're un-macroed in mathlib.h
+inline vec_t Vector4DNormalize( Vector4D& v )
+{
+	Assert( v.IsValid() );
+	vec_t l = v.Length();
+	if (l != 0.0f)
+	{
+		v /= l;
+	}
+	else
+	{
+		v.x = v.y = v.z = v.w = 0.0f;
+	}
+	return l;
+}
+
+//-----------------------------------------------------------------------------
+// Get the distance from this Vector4D to the other one 
+//-----------------------------------------------------------------------------
+
+inline vec_t Vector4D::DistTo(const Vector4D &vOther) const
+{
+	Vector4D delta;
+	Vector4DSubtract( *this, vOther, delta );
+	return delta.Length();
+}
+
+inline vec_t Vector4D::DistToSqr(const Vector4D &vOther) const
+{
+	Vector4D delta;
+	Vector4DSubtract( *this, vOther, delta );
+	return delta.LengthSqr();
+}
+
+
+//-----------------------------------------------------------------------------
+// Vector4DAligned routines
+//-----------------------------------------------------------------------------
+
+inline Vector4DAligned::Vector4DAligned( vec_t X, vec_t Y, vec_t Z, vec_t W )
+{ 
+	x = X; y = Y; z = Z; w = W;
+	Assert( IsValid() );
+}
+
+inline void Vector4DAligned::Set( vec_t X, vec_t Y, vec_t Z, vec_t W )
+{ 
+	x = X; y = Y; z = Z; w = W;
+	Assert( IsValid() );
+}
+
+inline void Vector4DAligned::InitZero( void )
+{ 
+#if !defined( PLATFORM_PPC )
+	this->AsM128() = _mm_set1_ps( 0.0f );
+#elif defined(_PS3)
+	this->AsM128() =VMX_ZERO;
+#else
+	this->AsM128() = __vspltisw( 0 );
+#endif
+	Assert( IsValid() );
+}
+
+inline void Vector4DMultiplyAligned( Vector4DAligned const& a, Vector4DAligned const& b, Vector4DAligned& c )
+{
+	Assert( a.IsValid() && b.IsValid() );
+#if !defined( PLATFORM_PPC )
+	c.x = a.x * b.x;
+	c.y = a.y * b.y;
+	c.z = a.z * b.z;
+	c.w = a.w * b.w;
+#elif defined(_PS3)
+	c.AsM128() = __vec_mul( a.AsM128(), b.AsM128());
+#else
+	c.AsM128() = __vmulfp( a.AsM128(), b.AsM128() );
+#endif
+}
+
+inline void Vector4DWeightMAD( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB )
+{
+	Assert( vInA.IsValid() && vInB.IsValid() && IsFinite(w) );
+
+#if !defined( PLATFORM_PPC )
+	vOutA.x += vInA.x * w;
+	vOutA.y += vInA.y * w;
+	vOutA.z += vInA.z * w;
+	vOutA.w += vInA.w * w;
+
+	vOutB.x += vInB.x * w;
+	vOutB.y += vInB.y * w;
+	vOutB.z += vInB.z * w;
+	vOutB.w += vInB.w * w;
+#elif defined(_PS3)
+#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
+	// GCC 4.1.1
+	__m128 temp=vec_splats(w);
+#else //__GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 1
+	__m128 temp=__m128(w);
+#endif //__GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 1
+
+	vOutA.AsM128() = vec_madd( vInA.AsM128(), temp, vOutA.AsM128() );
+	vOutB.AsM128() = vec_madd( vInB.AsM128(), temp, vOutB.AsM128() );
+#else
+	__vector4 temp;
+
+	temp = __lvlx( &w, 0 );
+	temp = __vspltw( temp, 0 );
+
+	vOutA.AsM128() = __vmaddfp( vInA.AsM128(), temp, vOutA.AsM128() );
+	vOutB.AsM128() = __vmaddfp( vInB.AsM128(), temp, vOutB.AsM128() );
+#endif
+}
+
+inline void Vector4DWeightMADSSE( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB )
+{
+	Assert( vInA.IsValid() && vInB.IsValid() && IsFinite(w) );
+
+#if !defined( PLATFORM_PPC )
+	// Replicate scalar float out to 4 components
+	__m128 packed = _mm_set1_ps( w );
+
+	// 4D SSE Vector MAD
+	vOutA.AsM128() = _mm_add_ps( vOutA.AsM128(), _mm_mul_ps( vInA.AsM128(), packed ) );
+	vOutB.AsM128() = _mm_add_ps( vOutB.AsM128(), _mm_mul_ps( vInB.AsM128(), packed ) );
+#elif defined(_PS3)
+#if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
+	// GCC 4.1.1
+	__m128 temp=vec_splats(w);
+#else //__GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 1
+	__m128 temp=__m128(w);
+#endif //__GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 1
+
+	vOutA.AsM128() = vec_madd( vInA.AsM128(), temp, vOutA.AsM128() );
+	vOutB.AsM128() = vec_madd( vInB.AsM128(), temp, vOutB.AsM128() );
+#else
+	__vector4 temp;
+
+	temp = __lvlx( &w, 0 );
+	temp = __vspltw( temp, 0 );
+
+	vOutA.AsM128() = __vmaddfp( vInA.AsM128(), temp, vOutA.AsM128() );
+	vOutB.AsM128() = __vmaddfp( vInB.AsM128(), temp, vOutB.AsM128() );
+#endif
+}
+
+#endif // VECTOR4D_H
+
--- a/public/mathlib/vertexcolor.h
+++ b/public/mathlib/vertexcolor.h
@@ -0,0 +1,121 @@
+//==== Copyright © 1996-2008, Valve Corporation, All rights reserved. =======//
+//
+// Purpose: A color format which works on 360 + PC
+//
+//===========================================================================//
+
+#ifndef VERTEXCOLOR_H
+#define VERTEXCOLOR_H
+
+#ifdef COMPILER_MSVC
+#pragma once
+#endif
+
+#include "tier0/platform.h"
+
+
+//-----------------------------------------------------------------------------
+// The challenge here is to make a color struct that works both on the 360
+// and PC, since the 360 is big-endian vs the PC which is little endian.
+//-----------------------------------------------------------------------------
+struct VertexColor_t
+{
+	// NOTE: This constructor is explicitly here to disallow initializers
+	// with initializer lists i.e. 
+	//     VertexColor_t color = { 255,   0,   0, 255 };
+	// which will totally fail on the 360.
+	VertexColor_t() {};
+	VertexColor_t( const VertexColor_t &src );
+	VertexColor_t( uint8 ir, uint8 ig, uint8 ib, uint8 ia );
+
+	// assign and copy by using the whole register rather than byte-by-byte copy. 
+	// (No, the compiler is not smart enough to do this for you. /FAcs if you 
+	// don't believe me.)
+	uint32 AsUint32() const; 
+	uint32 *AsUint32Ptr();
+	const uint32 *AsUint32Ptr() const; 
+
+	// assignment
+	VertexColor_t &operator=( const VertexColor_t &src );
+	VertexColor_t &operator=( const color32 &src );
+
+	// comparison
+	bool operator==( const VertexColor_t &src ) const;
+	bool operator!=( const VertexColor_t &src ) const;
+
+#ifdef PLATFORM_X360
+	// 360 is little endian
+	uint8 a, b, g, r;
+#else
+	uint8 r, g, b, a;
+#endif
+};
+
+
+//-----------------------------------------------------------------------------
+// Constructors
+//-----------------------------------------------------------------------------
+inline VertexColor_t::VertexColor_t( const VertexColor_t &src )
+{
+	*AsUint32Ptr() = src.AsUint32();
+}
+
+inline VertexColor_t::VertexColor_t( uint8 ir, uint8 ig, uint8 ib, uint8 ia ) : r(ir), g(ig), b(ib), a(ia)
+{
+}
+
+
+//-----------------------------------------------------------------------------
+// Cast to int
+//-----------------------------------------------------------------------------
+inline uint32 VertexColor_t::AsUint32() const
+{ 
+	return *reinterpret_cast<const uint32*>( this ); 
+}
+
+inline uint32 *VertexColor_t::AsUint32Ptr() 
+{ 
+	return reinterpret_cast<uint32*>( this ); 
+}
+
+inline const uint32 *VertexColor_t::AsUint32Ptr() const 
+{ 
+	return reinterpret_cast<const uint32*>( this ); 
+} 
+
+
+//-----------------------------------------------------------------------------
+// assignment
+//-----------------------------------------------------------------------------
+inline VertexColor_t &VertexColor_t::operator=( const VertexColor_t &src )
+{
+	*AsUint32Ptr() = src.AsUint32();
+	return *this;
+}
+
+inline VertexColor_t &VertexColor_t::operator=( const color32 &src )
+{
+	r = src.r;
+	g = src.g;
+	b = src.b;
+	a = src.a;
+	return *this;
+}
+
+
+//-----------------------------------------------------------------------------
+// comparison
+//-----------------------------------------------------------------------------
+inline bool VertexColor_t::operator==( const VertexColor_t &src ) const
+{
+	return AsUint32() == src.AsUint32();
+}
+
+inline bool VertexColor_t::operator!=( const VertexColor_t &src ) const
+{
+	return AsUint32() != src.AsUint32();
+}
+
+
+
+#endif // VERTEXCOLOR_H
--- a/public/mathlib/vmatrix.h
+++ b/public/mathlib/vmatrix.h
--- a/public/mathlib/volumeculler.h
+++ b/public/mathlib/volumeculler.h
@@ -0,0 +1,100 @@
+//============ Copyright (c) Valve Corporation, All rights reserved. ============
+
+#ifndef VOLUME_CULLER_H
+#define VOLUME_CULLER_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include "vector.h"
+#include "vplane.h"
+#include "ssemath.h"
+
+class CVolumeCuller : public CAlignedNewDelete< 16 >
+{
+public:
+	inline CVolumeCuller() { Clear(); }
+
+	inline void Clear() { m_nNumInclusionVolumePlanes = 0; m_bHasExclusionFrustum = false; m_bHasBaseFrustum = false; m_bCullSmallObjects = false; ClearCullCheckStats(); }
+
+	inline bool IsValid() const { return m_bHasExclusionFrustum || ( m_nNumInclusionVolumePlanes != 0 ); }
+	
+	// Returns false if box is culled.
+	bool CheckBox( const VectorAligned &mins, const VectorAligned &maxs ) const;
+	bool CheckBox( const Vector &mins, const Vector &maxs ) const;
+	bool CheckBoxCenterHalfDiagonal( const VectorAligned &center, const VectorAligned &halfDiagonal ) const;
+						
+	enum 
+	{ 
+		cNumBaseFrustumPlanes = 6,
+
+		cNumExclusionFrustumPlanes = 6,
+		
+		// cMaxInclusionVolumePlanes must at least have enough room to hold the planes output by CSunLightManager::ComputeCullingVolumePlanes().
+		cMaxInclusionVolumePlanes = 12 
+	};
+
+	// Base frustum
+	inline bool HasBaseFrustum() const { return m_bHasBaseFrustum; }
+	// Specify NULL to disable the base frustum.
+	void SetBaseFrustumPlanes( const VPlane *pPlanes );
+
+	void GetBaseFrustumPlanes( VPlane *pBasePlanes ) const;
+	int GetNumBaseFrustumPlanes() const { return cNumBaseFrustumPlanes; }
+	
+	// Exclusion frustum
+	inline bool HasExclusionFrustum() const { return m_bHasExclusionFrustum; }
+	// Specify NULL to disable the exclusion frustum.
+	void SetExclusionFrustumPlanes( const VPlane *pPlanes );
+	
+	int GetNumExclusionFrustumPlanes() const { return cNumExclusionFrustumPlanes; }
+	const fltx4 *GetExclusionFrustumPlanes() const { return m_ExclusionFrustumPlanes; }
+	
+	// Inclusion volume
+	inline bool HasInclusionVolume() const { return m_nNumInclusionVolumePlanes != 0; }
+	// Specify NULL to disable the inclusion volume.
+	void SetInclusionVolumePlanes( const VPlane *pPlanes, uint nNumPlanes );
+	
+	int GetNumInclusionVolumePlanes() const { return m_nNumInclusionVolumePlanes; }
+	const fltx4 *GetInclusionVolumePlanes() const { return m_InclusionVolumePlanes; }
+
+	bool GetCullSmallObjects() const { return m_bCullSmallObjects; }
+	float GetSmallObjectCullVolumeThreshold() const { return m_flSmallObjectCullVolumeThreshold; }
+
+	void SetCullSmallObjects(bool bCullSmallObjects, float flCullVolumeThreshold ) { m_bCullSmallObjects = bCullSmallObjects; m_flSmallObjectCullVolumeThreshold = flCullVolumeThreshold; }
+			
+	struct CullCheckStats_t
+	{
+		uint m_nTotalAABB;
+		uint m_nTotalAABBPassed;
+
+		uint m_nTotalCenterHalfDiagonal;
+		uint m_nTotalCenterHalfDiagonalPassed;
+	};
+
+	inline void ClearCullCheckStats() { memset( &m_Stats, 0, sizeof( m_Stats ) ); }
+	inline CullCheckStats_t &GetStats() const { return m_Stats; }
+				
+private:
+	// Objects which are not inside or touch the base planes are culled.
+	fourplanes_t	m_baseplanes[2];
+
+	// Objects totally within the exclusion frustum are culled (i.e. anything completely inside the exclusion region must be culled).
+	fltx4 m_ExclusionFrustumPlanes[cNumExclusionFrustumPlanes];
+	
+	// Objects totally outside of the inclusion region are culled (i.e. anything touching or inside the occlusion region cannot be culled).
+	fltx4 m_InclusionVolumePlanes[cMaxInclusionVolumePlanes];
+
+	uint m_nNumInclusionVolumePlanes;
+		
+	bool m_bHasBaseFrustum : 1;
+	bool m_bHasExclusionFrustum : 1;
+
+	bool m_bCullSmallObjects : 1;
+	float m_flSmallObjectCullVolumeThreshold;
+
+	mutable CullCheckStats_t m_Stats;
+};
+
+#endif // VOLUME_CULLER_H
--- a/public/mathlib/vplane.h
+++ b/public/mathlib/vplane.h
@@ -0,0 +1,182 @@
+//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $Workfile:     $
+// $Date:         $
+// $NoKeywords: $
+//=============================================================================//
+
+#ifndef VPLANE_H
+#define VPLANE_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include "mathlib/vector.h"
+
+typedef int SideType;
+
+// Used to represent sides of things like planes.
+#define	SIDE_FRONT	0
+#define	SIDE_BACK	1
+#define	SIDE_ON		2
+
+#define VP_EPSILON	0.01f
+
+
+class VPlane
+{
+public:
+				VPlane();
+				VPlane(const Vector &vNormal, vec_t dist);
+
+	void		Init(const Vector &vNormal, vec_t dist);
+
+	// Return the distance from the point to the plane.
+	vec_t		DistTo(const Vector &vVec) const;
+
+	// Copy.
+	VPlane&		operator=(const VPlane &thePlane);
+
+	// Returns SIDE_ON, SIDE_FRONT, or SIDE_BACK.
+	// The epsilon for SIDE_ON can be passed in.
+	SideType	GetPointSide(const Vector &vPoint, vec_t sideEpsilon=VP_EPSILON) const;
+
+	// Returns SIDE_FRONT or SIDE_BACK.
+	SideType	GetPointSideExact(const Vector &vPoint) const;
+
+	// Classify the box with respect to the plane.
+	// Returns SIDE_ON, SIDE_FRONT, or SIDE_BACK
+	SideType	BoxOnPlaneSide(const Vector &vMin, const Vector &vMax) const;
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+	// Flip the plane.
+	VPlane		Flip();
+
+	// Get a point on the plane (normal*dist).
+	Vector		GetPointOnPlane() const;
+
+	// Snap the specified point to the plane (along the plane's normal).
+	Vector		SnapPointToPlane(const Vector &vPoint) const;
+#endif
+
+public:
+	Vector		m_Normal;
+	vec_t		m_Dist;
+
+#ifdef VECTOR_NO_SLOW_OPERATIONS
+private:
+	// No copy constructors allowed if we're in optimal mode
+	VPlane(const VPlane& vOther);
+#endif
+};
+
+
+//-----------------------------------------------------------------------------
+// Inlines.
+//-----------------------------------------------------------------------------
+inline VPlane::VPlane()
+{
+}
+
+inline VPlane::VPlane(const Vector &vNormal, vec_t dist)
+{
+	m_Normal = vNormal;
+	m_Dist = dist;
+}
+
+inline void	VPlane::Init(const Vector &vNormal, vec_t dist)
+{
+	m_Normal = vNormal;
+	m_Dist = dist;
+}
+
+inline vec_t VPlane::DistTo(const Vector &vVec) const
+{
+	return vVec.Dot(m_Normal) - m_Dist;
+}
+
+inline VPlane& VPlane::operator=(const VPlane &thePlane)
+{
+	m_Normal = thePlane.m_Normal;
+	m_Dist = thePlane.m_Dist;
+	return *this;
+}
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+inline VPlane VPlane::Flip()
+{
+	return VPlane(-m_Normal, -m_Dist);
+}
+
+inline Vector VPlane::GetPointOnPlane() const
+{
+	return m_Normal * m_Dist;
+}
+
+inline Vector VPlane::SnapPointToPlane(const Vector &vPoint) const
+{
+	return vPoint - m_Normal * DistTo(vPoint);
+}
+
+#endif
+
+inline SideType VPlane::GetPointSide(const Vector &vPoint, vec_t sideEpsilon) const
+{
+	vec_t fDist;
+
+	fDist = DistTo(vPoint);
+	if(fDist >= sideEpsilon)
+		return SIDE_FRONT;
+	else if(fDist <= -sideEpsilon)
+		return SIDE_BACK;
+	else
+		return SIDE_ON;
+}
+
+inline SideType VPlane::GetPointSideExact(const Vector &vPoint) const
+{
+	return DistTo(vPoint) > 0.0f ? SIDE_FRONT : SIDE_BACK;
+}
+
+
+// BUGBUG: This should either simply use the implementation in mathlib or cease to exist.
+// mathlib implementation is much more efficient.  Check to see that VPlane isn't used in
+// performance critical code.
+inline SideType VPlane::BoxOnPlaneSide(const Vector &vMin, const Vector &vMax) const
+{
+	int i, firstSide, side;
+	TableVector vPoints[8] = 
+	{
+		{ vMin.x, vMin.y, vMin.z },
+		{ vMin.x, vMin.y, vMax.z },
+		{ vMin.x, vMax.y, vMax.z },
+		{ vMin.x, vMax.y, vMin.z },
+
+		{ vMax.x, vMin.y, vMin.z },
+		{ vMax.x, vMin.y, vMax.z },
+		{ vMax.x, vMax.y, vMax.z },
+		{ vMax.x, vMax.y, vMin.z },
+	};
+
+	firstSide = GetPointSideExact(vPoints[0]);
+	for(i=1; i < 8; i++)
+	{
+		side = GetPointSideExact(vPoints[i]);
+
+		// Does the box cross the plane?
+		if(side != firstSide)
+			return SIDE_ON;
+	}
+
+	// Ok, they're all on the same side, return that.
+	return firstSide;
+}
+
+
+
+
+#endif // VPLANE_H