mirror of
https://github.com/id-Software/DOOM-3-BFG.git
synced 2026-03-20 00:49:47 +01:00
Restored generic (non-SIMD) code
This commit is contained in:
@@ -72,6 +72,7 @@ void UnbindBufferObjects() {
|
||||
qglBindBufferARB( GL_ELEMENT_ARRAY_BUFFER_ARB, 0 );
|
||||
}
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
void CopyBuffer( byte * dst, const byte * src, int numBytes ) {
|
||||
assert_16_byte_aligned( dst );
|
||||
@@ -109,6 +110,15 @@ void CopyBuffer( byte * dst, const byte * src, int numBytes ) {
|
||||
_mm_sfence();
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void CopyBuffer( byte * dst, const byte * src, int numBytes ) {
|
||||
assert_16_byte_aligned( dst );
|
||||
assert_16_byte_aligned( src );
|
||||
memcpy( dst, src, numBytes );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
================================================================================================
|
||||
|
||||
@@ -258,7 +258,11 @@ idDxtEncoder::CompressImageDXT1Fast
|
||||
========================
|
||||
*/
|
||||
ID_INLINE void idDxtEncoder::CompressImageDXT1Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
CompressImageDXT1Fast_SSE2( inBuf, outBuf, width, height );
|
||||
#else
|
||||
CompressImageDXT1Fast_Generic( inBuf, outBuf, width, height );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -267,7 +271,11 @@ idDxtEncoder::CompressImageDXT1AlphaFast
|
||||
========================
|
||||
*/
|
||||
ID_INLINE void idDxtEncoder::CompressImageDXT1AlphaFast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
CompressImageDXT1AlphaFast_SSE2( inBuf, outBuf, width, height );
|
||||
#else
|
||||
CompressImageDXT1AlphaFast_Generic( inBuf, outBuf, width, height );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -276,7 +284,11 @@ idDxtEncoder::CompressImageDXT5Fast
|
||||
========================
|
||||
*/
|
||||
ID_INLINE void idDxtEncoder::CompressImageDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
CompressImageDXT5Fast_SSE2( inBuf, outBuf, width, height );
|
||||
#else
|
||||
CompressImageDXT5Fast_Generic( inBuf, outBuf, width, height );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -294,7 +306,11 @@ idDxtEncoder::CompressYCoCgDXT5Fast
|
||||
========================
|
||||
*/
|
||||
ID_INLINE void idDxtEncoder::CompressYCoCgDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
CompressYCoCgDXT5Fast_SSE2( inBuf, outBuf, width, height );
|
||||
#else
|
||||
CompressYCoCgDXT5Fast_Generic( inBuf, outBuf, width, height );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -312,7 +328,11 @@ idDxtEncoder::CompressNormalMapDXT5Fast
|
||||
========================
|
||||
*/
|
||||
ID_INLINE void idDxtEncoder::CompressNormalMapDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
CompressNormalMapDXT5Fast_SSE2( inBuf, outBuf, width, height );
|
||||
#else
|
||||
CompressNormalMapDXT5Fast_Generic( inBuf, outBuf, width, height );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -52,6 +52,7 @@ idDxtEncoder::NV4XHardwareBugFix
|
||||
========================
|
||||
*/
|
||||
void idDxtEncoder::NV4XHardwareBugFix( byte *minColor, byte *maxColor ) const {
|
||||
#ifdef ID_WIN_X86_ASM
|
||||
int minq = ( ( minColor[0] << 16 ) | ( minColor[1] << 8 ) | minColor[2] ) & 0x00F8FCF8;
|
||||
int maxq = ( ( maxColor[0] << 16 ) | ( maxColor[1] << 8 ) | maxColor[2] ) & 0x00F8FCF8;
|
||||
int mask = -( minq > maxq ) & 0x00FFFFFF;
|
||||
@@ -62,6 +63,13 @@ void idDxtEncoder::NV4XHardwareBugFix( byte *minColor, byte *maxColor ) const {
|
||||
min ^= max;
|
||||
*(int *)minColor = min;
|
||||
*(int *)maxColor = max;
|
||||
#else
|
||||
if ( ColorTo565( minColor ) > ColorTo565( maxColor ) ) {
|
||||
SwapValues( minColor[0], maxColor[0] );
|
||||
SwapValues( minColor[1], maxColor[1] );
|
||||
SwapValues( minColor[2], maxColor[2] );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -950,6 +958,7 @@ int idDxtEncoder::GetMinMaxNormalYHQ( const byte *colorBlock, byte *minColor, by
|
||||
return bestError;
|
||||
}
|
||||
|
||||
#if defined( ID_WIN_X86_ASM )
|
||||
ALIGN16( static float SIMD_SSE2_float_scale[4] ) = { 2.0f / 255.0f, 2.0f / 255.0f, 2.0f / 255.0f, 2.0f / 255.0f };
|
||||
ALIGN16( static float SIMD_SSE2_float_descale[4] ) = { 255.0f / 2.0f, 255.0f / 2.0f, 255.0f / 2.0f, 255.0f / 2.0f };
|
||||
ALIGN16( static float SIMD_SSE2_float_zero[4] ) = { 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
@@ -961,6 +970,7 @@ ALIGN16( static float SIMD_SP_rsqrt_c1[4] ) = { -0.5f, -0.5f, -0.5f, -0.5f };
|
||||
ALIGN16( static dword SIMD_SSE2_dword_maskFirstThree[4] ) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
|
||||
ALIGN16( static dword SIMD_SSE2_dword_maskWords[4] ) = { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x00000000 };
|
||||
#define R_SHUFFLE_PS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
|
||||
#endif
|
||||
|
||||
/*
|
||||
========================
|
||||
@@ -968,6 +978,7 @@ NormalDistanceDXT1
|
||||
========================
|
||||
*/
|
||||
int NormalDistanceDXT1( const int *vector, const int *normalized ) {
|
||||
#if defined( ID_WIN_X86_ASM )
|
||||
int result;
|
||||
__asm {
|
||||
mov esi, vector
|
||||
@@ -1007,6 +1018,24 @@ int NormalDistanceDXT1( const int *vector, const int *normalized ) {
|
||||
movd result, xmm0
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
float floatNormal[3];
|
||||
byte intNormal[4];
|
||||
floatNormal[0] = vector[0] * ( 2.0f / 255.0f ) - 1.0f;
|
||||
floatNormal[1] = vector[1] * ( 2.0f / 255.0f ) - 1.0f;
|
||||
floatNormal[2] = vector[2] * ( 2.0f / 255.0f ) - 1.0f;
|
||||
float rcplen = idMath::InvSqrt( floatNormal[0] * floatNormal[0] + floatNormal[1] * floatNormal[1] + floatNormal[2] * floatNormal[2] );
|
||||
floatNormal[0] *= rcplen;
|
||||
floatNormal[1] *= rcplen;
|
||||
floatNormal[2] *= rcplen;
|
||||
intNormal[0] = idMath::Ftob( ( floatNormal[0] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f );
|
||||
intNormal[1] = idMath::Ftob( ( floatNormal[1] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f );
|
||||
intNormal[2] = idMath::Ftob( ( floatNormal[2] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f );
|
||||
int result = ( ( intNormal[ 0 ] - normalized[ 0 ] ) * ( intNormal[ 0 ] - normalized[ 0 ] ) ) +
|
||||
( ( intNormal[ 1 ] - normalized[ 1 ] ) * ( intNormal[ 1 ] - normalized[ 1 ] ) ) +
|
||||
( ( intNormal[ 2 ] - normalized[ 2 ] ) * ( intNormal[ 2 ] - normalized[ 2 ] ) );
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1015,6 +1044,7 @@ NormalDistanceDXT5
|
||||
========================
|
||||
*/
|
||||
int NormalDistanceDXT5( const int *vector, const int *normalized ) {
|
||||
#if defined( ID_WIN_X86_ASM )
|
||||
int result;
|
||||
__asm {
|
||||
mov esi, vector
|
||||
@@ -1064,6 +1094,33 @@ int NormalDistanceDXT5( const int *vector, const int *normalized ) {
|
||||
movd result, xmm0
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
#if 0 // object-space
|
||||
const int c0 = 0;
|
||||
const int c1 = 1;
|
||||
const int c2 = 3;
|
||||
#else
|
||||
const int c0 = 1;
|
||||
const int c1 = 2;
|
||||
const int c2 = 3;
|
||||
#endif
|
||||
float floatNormal[3];
|
||||
byte intNormal[4];
|
||||
floatNormal[0] = vector[c0] / 255.0f * 2.0f - 1.0f;
|
||||
floatNormal[1] = vector[c1] / 255.0f * 2.0f - 1.0f;
|
||||
floatNormal[2] = vector[c2] / 255.0f * 2.0f - 1.0f;
|
||||
float rcplen = idMath::InvSqrt( floatNormal[0] * floatNormal[0] + floatNormal[1] * floatNormal[1] + floatNormal[2] * floatNormal[2] );
|
||||
floatNormal[0] *= rcplen;
|
||||
floatNormal[1] *= rcplen;
|
||||
floatNormal[2] *= rcplen;
|
||||
intNormal[c0] = idMath::Ftob( ( floatNormal[0] + 1.0f ) / 2.0f * 255.0f + 0.5f );
|
||||
intNormal[c1] = idMath::Ftob( ( floatNormal[1] + 1.0f ) / 2.0f * 255.0f + 0.5f );
|
||||
intNormal[c2] = idMath::Ftob( ( floatNormal[2] + 1.0f ) / 2.0f * 255.0f + 0.5f );
|
||||
int result = ( ( intNormal[ c0 ] - normalized[ c0 ] ) * ( intNormal[ c0 ] - normalized[ c0 ] ) ) +
|
||||
( ( intNormal[ c1 ] - normalized[ c1 ] ) * ( intNormal[ c1 ] - normalized[ c1 ] ) ) +
|
||||
( ( intNormal[ c2 ] - normalized[ c2 ] ) * ( intNormal[ c2 ] - normalized[ c2 ] ) );
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -72,6 +72,7 @@ R_MatrixMultiply
|
||||
==========================
|
||||
*/
|
||||
void R_MatrixMultiply( const float a[16], const float b[16], float out[16] ) {
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
__m128 a0 = _mm_loadu_ps( a + 0*4 );
|
||||
__m128 a1 = _mm_loadu_ps( a + 1*4 );
|
||||
@@ -108,6 +109,41 @@ void R_MatrixMultiply( const float a[16], const float b[16], float out[16] ) {
|
||||
_mm_storeu_ps( out + 2*4, t2 );
|
||||
_mm_storeu_ps( out + 3*4, t3 );
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
for ( int j = 0; j < 4; j++ ) {
|
||||
out[ i * 4 + j ] =
|
||||
a[ i * 4 + 0 ] * b[ 0 * 4 + j ] +
|
||||
a[ i * 4 + 1 ] * b[ 1 * 4 + j ] +
|
||||
a[ i * 4 + 2 ] * b[ 2 * 4 + j ] +
|
||||
a[ i * 4 + 3 ] * b[ 3 * 4 + j ];
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
out[0*4+0] = a[0*4+0]*b[0*4+0] + a[0*4+1]*b[1*4+0] + a[0*4+2]*b[2*4+0] + a[0*4+3]*b[3*4+0];
|
||||
out[0*4+1] = a[0*4+0]*b[0*4+1] + a[0*4+1]*b[1*4+1] + a[0*4+2]*b[2*4+1] + a[0*4+3]*b[3*4+1];
|
||||
out[0*4+2] = a[0*4+0]*b[0*4+2] + a[0*4+1]*b[1*4+2] + a[0*4+2]*b[2*4+2] + a[0*4+3]*b[3*4+2];
|
||||
out[0*4+3] = a[0*4+0]*b[0*4+3] + a[0*4+1]*b[1*4+3] + a[0*4+2]*b[2*4+3] + a[0*4+3]*b[3*4+3];
|
||||
|
||||
out[1*4+0] = a[1*4+0]*b[0*4+0] + a[1*4+1]*b[1*4+0] + a[1*4+2]*b[2*4+0] + a[1*4+3]*b[3*4+0];
|
||||
out[1*4+1] = a[1*4+0]*b[0*4+1] + a[1*4+1]*b[1*4+1] + a[1*4+2]*b[2*4+1] + a[1*4+3]*b[3*4+1];
|
||||
out[1*4+2] = a[1*4+0]*b[0*4+2] + a[1*4+1]*b[1*4+2] + a[1*4+2]*b[2*4+2] + a[1*4+3]*b[3*4+2];
|
||||
out[1*4+3] = a[1*4+0]*b[0*4+3] + a[1*4+1]*b[1*4+3] + a[1*4+2]*b[2*4+3] + a[1*4+3]*b[3*4+3];
|
||||
|
||||
out[2*4+0] = a[2*4+0]*b[0*4+0] + a[2*4+1]*b[1*4+0] + a[2*4+2]*b[2*4+0] + a[2*4+3]*b[3*4+0];
|
||||
out[2*4+1] = a[2*4+0]*b[0*4+1] + a[2*4+1]*b[1*4+1] + a[2*4+2]*b[2*4+1] + a[2*4+3]*b[3*4+1];
|
||||
out[2*4+2] = a[2*4+0]*b[0*4+2] + a[2*4+1]*b[1*4+2] + a[2*4+2]*b[2*4+2] + a[2*4+3]*b[3*4+2];
|
||||
out[2*4+3] = a[2*4+0]*b[0*4+3] + a[2*4+1]*b[1*4+3] + a[2*4+2]*b[2*4+3] + a[2*4+3]*b[3*4+3];
|
||||
|
||||
out[3*4+0] = a[3*4+0]*b[0*4+0] + a[3*4+1]*b[1*4+0] + a[3*4+2]*b[2*4+0] + a[3*4+3]*b[3*4+0];
|
||||
out[3*4+1] = a[3*4+0]*b[0*4+1] + a[3*4+1]*b[1*4+1] + a[3*4+2]*b[2*4+1] + a[3*4+3]*b[3*4+1];
|
||||
out[3*4+2] = a[3*4+0]*b[0*4+2] + a[3*4+1]*b[1*4+2] + a[3*4+2]*b[2*4+2] + a[3*4+3]*b[3*4+2];
|
||||
out[3*4+3] = a[3*4+0]*b[0*4+3] + a[3*4+1]*b[1*4+3] + a[3*4+2]*b[2*4+3] + a[3*4+3]*b[3*4+3];
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -274,6 +274,7 @@ static void R_DecalPointCullStatic( byte * cullBits, const idPlane * planes, con
|
||||
assert_16_byte_aligned( cullBits );
|
||||
assert_16_byte_aligned( verts );
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||
|
||||
@@ -376,6 +377,37 @@ static void R_DecalPointCullStatic( byte * cullBits, const idPlane * planes, con
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
for ( int i = 0; i < numVerts; ) {
|
||||
|
||||
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||
|
||||
for ( ; i <= nextNumVerts; i++ ) {
|
||||
const idVec3 & v = vertsODS[i].xyz;
|
||||
|
||||
const float d0 = planes[0].Distance( v );
|
||||
const float d1 = planes[1].Distance( v );
|
||||
const float d2 = planes[2].Distance( v );
|
||||
const float d3 = planes[3].Distance( v );
|
||||
const float d4 = planes[4].Distance( v );
|
||||
const float d5 = planes[5].Distance( v );
|
||||
|
||||
byte bits;
|
||||
bits = IEEE_FLT_SIGNBITNOTSET( d0 ) << 0;
|
||||
bits |= IEEE_FLT_SIGNBITNOTSET( d1 ) << 1;
|
||||
bits |= IEEE_FLT_SIGNBITNOTSET( d2 ) << 2;
|
||||
bits |= IEEE_FLT_SIGNBITNOTSET( d3 ) << 3;
|
||||
bits |= IEEE_FLT_SIGNBITNOTSET( d4 ) << 4;
|
||||
bits |= IEEE_FLT_SIGNBITNOTSET( d5 ) << 5;
|
||||
|
||||
cullBits[i] = bits;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -573,6 +605,7 @@ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * i
|
||||
assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
|
||||
assert_16_byte_aligned( fadeColor );
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
|
||||
const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts );
|
||||
@@ -612,6 +645,25 @@ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * i
|
||||
|
||||
_mm_sfence();
|
||||
|
||||
#else
|
||||
|
||||
// copy vertices and apply depth/time based fading
|
||||
for ( int i = 0; i < decal->numVerts; i++ ) {
|
||||
// NOTE: bad out-of-order write-combined write, SIMD code does the right thing
|
||||
verts[numVerts + i] = decal->verts[i];
|
||||
for ( int j = 0; j < 4; j++ ) {
|
||||
verts[numVerts + i].color[j] = idMath::Ftob( fadeColor[j] * decal->vertDepthFade[i] );
|
||||
}
|
||||
}
|
||||
|
||||
// copy indices
|
||||
assert( ( decal->numIndexes & 1 ) == 0 );
|
||||
for ( int i = 0; i < decal->numIndexes; i += 2 ) {
|
||||
assert( decal->indexes[i + 0] < decal->numVerts && decal->indexes[i + 1] < decal->numVerts );
|
||||
WriteIndexPair( &indexes[numIndexes + i], numVerts + decal->indexes[i + 0], numVerts + decal->indexes[i + 1] );
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -102,6 +102,7 @@ static void R_OverlayPointCullStatic( byte * cullBits, halfFloat_t * texCoordS,
|
||||
assert_16_byte_aligned( texCoordT );
|
||||
assert_16_byte_aligned( verts );
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||
|
||||
@@ -176,6 +177,39 @@ static void R_OverlayPointCullStatic( byte * cullBits, halfFloat_t * texCoordS,
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
for ( int i = 0; i < numVerts; ) {
|
||||
|
||||
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||
|
||||
for ( ; i <= nextNumVerts; i++ ) {
|
||||
const idVec3 & v = vertsODS[i].xyz;
|
||||
|
||||
const float d0 = planes[0].Distance( v );
|
||||
const float d1 = planes[1].Distance( v );
|
||||
const float d2 = 1.0f - d0;
|
||||
const float d3 = 1.0f - d1;
|
||||
|
||||
halfFloat_t s = Scalar_FastF32toF16( d0 );
|
||||
halfFloat_t t = Scalar_FastF32toF16( d1 );
|
||||
|
||||
texCoordS[i] = s;
|
||||
texCoordT[i] = t;
|
||||
|
||||
byte bits;
|
||||
bits = IEEE_FLT_SIGNBITSET( d0 ) << 0;
|
||||
bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
|
||||
bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
|
||||
bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
|
||||
|
||||
cullBits[i] = bits;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -189,6 +223,7 @@ static void R_OverlayPointCullSkinned( byte * cullBits, halfFloat_t * texCoordS,
|
||||
assert_16_byte_aligned( texCoordT );
|
||||
assert_16_byte_aligned( verts );
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||
|
||||
@@ -263,6 +298,39 @@ static void R_OverlayPointCullSkinned( byte * cullBits, halfFloat_t * texCoordS,
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
for ( int i = 0; i < numVerts; ) {
|
||||
|
||||
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||
|
||||
for ( ; i <= nextNumVerts; i++ ) {
|
||||
const idVec3 transformed = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
|
||||
|
||||
const float d0 = planes[0].Distance( transformed );
|
||||
const float d1 = planes[1].Distance( transformed );
|
||||
const float d2 = 1.0f - d0;
|
||||
const float d3 = 1.0f - d1;
|
||||
|
||||
halfFloat_t s = Scalar_FastF32toF16( d0 );
|
||||
halfFloat_t t = Scalar_FastF32toF16( d1 );
|
||||
|
||||
texCoordS[i] = s;
|
||||
texCoordT[i] = t;
|
||||
|
||||
byte bits;
|
||||
bits = IEEE_FLT_SIGNBITSET( d0 ) << 0;
|
||||
bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
|
||||
bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
|
||||
bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
|
||||
|
||||
cullBits[i] = bits;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -446,6 +514,7 @@ static void R_CopyOverlaySurface( idDrawVert * verts, int numVerts, triIndex_t *
|
||||
assert( ( ( overlay->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
|
||||
assert( ( ( overlay->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
const __m128i vector_int_clear_last = _mm_set_epi32( 0, -1, -1, -1 );
|
||||
const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
|
||||
@@ -482,6 +551,25 @@ static void R_CopyOverlaySurface( idDrawVert * verts, int numVerts, triIndex_t *
|
||||
|
||||
_mm_sfence();
|
||||
|
||||
#else
|
||||
|
||||
// copy vertices
|
||||
for ( int i = 0; i < overlay->numVerts; i++ ) {
|
||||
const overlayVertex_t &overlayVert = overlay->verts[i];
|
||||
|
||||
// NOTE: bad out-of-order write-combined write, SIMD code does the right thing
|
||||
verts[numVerts + i] = sourceVerts[overlayVert.vertexNum];
|
||||
verts[numVerts + i].st[0] = overlayVert.st[0];
|
||||
verts[numVerts + i].st[1] = overlayVert.st[1];
|
||||
}
|
||||
|
||||
// copy indexes
|
||||
for ( int i = 0; i < overlay->numIndexes; i += 2 ) {
|
||||
assert( overlay->indexes[i + 0] < overlay->numVerts && overlay->indexes[i + 1] < overlay->numVerts );
|
||||
WriteIndexPair( &indexes[numIndexes + i], numVerts + overlay->indexes[i + 0], numVerts + overlay->indexes[i + 1] );
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -32,10 +32,12 @@ If you have questions concerning this license or the applicable additional terms
|
||||
#include "tr_local.h"
|
||||
#include "Model_local.h"
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
static const __m128 vector_float_posInfinity = { idMath::INFINITY, idMath::INFINITY, idMath::INFINITY, idMath::INFINITY };
|
||||
static const __m128 vector_float_negInfinity = { -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY };
|
||||
|
||||
#endif
|
||||
|
||||
static const char *MD5_SnapshotName = "_MD5_Snapshot_";
|
||||
|
||||
@@ -501,6 +503,7 @@ idMD5Mesh::CalculateBounds
|
||||
====================
|
||||
*/
|
||||
void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds ) const {
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
__m128 minX = vector_float_posInfinity;
|
||||
__m128 minY = vector_float_posInfinity;
|
||||
@@ -534,6 +537,16 @@ void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds
|
||||
_mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) );
|
||||
_mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) );
|
||||
|
||||
#else
|
||||
|
||||
bounds.Clear();
|
||||
for ( int i = 0; i < numMeshJoints; i++ ) {
|
||||
const idJointMat & joint = entJoints[meshJoints[i]];
|
||||
bounds.AddPoint( joint.GetTranslation() );
|
||||
}
|
||||
bounds.ExpandSelf( maxJointVertDist );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1085,6 +1098,7 @@ static void TransformJoints( idJointMat *__restrict outJoints, const int numJoin
|
||||
assert_16_byte_aligned( inFloats1 );
|
||||
assert_16_byte_aligned( inFloats2 );
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
const __m128 mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) );
|
||||
|
||||
@@ -1160,6 +1174,13 @@ static void TransformJoints( idJointMat *__restrict outJoints, const int numJoin
|
||||
_mm_store_ps( outFloats + 1 * 12 + 8, ri1 );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for ( int i = 0; i < numJoints; i++ ) {
|
||||
idJointMat::Multiply( outJoints[i], inJoints1[i], inJoints2[i] );
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -87,6 +87,7 @@ static void R_ShadowVolumeCullBits( byte *cullBits, byte &totalOr, const float r
|
||||
assert_16_byte_aligned( cullBits );
|
||||
assert_16_byte_aligned( verts );
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||
|
||||
@@ -208,6 +209,54 @@ static void R_ShadowVolumeCullBits( byte *cullBits, byte &totalOr, const float r
|
||||
|
||||
totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte );
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
byte tOr = 0;
|
||||
for ( int i = 0; i < numVerts; ) {
|
||||
|
||||
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||
|
||||
for ( ; i <= nextNumVerts; i++ ) {
|
||||
const idVec3 & v = vertsODS[i].xyzw.ToVec3();
|
||||
|
||||
const float d0 = planes[0].Distance( v );
|
||||
const float d1 = planes[1].Distance( v );
|
||||
const float d2 = planes[2].Distance( v );
|
||||
const float d3 = planes[3].Distance( v );
|
||||
|
||||
const float t0 = d0 + radius;
|
||||
const float t1 = d1 + radius;
|
||||
const float t2 = d2 + radius;
|
||||
const float t3 = d3 + radius;
|
||||
|
||||
const float s0 = d0 - radius;
|
||||
const float s1 = d1 - radius;
|
||||
const float s2 = d2 - radius;
|
||||
const float s3 = d3 - radius;
|
||||
|
||||
byte bits;
|
||||
bits = IEEE_FLT_SIGNBITSET( t0 ) << 0;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
|
||||
|
||||
bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
|
||||
|
||||
bits ^= 0x0F; // flip lower four bits
|
||||
|
||||
tOr |= bits;
|
||||
cullBits[i] = bits;
|
||||
}
|
||||
}
|
||||
|
||||
totalOr = tOr;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -31,6 +31,7 @@ If you have questions concerning this license or the applicable additional terms
|
||||
#include "../../../idlib/sys/sys_intrinsics.h"
|
||||
#include "../../../idlib/geometry/DrawVert_intrinsics.h"
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
static const __m128i vector_int_neg_one = _mm_set_epi32( -1, -1, -1, -1 );
|
||||
|
||||
@@ -126,6 +127,69 @@ static __forceinline __m128i TriangleCulled_SSE2( const __m128 & vert0X, const _
|
||||
return _mm_castps_si128( _mm_cmpeq_ps( b0, zero ) );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
=====================
|
||||
TriangleFacing
|
||||
|
||||
Returns 255 if the triangle is facing the light origin, otherwise returns 0.
|
||||
=====================
|
||||
*/
|
||||
static byte TriangleFacing_Generic( const idVec3 & v1, const idVec3 & v2, const idVec3 & v3, const idVec3 & lightOrigin ) {
|
||||
const float sx = v2.x - v1.x;
|
||||
const float sy = v2.y - v1.y;
|
||||
const float sz = v2.z - v1.z;
|
||||
|
||||
const float tx = v3.x - v1.x;
|
||||
const float ty = v3.y - v1.y;
|
||||
const float tz = v3.z - v1.z;
|
||||
|
||||
const float normalX = ty * sz - tz * sy;
|
||||
const float normalY = tz * sx - tx * sz;
|
||||
const float normalZ = tx * sy - ty * sx;
|
||||
const float normalW = normalX * v1.x + normalY * v1.y + normalZ * v1.z;
|
||||
|
||||
const float d = lightOrigin.x * normalX + lightOrigin.y * normalY + lightOrigin.z * normalZ - normalW;
|
||||
return ( d > 0.0f ) ? 255 : 0;
|
||||
}
|
||||
|
||||
/*
|
||||
=====================
|
||||
TriangleCulled
|
||||
|
||||
Returns 255 if the triangle is culled to the light projection matrix, otherwise returns 0.
|
||||
The clip space of the 'lightProject' is assumed to be in the range [0, 1].
|
||||
=====================
|
||||
*/
|
||||
static byte TriangleCulled_Generic( const idVec3 & v1, const idVec3 & v2, const idVec3 & v3, const idRenderMatrix & lightProject ) {
|
||||
// transform the triangle
|
||||
idVec4 c[3];
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
c[0][i] = v1[0] * lightProject[i][0] + v1[1] * lightProject[i][1] + v1[2] * lightProject[i][2] + lightProject[i][3];
|
||||
c[1][i] = v2[0] * lightProject[i][0] + v2[1] * lightProject[i][1] + v2[2] * lightProject[i][2] + lightProject[i][3];
|
||||
c[2][i] = v3[0] * lightProject[i][0] + v3[1] * lightProject[i][1] + v3[2] * lightProject[i][2] + lightProject[i][3];
|
||||
}
|
||||
|
||||
// calculate the culled bits
|
||||
int bits = 0;
|
||||
for ( int i = 0; i < 3; i++ ) {
|
||||
const float minW = 0.0f;
|
||||
const float maxW = c[i][3];
|
||||
|
||||
if ( c[i][0] > minW ) { bits |= ( 1 << 0 ); }
|
||||
if ( c[i][0] < maxW ) { bits |= ( 1 << 1 ); }
|
||||
if ( c[i][1] > minW ) { bits |= ( 1 << 2 ); }
|
||||
if ( c[i][1] < maxW ) { bits |= ( 1 << 3 ); }
|
||||
if ( c[i][2] > minW ) { bits |= ( 1 << 4 ); }
|
||||
if ( c[i][2] < maxW ) { bits |= ( 1 << 5 ); }
|
||||
}
|
||||
|
||||
// if any bits weren't set, the triangle is completely off one side of the frustum
|
||||
return ( bits != 63 ) ? 255 : 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
=====================
|
||||
@@ -155,6 +219,7 @@ static int CalculateTriangleFacingCulledStatic( byte * __restrict facing, byte *
|
||||
const idVec3 lineDir = lineDelta * lineLengthRcp;
|
||||
const float lineLength = lineLengthSqr * lineLengthRcp;
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 4 * 3 > indexedVertsODS( verts, numVerts, indexes, numIndexes );
|
||||
|
||||
@@ -261,6 +326,55 @@ static int CalculateTriangleFacingCulledStatic( byte * __restrict facing, byte *
|
||||
|
||||
return _mm_cvtsi128_si32( numFrontFacing );
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 1 > indexedVertsODS( verts, numVerts, indexes, numIndexes );
|
||||
|
||||
const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0;
|
||||
|
||||
int numFrontFacing = 0;
|
||||
|
||||
for ( int i = 0, j = 0; i < numIndexes; ) {
|
||||
|
||||
const int batchStart = i;
|
||||
const int batchEnd = indexedVertsODS.FetchNextBatch();
|
||||
const int indexStart = j;
|
||||
|
||||
for ( ; i <= batchEnd - 3; i += 3, j++ ) {
|
||||
const idVec3 & v1 = indexedVertsODS[i + 0].xyz;
|
||||
const idVec3 & v2 = indexedVertsODS[i + 1].xyz;
|
||||
const idVec3 & v3 = indexedVertsODS[i + 2].xyz;
|
||||
|
||||
const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject );
|
||||
|
||||
byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin );
|
||||
|
||||
// optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume
|
||||
triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask );
|
||||
|
||||
culled[j] = triangleCulled;
|
||||
facing[j] = triangleFacing;
|
||||
|
||||
// count the number of facing triangles
|
||||
numFrontFacing += ( triangleFacing & 1 );
|
||||
}
|
||||
|
||||
if ( insideShadowVolume != NULL ) {
|
||||
for ( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ ) {
|
||||
if ( !facing[n] ) {
|
||||
if ( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, indexedVertsODS[k + 2].xyz, indexedVertsODS[k + 1].xyz, indexedVertsODS[k + 0].xyz ) ) {
|
||||
*insideShadowVolume = true;
|
||||
insideShadowVolume = NULL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return numFrontFacing;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -291,6 +405,7 @@ static int CalculateTriangleFacingCulledSkinned( byte * __restrict facing, byte
|
||||
const idVec3 lineDir = lineDelta * lineLengthRcp;
|
||||
const float lineLength = lineLengthSqr * lineLengthRcp;
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
@@ -428,6 +543,74 @@ static int CalculateTriangleFacingCulledSkinned( byte * __restrict facing, byte
|
||||
|
||||
return _mm_cvtsi128_si32( numFrontFacing );
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
for ( int i = 0; i < numVerts; ) {
|
||||
|
||||
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||
|
||||
for ( ; i <= nextNumVerts; i++ ) {
|
||||
tempVerts[i].ToVec3() = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
|
||||
tempVerts[i].w = 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
idODSStreamedArray< triIndex_t, 256, SBT_QUAD, 1 > indexesODS( indexes, numIndexes );
|
||||
|
||||
const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0;
|
||||
|
||||
int numFrontFacing = 0;
|
||||
|
||||
for ( int i = 0, j = 0; i < numIndexes; ) {
|
||||
|
||||
const int batchStart = i;
|
||||
const int batchEnd = indexesODS.FetchNextBatch();
|
||||
const int indexStart = j;
|
||||
|
||||
for ( ; i <= batchEnd - 3; i += 3, j++ ) {
|
||||
const int i0 = indexesODS[i + 0];
|
||||
const int i1 = indexesODS[i + 1];
|
||||
const int i2 = indexesODS[i + 2];
|
||||
|
||||
const idVec3 & v1 = tempVerts[i0].ToVec3();
|
||||
const idVec3 & v2 = tempVerts[i1].ToVec3();
|
||||
const idVec3 & v3 = tempVerts[i2].ToVec3();
|
||||
|
||||
const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject );
|
||||
|
||||
byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin );
|
||||
|
||||
// optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume
|
||||
triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask );
|
||||
|
||||
culled[j] = triangleCulled;
|
||||
facing[j] = triangleFacing;
|
||||
|
||||
// count the number of facing triangles
|
||||
numFrontFacing += ( triangleFacing & 1 );
|
||||
}
|
||||
|
||||
if ( insideShadowVolume != NULL ) {
|
||||
for ( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ ) {
|
||||
if ( !facing[n] ) {
|
||||
const int i0 = indexesODS[k + 0];
|
||||
const int i1 = indexesODS[k + 1];
|
||||
const int i2 = indexesODS[k + 2];
|
||||
if ( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, tempVerts[i2].ToVec3(), tempVerts[i1].ToVec3(), tempVerts[i0].ToVec3() ) ) {
|
||||
*insideShadowVolume = true;
|
||||
insideShadowVolume = NULL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return numFrontFacing;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -440,6 +623,7 @@ static void StreamOut( void * dst, const void * src, int numBytes ) {
|
||||
assert_16_byte_aligned( dst );
|
||||
assert_16_byte_aligned( src );
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
int i = 0;
|
||||
for ( ; i + 128 <= numBytes; i += 128 ) {
|
||||
__m128i d0 = _mm_load_si128( (const __m128i *)( (byte *)src + i + 0*16 ) );
|
||||
@@ -463,6 +647,9 @@ static void StreamOut( void * dst, const void * src, int numBytes ) {
|
||||
__m128i d = _mm_load_si128( (__m128i *)( (byte *)src + i ) );
|
||||
_mm_stream_si128( (__m128i *)( (byte *)dst + i ), d );
|
||||
}
|
||||
#else
|
||||
memcpy( dst, src, numBytes );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -671,7 +858,9 @@ static void R_CreateShadowVolumeTriangles( triIndex_t *__restrict shadowIndices,
|
||||
|
||||
numShadowIndexesTotal = numShadowIndices;
|
||||
|
||||
#if defined( ID_WIN_X86_SSE2_INTRIN )
|
||||
_mm_sfence();
|
||||
#endif
|
||||
|
||||
#else // NOTE: this code will not work on the SPU because it tries to write directly to the destination
|
||||
|
||||
@@ -844,7 +1033,9 @@ void R_CreateLightTriangles( triIndex_t * __restrict lightIndices, triIndex_t *
|
||||
|
||||
numLightIndicesTotal = numLightIndices;
|
||||
|
||||
#if defined( ID_WIN_X86_SSE2_INTRIN )
|
||||
_mm_sfence();
|
||||
#endif
|
||||
|
||||
#else // NOTE: this code will not work on the SPU because it tries to write directly to the destination
|
||||
|
||||
|
||||
@@ -43,6 +43,7 @@ static void R_TracePointCullStatic( byte *cullBits, byte &totalOr, const float r
|
||||
assert_16_byte_aligned( cullBits );
|
||||
assert_16_byte_aligned( verts );
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||
|
||||
@@ -164,6 +165,54 @@ static void R_TracePointCullStatic( byte *cullBits, byte &totalOr, const float r
|
||||
|
||||
totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte );
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
byte tOr = 0;
|
||||
for ( int i = 0; i < numVerts; ) {
|
||||
|
||||
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||
|
||||
for ( ; i <= nextNumVerts; i++ ) {
|
||||
const idVec3 & v = vertsODS[i].xyz;
|
||||
|
||||
const float d0 = planes[0].Distance( v );
|
||||
const float d1 = planes[1].Distance( v );
|
||||
const float d2 = planes[2].Distance( v );
|
||||
const float d3 = planes[3].Distance( v );
|
||||
|
||||
const float t0 = d0 + radius;
|
||||
const float t1 = d1 + radius;
|
||||
const float t2 = d2 + radius;
|
||||
const float t3 = d3 + radius;
|
||||
|
||||
const float s0 = d0 - radius;
|
||||
const float s1 = d1 - radius;
|
||||
const float s2 = d2 - radius;
|
||||
const float s3 = d3 - radius;
|
||||
|
||||
byte bits;
|
||||
bits = IEEE_FLT_SIGNBITSET( t0 ) << 0;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
|
||||
|
||||
bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
|
||||
|
||||
bits ^= 0x0F; // flip lower four bits
|
||||
|
||||
tOr |= bits;
|
||||
cullBits[i] = bits;
|
||||
}
|
||||
}
|
||||
|
||||
totalOr = tOr;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -175,6 +224,7 @@ static void R_TracePointCullSkinned( byte *cullBits, byte &totalOr, const float
|
||||
assert_16_byte_aligned( cullBits );
|
||||
assert_16_byte_aligned( verts );
|
||||
|
||||
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||
|
||||
@@ -296,6 +346,54 @@ static void R_TracePointCullSkinned( byte *cullBits, byte &totalOr, const float
|
||||
|
||||
totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte );
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
byte tOr = 0;
|
||||
for ( int i = 0; i < numVerts; ) {
|
||||
|
||||
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||
|
||||
for ( ; i <= nextNumVerts; i++ ) {
|
||||
const idVec3 v = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
|
||||
|
||||
const float d0 = planes[0].Distance( v );
|
||||
const float d1 = planes[1].Distance( v );
|
||||
const float d2 = planes[2].Distance( v );
|
||||
const float d3 = planes[3].Distance( v );
|
||||
|
||||
const float t0 = d0 + radius;
|
||||
const float t1 = d1 + radius;
|
||||
const float t2 = d2 + radius;
|
||||
const float t3 = d3 + radius;
|
||||
|
||||
const float s0 = d0 - radius;
|
||||
const float s1 = d1 - radius;
|
||||
const float s2 = d2 - radius;
|
||||
const float s3 = d3 - radius;
|
||||
|
||||
byte bits;
|
||||
bits = IEEE_FLT_SIGNBITSET( t0 ) << 0;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
|
||||
|
||||
bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
|
||||
|
||||
bits ^= 0x0F; // flip lower four bits
|
||||
|
||||
tOr |= bits;
|
||||
cullBits[i] = bits;
|
||||
}
|
||||
}
|
||||
|
||||
totalOr = tOr;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user