Restored generic (non-SIMD) code

2026-03-20 09:00:25 +01:00 · 2013-05-29 13:12:13 -05:00
parent be311f42e1
commit 9c37079c16
23 changed files with 3328 additions and 24 deletions
--- a/neo/renderer/ModelOverlay.cpp
+++ b/neo/renderer/ModelOverlay.cpp
@@ -102,6 +102,7 @@ static void R_OverlayPointCullStatic( byte * cullBits, halfFloat_t * texCoordS,
 	assert_16_byte_aligned( texCoordT );
 	assert_16_byte_aligned( verts );

+#ifdef ID_WIN_X86_SSE2_INTRIN

 	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );

@@ -176,6 +177,39 @@ static void R_OverlayPointCullStatic( byte * cullBits, halfFloat_t * texCoordS,
 		}
 	}

+#else
+
+	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
+
+	for ( int i = 0; i < numVerts; ) {
+
+		const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
+
+		for ( ; i <= nextNumVerts; i++ ) {
+			const idVec3 & v = vertsODS[i].xyz;
+
+			const float d0 = planes[0].Distance( v );
+			const float d1 = planes[1].Distance( v );
+			const float d2 = 1.0f - d0;
+			const float d3 = 1.0f - d1;
+
+			halfFloat_t s = Scalar_FastF32toF16( d0 );
+			halfFloat_t t = Scalar_FastF32toF16( d1 );
+
+			texCoordS[i] = s;
+			texCoordT[i] = t;
+
+			byte bits;
+			bits  = IEEE_FLT_SIGNBITSET( d0 ) << 0;
+			bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
+			bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
+			bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
+
+			cullBits[i] = bits;
+		}
+	}
+
+#endif
 }

 /*
@@ -189,6 +223,7 @@ static void R_OverlayPointCullSkinned( byte * cullBits, halfFloat_t * texCoordS,
 	assert_16_byte_aligned( texCoordT );
 	assert_16_byte_aligned( verts );

+#ifdef ID_WIN_X86_SSE2_INTRIN

 	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );

@@ -263,6 +298,39 @@ static void R_OverlayPointCullSkinned( byte * cullBits, halfFloat_t * texCoordS,
 		}
 	}

+#else
+
+	idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
+
+	for ( int i = 0; i < numVerts; ) {
+
+		const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
+
+		for ( ; i <= nextNumVerts; i++ ) {
+			const idVec3 transformed = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
+
+			const float d0 = planes[0].Distance( transformed );
+			const float d1 = planes[1].Distance( transformed );
+			const float d2 = 1.0f - d0;
+			const float d3 = 1.0f - d1;
+
+			halfFloat_t s = Scalar_FastF32toF16( d0 );
+			halfFloat_t t = Scalar_FastF32toF16( d1 );
+
+			texCoordS[i] = s;
+			texCoordT[i] = t;
+
+			byte bits;
+			bits  = IEEE_FLT_SIGNBITSET( d0 ) << 0;
+			bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
+			bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
+			bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
+
+			cullBits[i] = bits;
+		}
+	}
+
+#endif
 }

 /*
@@ -446,6 +514,7 @@ static void R_CopyOverlaySurface( idDrawVert * verts, int numVerts, triIndex_t *
 	assert( ( ( overlay->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
 	assert( ( ( overlay->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );

+#ifdef ID_WIN_X86_SSE2_INTRIN

 	const __m128i vector_int_clear_last = _mm_set_epi32( 0, -1, -1, -1 );
 	const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
@@ -482,6 +551,25 @@ static void R_CopyOverlaySurface( idDrawVert * verts, int numVerts, triIndex_t *

 	_mm_sfence();

+#else
+
+	// copy vertices
+	for ( int i = 0; i < overlay->numVerts; i++ ) {
+		const overlayVertex_t &overlayVert = overlay->verts[i];
+
+		// NOTE: bad out-of-order write-combined write, SIMD code does the right thing
+		verts[numVerts + i] = sourceVerts[overlayVert.vertexNum];
+		verts[numVerts + i].st[0] = overlayVert.st[0];
+		verts[numVerts + i].st[1] = overlayVert.st[1];
+	}
+
+	// copy indexes
+	for ( int i = 0; i < overlay->numIndexes; i += 2 ) {
+		assert( overlay->indexes[i + 0] < overlay->numVerts && overlay->indexes[i + 1] < overlay->numVerts );
+		WriteIndexPair( &indexes[numIndexes + i], numVerts + overlay->indexes[i + 0], numVerts + overlay->indexes[i + 1] );
+	}
+
+#endif
 }

 /*