01 [...] 02 // Matrix vector multiplication using SSE Intrinsics code. 03 04 // Assign matrix memory: 05 union { 06 __m128 m128[4]; // by column 07 float f[4][4]; 08 } matrix; 09 10 // Memory for vector and result: 11 union { 12 __m128 m128; 13 float f[4]; 14 } vector, result; 15 16 // Initialize matrix: 17 matrix.m128[3] = _mm_set_ps( 2.5, 1.2, 3.1, 7.8 ); // 1. Column 18 matrix.m128[2] = _mm_set_ps( 3.4, 7.7, 8.2, 0.4 ); // 2. Column 19 matrix.m128[1] = _mm_set_ps( 7.9, 3.7, 7.1, 1.2 ); // 3. Column 20 matrix.m128[0] = _mm_set_ps( 1.2, 0.5, 3.6, 5.2 ); // 4. Column 21 22 // Initialize vector: 23 vector.m128 = _mm_set_ps(1.0,1.0,1.0,1.0); 24 result.m128 = _mm_setzero_ps(); 25 26 // Compute multiplication: 27 __m128 tmp_multiplier; 28 __m128 tmp_column1; 29 __m128 tmp_column2; 30 31 tmp_multiplier = _mm_set_ps1(vector.f[0]); 32 tmp_column1 = _mm_mul_ps(matrix.m128[0], tmp_multiplier); 33 34 tmp_multiplier = _mm_set_ps1(vector.f[1]); 35 tmp_column2 = _mm_mul_ps(matrix.m128[1], tmp_multiplier); 36 tmp_column1 = _mm_add_ps(tmp_column1, tmp_column2); 37 38 tmp_multiplier = _mm_set_ps1(vector.f[2]); 39 tmp_column2 = _mm_mul_ps(matrix.m128[2], tmp_multiplier); 40 tmp_column1 = _mm_add_ps(tmp_column1, tmp_column2); 41 42 tmp_multiplier = _mm_set_ps1(vector.f[3]); 43 tmp_column2 = _mm_mul_ps(matrix.m128[3], tmp_multiplier); 44 result.m128 = _mm_add_ps(tmp_column1, tmp_column2); 45 46 result.m128 = _mm_loadr_ps(result.f); 47 }