Listing 1. Simplified Matrix Multiplication in CUDA, Using Tiled Algorithm __global__ void matmulKernel( float* C, float* A, float* B, int N2, int N3 ){ int bx = blockIdx.x, by = blockIdx.y; int tx = threadIdx.x, ty = threadIdx.y; int aFirst = 16 * by * N2; int bFirst = 16 * bx; float Csub = 0; for( int j = 0; j < N2; j += 16 ) { __shared__ float Atile[16][16], Btile[16][16]; Atile[ty][tx] = A[aFirst + j + N2 * ty + tx]; Btile[ty][tx] = B[bFirst + j*N3 + b + N3 * ty + tx]; __syncthreads(); for( int k = 0; k < 16; ++k ) Csub += Atile[ty][k] * Btile[k][tx]; __syncthreads(); } int c = N3 * 16 * by + 16 * bx; C[c + N3 * ty + tx] = Csub; } void matmul( float* A, float* B, float* C, size_t N1, size_t N2, size_t N3 ){ void *devA, *devB, *devC; cudaSetDevice(0); cudaMalloc( &devA, N1*N2*sizeof(float) ); cudaMalloc( &devB, N2*N3*sizeof(float) ); cudaMalloc( &devC, N1*N3*sizeof(float) ); cudaMemcpy( devA, A, N1*N2*sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpy( devB, B, N2*N3*sizeof(float), cudaMemcpyHostToDevice ); dim3 threads( 16, 16 ); dim3 grid( N1 / threads.x, N3 / threads.y); matmulKernel<<< grid, threads >>>( devC, devA, devB, N2, N3 ); cudaMemcpy( C, devC, N1*N3*sizeof(float), cudaMemcpyDeviceToHost ); cudaFree( devA ); cudaFree( devB ); cudaFree( devC ); }