#include // traditional "naiive" loops forall( [M], [N], [P] ) void matmul( array(float, M, P) & src1, array(float, P, N) & src2, array(float, M, N) & tgt ) { for (i; M) for (j; N) { tgt[i][j] = 0; for (k; P) tgt[i][j] += src1[i][k] * src2[k][j]; } }