// macos: // clang++ blas_test.cc -framework Accelerate -std=c++11 -O3 -o blas_test // linux: // g++ blas_test.cc -lblas -std=c++11 -O3 -o blas_test // run: // ./blas_test 512 512 512 100 100 #ifdef __APPLE__ #include #else #include #endif #include #include #include int main(int argc, char **argv) { if (argc < 4) { std::cerr << argv[0] << " M N K [iters] [reps]\n"; return 1; } auto M = 1; auto N = 1; auto K = 1; { std::stringstream ss(argv[1]); ss >> M; } { std::stringstream ss(argv[2]); ss >> N; } { std::stringstream ss(argv[3]); ss >> K; } auto LDA = K; auto LDB = N; auto LDC = N; auto *A = (float *)malloc(sizeof(float) * M * K); auto *B = (float *)malloc(sizeof(float) * K * N); auto *C = (float *)malloc(sizeof(float) * M * N); for (auto i = 0; i < M * K; ++i) { A[i] = 0.7; } for (auto i = 0; i < N * K; ++i) { B[i] = 0.4; } auto iters = int(1e11 / (M * N * K)); auto reps = 10; if (iters < 10) { iters = 10; } if (argc > 4) { std::stringstream ss(argv[4]); ss >> iters; } if (argc > 5) { std::stringstream ss(argv[5]); ss >> reps; } for (auto _ = 0; _ < reps; ++_) { for (auto i = 0; i < iters / 10; ++i) { cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1.0, A, LDA, B, LDB, 0.0, C, LDC); } auto start = std::chrono::steady_clock::now(); for (auto i = 0; i < iters; ++i) { cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1.0, A, LDA, B, LDB, 0.0, C, LDC); } auto end = std::chrono::steady_clock::now(); std::chrono::duration elapsed_seconds = end - start; std::cout << 1.0 * M * N * K * 2 * iters / elapsed_seconds.count() / 1e9 << " gflops\n"; } free(A); free(B); free(C); }