blas_test.cc
// macos:
// clang++ blas_test.cc -framework Accelerate -std=c++11 -O3 -o blas_test
// linux:
// g++ blas_test.cc -lblas -std=c++11 -O3 -o blas_test
// run:
// ./blas_test 512 512 512 100 100
#ifdef __APPLE__
#include <Accelerate/Accelerate.h>
#else
#include <cblas.h>
#endif
#include <chrono>
#include <iostream>
#include <sstream>
int main(int argc, char **argv) {
if (argc < 4) {
std::cerr << argv[0] << " M N K [iters] [reps]\n";
return 1;
}
auto M = 1;
auto N = 1;
auto K = 1;
{
std::stringstream ss(argv[1]);
ss >> M;
}
{
std::stringstream ss(argv[2]);
ss >> N;
}
{
std::stringstream ss(argv[3]);
ss >> K;
}
auto LDA = K;
auto LDB = N;
auto LDC = N;
auto *A = (float *)malloc(sizeof(float) * M * K);
auto *B = (float *)malloc(sizeof(float) * K * N);
auto *C = (float *)malloc(sizeof(float) * M * N);
for (auto i = 0; i < M * K; ++i) {
A[i] = 0.7;
}
for (auto i = 0; i < N * K; ++i) {
B[i] = 0.4;
}
auto iters = int(1e11 / (M * N * K));
auto reps = 10;
if (iters < 10) {
iters = 10;
}
if (argc > 4) {
std::stringstream ss(argv[4]);
ss >> iters;
}
if (argc > 5) {
std::stringstream ss(argv[5]);
ss >> reps;
}
for (auto _ = 0; _ < reps; ++_) {
for (auto i = 0; i < iters / 10; ++i) {
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1.0, A,
LDA, B, LDB, 0.0, C, LDC);
}
auto start = std::chrono::steady_clock::now();
for (auto i = 0; i < iters; ++i) {
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1.0, A,
LDA, B, LDB, 0.0, C, LDC);
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
std::cout << 1.0 * M * N * K * 2 * iters / elapsed_seconds.count() / 1e9
<< " gflops\n";
}
free(A);
free(B);
free(C);
}