I noticed a situation in which version 2 of the library gives me significantly degraded performance compared to version 1. Please find attached a minimal working example consisting of two functions minimal
and minimal_vcl16
which take an array, and then 'demux' it by first extracting all elements with even index, and then all elements with odd index.
#include <iostream>
#include <vectorclass.h>
// see https://github.com/martinus/nanobench/raw/master/src/include/nanobench.h
#define ANKERL_NANOBENCH_IMPLEMENT
#include "nanobench.h"
#include <chrono>
typedef unsigned char byte;
byte *generate(size_t SRC_SIZE, int alignment = 64) {
byte *buf = (byte *)_mm_malloc(SRC_SIZE, alignment);
srand(0);
for (size_t i = 0; i < SRC_SIZE; i++)
buf[i] = (byte)(rand() % 256);
return buf;
}
byte *allocate_dst(size_t DST_SIZE, int alignment = 64) {
byte *result = (byte *)_mm_malloc(DST_SIZE, alignment);
memset(result, 0, DST_SIZE);
return result;
}
void minimal(unsigned char *src, unsigned char *dst, size_t step) {
size_t size = step * 2;
auto x = dst;
auto y = dst + step;
for (int i = 0; i < size; i += 2) {
x[0] = src[i];
y[0] = src[i + 1];
++x;
++y;
}
}
void minimal_vcl16(unsigned char *src, unsigned char *dst, size_t step) {
size_t size = step * 2;
auto x = dst;
auto y = dst + step;
for (int pos = 0; pos < size; pos += 2 * 16) {
Vec16uc a, b;
a.load_a(&src[pos]);
b.load_a(&src[pos + 16]);
blend16<0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30>(a, b)
.store_a(x);
blend16<1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31>(a, b)
.store_a(y);
x += 16;
y += 16;
}
}
int main() {
auto config = ankerl::nanobench::Config();
config.minEpochTime(std::chrono::milliseconds{200});
size_t step = 800 * 800;
size_t SIZE = step * 2;
auto src = generate(SIZE);
auto dst = allocate_dst(SIZE);
auto dst_vcl = allocate_dst(SIZE);
minimal(src, dst, step);
minimal_vcl16(src, dst_vcl, step);
// check if minimal and minimal_vcl produce identical outputs
for (int i = 0; i < SIZE; ++i) {
// std::cout << i << ">>> " << (unsigned int) src[i] << ": " << (unsigned
// int)dst[i] << " " << (unsigned int) dst_vcl[i] << std::endl;
if (dst[i] != dst_vcl[i]) {
abort();
}
}
std::cout << "OK\n";
_mm_free(src);
_mm_free(dst);
_mm_free(dst_vcl);
// run the benchmark
src = generate(SIZE);
dst = allocate_dst(SIZE);
dst_vcl = allocate_dst(SIZE);
config.run("Minimal", [&] { minimal(src, dst, step); })
.doNotOptimizeAway(dst);
config.run("MinimalVCL", [&] { minimal_vcl16(src, dst_vcl, step); })
.doNotOptimizeAway(dst);
_mm_free(dst);
_mm_free(dst_vcl);
_mm_free(src);
}
| ns/op | op/s | MdAPE | ins/op | cyc/op | IPC | branches/op | missed% | benchmark
|--------------------:|--------------------:|--------:|---------------:|---------------:|-------:|---------------:|--------:|:----------------------------------------------
| 80,675.93 | 12,395.27 | 0.2% | 380,004.02 | 234,070.96 | 1.623 | 20,001.02 | 0.0% | `Minimal`
| 84,662.87 | 11,811.55 | 0.3% | 520,002.02 | 245,669.68 | 2.117 | 40,000.02 | 0.0% | `MinimalVCL`
| ns/op | op/s | MdAPE | ins/op | cyc/op | IPC | branches/op | missed% | benchmark
|--------------------:|--------------------:|--------:|---------------:|---------------:|-------:|---------------:|--------:|:----------------------------------------------
| 79,375.69 | 12,598.32 | 0.3% | 380,004.02 | 230,320.17 | 1.650 | 20,001.02 | 0.0% | `Minimal`
| 554,310.82 | 1,804.04 | 0.0% | 760,018.15 | 1,608,538.10 | 0.472 | 40,003.14 | 0.0% | `MinimalVCL`
This picture is consistent across compilers (I tested gcc 8, gcc 9.2, clang 9.0) and on two different machines, also with an analogous version using Vec32uc
instead of Vec16uc
. When I turn on AVX512 on my workstation, the performance improves to expected levels.