I got inspired by a recent question to do some benchmarking on the different methods to copy data and this is what I have come up with:
#include <iostream> #include <iomanip> #include <chrono> #include <vector> #include <cstring> class TimedTest { public: TimedTest(const std::string& name) : name{ name }, time{ 0 }, total{ 0 }, testCount{ 0 } { } void run(int* dst, int* src, std::size_t count) { std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); test(dst, src, count); std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); time = (double)std::chrono::duration_cast<std::chrono::milliseconds> (end - begin).count(); total += time; testCount += 1.0; } virtual void test(int* dst, int* src, std::size_t count) = 0; double average() { return total / testCount; } public: std::string name; double time; double total; double testCount; }; class TimedTestMemCpy : public TimedTest { using TimedTest::TimedTest; virtual void test(int* dst, int* src, std::size_t count) override { memcpy(dst, src, sizeof(int) * count); } }; class TimedTestStdCopy : public TimedTest { using TimedTest::TimedTest; virtual void test(int* dst, int* src, std::size_t count) override { std::copy(src, src + count, dst); } }; class TimedTestSimpleLoop : public TimedTest { using TimedTest::TimedTest; virtual void test(int* dst, int* src, std::size_t count) override { for (size_t i = 0; i < count; i++) dst[i] = src[i]; } }; class TimedTestPointerCopy : public TimedTest { using TimedTest::TimedTest; virtual void test(int* dst, int* src, std::size_t count) override { int* end = dst + count; while (dst != end) *dst++ = *src++; } }; class TimedTestOMPCopy : public TimedTest { using TimedTest::TimedTest; virtual void test(int* dst, int* src, std::size_t count) override { #pragma omp parallel for for (int i = 0; i < (int)count; i++) dst[i] = src[i]; } }; int main() { constexpr std::size_t length = 200'000'000; int* src = new int[length]; for (int i = 0; i < length; i++) src[i] = i; int* dst = new int[length]; std::vector<TimedTest*> tests; tests.push_back(new TimedTestMemCpy("memcpy")); tests.push_back(new TimedTestStdCopy("std::copy")); tests.push_back(new TimedTestSimpleLoop("simpleLoop")); tests.push_back(new TimedTestPointerCopy("pointerCopy")); tests.push_back(new TimedTestOMPCopy("OMPCopy")); std::cout << std::setw(5) << "Test#"; for (auto test : tests) std::cout << std::setw(12) << test->name << std::setw(9) << "Avg"; std::cout << "\n"; for (int i = 0; i < 100; i++) { std::cout << std::setw(5) << i; for (auto test : tests) { test->run(dst, src, length); std::cout << std::setw(12) << test->time << std::setw(9) << test->average(); } std::cout << "\n"; } for (auto test : tests) delete test; delete[] src; delete[] dst; } I would appreciate any comments on the results or suggestions on improving the benchmarking / general code.