// Deomonstrates keeping the cold data in the object with the fast data structTogether { std::uint32_t value; std::string metadata; };
// Demonstrates "giving up" and just not storing the associated cold-data structOnlyFastData { std::uint32_t value; };
// Demonstrates using OutOfLine to store the cold data structWithOOL :public OutOfLine<WithOOL, std::string> { std::uint32_t value; };
// The crux of the optimization is that there's no space-overhead static_assert(sizeof(WithOOL) == sizeof(OnlyFastData));
// We synthesize the data up-front so the generation can't interfere with the measured behavior in any way template <classData> automake_data() { srand(20180101); std::vector<Data> data(10000000); for (Data& d : data) { d.value = rand(); } return data; }
// We time to measure throughput of touching all the fast data once in sequence template <classData> voidtrial(constchar* constname, constData& data, constboolprint) { std::uint32_trunning = 0U; constauto before = std::chrono::system_clock::now(); for (constauto& d : data) { running += d.value; } constauto after = std::chrono::system_clock::now();
if (print) { std::cout << name << " took " << (after - before).count() << "ns and " << running << " is a value I don't want optimized away" << std::endl; } }
intmain(){ // We generate all the datasets up front, to keep the effects of the allocator distant constauto together = make_data<Together>(); constauto only_fast = make_data<OnlyFastData>(); constauto with_ool = make_data<WithOOL>();
// We run each trial twice, recording the results only the second time. The idea is to give a fair comparison, where // all 3 options have their caches primed and nobody thus has data fresher from the allocation. trial("With cold data in-line (original) ", together, false); trial("With cold data in-line (original) ", together, true);
trial("With cold data thrown away (best-case scenario) ", only_fast, false); trial("With cold data thrown away (best-case scenario) ", only_fast, true);
With cold data in-line (original) took 82936589ns and 3350498669 is a value I don't want optimized away With cold data thrown away (best-case scenario) took 74243139ns and 3350498669 is a value I don't want optimized away With OutOfLIne took 73445932ns and 3350498669 is a value I don't want optimized away