From 124498804147ad038c6da9de1f41eafde0f15110 Mon Sep 17 00:00:00 2001
From: Hui <hui.xie0621@gmail.com>
Date: Wed, 5 Jun 2024 22:45:55 +0100
Subject: [PATCH] benchmark

---
 any_view.md                          | 332 ++++++++++++++++++++++++++-
 impl/any_view/any_view_te.hpp        |   2 +-
 impl/any_view/benchmark/micro.cpp    |  31 +++
 impl/any_view/benchmark/pipeline.cpp | 105 +++++++++
 impl/any_view/benchmark/widget.cpp   |  46 ++++
 impl/any_view/benchmark/widget.hpp   |  53 +++++
 6 files changed, 567 insertions(+), 2 deletions(-)
 create mode 100644 impl/any_view/benchmark/micro.cpp
 create mode 100644 impl/any_view/benchmark/pipeline.cpp
 create mode 100644 impl/any_view/benchmark/widget.cpp
 create mode 100644 impl/any_view/benchmark/widget.hpp
diff --git a/any_view.md b/any_view.md
index 37e0d60..4590921 100644
--- a/any_view.md
+++ b/any_view.md
@@ -183,7 +183,7 @@ enum class category
 template <class Ref, 
           category Cat = category::input,
           class Value = decay_t<Ref>,
-          class RValueRef = add_rvalue_reference_t<Value>,
+          class RValueRef = add_rvalue_reference_t<remove_reference_t<Ref>>,
           class Diff = ptrdiff_t>
 class any_view;
 ```
@@ -202,6 +202,336 @@ class any_view;
 - TODO: move ctor cannot guarentee move ctors have been called
 - TODO: view can be valueless: because strong exception guarentee + if we want to support move (or move-only)
 
+### Performance
+
+One of the major concerns of using type erased type is the performance cost of `virtual`/indirect function calls. With `any_view`, every iteration will have three `virtual`/indirect function calls:
+
+```cpp
+++it;
+it != last;
+*it;
+```
+
+#### Micro benchmark : `vector` vs `any_view` on purely iterating
+
+Purely profile the iteration between `std::vector` and `any_view`
+
+```cpp
+  std::vector v = std::views::iota(0, state.range(0)) | std::ranges::to<std::vector>();
+  for (auto _ : state) {
+    for (auto i : v) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+```
+
+vs
+
+```cpp
+  std::vector v = std::views::iota(0, state.range(0)) | std::ranges::to<std::vector>();
+  std::ranges::any_view<int&> av(std::views::all(v));
+  for (auto _ : state) {
+    for (auto i : av) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+```
+
+##### -O0
+
+```bash
+Benchmark                                           Time             CPU      Time Old      Time New       CPU Old       CPU New
+--------------------------------------------------------------------------------------------------------------------------------
+[BM_vector vs. BM_AnyView]/1024                  +3.4488         +3.4487         10423         46371         10418         46347
+[BM_vector vs. BM_AnyView]/2048                  +3.3358         +3.3375         21318         92432         21301         92396
+[BM_vector vs. BM_AnyView]/4096                  +3.4224         +3.4237         41864        185137         41834        185061
+[BM_vector vs. BM_AnyView]/8192                  +3.4665         +3.4665         83019        370802         82986        370659
+[BM_vector vs. BM_AnyView]/16384                 +3.4586         +3.4574        166596        742785        166536        742319
+[BM_vector vs. BM_AnyView]/32768                 +3.4413         +3.4416        333311       1480349        333151       1479723
+[BM_vector vs. BM_AnyView]/65536                 +3.4166         +3.4154        667125       2946432        666900       2944657
+[BM_vector vs. BM_AnyView]/131072                +3.4295         +3.4305       1335405       5915230       1334717       5913487
+[BM_vector vs. BM_AnyView]/262144                +3.4320         +3.4329       2665004      11811264       2663916      11808776
+OVERALL_GEOMEAN                                  +3.4278         +3.4281             0             0             0             0
+
+```
+
+##### -O2
+
+```bash
+Benchmark                                           Time             CPU      Time Old      Time New       CPU Old       CPU New
+--------------------------------------------------------------------------------------------------------------------------------
+[BM_vector vs. BM_AnyView]/1024                 +14.8383        +14.8421           315          4991           315          4989
+[BM_vector vs. BM_AnyView]/2048                 +14.9416        +14.9453           632         10075           632         10071
+[BM_vector vs. BM_AnyView]/4096                 +15.1943        +15.2000          1231         19942          1231         19936
+[BM_vector vs. BM_AnyView]/8192                 +15.1609        +15.1626          2465         39835          2464         39820
+[BM_vector vs. BM_AnyView]/16384                +13.8958        +13.8949          5386         80235          5384         80196
+[BM_vector vs. BM_AnyView]/32768                +13.8638        +13.8647         10720        159341         10714        159264
+[BM_vector vs. BM_AnyView]/65536                +13.6891        +13.6912         21772        319807         21758        319659
+[BM_vector vs. BM_AnyView]/131072               +13.5340        +13.5338         44363        644768         44335        644359
+[BM_vector vs. BM_AnyView]/262144               +13.5374        +13.5384         87600       1273476         87558       1272956
+OVERALL_GEOMEAN                                 +16.0765        +16.0789             0             0             0             0
+```
+
+`any_view` is 3 - 16 times slower on iteration than `std::vector`. Yes, 3
+virtual function calls vs `std::vector`, what can you expect? But this benchmark
+is not a realistic use case. No one would create a `vector`, immediately create
+a type erased wrapper `any_view` that wraps it and then iterate through it. The
+same way, no one would create a lambda, immediately create a `std::function`
+then call it.
+
+#### Slightly more realistic case: A view pipeline vs `any_view`
+
+Since `any_view` is most likely used in an ABI boundary, the benchmark will separate the creation of the view in a different TU. We will compare using the "`view` pipeline directly" with using `any_view` that wraps the `view` pipeline.
+
+Consider the following case :
+
+```cpp
+// hpp file
+struct Widget {
+  std::string name;
+  int size;
+};
+
+struct UI {
+  std::vector<Widget> widgets_;
+
+  ??? getWidgetNames() const;
+};
+
+// cpp file
+??? UI::getWidgetNames() const {
+  return widgets_ | std::views::filter([](const Widget& widget) {
+           return widget.size > 10;
+         }) |
+         std::views::transform(&Widget::name);
+}
+```
+
+The implementation of creation of the results of `getWidgetNames` is hidden in a separate translation unit.
+
+In first case, we use the `view` directly. It is tedious to spell the type, and it is impossible to use lambda now because the function type is part of the result type. We ended up writing something like this
+
+```cpp
+// hpp file
+struct UI {
+  std::vector<Widget> widgets_;
+  // cannot use lambda because we need to spell the type of the view
+  struct FilterFn {
+    bool operator()(const Widget&) const;
+  };
+
+  struct TransformFn {
+    const std::string& operator()(const Widget&) const;
+  };
+  std::ranges::transform_view<
+      std::ranges::filter_view<std::ranges::ref_view<const std::vector<Widget>>,
+                               FilterFn>,
+      TransformFn>
+  getWidgetNames() const;
+};
+
+// cpp file
+bool UI::FilterFn::operator()(const Widget& widget) const {
+  return widget.size > 10;
+}
+
+const std::string& UI::TransformFn::operator()(const Widget& widget) const {
+  return widget.name;
+}
+
+std::ranges::transform_view<
+    std::ranges::filter_view<std::ranges::ref_view<const std::vector<Widget>>,
+                             UI::FilterFn>,
+    UI::TransformFn>
+UI::getWidgetNames() const {
+  return widgets_ | std::views::filter(UI::FilterFn{}) |
+         std::views::transform(UI::TransformFn{});
+}
+```
+
+With using `any_view`, the interface looks much simpler
+
+```cpp
+// hpp file
+struct UI {
+  std::vector<Widget> widgets_;
+
+  std::ranges::any_view<const std::string&> getWidgetNames() const;
+};
+
+// cpp file
+std::ranges::any_view<const std::string&> UI::getWidgetNames() const {
+  return widgets_ | std::views::filter([](const Widget& widget) {
+           return widget.size > 10;
+         }) |
+         std::views::transform(&Widget::name);
+}
+```
+
+##### -O0
+
+```bash
+Benchmark                                                        Time             CPU      Time Old      Time New       CPU Old       CPU New
+---------------------------------------------------------------------------------------------------------------------------------------------
+[BM_RawPipeline vs. BM_AnyViewPipeline]/1024                  +0.4290         +0.4291         78469        112130         78435        112090
+[BM_RawPipeline vs. BM_AnyViewPipeline]/2048                  +0.4051         +0.4050        159225        223729        159161        223625
+[BM_RawPipeline vs. BM_AnyViewPipeline]/4096                  +0.3568         +0.4021        331276        449466        320471        449331
+[BM_RawPipeline vs. BM_AnyViewPipeline]/8192                  +0.4022         +0.4030        639566        896817        639056        896623
+[BM_RawPipeline vs. BM_AnyViewPipeline]/16384                 +0.4148         +0.4144       1267196       1792804       1266743       1791639
+[BM_RawPipeline vs. BM_AnyViewPipeline]/32768                 +0.4293         +0.4287       2522849       3606022       2522004       3603164
+[BM_RawPipeline vs. BM_AnyViewPipeline]/65536                 +0.4199         +0.4201       5078713       7211428       5076977       7209978
+[BM_RawPipeline vs. BM_AnyViewPipeline]/131072                +0.4170         +0.4170      10142694      14372118      10139299      14367292
+[BM_RawPipeline vs. BM_AnyViewPipeline]/262144                +0.4358         +0.4357      20386564      29270816      20381118      29260958
+OVERALL_GEOMEAN                                               +0.4120         +0.4172             0             0             0             0
+```
+
+##### -O2
+
+```bash
+Benchmark                                                        Time             CPU      Time Old      Time New       CPU Old       CPU New
+---------------------------------------------------------------------------------------------------------------------------------------------
+[BM_RawPipeline vs. BM_AnyViewPipeline]/1024                  +0.8066         +0.8064          3504          6330          3503          6327
+[BM_RawPipeline vs. BM_AnyViewPipeline]/2048                  +0.7136         +0.7134          7339         12576          7335         12568
+[BM_RawPipeline vs. BM_AnyViewPipeline]/4096                  +0.6746         +0.6748         14841         24853         14835         24846
+[BM_RawPipeline vs. BM_AnyViewPipeline]/8192                  +0.6424         +0.6423         30177         49563         30163         49537
+[BM_RawPipeline vs. BM_AnyViewPipeline]/16384                 +0.6538         +0.6539         60751        100468         60720        100427
+[BM_RawPipeline vs. BM_AnyViewPipeline]/32768                 +0.6524         +0.6521        121345        200514        121303        200404
+[BM_RawPipeline vs. BM_AnyViewPipeline]/65536                 +0.6582         +0.6579        240378        398604        240326        398440
+[BM_RawPipeline vs. BM_AnyViewPipeline]/131072                +0.6861         +0.6860        484220        816458        484055        816109
+[BM_RawPipeline vs. BM_AnyViewPipeline]/262144                +0.6234         +0.6235        991733       1609940        991406       1609560
+OVERALL_GEOMEAN                                               +0.6782         +0.6782             0             0             0             0
+```
+
+This is slightly better now. It is about 40% - 70% slower on iteration. Although
+it is still not very realistic. Nobody is spelling the concrete type of a view
+pipeline. It is tedious to write and also it makes the implementation not
+flexible. Any changes in the implementation would result in changing the type in
+the header, which defeats the purpose of hiding implementation details in a TU.
+
+#### Much more realistic case: A copy of `vector<string>` vs `any_view`
+
+For situation like this, in most code bases in the wild, the interface is
+probably returning a `std::vector<std::string>`, i.e. using `std::vector` as a
+type erasure tool, with the cost of copying all the elements.
+
+```cpp
+// hpp file
+struct UI {
+  std::vector<Widget> widgets_;
+
+  std::vector<std::string> getWidgetNames() const;
+};
+
+// cpp file
+std::vector<std::string> UI::getWidgetNames() const {
+  return widgets_ | std::views::filter([](const Widget& widget) {
+           return widget.size > 10;
+         }) |
+         std::views::transform(&Widget::name) | std::ranges::to<std::vector>();
+}
+```
+
+
+##### -O0
+
+```bash
+Benchmark                                                       Time             CPU      Time Old      Time New       CPU Old       CPU New
+--------------------------------------------------------------------------------------------------------------------------------------------
+[BM_VectorCopy vs. BM_AnyViewPipeline]/1024                  -0.5376         -0.5376        238558        110316        238396        110242
+[BM_VectorCopy vs. BM_AnyViewPipeline]/2048                  -0.5110         -0.5110        454350        222187        454157        222104
+[BM_VectorCopy vs. BM_AnyViewPipeline]/4096                  -0.4868         -0.4869        886121        454774        885773        454530
+[BM_VectorCopy vs. BM_AnyViewPipeline]/8192                  -0.4766         -0.4769       1729318        905041       1728626        904303
+[BM_VectorCopy vs. BM_AnyViewPipeline]/16384                 -0.4834         -0.4834       3462454       1788737       3461093       1788080
+[BM_VectorCopy vs. BM_AnyViewPipeline]/32768                 -0.4858         -0.4729       7006102       3602475       6830520       3600306
+[BM_VectorCopy vs. BM_AnyViewPipeline]/65536                 -0.4777         -0.4776      13741174       7176723      13734490       7175337
+[BM_VectorCopy vs. BM_AnyViewPipeline]/131072                -0.4792         -0.4792      27501856      14321826      27494923      14318104
+[BM_VectorCopy vs. BM_AnyViewPipeline]/262144                -0.4838         -0.4835      55950048      28883803      55912545      28879083
+OVERALL_GEOMEAN                                              -0.4917         -0.4903             0             0             0             0
+```
+
+##### -O2
+
+```bash
+Benchmark                                                       Time             CPU      Time Old      Time New       CPU Old       CPU New
+--------------------------------------------------------------------------------------------------------------------------------------------
+[BM_VectorCopy vs. BM_AnyViewPipeline]/1024                  -0.8228         -0.8228         35350          6264         35330          6262
+[BM_VectorCopy vs. BM_AnyViewPipeline]/2048                  -0.8250         -0.8250         71983         12596         71953         12590
+[BM_VectorCopy vs. BM_AnyViewPipeline]/4096                  -0.8320         -0.8320        148942         25018        148873         25005
+[BM_VectorCopy vs. BM_AnyViewPipeline]/8192                  -0.8276         -0.8276        291307         50234        291198         50209
+[BM_VectorCopy vs. BM_AnyViewPipeline]/16384                 -0.8304         -0.8304        590026        100058        589571        100020
+[BM_VectorCopy vs. BM_AnyViewPipeline]/32768                 -0.8301         -0.8300       1175121        199685       1174459        199614
+[BM_VectorCopy vs. BM_AnyViewPipeline]/65536                 -0.8297         -0.8298       2363963        402634       2363007        402209
+[BM_VectorCopy vs. BM_AnyViewPipeline]/131072                -0.8340         -0.8340       4841300        803467       4838717        803175
+[BM_VectorCopy vs. BM_AnyViewPipeline]/262144                -0.8463         -0.8463      10412999       1600341      10410152       1600078
+OVERALL_GEOMEAN                                              -0.8310         -0.8310             0             0             0             0
+```
+
+Boom! `any_view` is 50% - 80% faster. In the test cases, 10% of the `Widget`s were filtered out by the filter pipeline and the `name` string;s length is randomly 0-30. So some of `string`s are in the SBO and some are allocated. Yes this code pattern is very common: making the code simple and clean at the cost of copying data, even though most of the callers don't need the ownership at all.
+
+#### Some optimisations in the wild: `vector<reference_wrapper>` vs `any_view`
+
+People who care about some level of the performance (and at the same time, needs to keep the ABI boundary) sometimes make it return `std::vector<std::reference_wrapper<const std::string>>` to save the copy of those `string`s
+
+```cpp
+// hpp file
+struct UI {
+  std::vector<Widget> widgets_;
+
+  std::vector<std::reference_wrapper<const std::string>> getWidgetNames() const;
+};
+
+// cpp file
+std::vector<std::reference_wrapper<const std::string>> UI::getWidgetNames()
+    const {
+  return widgets_ | std::views::filter([](const Widget& widget) {
+           return widget.size > 10;
+         }) |
+         std::views::transform(
+             [](const Widget& widget) { return std::cref(widget.name); }) |
+         std::ranges::to<std::vector>();
+}
+```
+
+##### -O0
+
+```bash
+Benchmark                                                             Time             CPU      Time Old      Time New       CPU Old       CPU New
+--------------------------------------------------------------------------------------------------------------------------------------------------
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/1024                  -0.3744         -0.3744        183525        114814        183467        114768
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/2048                  -0.3757         -0.3759        368639        230131        368529        229985
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/4096                  -0.3689         -0.3691        736658        464898        736390        464614
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/8192                  -0.3821         -0.3820       1497364        925176       1496487        924814
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/16384                 -0.3930         -0.3929       3004560       1823724       3002700       1823008
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/32768                 -0.3841         -0.3841       5914763       3642657       5911948       3641151
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/65536                 -0.3848         -0.3849      11823432       7273326      11820603       7270880
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/131072                -0.3864         -0.3863      23783665      14592934      23776966      14591191
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/262144                -0.3834         -0.3834      47455488      29259513      47445000      29253042
+OVERALL_GEOMEAN                                                    -0.3815         -0.3815             0             0             0             0
+```
+
+##### -O2
+
+```bash
+Benchmark                                                             Time             CPU      Time Old      Time New       CPU Old       CPU New
+--------------------------------------------------------------------------------------------------------------------------------------------------
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/1024                  +1.7685         +1.7690          2175          6022          2174          6020
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/2048                  +1.7366         +1.7368          4476         12248          4474         12244
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/4096                  +1.4270         +1.4270         10039         24363         10034         24354
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/8192                  +0.8925         +0.8927         25901         49018         25888         48999
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/16384                 +0.6378         +0.6378         59587         97590         59567         97558
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/32768                 +0.5174         +0.5179        129966        197216        129883        197145
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/65536                 +0.4826         +0.4826        265071        392994        264940        392799
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/131072                +0.4483         +0.4484        549000        795131        548800        794858
+[BM_VectorRefWrapper vs. BM_AnyViewPipeline]/262144                +0.2827         +0.2827       1240717       1591496       1240390       1590989
+OVERALL_GEOMEAN                                                    +0.8370         +0.8371             0             0             0             0
+```
+
+This set of results are not very consistent. Depending on the optimization level, `any_view` can be 30% faster or 80% slower.
+
+#### Conclusion?
+
+In the cases where type erasure is needed, the performance is not bad at all and sometimes even faster than the most common solution today : copying data into the `vector`
+
 # Implementation Experience
 
 - Reference implementation in the repo
diff --git a/impl/any_view/any_view_te.hpp b/impl/any_view/any_view_te.hpp
index 6e671d5..5923b2b 100644
--- a/impl/any_view/any_view_te.hpp
+++ b/impl/any_view/any_view_te.hpp
@@ -46,7 +46,7 @@ constexpr auto operator<=>(category lhs, category rhs) noexcept {
 }  // namespace __any_view
 
 template <class Ref, category Cat = category::input, class Value = decay_t<Ref>,
-          class RValueRef = add_rvalue_reference_t<Value>,
+          class RValueRef = add_rvalue_reference_t<remove_reference_t<Ref>>,
           class Diff = ptrdiff_t>
 class any_view {
  public:
diff --git a/impl/any_view/benchmark/micro.cpp b/impl/any_view/benchmark/micro.cpp
new file mode 100644
index 0000000..5329be6
--- /dev/null
+++ b/impl/any_view/benchmark/micro.cpp
@@ -0,0 +1,31 @@
+#include <benchmark/benchmark.h>
+
+#include <ranges>
+#include <vector>
+
+#include "any_view.hpp"
+
+static void BM_vector(benchmark::State& state) {
+  std::vector v =
+      std::views::iota(0, state.range(0)) | std::ranges::to<std::vector>();
+  for (auto _ : state) {
+    for (auto i : v) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+}
+// Register the function as a benchmark
+BENCHMARK(BM_vector)->RangeMultiplier(2)->Range(1 << 10, 1 << 18);
+
+// Define another benchmark
+static void BM_AnyView(benchmark::State& state) {
+  std::vector v =
+      std::views::iota(0, state.range(0)) | std::ranges::to<std::vector>();
+  std::ranges::any_view<int&> av(std::views::all(v));
+  for (auto _ : state) {
+    for (auto i : av) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+}
+BENCHMARK(BM_AnyView)->RangeMultiplier(2)->Range(1 << 10, 1 << 18);
diff --git a/impl/any_view/benchmark/pipeline.cpp b/impl/any_view/benchmark/pipeline.cpp
new file mode 100644
index 0000000..53f5d45
--- /dev/null
+++ b/impl/any_view/benchmark/pipeline.cpp
@@ -0,0 +1,105 @@
+#include <benchmark/benchmark.h>
+
+#include <random>
+#include <ranges>
+#include <vector>
+
+#include "any_view.hpp"
+#include "widget.hpp"
+
+namespace {
+
+constexpr auto MaxSize = 1 << 18;
+
+auto generate_random_widgets() {
+  std::vector<lib::Widget> widgets;
+  widgets.reserve(MaxSize);
+
+  static const char alphanum[] =
+      "0123456789"
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+      "abcdefghijklmnopqrstuvwxyz";
+  std::random_device char_dev;
+  std::mt19937 char_rng(char_dev());
+  std::uniform_int_distribution<std::mt19937::result_type> char_dist(
+      0, sizeof(alphanum) - 1);
+
+  std::random_device len_dev;
+  std::mt19937 len_rng(len_dev());
+  std::uniform_int_distribution<std::mt19937::result_type> len_dist(1, 30);
+
+  auto gen_next_str = [&]() {
+    int len = len_dist(len_rng);
+    std::string tmp_s;
+    tmp_s.reserve(len);
+
+    for (int i = 0; i < len; ++i) {
+      tmp_s.push_back(alphanum[char_dist(char_rng)]);
+    }
+
+    return tmp_s;
+  };
+
+  std::random_device w_dev;
+  std::mt19937 w_rng(w_dev());
+  std::uniform_int_distribution<int> w_dist(0, 100);
+
+  auto gen_size = [&] { return w_dist(w_rng); };
+
+  for (auto i = 0; i < MaxSize; ++i) {
+    widgets.push_back(lib::Widget{gen_next_str(), gen_size()});
+  }
+  return widgets;
+}
+
+const auto global_widgets = generate_random_widgets();
+}  // namespace
+
+using namespace lib;
+
+static void BM_AnyViewPipeline(benchmark::State& state) {
+  lib::UI1 ui1{global_widgets | std::views::take(state.range(0)) |
+               std::ranges::to<std::vector>()};
+  for (auto _ : state) {
+    for (auto const& name : ui1.getWidgetNames()) {
+      benchmark::DoNotOptimize(const_cast<std::string&>(name));
+    }
+  }
+}
+BENCHMARK(BM_AnyViewPipeline)->RangeMultiplier(2)->Range(1 << 10, 1 << 18);
+
+static void BM_RawPipeline(benchmark::State& state) {
+  lib::UI2 ui2{global_widgets | std::views::take(state.range(0)) |
+               std::ranges::to<std::vector>()};
+  for (auto _ : state) {
+    for (const auto& name : ui2.getWidgetNames()) {
+      benchmark::DoNotOptimize(const_cast<std::string&>(name));
+    }
+  }
+}
+// Register the function as a benchmark
+BENCHMARK(BM_RawPipeline)->RangeMultiplier(2)->Range(1 << 10, 1 << 18);
+
+static void BM_VectorCopy(benchmark::State& state) {
+  lib::UI3 ui3{global_widgets | std::views::take(state.range(0)) |
+               std::ranges::to<std::vector>()};
+  for (auto _ : state) {
+    for (const auto& name : ui3.getWidgetNames()) {
+      benchmark::DoNotOptimize(const_cast<std::string&>(name));
+    }
+  }
+}
+// Register the function as a benchmark
+BENCHMARK(BM_VectorCopy)->RangeMultiplier(2)->Range(1 << 10, 1 << 18);
+
+static void BM_VectorRefWrapper(benchmark::State& state) {
+  lib::UI4 ui4{global_widgets | std::views::take(state.range(0)) |
+               std::ranges::to<std::vector>()};
+  for (auto _ : state) {
+    for (const auto& nameRef : ui4.getWidgetNames()) {
+      benchmark::DoNotOptimize(const_cast<std::string&>(nameRef.get()));
+    }
+  }
+}
+// Register the function as a benchmark
+BENCHMARK(BM_VectorRefWrapper)->RangeMultiplier(2)->Range(1 << 10, 1 << 18);
diff --git a/impl/any_view/benchmark/widget.cpp b/impl/any_view/benchmark/widget.cpp
new file mode 100644
index 0000000..e1755db
--- /dev/null
+++ b/impl/any_view/benchmark/widget.cpp
@@ -0,0 +1,46 @@
+#include "widget.hpp"
+
+namespace lib {
+
+std::ranges::any_view<const std::string&> UI1::getWidgetNames() const {
+  return widgets_ | std::views::filter([](const Widget& widget) {
+           return widget.size > 10;
+         }) |
+         std::views::transform(&Widget::name);
+}
+
+bool UI2::FilterFn::operator()(const Widget& widget) const {
+  return widget.size > 10;
+}
+
+const std::string& UI2::TransformFn::operator()(const Widget& widget) const {
+  return widget.name;
+}
+
+std::ranges::transform_view<
+    std::ranges::filter_view<std::ranges::ref_view<const std::vector<Widget>>,
+                             UI2::FilterFn>,
+    UI2::TransformFn>
+UI2::getWidgetNames() const {
+  return widgets_ | std::views::filter(UI2::FilterFn{}) |
+         std::views::transform(UI2::TransformFn{});
+}
+
+std::vector<std::string> UI3::getWidgetNames() const {
+  return widgets_ | std::views::filter([](const Widget& widget) {
+           return widget.size > 10;
+         }) |
+         std::views::transform(&Widget::name) | std::ranges::to<std::vector>();
+}
+
+std::vector<std::reference_wrapper<const std::string>> UI4::getWidgetNames()
+    const {
+  return widgets_ | std::views::filter([](const Widget& widget) {
+           return widget.size > 10;
+         }) |
+         std::views::transform(
+             [](const Widget& widget) { return std::cref(widget.name); }) |
+         std::ranges::to<std::vector>();
+}
+
+}  // namespace lib
diff --git a/impl/any_view/benchmark/widget.hpp b/impl/any_view/benchmark/widget.hpp
new file mode 100644
index 0000000..457c222
--- /dev/null
+++ b/impl/any_view/benchmark/widget.hpp
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <ranges>
+#include <string>
+#include <vector>
+
+#include "any_view.hpp"
+
+namespace lib {
+
+struct Widget {
+  std::string name;
+  int size;
+};
+
+struct UI1 {
+  std::vector<Widget> widgets_;
+
+  std::ranges::any_view<const std::string&> getWidgetNames() const;
+};
+
+struct UI2 {
+  std::vector<Widget> widgets_;
+  // cannot use lambda because we need to spell the type of the view
+  struct FilterFn {
+    bool operator()(const Widget&) const;
+  };
+
+  struct TransformFn {
+    const std::string& operator()(const Widget&) const;
+  };
+  std::ranges::transform_view<
+      std::ranges::filter_view<std::ranges::ref_view<const std::vector<Widget>>,
+                               FilterFn>,
+      TransformFn>
+  getWidgetNames() const;
+};
+
+struct UI3 {
+  std::vector<Widget> widgets_;
+
+  std::vector<std::string> getWidgetNames() const;
+};
+
+struct UI4 {
+  std::vector<Widget> widgets_;
+
+  std::vector<std::reference_wrapper<const std::string>> getWidgetNames() const;
+};
+
+
+
+}  // namespace lib