From 6b148d59a068d97cfe709f70638ec8965fbc0991 Mon Sep 17 00:00:00 2001
From: Yanglin Zhang <56758695+lucky9-cyou@users.noreply.github.com>
Date: Fri, 22 Dec 2023 14:42:07 +0800
Subject: [PATCH 1/3] feat: add clifford simulator(#1)

* [feat]: init add qfvm clifford

* [feat]: add simulate circuit using clifford

* [fix]: fix compile bugs

* [feat]: init add python

* [feat]: add naive final measure

* style: format code

* fix: ms aligned malloc

* fix: apple align malloc

* ci: update macos os version to latest

* ci: update macos os version to latest

* ci: update eigen version

* ci: update macos version min version

* ci: update macos version min

* ci: update pyproject

* ci: update pyproject

* feat: fix bugs, add clifford test
---
 .github/workflows/wheels.yml              |   4 +-
 CMakeLists.txt                            |  10 +-
 pyproject.toml                            |   1 +
 quafu/results/results.py                  |   9 +-
 quafu/simulators/simulator.py             |  13 +
 src/qfvm/qfvm.cpp                         |  87 ++++
 src/qfvm/simulator.hpp                    |  46 ++
 src/qfvm_clifford/bit.h                   |  55 +++
 src/qfvm_clifford/bit_word.h              | 512 ++++++++++++++++++++++
 src/qfvm_clifford/circuit.h               | 179 ++++++++
 src/qfvm_clifford/clifford_simulator.h    | 185 ++++++++
 src/qfvm_clifford/gate_list.h             | 246 +++++++++++
 src/qfvm_clifford/gate_macro.h            |  65 +++
 src/qfvm_clifford/packed_bit_word.h       | 350 +++++++++++++++
 src/qfvm_clifford/packed_bit_word_slice.h | 307 +++++++++++++
 src/qfvm_clifford/pauli.h                 | 176 ++++++++
 src/qfvm_clifford/pauli_slice.h           | 145 ++++++
 src/qfvm_clifford/span_ref.h              | 331 ++++++++++++++
 src/qfvm_clifford/table.h                 | 347 +++++++++++++++
 src/qfvm_clifford/tableau.h               | 434 ++++++++++++++++++
 src/qfvm_clifford/utils.h                 |  40 ++
 tests/quafu/simulator/basis_test.py       |  79 ++++
 22 files changed, 3614 insertions(+), 7 deletions(-)
 create mode 100644 src/qfvm_clifford/bit.h
 create mode 100644 src/qfvm_clifford/bit_word.h
 create mode 100644 src/qfvm_clifford/circuit.h
 create mode 100644 src/qfvm_clifford/clifford_simulator.h
 create mode 100644 src/qfvm_clifford/gate_list.h
 create mode 100644 src/qfvm_clifford/gate_macro.h
 create mode 100644 src/qfvm_clifford/packed_bit_word.h
 create mode 100644 src/qfvm_clifford/packed_bit_word_slice.h
 create mode 100644 src/qfvm_clifford/pauli.h
 create mode 100644 src/qfvm_clifford/pauli_slice.h
 create mode 100644 src/qfvm_clifford/span_ref.h
 create mode 100644 src/qfvm_clifford/table.h
 create mode 100644 src/qfvm_clifford/tableau.h
 create mode 100644 src/qfvm_clifford/utils.h

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 860c3ec..032a0a9 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -27,9 +27,9 @@ jobs:
                 -   os-arch: win_amd64
                     os: windows-2019
                 -   os-arch: macosx_x86_64
-                    os: macos-11
+                    os: macos-13
                 -   os-arch: macosx_arm64
-                    os: macos-11
+                    os: macos-13
         runs-on: ${{ matrix.os }}
 
         env:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ee1910a..9595c4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,8 +9,9 @@ if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release)
 endif()
 
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CUDA_ARCHITECTURES 70;75;80;90)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 if(SKBUILD)
 
 execute_process(
@@ -46,7 +47,7 @@ include(ExternalProject)
 ExternalProject_Add(Eigen3
 		PREFIX             ${EIGEN3_ROOT}
 		GIT_REPOSITORY     https://gitlab.com/libeigen/eigen.git
-		GIT_TAG            3.3.9
+		GIT_TAG            3.4
 
 		CONFIGURE_COMMAND ""
 		BUILD_COMMAND ""
@@ -56,10 +57,11 @@ ExternalProject_Add(Eigen3
 )
 list (APPEND PRJ_INCLUDE_DIRS ${EIGEN3_INCLUDE_DIR})
 
-
 find_package(pybind11 CONFIG)
 list (APPEND PRJ_INCLUDE_DIRS ${PYBIND11_INCLUDE_DIR})
 
+set(MACOS_VERSION_MIN 10.14)
+
 #SIMD
 if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64" OR CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "amd64")
 	if(MSVC)
@@ -74,7 +76,7 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_HOST_SYSTEM_PROCESSOR
 	endif()
 endif()
 
-list (APPEND PRJ_INCLUDE_DIRS src/qfvm)
+list (APPEND PRJ_INCLUDE_DIRS src/qfvm src/qfvm_clifford)
 pybind11_add_module(${PROJECT_NAME} MODULE src/${PROJECT_NAME}/${PROJECT_NAME}.cpp)
 add_dependencies(${PROJECT_NAME} Eigen3) #must add dependence for ninja
 target_compile_options(${PROJECT_NAME} PUBLIC ${PRJ_COMPILE_OPTIONS})
diff --git a/pyproject.toml b/pyproject.toml
index 6679926..1b10e0d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,7 @@ archs = ["x86_64"]
 
 
 [tool.cibuildwheel.macos]
+environment = {MACOSX_DEPLOYMENT_TARGET = "13.6"}
 archs = ["x86_64", "arm64"]
 
 repair-wheel-command = [
diff --git a/quafu/results/results.py b/quafu/results/results.py
index 1bcc211..d913d55 100644
--- a/quafu/results/results.py
+++ b/quafu/results/results.py
@@ -91,7 +91,11 @@ class SimuResult(Result):
     """
 
     def __init__(self, input, input_form, count_dict: dict = None):
-        self.num = int(np.log2(input.shape[0]))
+        if input_form != "count_dict":
+            self.num = int(np.log2(input.shape[0]))
+        else:
+            # input is num qubits
+            self.num = input
         if input_form == "density_matrix":
             self.rho = np.array(input)
             self.probabilities = np.diag(input)
@@ -99,6 +103,9 @@ def __init__(self, input, input_form, count_dict: dict = None):
             self.probabilities = input
         elif input_form == "state_vector":
             self.state_vector = input
+        elif input_form == "count_dict":
+            # do nothing, only count dict
+            pass
         # come form c++ simulator
         # TODO: add count for py_simu
         if count_dict is not None:
diff --git a/quafu/simulators/simulator.py b/quafu/simulators/simulator.py
index e4a6a10..4be6e19 100644
--- a/quafu/simulators/simulator.py
+++ b/quafu/simulators/simulator.py
@@ -32,6 +32,7 @@ def simulate(
     shots: int = 100,
     use_gpu: bool = False,
     use_custatevec: bool = False,
+    use_clifford: bool = False,
 ) -> SimuResult:
     """Simulate quantum circuit
     Args:
@@ -86,6 +87,7 @@ def simulate(
         if use_gpu:
             if qc.executable_on_backend == False:
                 raise QuafuError("classical operation only support for `qfvm_qasm`")
+
             if use_custatevec:
                 try:
                     from .qfvm import simulate_circuit_custate
@@ -101,6 +103,14 @@ def simulate(
         else:
             count_dict, psi = simulate_circuit(qc, psi, shots)
 
+    elif simulator == "qfvm_clifford":
+        try:
+            from .qfvm import simulate_circuit_clifford
+        except ImportError:
+            raise QuafuError("you are not using the clifford version of pyquafu")
+
+        count_dict = simulate_circuit_clifford(qc, shots)
+
     elif simulator == "py_simu":
         if qc.executable_on_backend == False:
             raise QuafuError("classical operation only support for `qfvm_qasm`")
@@ -129,6 +139,9 @@ def simulate(
     elif output == "state_vector":
         return SimuResult(psi, output, count_dict)
 
+    elif output == "count_dict":
+        return SimuResult(max(qc.used_qubits) + 1, output, count_dict)
+
     else:
         raise ValueError(
             "output should in be 'density_matrix', 'probabilities', or 'state_vector'"
diff --git a/src/qfvm/qfvm.cpp b/src/qfvm/qfvm.cpp
index f3abd10..913262d 100644
--- a/src/qfvm/qfvm.cpp
+++ b/src/qfvm/qfvm.cpp
@@ -3,6 +3,8 @@
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <random>
+#include <tuple>
+
 #ifdef _USE_GPU
 #include <cuda_simulator.cuh>
 #endif
@@ -11,6 +13,12 @@
 #include <custate_simu.cuh>
 #endif
 
+#ifdef USE_SIMD
+constexpr size_t _word_size = 256;
+#else
+constexpr size_t _word_size = 64;
+#endif
+
 namespace py = pybind11;
 
 template <typename T>
@@ -115,6 +123,82 @@ simulate_circuit(py::object const& pycircuit,
     return std::make_pair(outcount, np_inputstate);
 }
 
+std::map<uint, uint> simulate_circuit_clifford(py::object const& pycircuit,
+                                               const int& shots) {
+
+  auto circuit = Circuit(pycircuit);
+
+  // If measure all at the end, simulate once
+  uint actual_shots = shots;
+
+  // qbit, cbit
+  vector<std::pair<uint, uint>> measures = circuit.measure_vec();
+  std::map<uint, bool> cbit_measured;
+  for (auto& pair : measures) {
+    cbit_measured[pair.second] = true;
+  }
+
+  // Store outcome's count
+  std::map<uint, uint> outcount;
+
+  circuit_simulator<_word_size> cs(circuit.qubit_num());
+
+  for (uint i = 0; i < actual_shots; i++) {
+
+    simulate(circuit, cs);
+    uint outcome = 0;
+
+    if (!circuit.final_measure()) {
+      // qubit, cbit, measure result
+      auto measure_results = cs.current_measurement_record();
+
+      // make sure the order is the same with other simulators
+      std::sort(
+          measure_results.begin(), measure_results.end(),
+          [](auto& a, auto& b) { return std::get<1>(a) < std::get<1>(b); });
+
+      for (auto& measure_result : measure_results) {
+        outcome *= 2;
+        outcome += std::get<2>(measure_result);
+      }
+
+    } else if (circuit.final_measure() && !measures.empty()) {
+      for (auto& measure : measures) {
+        cs.do_circuit_instruction(
+            {"measure", std::vector<size_t>{measure.first},
+             std::vector<double>{static_cast<double>(measure.second)}});
+      }
+
+      // qubit, cbit, measure result
+      auto measure_results = cs.current_measurement_record();
+
+      // make sure the order is the same with other simulators
+      std::sort(
+          measure_results.begin(), measure_results.end(),
+          [](auto& a, auto& b) { return std::get<1>(a) < std::get<1>(b); });
+
+      for (auto& measure_result : measure_results) {
+        outcome *= 2;
+        outcome += std::get<2>(measure_result);
+      }
+    }
+
+    if (measures.empty()) {
+      continue;
+    }
+
+    if (outcount.find(outcome) != outcount.end())
+      outcount[outcome]++;
+    else
+      outcount[outcome] = 1;
+
+    cs.reset_tableau();
+    cs.sim_record.clear();
+  }
+
+  return outcount;
+}
+
 #ifdef _USE_GPU
 py::object simulate_circuit_gpu(py::object const& pycircuit,
                                 py::array_t<complex<double>>& np_inputstate) {
@@ -164,6 +248,9 @@ PYBIND11_MODULE(qfvm, m) {
         py::arg("circuit"),
         py::arg("inputstate") = py::array_t<complex<double>>(0),
         py::arg("shots"));
+  m.def("simulate_circuit_clifford", &simulate_circuit_clifford,
+        "Simulate with circuit using clifford", py::arg("circuit"),
+        py::arg("shots"));
 
 #ifdef _USE_GPU
   m.def("simulate_circuit_gpu", &simulate_circuit_gpu, "Simulate with circuit",
diff --git a/src/qfvm/simulator.hpp b/src/qfvm/simulator.hpp
index 1e2663d..9e88fb6 100644
--- a/src/qfvm/simulator.hpp
+++ b/src/qfvm/simulator.hpp
@@ -1,7 +1,12 @@
 #pragma once
 
 #include "circuit.hpp"
+#include "clifford_simulator.h"
+#include "qasm.hpp"
 #include "statevector.hpp"
+#include "types.hpp"
+#include <cstddef>
+#include <vector>
 
 void apply_op(QuantumOperator& op, StateVector<data_t>& state) {
   bool matched = false;
@@ -114,3 +119,44 @@ void simulate(Circuit const& circuit, StateVector<data_t>& state) {
     apply_op(op, state);
   }
 }
+
+template <size_t word_size>
+void apply_measure(circuit_simulator<word_size>& cs, const vector<pos_t>& qbits,
+                   const vector<pos_t>& cbits) {
+  for (size_t i = 0; i < qbits.size(); i++) {
+    cs.do_circuit_instruction(
+        {"measure", std::vector<size_t>{qbits[i]},
+         std::vector<double>{static_cast<double>(cbits[i])}});
+  }
+}
+
+template <size_t word_size>
+void apply_op(QuantumOperator& op, circuit_simulator<word_size>& cs) {
+  // TODO: support args
+  switch (OPMAP[op.name()]) {
+  case Opname::measure:
+    apply_measure(cs, op.qbits(), op.cbits());
+    break;
+  case Opname::reset:
+    for (auto qubit : op.qbits()) {
+      cs.do_circuit_instruction(
+          {"reset", std::vector<size_t>{static_cast<size_t>(qubit)}});
+    }
+    break;
+  default:
+    auto qubits = op.positions();
+    cs.do_circuit_instruction(
+        {op.name(), std::vector<size_t>(qubits.begin(), qubits.end())});
+  }
+}
+
+template <size_t word_size>
+void simulate(Circuit const& circuit, circuit_simulator<word_size>& cs) {
+  // skip measure and handle it in qfvm.cpp
+  bool skip_measure = circuit.final_measure();
+  for (auto op : circuit.instructions()) {
+    if (skip_measure == true && op.name() == "measure")
+      continue;
+    apply_op(op, cs);
+  }
+}
diff --git a/src/qfvm_clifford/bit.h b/src/qfvm_clifford/bit.h
new file mode 100644
index 0000000..1be3467
--- /dev/null
+++ b/src/qfvm_clifford/bit.h
@@ -0,0 +1,55 @@
+#ifndef BIT_H_
+#define BIT_H_
+
+#include <cstddef>
+#include <cstdint>
+
+// bit in byte
+struct bit {
+  uint8_t* byte;
+  uint8_t byte_index;
+
+  bit(void* ptr, size_t offset)
+      : byte(((uint8_t*)ptr + (offset / 8))), byte_index(offset & 7) {}
+
+  // copy assignment for bit in byte
+  inline bit& operator=(bool value) {
+    // make bit be 0
+    *byte &= ~((uint8_t)1 << byte_index);
+    // assignment
+    *byte |= uint8_t(value) << byte_index;
+    return *this;
+  }
+
+  inline bit& operator=(const bit& other) {
+    *this = bool(other);
+    return *this;
+  }
+
+  // bit operator
+  inline bit& operator^=(bool value) {
+    *byte ^= uint8_t(value) << byte_index;
+    return *this;
+  }
+
+  inline bit& operator&=(bool value) {
+    *byte &= (uint8_t(value) << byte_index) | ~(uint8_t(1) << byte_index);
+    return *this;
+  }
+
+  inline bit& operator|=(bool value) {
+    *byte |= uint8_t(value) << byte_index;
+    return *this;
+  }
+
+  // conversion operator
+  inline operator bool() const { return (*byte >> byte_index) & 1; }
+
+  void swap(bit other) {
+    bool b = bool(other);
+    other = bool(*this);
+    *this = b;
+  }
+};
+
+#endif
diff --git a/src/qfvm_clifford/bit_word.h b/src/qfvm_clifford/bit_word.h
new file mode 100644
index 0000000..e832964
--- /dev/null
+++ b/src/qfvm_clifford/bit_word.h
@@ -0,0 +1,512 @@
+#ifndef TABLEAU_WORD_H_
+#define TABLEAU_WORD_H_
+
+#include "utils.h"
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#ifdef USE_SIMD
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#endif
+
+#include <iostream>
+#include <sstream>
+
+// A bit_word is a bag of bits, which can be operated by individual CPU
+// instructions.
+// This template is necessary due to the varying interfaces between native types
+// such as uint64_t and intrinsics like __m256i across different architectures
+// and operating systems. In certain contexts, operators can be used on __m256i
+// values (e.g. a ^= b), while in others this is not possible. The bitword
+// template implementations establish a standard set of methods that are
+// essential for Clifford's operation, allowing the same code to be compiled
+// using either 256-bit or 64-bit registers depending on what is appropriate.
+template <size_t word_size> struct bit_word;
+
+/* =================================================== */
+/* ================ bit_word operation =============== */
+
+template <size_t word_size>
+inline bool operator==(const bit_word<word_size>& left,
+                       const bit_word<word_size>& right) {
+  return left.to_u64_array() == right.to_u64_array();
+}
+
+template <size_t word_size>
+inline bool operator!=(const bit_word<word_size>& left,
+                       const bit_word<word_size>& right) {
+  return !(left == right);
+}
+
+template <size_t word_size>
+inline bool operator<(const bit_word<word_size>& left,
+                      const bit_word<word_size>& right) {
+  auto array1 = left.to_u64_array();
+  auto array2 = right.to_u64_array();
+
+  for (size_t i = 0; i < array1.size(); ++i) {
+    if (array1[i] != array2[i]) {
+      return array1[i] < array2[i];
+    }
+  }
+  return false;
+}
+
+template <size_t word_size>
+inline bool operator==(const bit_word<word_size>& left, int right) {
+  return left == bit_word<word_size>(right);
+}
+
+template <size_t word_size>
+inline bool operator!=(const bit_word<word_size>& left, int right) {
+  return left != right;
+}
+
+template <size_t word_size>
+inline bool operator==(const bit_word<word_size>& left, uint64_t right) {
+  return left == bit_word<word_size>(right);
+}
+
+template <size_t word_size>
+inline bool operator!=(const bit_word<word_size>& left, uint64_t right) {
+  return left != right;
+}
+
+template <size_t word_size>
+inline bool operator==(const bit_word<word_size>& left, int64_t right) {
+  return left == bit_word<word_size>(right);
+}
+
+template <size_t word_size>
+inline bool operator!=(const bit_word<word_size>& left, int64_t right) {
+  return left != right;
+}
+
+// output 1 for bit 1, . for bit 0
+template <size_t word_size>
+std::ostream& operator<<(std::ostream& os, const bit_word<word_size>& word) {
+  os << "bit_word<" << word_size << ">{";
+  auto array1 = word.to_u64_array();
+  for (size_t i = 0; i < array1.size(); ++i) {
+    for (size_t j = 0; j < 64; ++j) {
+      if ((i | j) && (j & 7) == 0) {
+        os << ' ';
+      }
+      // ".1" is a char array
+      os << ".1"[(array1[i] >> j) & 1];
+    }
+  }
+  os << "}";
+  return os;
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator<<(const bit_word<word_size>& word,
+                                      int offset) {
+  return word.shift(offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator>>(const bit_word<word_size>& word,
+                                      int offset) {
+  return word.shift(-offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator<<=(bit_word<word_size>& word, int offset) {
+  return word = word.shift(offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator>>=(bit_word<word_size>& word, int offset) {
+  return word = word.shift(-offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator<<(const bit_word<word_size>& word,
+                                      uint64_t offset) {
+  return word.shift((int)offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator>>(const bit_word<word_size>& word,
+                                      uint64_t offset) {
+  return word.shift(-(int)offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator<<=(bit_word<word_size>& word,
+                                       uint64_t offset) {
+  return word = word.shift((int)offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator>>=(bit_word<word_size>& word,
+                                       uint64_t offset) {
+  return word = word.shift(-(int)offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator&(const bit_word<word_size>& word,
+                                     uint64_t mask) {
+  return word & bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator|(const bit_word<word_size>& word,
+                                     uint64_t mask) {
+  return word | bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator^(const bit_word<word_size>& word,
+                                     uint64_t mask) {
+  return word ^ bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator&(const bit_word<word_size>& word,
+                                     int64_t mask) {
+  return word & bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator|(const bit_word<word_size>& word,
+                                     int64_t mask) {
+  return word | bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator^(const bit_word<word_size>& word,
+                                     int64_t mask) {
+  return word ^ bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator&(const bit_word<word_size>& word,
+                                     int mask) {
+  return word & bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator|(const bit_word<word_size>& word,
+                                     int mask) {
+  return word | bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator^(const bit_word<word_size>& word,
+                                     int mask) {
+  return word ^ bit_word<word_size>(mask);
+}
+
+/* =================================================== */
+/* ================ 64 bit version =================== */
+template <> struct bit_word<64> {
+  constexpr static size_t WORD_SIZE = 64;
+  constexpr static size_t BIT_POW = 6;
+
+  union {
+    uint8_t u8[8];
+    uint64_t u64[1];
+  };
+
+  inline constexpr bit_word<64>() : u64{} {}
+  inline constexpr bit_word<64>(uint64_t v) : u64{v} {}
+  inline constexpr bit_word<64>(int64_t v) : u64{(uint64_t)v} {}
+  inline constexpr bit_word<64>(int v) : u64{(uint64_t)v} {}
+
+  inline operator bool() const { return bool(u64[0]); }
+  inline operator int64_t() const { return int64_t(u64[0]); }
+  inline operator int() const { return int64_t(*this); }
+  inline operator uint64_t() const { return u64[0]; }
+
+  inline bit_word<64>& operator^=(const bit_word<64>& other) {
+    u64[0] ^= other.u64[0];
+    return *this;
+  }
+
+  inline bit_word<64>& operator&=(const bit_word<64>& other) {
+    u64[0] &= other.u64[0];
+    return *this;
+  }
+
+  inline bit_word<64>& operator|=(const bit_word<64>& other) {
+    u64[0] |= other.u64[0];
+    return *this;
+  }
+
+  inline bit_word<64> operator^(const bit_word<64>& other) const {
+    return bit_word<64>(u64[0] ^ other.u64[0]);
+  }
+
+  inline bit_word<64> operator&(const bit_word<64>& other) const {
+    return bit_word<64>(u64[0] & other.u64[0]);
+  }
+
+  inline bit_word<64> operator|(const bit_word<64>& other) const {
+    return bit_word<64>(u64[0] | other.u64[0]);
+  }
+
+  inline bit_word<64> andnot(const bit_word<64>& other) const {
+    return bit_word<64>(~u64[0] & other.u64[0]);
+  }
+
+  // convert bit word to string
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  std::array<uint64_t, 1> to_u64_array() const {
+    return std::array<uint64_t, 1>{u64[0]};
+  }
+
+  inline bit_word<64> shift(int offset) const {
+    auto array1 = u64[0];
+    if (64 <= offset || -64 >= offset) {
+      return 0;
+    } else if (offset > 0) {
+      return bit_word<64>(array1 << offset);
+    } else {
+      return bit_word<64>(array1 >> -offset);
+    }
+  }
+
+  inline uint16_t count() const { return count_uint64_bits(u64[0]); }
+
+  static void* aligned_malloc(size_t bits) { return malloc(bits); }
+
+  static void aligned_free(void* ptr) { free(ptr); }
+
+  template <uint64_t mask, uint64_t shift>
+  static void inplace_transpose_64_step(uint64_t* data, size_t stride) {
+    for (size_t k = 0; k < 64; k++) {
+      if (k & shift)
+        continue;
+      uint64_t& x = data[stride * k];
+      uint64_t& y = data[stride * (k + shift)];
+      uint64_t a = x & mask;
+      uint64_t b = x & ~mask;
+      uint64_t c = y & mask;
+      uint64_t d = y & ~mask;
+      x = a | (c << shift);
+      y = (b >> shift) | d;
+    }
+  }
+
+  static void inplace_transpose_square(bit_word<64>* block_start,
+                                       size_t stride) {
+    inplace_transpose_64_step<0x5555555555555555ull, 1>((uint64_t*)block_start,
+                                                        stride);
+    inplace_transpose_64_step<0x3333333333333333ull, 2>((uint64_t*)block_start,
+                                                        stride);
+    inplace_transpose_64_step<0x0F0F0F0F0F0F0F0Full, 4>((uint64_t*)block_start,
+                                                        stride);
+    inplace_transpose_64_step<0x00FF00FF00FF00FFull, 8>((uint64_t*)block_start,
+                                                        stride);
+    inplace_transpose_64_step<0x0000FFFF0000FFFFull, 16>((uint64_t*)block_start,
+                                                         stride);
+    inplace_transpose_64_step<0x00000000FFFFFFFFull, 32>((uint64_t*)block_start,
+                                                         stride);
+  }
+};
+
+#ifdef USE_SIMD
+/* =================================================== */
+/* ================ 256 bit version ================== */
+template <> struct bit_word<256> {
+  constexpr static size_t WORD_SIZE = 256;
+  constexpr static size_t BIT_POW = 8;
+
+  union {
+    uint8_t u8[32];
+    uint64_t u64[4];
+    __m256i m256;
+  };
+
+  inline constexpr bit_word<256>() : m256(__m256i{}) {}
+  inline constexpr bit_word<256>(__m256i v) : m256(v) {}
+  inline bit_word<256>(uint64_t v) : m256{_mm256_set_epi64x(0, 0, 0, v)} {}
+  inline bit_word<256>(int64_t v)
+      : m256{_mm256_set_epi64x(-(v < 0), -(v < 0), -(v < 0), v)} {}
+  inline bit_word<256>(int v)
+      : m256{_mm256_set_epi64x(-(v < 0), -(v < 0), -(v < 0), v)} {}
+
+  inline operator bool() const {
+    return bool(u64[0] | u64[1] | u64[2] | u64[3]);
+  }
+  inline operator int64_t() const {
+    auto words = to_u64_array();
+    // x86 is little endian default
+    int64_t result = int64_t(words[0]);
+    uint64_t expected = result < 0 ? uint64_t(-1) : uint64_t(0);
+    if (words[1] != expected || words[2] != expected || words[3] != expected) {
+      throw std::runtime_error("int64_t overflow");
+    }
+    return result;
+  }
+  inline operator int() const { return int64_t(*this); }
+  inline operator uint64_t() const {
+    if (u64[1] || u64[2] || u64[3]) {
+      throw std::runtime_error("uint64_t overflow");
+    }
+    return u64[0];
+  }
+
+  inline bit_word<256>& operator^=(const bit_word<256>& other) {
+    m256 = _mm256_xor_si256(m256, other.m256);
+    return *this;
+  }
+
+  inline bit_word<256>& operator&=(const bit_word<256>& other) {
+    m256 = _mm256_and_si256(m256, other.m256);
+    return *this;
+  }
+
+  inline bit_word<256>& operator|=(const bit_word<256>& other) {
+    m256 = _mm256_or_si256(m256, other.m256);
+    return *this;
+  }
+
+  inline bit_word<256> operator^(const bit_word<256>& other) const {
+    return bit_word<256>(_mm256_xor_si256(m256, other.m256));
+  }
+
+  inline bit_word<256> operator&(const bit_word<256>& other) const {
+    return bit_word<256>(_mm256_and_si256(m256, other.m256));
+  }
+
+  inline bit_word<256> operator|(const bit_word<256>& other) const {
+    return bit_word<256>(_mm256_or_si256(m256, other.m256));
+  }
+
+  inline bit_word<256> andnot(const bit_word<256>& other) const {
+    return bit_word<256>(_mm256_andnot_si256(m256, other.m256));
+  }
+
+  // convert bit word to string
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  std::array<uint64_t, 4> to_u64_array() const {
+    return std::array<uint64_t, 4>{u64[0], u64[1], u64[2], u64[3]};
+  }
+
+  inline bit_word<256> shift(int offset) const {
+    auto array = to_u64_array();
+    while (offset <= -64) {
+      array[0] = array[1];
+      array[1] = array[2];
+      array[2] = array[3];
+      array[3] = 0;
+      offset += 64;
+    }
+
+    while (offset >= 64) {
+      array[3] = array[2];
+      array[2] = array[1];
+      array[1] = array[0];
+      array[0] = 0;
+      offset -= 64;
+    }
+
+    __m256i low2high;
+    __m256i high2low;
+    if (offset < 0) {
+      low2high = _mm256_set_epi64x(0, array[3], array[2], array[1]);
+      high2low = _mm256_set_epi64x(array[3], array[2], array[1], array[0]);
+      offset += 64;
+    } else {
+      low2high = _mm256_set_epi64x(array[3], array[2], array[1], array[0]);
+      high2low = _mm256_set_epi64x(array[2], array[1], array[0], 0);
+    }
+
+    uint64_t mask = (uint64_t{1} << offset) - 1;
+    low2high = _mm256_slli_epi64(low2high, offset);
+    high2low = _mm256_srli_epi64(high2low, 64 - offset);
+    // for offset < 0, only w[1] in lower 64 bits is used
+    low2high = _mm256_and_si256(low2high, _mm256_set1_epi64x(~mask));
+    // for offset > 0, only w[0] in upper 64 bits is used
+    high2low = _mm256_and_si256(high2low, _mm256_set1_epi64x(mask));
+    return _mm256_or_si256(low2high, high2low);
+  }
+
+  inline uint16_t count() const {
+    return count_uint64_bits(u64[0]) + count_uint64_bits(u64[1]) +
+           count_uint64_bits(u64[2]) + count_uint64_bits(u64[3]);
+  }
+
+  static void* aligned_malloc(size_t bits) {
+    return _mm_malloc(bits, sizeof(__m256i));
+  }
+
+  static void aligned_free(void* ptr) { _mm_free(ptr); }
+
+  template <uint64_t shift>
+  static void inplace_transpose_256_step(__m256i mask, __m256i* data,
+                                         size_t stride) {
+    for (std::size_t k = 0; k < 256; k++) {
+      if (k & shift)
+        continue;
+
+      __m256i& x = data[stride * k];
+      __m256i& y = data[stride * (k + shift)];
+      __m256i a = _mm256_and_si256(x, mask);
+      __m256i b = _mm256_andnot_si256(mask, x);
+      __m256i c = _mm256_and_si256(y, mask);
+      __m256i d = _mm256_andnot_si256(mask, y);
+
+      x = _mm256_or_si256(a, _mm256_slli_epi64(c, shift));
+      y = _mm256_or_si256(_mm256_srli_epi64(b, shift), d);
+    }
+  }
+
+  static void inplace_transpose_64_and_128_step(bit_word<256>* data,
+                                                size_t stride) {
+    uint64_t* u64_ptr = (uint64_t*)data;
+    stride <<= 2;
+    for (std::size_t k = 0; k < 64; k++) {
+      std::swap(u64_ptr[stride * (k + 64 * 0) + 1],
+                u64_ptr[stride * (k + 64 * 1) + 0]);
+      std::swap(u64_ptr[stride * (k + 64 * 0) + 2],
+                u64_ptr[stride * (k + 64 * 2) + 0]);
+      std::swap(u64_ptr[stride * (k + 64 * 0) + 3],
+                u64_ptr[stride * (k + 64 * 3) + 0]);
+      std::swap(u64_ptr[stride * (k + 64 * 1) + 2],
+                u64_ptr[stride * (k + 64 * 2) + 1]);
+      std::swap(u64_ptr[stride * (k + 64 * 1) + 3],
+                u64_ptr[stride * (k + 64 * 3) + 1]);
+      std::swap(u64_ptr[stride * (k + 64 * 2) + 3],
+                u64_ptr[stride * (k + 64 * 3) + 2]);
+    }
+  }
+
+  static void inplace_transpose_square(bit_word<256>* data, size_t stride) {
+    inplace_transpose_256_step<1>(_mm256_set1_epi8(0x55), (__m256i*)data,
+                                  stride);
+    inplace_transpose_256_step<2>(_mm256_set1_epi8(0x33), (__m256i*)data,
+                                  stride);
+    inplace_transpose_256_step<4>(_mm256_set1_epi8(0x0F), (__m256i*)data,
+                                  stride);
+    inplace_transpose_256_step<8>(_mm256_set1_epi16(0x00FF), (__m256i*)data,
+                                  stride);
+    inplace_transpose_256_step<16>(_mm256_set1_epi32(0x0000FFFF),
+                                   (__m256i*)data, stride);
+    inplace_transpose_256_step<32>(_mm256_set1_epi64x(0x00000000FFFFFFFFull),
+                                   (__m256i*)data, stride);
+  }
+};
+#endif
+
+#endif
diff --git a/src/qfvm_clifford/circuit.h b/src/qfvm_clifford/circuit.h
new file mode 100644
index 0000000..958bad6
--- /dev/null
+++ b/src/qfvm_clifford/circuit.h
@@ -0,0 +1,179 @@
+#ifndef CIRCUIT_H_
+#define CIRCUIT_H_
+
+#include "span_ref.h"
+#include <algorithm>
+#include <cstddef>
+#include <string>
+#include <vector>
+
+// gate instruction, include gate name, target qubits and arguments
+struct circuit_instruction {
+  std::string gate;
+  span_ref<const size_t> targets;
+  span_ref<const double> args;
+
+  circuit_instruction() = delete;
+  circuit_instruction(std::string gate, span_ref<const size_t> targets,
+                      span_ref<const double> args = {})
+      : gate(gate), targets(targets), args(args) {}
+
+  // TODO: add validation for instruction
+  void validate() {}
+};
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const circuit_instruction& instr) {
+  os << instr.gate << " "
+     << "targets: ";
+  for (auto& target : instr.targets) {
+    os << target << " ";
+  }
+
+  if (instr.args.size() > 0) {
+    os << "args: ";
+    for (auto& arg : instr.args) {
+      os << arg << " ";
+    }
+  }
+  os << "\n";
+  return os;
+}
+
+// quantum circuit, include a list of gate instructions, and buffer for targets
+// and args
+struct quantum_circuit {
+  std::vector<circuit_instruction> instr_list;
+  monotonic_buffer<size_t> targets_buf;
+  monotonic_buffer<double> args_buf;
+
+  quantum_circuit() : instr_list(), targets_buf(), args_buf() {}
+  quantum_circuit(const std::string& gate, const std::vector<size_t>& targets,
+                  const std::vector<double>& args = {})
+      : instr_list(), targets_buf(), args_buf() {
+    _append(gate, targets, args);
+  }
+
+  // copy constructor
+  quantum_circuit(const quantum_circuit& other)
+      : instr_list(other.instr_list),
+        targets_buf(other.targets_buf.total_allocated()),
+        args_buf(other.args_buf.total_allocated()) {
+
+    // take copy of targets and args
+    for (auto& instr : instr_list) {
+      instr.targets = targets_buf.take_copy(instr.targets);
+      instr.args = args_buf.take_copy(instr.args);
+    }
+  };
+
+  // move constructor
+  quantum_circuit(quantum_circuit&& other) noexcept
+      : instr_list(std::move(other.instr_list)),
+        targets_buf(std::move(other.targets_buf)),
+        args_buf(std::move(other.args_buf)) {}
+
+  // copy assignment
+  quantum_circuit& operator=(const quantum_circuit& other) {
+    instr_list = other.instr_list;
+    targets_buf = monotonic_buffer<size_t>(other.targets_buf.total_allocated());
+    args_buf = monotonic_buffer<double>(other.args_buf.total_allocated());
+    for (auto& instr : instr_list) {
+      instr.targets = targets_buf.take_copy(instr.targets);
+      instr.args = args_buf.take_copy(instr.args);
+    }
+    return *this;
+  }
+
+  // move assignment
+  quantum_circuit& operator=(quantum_circuit&& other) noexcept {
+    instr_list = std::move(other.instr_list);
+    targets_buf = std::move(other.targets_buf);
+    args_buf = std::move(other.args_buf);
+    return *this;
+  }
+
+  // concatenate two quantum circuits
+  quantum_circuit& operator+=(const quantum_circuit& other) {
+    span_ref<const circuit_instruction> other_instrs(other.instr_list);
+
+    if (&other == this) {
+      instr_list.insert(instr_list.end(), other_instrs.begin(),
+                        other_instrs.end());
+      return *this;
+    }
+
+    for (auto& instr : other_instrs) {
+      auto instr_targets = targets_buf.take_copy(instr.targets);
+      auto instr_args = args_buf.take_copy(instr.args);
+      instr_list.push_back({instr.gate, instr_targets, instr_args});
+    }
+
+    return *this;
+  }
+
+  // concatenate two quantum circuits
+  quantum_circuit operator+(const quantum_circuit& other) {
+    quantum_circuit result(*this);
+    result += other;
+    return result;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const quantum_circuit& qc) {
+    for (auto& instr : qc.instr_list) {
+      os << instr;
+    }
+    return os;
+  }
+
+  operator std::string() const { return str(); }
+
+  std::string str() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  quantum_circuit& append(const std::string& gate,
+                          const std::vector<size_t>& targets,
+                          const std::vector<double>& args = {}) {
+
+    _append(gate, targets, args);
+
+    return *this;
+  }
+
+  quantum_circuit& _append(const std::string& gate,
+                           span_ref<const size_t> targets,
+                           span_ref<const double> args) {
+    circuit_instruction instr(gate, targets, args);
+    instr.validate();
+
+    // TODO: fuse some instr
+    instr.targets = targets_buf.take_copy(targets);
+    instr.args = args_buf.take_copy(args);
+    instr_list.push_back(instr);
+
+    return *this;
+  }
+
+  size_t max_qubit() const {
+    size_t max_qubit = 0;
+    for (auto& instr : instr_list) {
+      for (auto& target : instr.targets) {
+        max_qubit = std::max(max_qubit, target);
+      }
+    }
+
+    return max_qubit;
+  }
+
+  template <typename func>
+  void for_each_circuit_instruction(const func& callback) const {
+    for (auto& instr : instr_list) {
+      callback(instr);
+    }
+  }
+};
+
+#endif
diff --git a/src/qfvm_clifford/clifford_simulator.h b/src/qfvm_clifford/clifford_simulator.h
new file mode 100644
index 0000000..cc01cf6
--- /dev/null
+++ b/src/qfvm_clifford/clifford_simulator.h
@@ -0,0 +1,185 @@
+#ifndef SIMULATOR_H_
+#define SIMULATOR_H_
+
+#include "circuit.h"
+#include "span_ref.h"
+#include "table.h"
+#include "tableau.h"
+#include <cstddef>
+#include <ios>
+#include <ostream>
+#include <tuple>
+
+// strore measurement result
+struct measurement_record {
+  using result = std::tuple<size_t, size_t, bool>;
+  std::vector<result> storage;
+
+  void record(size_t qubit, size_t cbit, bool result) {
+    storage.push_back({qubit, cbit, result});
+  }
+  auto operator[](size_t index) { return storage[index]; }
+  auto const operator[](size_t index) const { return storage[index]; }
+  auto begin() { return storage.begin(); }
+  auto end() { return storage.end(); }
+  auto begin() const { return storage.begin(); }
+  auto end() const { return storage.end(); }
+  auto cbegin() const { return storage.cbegin(); }
+  auto cend() const { return storage.cend(); }
+  auto size() const { return storage.size(); }
+  void clear() { storage.clear(); }
+};
+
+// quantum circuit simulator, include original tableau, measurement record and a
+// random number generator
+template <size_t word_size> struct circuit_simulator {
+
+  tableau<word_size> sim_tableau;
+  measurement_record sim_record;
+  std::mt19937_64 rng;
+
+  // constructor
+  // The number of qubits is specified by num_qubits. And the random number
+  // generator is used to generate random numbers for error gate and
+  // measurement.
+  explicit circuit_simulator(size_t num_qubits, size_t seed = 42)
+      : sim_tableau(num_qubits), sim_record(), rng() {
+    rng.seed(seed);
+  }
+
+  // do a quantum gate except measurement gate
+  template <size_t vec_size, size_t... S>
+  result unpack_vector(auto f, tableau<word_size>& t, auto& vec,
+                       std::index_sequence<S...>) {
+
+    // TODO: maybe can be better
+    return std::get<vec_size - 1>(f)(t, vec[S]...);
+  }
+
+  template <size_t vec_size>
+  result unpack_vector(auto f, tableau<word_size>& t, auto& vec) {
+    if (vec.size() != vec_size)
+      throw std::runtime_error("wrong number of qubits for gate");
+    return unpack_vector<vec_size>(f, t, vec,
+                                   std::make_index_sequence<vec_size>());
+  }
+
+  // do a measurement gate
+  template <size_t... S>
+  result unpack_vector(auto f, tableau<word_size>& t, auto& vec,
+                       std::mt19937_64& rng, std::index_sequence<S...>) {
+
+    // TODO: maybe can be better
+    return std::get<gate_type::COLLAPSING_GATE>(f)(t, rng, vec[S]...);
+  }
+
+  template <size_t vec_size>
+  result unpack_vector(auto f, tableau<word_size>& t, auto& vec,
+                       std::mt19937_64& rng) {
+    if (vec.size() != vec_size)
+      throw std::runtime_error("wrong number of qubits for gate");
+    return unpack_vector(f, t, vec, rng, std::make_index_sequence<vec_size>());
+  }
+
+  // do a quantum circuit, check the max qubit of the quantum circuit, if it is
+  // larger than the number of qubits of the tableau, expand the tableau
+  void do_circuit(const quantum_circuit& qc) {
+    if (qc.max_qubit() >= sim_tableau.num_qubits) {
+      sim_tableau.expand(qc.max_qubit(), 1.0);
+      std::cerr << "WARNING: expanding tableau to " << qc.max_qubit()
+                << " qubits\n";
+    }
+
+    // according quantum circuit instruction type to do the instruction
+    qc.for_each_circuit_instruction([&](const circuit_instruction& ci) {
+      if (!gate_map<word_size>.contains(ci.gate + "_gate")) {
+        throw std::runtime_error("unknown gate");
+      }
+
+      auto pair = gate_map<word_size>[ci.gate + "_gate"];
+      if (SINGLE_QUBIT_GATE == pair.first) {
+        unpack_vector<1>(pair.second, sim_tableau, ci.targets);
+      } else if (TWO_QUBIT_GATE == pair.first) {
+        unpack_vector<2>(pair.second, sim_tableau, ci.targets);
+      } else if (COLLAPSING_GATE == pair.first) {
+        auto record =
+            unpack_vector<1>(pair.second, sim_tableau, ci.targets, rng);
+        if (record.has_value())
+          sim_record.record(ci.targets[0], static_cast<size_t>(ci.args[0]),
+                            record.value());
+      } else if (ERROR_QUBIT_GATE == pair.first) {
+        std::bernoulli_distribution d(ci.args[0]);
+        if (d(rng))
+          unpack_vector<1>(pair.second, sim_tableau, ci.targets);
+      } else {
+        throw std::runtime_error("unknown gate");
+      }
+    });
+  }
+
+  // do a quantum circuit instruction
+  void do_circuit_instruction(const circuit_instruction& ci) {
+
+    if (!gate_map<word_size>.contains(ci.gate + "_gate")) {
+      throw std::runtime_error("unknown gate");
+    }
+
+    auto pair = gate_map<word_size>[ci.gate + "_gate"];
+
+    if (SINGLE_QUBIT_GATE == pair.first) {
+      unpack_vector<1>(pair.second, sim_tableau, ci.targets);
+    } else if (TWO_QUBIT_GATE == pair.first) {
+      unpack_vector<2>(pair.second, sim_tableau, ci.targets);
+    } else if (COLLAPSING_GATE == pair.first) {
+      auto record = unpack_vector<1>(pair.second, sim_tableau, ci.targets, rng);
+      if (record.has_value())
+        sim_record.record(ci.targets[0], static_cast<size_t>(ci.args[0]),
+                          record.value());
+    } else if (ERROR_QUBIT_GATE == pair.first) {
+      std::bernoulli_distribution d(ci.args[0]);
+      if (d(rng))
+        unpack_vector<1>(pair.second, sim_tableau, ci.targets);
+    } else {
+      throw std::runtime_error("unknown gate");
+    }
+  }
+
+  // sample the quantum circuit, after each iteration, reset the tableau to
+  // identity
+  void sample(const quantum_circuit& qc, size_t num_samples) {
+    for (size_t i = 0; i < num_samples; i++) {
+      reset_tableau();
+      do_circuit(qc);
+    }
+  }
+
+  // reset the tableau to identity
+  void reset_tableau() { sim_tableau.reset(); }
+
+  // z-basis reset
+  void reset(size_t qubit) { sim_tableau.reset(rng, qubit); }
+
+  // // x-basis reset
+  // void reset_x(size_t qubit) { sim_tableau.reset_x(rng, qubit); }
+
+  // // y-basis reset
+  // void reset_y(size_t qubit) { sim_tableau.reset_y(rng, qubit); }
+
+  // measurement record size
+  size_t record_size() { return sim_record.size(); }
+
+  // get the measurement record
+  auto current_measurement_record() const { return sim_record; }
+
+  auto measure_all() {
+    measurement_record mr;
+    for (size_t i = 0; i < sim_tableau.num_qubits; i++) {
+      auto record = sim_tableau.m_gate(rng, i);
+      if (record.has_value())
+        mr.record(record.value());
+    }
+    return mr;
+  }
+};
+
+#endif
diff --git a/src/qfvm_clifford/gate_list.h b/src/qfvm_clifford/gate_list.h
new file mode 100644
index 0000000..3d74a18
--- /dev/null
+++ b/src/qfvm_clifford/gate_list.h
@@ -0,0 +1,246 @@
+#include "gate_macro.h"
+#include "tableau.h"
+#include <type_traits>
+
+SINGLE_QUBIT_GATE(h, {
+  t.distabilizer[qubit].swap(t.stabilizer[qubit]);
+  return {};
+})
+
+SINGLE_QUBIT_GATE(i, { return {}; })
+
+SINGLE_QUBIT_GATE(x, {
+  t.stabilizer[qubit].sign ^= true;
+  return {};
+})
+
+SINGLE_QUBIT_GATE(y, {
+  t.distabilizer[qubit].sign ^= true;
+  t.stabilizer[qubit].sign ^= true;
+  return {};
+})
+
+SINGLE_QUBIT_GATE(z, {
+  t.distabilizer[qubit].sign ^= true;
+  return {};
+})
+
+ERROR_QUBIT_GATE(x_error, {
+  t.stabilizer[qubit].sign ^= true;
+  return {};
+})
+
+ERROR_QUBIT_GATE(y_error, {
+  t.distabilizer[qubit].sign ^= true;
+  t.stabilizer[qubit].sign ^= true;
+  return {};
+})
+
+ERROR_QUBIT_GATE(z_error, {
+  t.distabilizer[qubit].sign ^= true;
+  return {};
+})
+
+SINGLE_QUBIT_GATE(h_yz, {
+  t.stabilizer[qubit].mul_ignore_anti_commute(t.distabilizer[qubit]);
+  t.z_gate(qubit);
+  return {};
+})
+
+SINGLE_QUBIT_GATE(s_dag, {
+  t.distabilizer[qubit].mul_ignore_anti_commute(t.stabilizer[qubit]);
+  return {};
+})
+
+SINGLE_QUBIT_GATE(s, {
+  t.s_dag_gate(qubit);
+  t.z_gate(qubit);
+  return {};
+})
+
+// control, target
+TWO_QUBIT_GATE(cnot, {
+  t.stabilizer[qubit2] *= t.stabilizer[qubit1];
+  t.distabilizer[qubit1] *= t.distabilizer[qubit2];
+  return {};
+})
+
+// control, target
+TWO_QUBIT_GATE(cx, {
+  t.stabilizer[qubit2] *= t.stabilizer[qubit1];
+  t.distabilizer[qubit1] *= t.distabilizer[qubit2];
+  return {};
+})
+
+TWO_QUBIT_GATE(swap, {
+  t.stabilizer[qubit1].swap(t.stabilizer[qubit2]);
+  t.distabilizer[qubit1].swap(t.distabilizer[qubit2]);
+  return {};
+})
+
+// Z-basis measurement. Projects each target qubit into |0> or |1> and reports
+// its value (false=|0>, true=|1>).
+COLLAPSING_GATE(m, {
+  if (!t.is_deterministic_z(qubit)) {
+
+    tableau_trans<word_size> t_trans(t);
+    t.collapse_qubit_along_z(t_trans, qubit, rng);
+  }
+
+  return t.stabilizer.signs[qubit];
+})
+
+// Z-basis measurement. Projects each target qubit into |0> or |1> and reports
+// its value (false=|0>, true=|1>).
+COLLAPSING_GATE(measure, {
+  if (!t.is_deterministic_z(qubit)) {
+
+    tableau_trans<word_size> t_trans(t);
+    t.collapse_qubit_along_z(t_trans, qubit, rng);
+  }
+
+  return t.stabilizer.signs[qubit];
+})
+
+// Z-basis measurement. Projects each target qubit into |0> or |1> and reports
+// its value (false=|0>, true=|1>).
+COLLAPSING_GATE(mz, {
+  if (!t.is_deterministic_z(qubit)) {
+
+    tableau_trans<word_size> t_trans(t);
+    t.collapse_qubit_along_z(t_trans, qubit, rng);
+  }
+
+  return t.stabilizer.signs[qubit];
+})
+
+// Y-basis measurement. Projects each target qubit into |i> or |-i> and reports
+// its value (false=|i>, true=|-i>).
+COLLAPSING_GATE(my, {
+  if (!t.is_deterministic_y(qubit)) {
+
+    t.h_gate(qubit);
+    tableau_trans<word_size> t_trans(t);
+    t.collapse_qubit_along_z(t_trans, qubit, rng);
+    t.h_gate(qubit);
+  }
+
+  return t.eval_y_obs(qubit).sign;
+})
+
+// X-basis measurement. Projects each target qubit into |+> or |-> and reports
+// its value (false=|+>, true=|->).
+COLLAPSING_GATE(mx, {
+  if (!t.is_deterministic_x(qubit)) {
+
+    t.h_yz_gate(qubit);
+    tableau_trans<word_size> t_trans(t);
+    t.collapse_qubit_along_z(t_trans, qubit, rng);
+    t.h_yz_gate(qubit);
+  }
+
+  return t.distabilizer.signs[qubit];
+})
+
+// Z-basis reset. Forces each target qubit into the |0> state by silently
+// measuring it in the Z basis and applying an X gate if it ended up in the |1>
+// state.
+COLLAPSING_GATE(r, {
+  // Collapse the qubits to be reset.
+  if (!t.is_deterministic_z(qubit)) {
+
+    tableau_trans<word_size> t_trans(t);
+    t.collapse_qubit_along_z(t_trans, qubit, rng);
+  }
+
+  // Force the collapsed qubits into the ground state.
+  t.distabilizer.signs[qubit] = false;
+  t.stabilizer.signs[qubit] = false;
+
+  return {};
+})
+
+// Z-basis reset. Forces each target qubit into the |0> state by silently
+// measuring it in the Z basis and applying an X gate if it ended up in the |1>
+// state.
+COLLAPSING_GATE(reset, {
+  // Collapse the qubits to be reset.
+  if (!t.is_deterministic_z(qubit)) {
+
+    tableau_trans<word_size> t_trans(t);
+    t.collapse_qubit_along_z(t_trans, qubit, rng);
+  }
+
+  // Force the collapsed qubits into the ground state.
+  t.distabilizer.signs[qubit] = false;
+  t.stabilizer.signs[qubit] = false;
+
+  return {};
+})
+
+// // Z-basis reset. Forces each target qubit into the |0> state by silently
+// // measuring it in the Z basis and applying an X gate if it ended up in the
+// |1>
+// // state.
+// COLLAPSING_GATE(rz, {
+//   // Collapse the qubits to be reset.
+//   if (!t.is_deterministic_z(qubit)) {
+
+//     tableau_trans<word_size> t_trans(t);
+//     t.collapse_qubit_along_z(t_trans, qubit, rng);
+//   }
+
+//   // Force the collapsed qubits into the ground state.
+//   t.distabilizer.signs[qubit] = false;
+//   t.stabilizer.signs[qubit] = false;
+
+//   return {};
+// })
+
+// // X-basis reset. Forces each target qubit into the |+> state by silently
+// // measuring it in the X basis and applying a Z gate if it ended up in the
+// |->
+// // state.
+// COLLAPSING_GATE(rx, {
+//   // Collapse the qubits to be reset.
+//   if (!t.is_deterministic_x(qubit)) {
+
+//     t.h_yz_gate(qubit);
+//     tableau_trans<word_size> t_trans(t);
+//     t.collapse_qubit_along_z(t_trans, qubit, rng);
+//     t.h_yz_gate(qubit);
+//   }
+
+//   // Force the collapsed qubits into the ground state.
+//   t.distabilizer.signs[qubit] = false;
+//   t.stabilizer.signs[qubit] = false;
+
+//   return {};
+// })
+
+// // Y-basis reset. Forces each target qubit into the |i> state by silently
+// // measuring it in the Y basis and applying an X gate if it ended up in the
+// |-i>
+// // state.
+// COLLAPSING_GATE(ry, {
+//   // Collapse the qubits to be reset.
+//   if (!t.is_deterministic_y(qubit)) {
+
+//     t.h_gate(qubit);
+//     tableau_trans<word_size> t_trans(t);
+//     t.collapse_qubit_along_z(t_trans, qubit, rng);
+//     t.h_gate(qubit);
+//   }
+
+//   // Force the collapsed qubits into the ground state.
+//   t.distabilizer.signs[qubit] = false;
+//   t.stabilizer.signs[qubit] = false;
+//   t.stabilizer.signs[qubit] ^= t.eval_y_obs(qubit).sign;
+
+//   return {};
+// })
+
+#undef SINGLE_QUBIT_GATE
+#undef TWO_QUBIT_GATE
+#undef COLLAPSING_GATE
+#undef ERROR_QUBIT_GATE
diff --git a/src/qfvm_clifford/gate_macro.h b/src/qfvm_clifford/gate_macro.h
new file mode 100644
index 0000000..4544bea
--- /dev/null
+++ b/src/qfvm_clifford/gate_macro.h
@@ -0,0 +1,65 @@
+#include <functional>
+
+#ifdef FUNCTION_REGISTRATION
+#define SINGLE_QUBIT_GATE(GATE_NAME, ...)                                      \
+  template <size_t word_size>                                                  \
+  result GATE_NAME##_gate(tableau<word_size>& t, const size_t qubit)           \
+      __VA_ARGS__
+
+#define TWO_QUBIT_GATE(GATE_NAME, ...)                                         \
+  template <size_t word_size>                                                  \
+  result GATE_NAME##_gate(tableau<word_size>& t, const size_t qubit1,          \
+                          const size_t qubit2) __VA_ARGS__
+
+#define COLLAPSING_GATE(GATE_NAME, ...)                                        \
+  template <size_t word_size>                                                  \
+  result GATE_NAME##_gate(tableau<word_size>& t, std::mt19937_64& rng,         \
+                          const size_t qubit) __VA_ARGS__
+
+#define ERROR_QUBIT_GATE(GATE_NAME, ...)                                       \
+  template <size_t word_size>                                                  \
+  result GATE_NAME##_gate(tableau<word_size>& t, const size_t qubit)           \
+      __VA_ARGS__
+#endif
+
+#ifdef STRUCT_FUNCTION_REGISTRATION
+#define SINGLE_QUBIT_GATE(GATE_NAME, ...)                                      \
+  result GATE_NAME##_gate(const size_t qubit) {                                \
+    tableau<word_size>& t = *this;                                             \
+    __VA_ARGS__                                                                \
+  }
+
+#define TWO_QUBIT_GATE(GATE_NAME, ...)                                         \
+  result GATE_NAME##_gate(const size_t qubit1, const size_t qubit2) {          \
+    tableau<word_size>& t = *this;                                             \
+    __VA_ARGS__                                                                \
+  }
+
+#define COLLAPSING_GATE(GATE_NAME, ...)                                        \
+  result GATE_NAME##_gate(std::mt19937_64& rng, const size_t qubit) {          \
+    tableau<word_size>& t = *this;                                             \
+    __VA_ARGS__                                                                \
+  }
+
+#define ERROR_QUBIT_GATE(GATE_NAME, ...)                                       \
+  result GATE_NAME##_gate(const size_t qubit) {                                \
+    tableau<word_size>& t = *this;                                             \
+    __VA_ARGS__                                                                \
+  }
+#endif
+
+#ifdef GATE_MAP_REGISTRATION
+#define SINGLE_QUBIT_GATE(GATE_NAME, ...)                                      \
+  {STRINGIZE(GATE_NAME##_gate),                                                \
+             {SINGLE_QUBIT_GATE, GATE_NAME##_gate<word_size>}},
+
+#define TWO_QUBIT_GATE(GATE_NAME, ...)                                         \
+  {STRINGIZE(GATE_NAME##_gate), {TWO_QUBIT_GATE, GATE_NAME##_gate<word_size>}},
+
+#define COLLAPSING_GATE(GATE_NAME, ...)                                        \
+  {STRINGIZE(GATE_NAME##_gate), {COLLAPSING_GATE, GATE_NAME##_gate<word_size>}},
+
+#define ERROR_QUBIT_GATE(GATE_NAME, ...)                                       \
+  {STRINGIZE(GATE_NAME##_gate),                                                \
+             {ERROR_QUBIT_GATE, GATE_NAME##_gate<word_size>}},
+#endif
diff --git a/src/qfvm_clifford/packed_bit_word.h b/src/qfvm_clifford/packed_bit_word.h
new file mode 100644
index 0000000..5cb3fab
--- /dev/null
+++ b/src/qfvm_clifford/packed_bit_word.h
@@ -0,0 +1,350 @@
+#ifndef PACKED_BIT_WORD_H_
+#define PACKED_BIT_WORD_H_
+
+#include "bit.h"
+#include "bit_word.h"
+#include "packed_bit_word_slice.h"
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <random>
+#include <sstream>
+
+// Pack bit_words, allocated with alignment and padding enabling SIMD
+// instructions. Due to padding, the smallest tableaus are 256 bits. For
+// performance tableau_element does not store "intendend" size. Only store the
+// padded size.
+template <size_t word_size> struct packed_bit_word {
+  size_t num_bit_words;
+  union {
+    uint8_t* u8;
+    uint64_t* u64;
+    bit_word<word_size>* bw;
+  };
+
+  explicit packed_bit_word(size_t num_bits)
+      : num_bit_words(bits_to_word_padded<word_size>(num_bits)),
+        u64(malloc_aligned_padded(num_bits)) {}
+
+  ~packed_bit_word() {
+    if (nullptr != u64) {
+      bit_word<word_size>::aligned_free(u64);
+      u64 = nullptr;
+      num_bit_words = 0;
+    }
+  }
+
+  // copy constructor
+  packed_bit_word(const packed_bit_word& other)
+      : num_bit_words(other.num_bit_words),
+        u64(malloc_aligned_padded(other.num_bit_words * word_size)) {
+    memcpy(u8, other.u8, other.num_bit_words * word_size / 8);
+  }
+
+  packed_bit_word(const packed_bit_word_slice<word_size>& other)
+      : num_bit_words(other.num_bit_words),
+        u64(malloc_aligned_padded(other.num_bit_words * word_size)) {
+    memcpy(u8, other.u8, other.num_bit_words * word_size / 8);
+  }
+
+  // move constructor, is not allowed to throw generally
+  packed_bit_word(packed_bit_word&& other) noexcept
+      : num_bit_words(other.num_bit_words), u64(other.u64) {
+    other.u64 = nullptr;
+    other.num_bit_words = 0;
+  }
+
+  // copy assignment, deep copy
+  packed_bit_word& operator=(const packed_bit_word& other) {
+    return *this = packed_bit_word_slice<word_size>(other);
+  }
+
+  packed_bit_word<word_size>&
+  operator=(const packed_bit_word_slice<word_size>& other) {
+    if (num_bit_words == other.num_bit_words) {
+      // avoid re-allocating memory
+      packed_bit_word_slice<word_size>(*this) = other;
+      return *this;
+    }
+
+    this->~packed_bit_word();
+    new (this) packed_bit_word(other);
+    return *this;
+  }
+
+  // move assignment
+  packed_bit_word& operator=(const packed_bit_word&& other) noexcept {
+    this->~packed_bit_word();
+    new (this) packed_bit_word(std::move(other));
+    return *this;
+  }
+
+  // equality
+  bool operator==(const packed_bit_word<word_size>& other) const {
+    return num_bit_words == other.num_bit_words &&
+           memcmp(bw, other.bw, word_size * num_bit_words / 8) == 0;
+  }
+
+  bool operator==(const packed_bit_word_slice<word_size>& other) const {
+    return num_bit_words == other.num_bit_words &&
+           memcmp(bw, other.bw, word_size * num_bit_words / 8) == 0;
+  }
+
+  // inequality
+  bool operator!=(const packed_bit_word<word_size>& other) const {
+    return !(*this == other);
+  }
+
+  bool operator!=(const packed_bit_word_slice<word_size>& other) const {
+    return !(*this == other);
+  }
+
+  // convert packed_bit_word to packed_bit_word_slice
+  operator packed_bit_word_slice<word_size>() {
+    return packed_bit_word_slice<word_size>(bw, num_bit_words);
+  }
+
+  operator const packed_bit_word_slice<word_size>() const {
+    return packed_bit_word_slice<word_size>(bw, num_bit_words);
+  }
+
+  // index operation
+  bit operator[](size_t index) { return bit(u64, index); }
+  const bit operator[](size_t index) const { return bit(u64, index); }
+
+  // assignment
+  packed_bit_word<word_size>&
+  operator^=(const packed_bit_word<word_size>& other) {
+    packed_bit_word_slice<word_size>(*this) ^=
+        packed_bit_word_slice<word_size>(other);
+    return *this;
+  }
+
+  packed_bit_word<word_size>
+  operator&=(const packed_bit_word<word_size>& other) {
+    packed_bit_word_slice<word_size>(*this) &=
+        packed_bit_word_slice<word_size>(other);
+    return *this;
+  }
+
+  packed_bit_word<word_size>
+  operator|=(const packed_bit_word<word_size>& other) {
+    packed_bit_word_slice<word_size>(*this) |=
+        packed_bit_word_slice<word_size>(other);
+    return *this;
+  }
+
+  packed_bit_word<word_size>&
+  operator+=(const packed_bit_word<word_size>& other) {
+    size_t num_u64 = (num_bit_words * word_size) >> 6;
+    for (size_t i = 0; i < num_u64 - 1; ++i) {
+      u64[i] += other.u64[i];
+      // carry
+      u64[i + 1] += (u64[i] < other.u64[i]);
+    }
+    u64[num_u64 - 1] += other.u64[num_u64 - 1];
+    return *this;
+  }
+
+  // compare operation
+  bool operator<(const packed_bit_word<word_size>& other) const {
+    return packed_bit_word_slice<word_size>(*this) <
+           packed_bit_word_slice<word_size>(other);
+  }
+
+  // shift operator
+  packed_bit_word<word_size>& operator>>=(int offset) {
+    packed_bit_word_slice<word_size>(*this) >>= offset;
+    return *this;
+  }
+
+  packed_bit_word<word_size>& operator<<=(int offset) {
+    packed_bit_word_slice<word_size>(*this) <<= offset;
+    return *this;
+  }
+
+  // convert packed_bit_word to string
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  // swap two packed_bit_word
+  void swap(packed_bit_word<word_size> other) {
+    packed_bit_word_slice<word_size>(*this).swap(
+        packed_bit_word_slice<word_size>(other));
+  }
+
+  // slice operator, offset and num_bit_words should be the number of bit words
+  inline packed_bit_word_slice<word_size> slice(size_t offset,
+                                                size_t num_bit_words) {
+    return packed_bit_word_slice<word_size>(bw + offset, num_bit_words);
+  }
+
+  // slice operator, offset and num_bit_words should be the number of bit words
+  inline const packed_bit_word_slice<word_size>
+  slice(size_t offset, size_t num_bit_words) const {
+    return packed_bit_word_slice<word_size>(bw + offset, num_bit_words);
+  }
+
+  // determine the list of bit_word is not all zero
+  bool is_not_all_zero() const {
+    return packed_bit_word_slice<word_size>(*this).is_not_all_zero();
+  }
+
+  // generate random packed_bit_word
+  static packed_bit_word<word_size> random(size_t num_bits,
+                                           std::mt19937_64& rng) {
+    packed_bit_word<word_size> result(num_bits);
+    result.randomize(num_bits, rng);
+    return result;
+  }
+
+  // if num_bits != this.num_bit_words * word_size, this function will change
+  // data from low to high
+  void randomize(size_t num_bits, std::mt19937_64& rng) {
+    auto num_u64 = num_bits >> 6;
+    for (size_t k = 0; k < num_u64; ++k)
+      u64[k] = rng();
+
+    auto remaining_bits = num_bits & 63;
+    if (remaining_bits) {
+      auto mask = (uint64_t(1) << remaining_bits) - 1;
+      u64[num_u64] &= ~mask;
+      u64[num_u64] |= rng() & mask;
+    }
+  }
+
+  void truncated_overwrite_from(packed_bit_word_slice<word_size>& other,
+                                size_t num_bits) {
+    size_t num_u8 = num_bits >> 3;
+    memcpy(u8, other.u8, num_u8);
+    auto remaining_bits = num_bits & 7;
+    if (remaining_bits) {
+      auto mask = (uint8_t(1) << remaining_bits) - 1;
+      u8[num_u8] &= ~mask;
+      u8[num_u8] |= other.u8[num_u8] & mask;
+    }
+  }
+
+  // count the number of 1s
+  size_t count() { return packed_bit_word_slice<word_size>(*this).count(); }
+
+  // malloc aligned padded
+  uint64_t* malloc_aligned_padded(size_t bits) {
+    size_t num_u8 = bits_to_bits_padded<word_size>(bits);
+    void* result = bit_word<word_size>::aligned_malloc(num_u8);
+    memset(result, 0, num_u8);
+    return reinterpret_cast<uint64_t*>(result);
+  }
+};
+
+// bit operations
+template <size_t word_size>
+packed_bit_word<word_size>
+operator^(const packed_bit_word_slice<word_size>& left,
+          const packed_bit_word_slice<word_size>& right) {
+  assert(left.num_bit_words == right.num_bit_words);
+  packed_bit_word<word_size> result(left.num_bit_words);
+  packed_bit_word_slice<word_size>(result).for_each_word(
+      left, right, [](auto& a, auto& b, auto& c) { a = b ^ c; });
+  return result;
+}
+
+template <size_t word_size>
+packed_bit_word<word_size> operator^(const packed_bit_word<word_size>& left,
+                                     const packed_bit_word<word_size>& right) {
+  return packed_bit_word_slice<word_size>(left) ^
+         packed_bit_word_slice<word_size>(right);
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator^(const packed_bit_word<word_size>& left,
+          const packed_bit_word_slice<word_size>& right) {
+  return packed_bit_word_slice<word_size>(left) ^ right;
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator^(const packed_bit_word_slice<word_size>& left,
+          const packed_bit_word<word_size>& right) {
+  return left ^ packed_bit_word_slice<word_size>(right);
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator&(const packed_bit_word_slice<word_size>& left,
+          const packed_bit_word_slice<word_size>& right) {
+  assert(left.num_bit_words == right.num_bit_words);
+  packed_bit_word<word_size> result(left.num_bit_words);
+  packed_bit_word_slice<word_size>(result).for_each_word(
+      left, right, [](auto& a, auto& b, auto& c) { a = b & c; });
+  return result;
+}
+
+template <size_t word_size>
+packed_bit_word<word_size> operator&(const packed_bit_word<word_size>& left,
+                                     const packed_bit_word<word_size>& right) {
+  return packed_bit_word_slice<word_size>(left) &
+         packed_bit_word_slice<word_size>(right);
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator&(const packed_bit_word<word_size>& left,
+          const packed_bit_word_slice<word_size>& right) {
+  return packed_bit_word_slice<word_size>(left) & right;
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator&(const packed_bit_word_slice<word_size>& left,
+          const packed_bit_word<word_size>& right) {
+  return left & packed_bit_word_slice<word_size>(right);
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator|(const packed_bit_word_slice<word_size>& left,
+          const packed_bit_word_slice<word_size>& right) {
+  assert(left.num_bit_words == right.num_bit_words);
+  packed_bit_word<word_size> result(left.num_bit_words);
+  packed_bit_word_slice<word_size>(result).for_each_word(
+      left, right, [](auto& a, auto& b, auto& c) { a = b | c; });
+  return result;
+}
+
+template <size_t word_size>
+packed_bit_word<word_size> operator|(const packed_bit_word<word_size>& left,
+                                     const packed_bit_word<word_size>& right) {
+  return packed_bit_word_slice<word_size>(left) |
+         packed_bit_word_slice<word_size>(right);
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator|(const packed_bit_word<word_size>& left,
+          const packed_bit_word_slice<word_size>& right) {
+  return packed_bit_word_slice<word_size>(left) | right;
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator|(const packed_bit_word_slice<word_size>& left,
+          const packed_bit_word<word_size>& right) {
+  return left | packed_bit_word_slice<word_size>(right);
+}
+
+template <size_t word_size>
+std::ostream& operator<<(std::ostream& os,
+                         const packed_bit_word<word_size> word) {
+  for (size_t i = 0; i < word.num_bit_words * word_size; ++i)
+    os << "_1"[word[i]];
+
+  return os;
+}
+
+#endif
diff --git a/src/qfvm_clifford/packed_bit_word_slice.h b/src/qfvm_clifford/packed_bit_word_slice.h
new file mode 100644
index 0000000..6041225
--- /dev/null
+++ b/src/qfvm_clifford/packed_bit_word_slice.h
@@ -0,0 +1,307 @@
+#ifndef PACKED_BIT_WORD_SLICE_H_
+#define PACKED_BIT_WORD_SLICE_H_
+
+#include "bit.h"
+#include "bit_word.h"
+#include "utils.h"
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <random>
+#include <sstream>
+
+// reference to a slice of a packed bit word
+template <size_t word_size> struct packed_bit_word_slice {
+  const size_t num_bit_words;
+
+  union {
+    uint8_t* u8;
+    uint64_t* u64;
+    bit_word<word_size>* bw;
+  };
+
+  packed_bit_word_slice(bit_word<word_size>* bw, size_t num_bit_words)
+      : num_bit_words(num_bit_words), bw(bw) {}
+
+  // assignment operator
+  packed_bit_word_slice<word_size>&
+  operator=(const packed_bit_word_slice<word_size>& other) {
+    memcpy(u8, other.u8, num_bit_words * word_size / 8);
+    return *this;
+  }
+
+  packed_bit_word_slice<word_size>&
+  operator^=(const packed_bit_word_slice<word_size>& other) {
+    for_each_word(other, [](auto& a, auto& b) { a ^= b; });
+    return *this;
+  }
+
+  packed_bit_word_slice<word_size>&
+  operator&=(const packed_bit_word_slice<word_size>& other) {
+    for_each_word(other, [](auto& a, auto& b) { a &= b; });
+    return *this;
+  }
+
+  packed_bit_word_slice<word_size>&
+  operator|=(const packed_bit_word_slice<word_size>& other) {
+    for_each_word(other, [](auto& a, auto& b) { a |= b; });
+    return *this;
+  }
+
+  packed_bit_word_slice<word_size>&
+  operator+=(const packed_bit_word_slice<word_size>& other) {
+    size_t num_u64 = (num_bit_words * word_size) >> 6;
+    for (size_t i = 0; i < num_u64 - 1; ++i) {
+      u64[i] += other.u64[i];
+      // carry
+      u64[i + 1] += (u64[i] < other.u64[i]);
+    }
+    u64[num_u64 - 1] += other.u64[num_u64 - 1];
+    return *this;
+  }
+
+  packed_bit_word_slice<word_size>& operator>>=(int offset) {
+    uint64_t incoming_word;
+    uint64_t current_word;
+
+    if (0 == offset)
+      return *this;
+
+    // move right every u64 by offset
+    while (64 <= offset) {
+      incoming_word = 0;
+      for (int w = ((num_bit_words * word_size) >> 6) - 1; w >= 0; w--) {
+        current_word = u64[w];
+        u64[w] = incoming_word;
+        incoming_word = current_word;
+      }
+      offset -= 64;
+    }
+
+    if (0 == offset)
+      return *this;
+
+    incoming_word = 0;
+    for (int w = ((num_bit_words * word_size) >> 6) - 1; w >= 0; w--) {
+      current_word = u64[w];
+      // move right
+      u64[w] >>= offset;
+      // add high bits
+      u64[w] |= incoming_word << (64 - offset);
+      // update next incoming word
+      incoming_word = current_word & ((uint64_t(1) << offset) - 1);
+    }
+
+    return *this;
+  }
+
+  packed_bit_word_slice<word_size>& operator<<=(int offset) {
+    uint64_t incoming_word;
+    uint64_t current_word;
+
+    if (0 == offset)
+      return *this;
+
+    // move left every u64 by offset
+    while (64 <= offset) {
+      incoming_word = 0;
+      for (uint64_t w = 0; w < (num_bit_words * word_size) >> 6; w++) {
+        current_word = u64[w];
+        u64[w] = incoming_word;
+        incoming_word = current_word;
+      }
+      offset -= 64;
+    }
+
+    if (0 == offset)
+      return *this;
+
+    incoming_word = 0;
+    for (uint64_t w = 0; w < (num_bit_words * word_size) >> 6; w++) {
+      current_word = u64[w];
+      // move left
+      u64[w] <<= offset;
+      // add low bits
+      u64[w] |= incoming_word;
+      // update next incoming word
+      incoming_word = current_word >> (64 - offset);
+    }
+
+    return *this;
+  }
+
+  // equality operator
+  bool operator==(const packed_bit_word_slice<word_size>& other) const {
+    return num_bit_words == other.num_bit_words &&
+           0 == memcmp(bw, other.bw, num_bit_words * word_size / 8);
+  }
+
+  bool operator!=(const packed_bit_word_slice<word_size>& other) const {
+    return !(*this == other);
+  }
+
+  // index operator
+  inline bit operator[](size_t index) { return bit(u8, index); }
+
+  inline const bit operator[](size_t index) const { return bit(u8, index); }
+
+  // convert packed bit word slice to string
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  void swap(packed_bit_word_slice<word_size> other) {
+    for_each_word(other, [](auto& a, auto& b) { std::swap(a, b); });
+  }
+
+  // slice operator, offset and num_bit_words should be the number of bit words
+  inline packed_bit_word_slice<word_size> slice(size_t offset,
+                                                size_t num_bit_words) {
+    return packed_bit_word_slice<word_size>(bw + offset, num_bit_words);
+  }
+
+  inline const packed_bit_word_slice<word_size>
+  slice(size_t offset, size_t num_bit_words) const {
+    return packed_bit_word_slice<word_size>(bw + offset, num_bit_words);
+  }
+
+  // determine the list of bit_word is not all zero
+  bool is_not_all_zero() const {
+    bit_word<word_size> res{};
+    for_each_word([&res](auto& a) { res |= a; });
+    return bool(res);
+  }
+
+  // if num_bits != this.num_bit_words * word_size, this function will change
+  // data from low to high
+  void randomize(size_t num_bits, std::mt19937_64& rng) {
+    auto num_u64 = num_bits >> 6;
+    for (size_t k = 0; k < num_u64; ++k)
+      u64[k] = rng();
+
+    auto remaining_bits = num_bits & 63;
+    if (remaining_bits) {
+      auto mask = (uint64_t(1) << remaining_bits) - 1;
+      u64[num_u64] &= ~mask;
+      u64[num_u64] |= rng() & mask;
+    }
+  }
+
+  // write the truncated data from other to this
+  void truncated_overwrite_from(packed_bit_word_slice<word_size> other,
+                                size_t num_bits) {
+    size_t num_u8 = num_bits >> 3;
+    memcpy(u8, other.u8, num_u8);
+    auto remaining_bits = num_bits & 7;
+    if (remaining_bits) {
+      auto mask = (uint8_t(1) << remaining_bits) - 1;
+      u8[num_u8] &= ~mask;
+      u8[num_u8] |= other.u8[num_u8] & mask;
+    }
+  }
+
+  // count the number of 1s
+  template <size_t W> size_t count() const {
+    auto end = u64 + (num_bit_words * word_size >> 6);
+    size_t result = 0;
+    for (const uint64_t* p = u64; p != end; p++) {
+      result += count_uint64_bits(*p);
+    }
+    return result;
+  }
+
+  template <typename func> inline void for_each_word(func f) const {
+    auto* bw_start = bw;
+    auto* bw_end = bw + num_bit_words;
+    while (bw_start != bw_end) {
+      f(*bw_start);
+      ++bw_start;
+    }
+  }
+
+  template <typename func>
+  inline void for_each_word(packed_bit_word_slice<word_size> other,
+                            func f) const {
+    auto* bw_start = bw;
+    auto* bw_end = bw + num_bit_words;
+    auto* other_bw_start = other.bw;
+    while (bw_start != bw_end) {
+      f(*bw_start, *other_bw_start);
+      ++bw_start;
+      ++other_bw_start;
+    }
+  }
+
+  template <typename func>
+  inline void for_each_word(packed_bit_word_slice<word_size> other1,
+                            packed_bit_word_slice<word_size> other2,
+                            func f) const {
+    auto* bw_start = bw;
+    auto* bw_end = bw + num_bit_words;
+    auto* other1_bw_start = other1.bw;
+    auto* other2_bw_start = other2.bw;
+    while (bw_start != bw_end) {
+      f(*bw_start, *other1_bw_start, *other2_bw_start);
+      ++bw_start;
+      ++other1_bw_start;
+      ++other2_bw_start;
+    }
+  }
+
+  template <typename func>
+  inline void for_each_word(packed_bit_word_slice<word_size> other1,
+                            packed_bit_word_slice<word_size> other2,
+                            packed_bit_word_slice<word_size> other3,
+                            func f) const {
+    auto* bw_start = bw;
+    auto* bw_end = bw + num_bit_words;
+    auto* other1_bw_start = other1.bw;
+    auto* other2_bw_start = other2.bw;
+    auto* other3_bw_start = other3.bw;
+    while (bw_start != bw_end) {
+      f(*bw_start, *other1_bw_start, *other2_bw_start, *other3_bw_start);
+      ++bw_start;
+      ++other1_bw_start;
+      ++other2_bw_start;
+      ++other3_bw_start;
+    }
+  }
+
+  template <typename func>
+  inline void for_each_word(packed_bit_word_slice<word_size> other1,
+                            packed_bit_word_slice<word_size> other2,
+                            packed_bit_word_slice<word_size> other3,
+                            packed_bit_word_slice<word_size> other4,
+                            func f) const {
+    auto* bw_start = bw;
+    auto* bw_end = bw + num_bit_words;
+    auto* other1_bw_start = other1.bw;
+    auto* other2_bw_start = other2.bw;
+    auto* other3_bw_start = other3.bw;
+    auto* other4_bw_start = other4.bw;
+    while (bw_start != bw_end) {
+      f(*bw_start, *other1_bw_start, *other2_bw_start, *other3_bw_start,
+        *other4_bw_start);
+      ++bw_start;
+      ++other1_bw_start;
+      ++other2_bw_start;
+      ++other3_bw_start;
+      ++other4_bw_start;
+    }
+  }
+};
+
+// output operator for packed bit word slice
+template <size_t word_size>
+std::ostream& operator<<(std::ostream& os,
+                         const packed_bit_word_slice<word_size>& word) {
+  for (size_t i = 0; i < word.num_bit_words * word_size; ++i)
+    os << "_1"[word[i]];
+
+  return os;
+}
+
+#endif
diff --git a/src/qfvm_clifford/pauli.h b/src/qfvm_clifford/pauli.h
new file mode 100644
index 0000000..4e7604a
--- /dev/null
+++ b/src/qfvm_clifford/pauli.h
@@ -0,0 +1,176 @@
+#ifndef PAULI_H_
+#define PAULI_H_
+
+#include "bit.h"
+#include "packed_bit_word.h"
+#include "pauli_slice.h"
+#include "table.h"
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <ostream>
+#include <random>
+#include <string>
+
+// a pauli string is a product of pauli operators (I, X, Y, Z) on n qubits
+template <size_t word_size> struct pauli_string {
+  // the length of the pauli string
+  size_t num_qubits;
+
+  // whether the pauli string is a negative, true if negative and false if
+  // positive
+  bool sign;
+
+  // the paulis in the pauli string, paulis are encoded by xz (I=00, X=10, Y=11,
+  // Z=01)
+  packed_bit_word<word_size> xs, zs;
+
+  explicit pauli_string(size_t n) : num_qubits(n), sign(false), xs(n), zs(n) {}
+
+  explicit pauli_string(std::string& str)
+      : num_qubits(0), sign(false), xs(0), zs(0) {
+    *this = std::move(from_cstr(str.c_str()));
+  }
+
+  // copy constructor
+  pauli_string(const pauli_string<word_size>& other)
+      : num_qubits(other.num_qubits), sign(other.sign), xs(other.xs),
+        zs(other.zs) {}
+
+  pauli_string(const pauli_string_slice<word_size>& other)
+      : num_qubits(other.num_qubits), sign(other.sign), xs(other.xs),
+        zs(other.zs) {}
+
+  // move constructor
+  pauli_string(pauli_string<word_size>&& other) noexcept
+      : num_qubits(other.num_qubits), sign(other.sign), xs(std::move(other.xs)),
+        zs(std::move(other.zs)) {}
+
+  // copy assignment
+  pauli_string<word_size>& operator=(const pauli_string<word_size>& other) {
+    this->num_qubits = other.num_qubits, this->sign = other.sign;
+    this->xs = other.xs, this->zs = other.zs;
+
+    return *this;
+  }
+
+  pauli_string<word_size>&
+  operator=(const pauli_string_slice<word_size>& other) {
+    this->num_qubits = other.num_qubits, this->sign = other.sign;
+    this->xs = other.xs, this->zs = other.zs;
+
+    return *this;
+  }
+
+  // move assignment
+  pauli_string<word_size>& operator=(pauli_string<word_size>&& other) {
+    this->~pauli_string();
+    new (this) pauli_string<word_size>(std::move(other));
+
+    return *this;
+  }
+
+  // equality operator
+  bool operator==(const pauli_string<word_size>& other) const {
+    return num_qubits == other.num_qubits && bool(sign) == bool(other.sign) &&
+           xs == other.xs && zs == other.zs;
+  }
+
+  bool operator==(const pauli_string_slice<word_size>& other) const {
+    return num_qubits == other.num_qubits && bool(sign) == bool(other.sign) &&
+           xs == other.xs && zs == other.zs;
+  }
+
+  bool operator!=(const pauli_string<word_size>& other) const {
+    return !(*this == other);
+  }
+
+  bool operator!=(const pauli_string_slice<word_size>& other) const {
+    return !(*this == other);
+  }
+
+  // convert to pauli string slice
+  operator const pauli_string_slice<word_size>() const {
+    return pauli_string_slice<word_size>(num_qubits, bit((void*)&sign, 0), xs,
+                                         zs);
+  }
+
+  operator pauli_string_slice<word_size>() {
+    return pauli_string_slice<word_size>(num_qubits, bit(&sign, 0), xs, zs);
+  }
+
+  // convert pauli string to string
+  std::string str() const { return std::string(*this); }
+
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  // generate a random pauli string
+  static pauli_string<word_size> random(size_t num_qubits,
+                                        std::mt19937_64& rng) {
+    auto result = pauli_string<word_size>(num_qubits);
+    result.xs.randomize(num_qubits, rng);
+    result.zs.randomize(num_qubits, rng);
+    result.sign ^= rng() & 1;
+    return result;
+  }
+
+  // parse char one by one
+  static pauli_string<word_size>
+  parse_cstr(size_t num_qubits, bool sign,
+             const std::function<char(size_t)>& func) {
+    pauli_string<word_size> ret(num_qubits);
+    ret.sign = sign;
+
+    for (size_t i = 0; i < num_qubits; i++) {
+      bool x, z;
+      switch (func(i)) {
+      case 'X':
+        x = true, z = false;
+        break;
+      case 'Y':
+        x = true, z = true;
+        break;
+      case 'Z':
+        x = false, z = true;
+        break;
+      case 'I':
+        x = false, z = false;
+        break;
+      default:
+        throw std::invalid_argument("can not parse character: " +
+                                    std::to_string(func(i)));
+      }
+
+      ret.xs.u64[i / 64] ^= (uint64_t)x << (i & 63);
+      ret.zs.u64[i / 64] ^= (uint64_t)z << (i & 63);
+    }
+
+    return ret;
+  }
+
+  // convert c style string to pauli string
+  static pauli_string<word_size> from_cstr(const char* cstr) {
+    // default is positive
+    auto sign = cstr[0] == '-';
+    if ('-' == cstr[0] || '+' == cstr[0])
+      cstr++;
+
+    return parse_cstr(strlen(cstr), sign, [&](size_t i) { return cstr[i]; });
+  }
+};
+
+template <size_t word_size>
+std::ostream& operator<<(std::ostream& os, const pauli_string<word_size>& str) {
+  os << (str.sign ? '-' : '+');
+  for (size_t i = 0; i < str.num_qubits; i++) {
+    os << "IXZY"[str.xs[i] | (str.zs[i] << 1)];
+  }
+  return os;
+}
+
+#endif
diff --git a/src/qfvm_clifford/pauli_slice.h b/src/qfvm_clifford/pauli_slice.h
new file mode 100644
index 0000000..eef0cc0
--- /dev/null
+++ b/src/qfvm_clifford/pauli_slice.h
@@ -0,0 +1,145 @@
+#ifndef PAULI_SLICE_H_
+#define PAULI_SLICE_H_
+
+#include "bit.h"
+#include "packed_bit_word_slice.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <ostream>
+#include <sstream>
+
+// reference to a slice of a pauli string
+template <size_t word_size> struct pauli_string_slice {
+  size_t num_qubits;
+  bit sign;
+  packed_bit_word_slice<word_size> xs, zs;
+
+  // num_qubits should be the same with the number of padded bit_words, that
+  // means, (num_qubits + word_size - 1 / word_size) == xs.num_bit_words ==
+  // zs.num_bit_words
+  pauli_string_slice(size_t num_qubits, bit sign,
+                     packed_bit_word_slice<word_size> xs,
+                     packed_bit_word_slice<word_size> zs)
+      : num_qubits(num_qubits), sign(sign), xs(xs), zs(zs) {}
+
+  // assign operator
+  pauli_string_slice<word_size>&
+  operator=(const pauli_string_slice<word_size>& other) {
+    num_qubits = other.num_qubits;
+    sign = other.sign;
+    xs = other.xs;
+    zs = other.zs;
+
+    return *this;
+  }
+
+  // mulitply a commuting pauli string
+  pauli_string_slice<word_size>&
+  operator*=(const pauli_string_slice<word_size>& other) {
+    auto res = inplace_right_mul(other);
+    // must be commute
+    assert((res & 1) == 0);
+    // if the result phase is positive, then compute excluse-or of the signs
+    sign ^= res & 2;
+    return *this;
+  }
+
+  // mulitply a pauli string, ignore anti-commuting terms
+  pauli_string_slice<word_size>&
+  mul_ignore_anti_commute(const pauli_string_slice<word_size>& other) {
+    auto res = inplace_right_mul(other);
+    // if the result phase is positive, then compute excluse-or of the signs
+    sign ^= res & 2;
+    return *this;
+  }
+
+  // euqality operator
+  bool operator==(const packed_bit_word_slice<word_size>& other) {
+    return num_qubits == other.num_qubits && sign == other.sign &&
+           xs == other.xs && zs == other.zs;
+  }
+
+  bool operator!=(const packed_bit_word_slice<word_size>& other) {
+    return !(*this == other);
+  }
+
+  // convert pauli string to string
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  // swap operator, swap signs, x matrix and z matrix
+  void swap(pauli_string_slice<word_size> other) {
+    sign.swap(other.sign);
+    xs.swap(other.xs);
+    zs.swap(other.zs);
+  }
+
+  // Intuitively, this functionreturns the exponent to which the imaginary
+  // number i is raised when the Pauli matrices represented by x1z1 and x2z2
+  // are multiplied together. For example, if x1 = z2 = 0 and z1 = x2 = 1 then
+  // Definition 2 shows that x1z1 and x2z2 represent Z and X, respectively.
+  // Multiplying Z and X together gives ZX = iY . Since the exponent on i is 1,
+  // the result of this function is 1.
+  // Returns:
+  //  0 if the product is 1
+  //  1 if the product is i
+  //  2 if the product is -1
+  //  3 if the product is -i
+  uint8_t
+  inplace_right_mul(const pauli_string_slice<word_size>& other) noexcept {
+    bit_word<word_size> count1{};
+    bit_word<word_size> count2{};
+
+    xs.for_each_word(
+        zs, other.xs, other.zs,
+        [&count1, &count2](auto& x1, auto& z1, auto& x2, auto& z2) {
+          // accumulate anti-commutation (+i or -i) counts
+          auto x1z2 = x1 & z2;
+          auto anti_commutes = (x2 & z1) ^ x1z2;
+
+          // update left side pauli
+          x1 ^= x2;
+          z1 ^= z2;
+
+          // accumulate anti-commutation (+i or -i) counts
+          count2 ^= (count1 ^ x1 ^ z1 ^ x1z2) & anti_commutes;
+          count1 ^= anti_commutes;
+        });
+
+    // combine final anti-commutation phase tally (mod 4)
+    auto s = count1.count();
+    s ^= count2.count() << 1;
+    s ^= other.sign << 1;
+    return s & 3;
+  }
+
+  // determines if the pauli string commutes with the given pauli string
+  bool commutes(const pauli_string_slice<word_size>& other) const noexcept {
+    if (num_qubits > other.num_qubits)
+      return other.commutes(*this);
+
+    bit_word<word_size> count{};
+    xs.for_each_word(zs, other.xs, other.zs,
+                     [&count](auto& x1, auto& z1, auto& x2, auto& z2) {
+                       count ^= (x1 & z2) ^ (x2 & z1);
+                     });
+    return (count.count() & 1) == 0;
+  }
+};
+
+template <size_t word_size>
+std::ostream& operator<<(std::ostream& os,
+                         const pauli_string_slice<word_size> str) {
+  os << "+-"[str.sign];
+  for (size_t i = 0; i < str.num_qubits; i++) {
+    os << "IXZY"[str.xs[i] | (str.zs[i] << 1)];
+  }
+
+  return os;
+}
+
+#endif
diff --git a/src/qfvm_clifford/span_ref.h b/src/qfvm_clifford/span_ref.h
new file mode 100644
index 0000000..8214087
--- /dev/null
+++ b/src/qfvm_clifford/span_ref.h
@@ -0,0 +1,331 @@
+#ifndef SPAN_REF_H_
+#define SPAN_REF_H_
+
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <random>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+// A significant distinction between the semantics of this class and the
+// std::span class introduced in C++20 is that this class defines equality and
+// ordering operators based on *the content being pointed to* rather than the
+// values of the pointers themselves. Two range references are not considered
+// equal simply because they have identical pointers; they are deemed equal
+// because they point to ranges with matching contents. In essence, this class
+// behaves more like a *reference* rather than a pointer.
+
+template <typename T> struct span_ref {
+  T* ptr_start;
+  T* ptr_end;
+
+  span_ref() : ptr_start(nullptr), ptr_end(nullptr) {}
+  span_ref(T* begin, T* end) : ptr_start(begin), ptr_end(end) {}
+
+  // Implicit conversions.
+  span_ref(T* singleton) : ptr_start(singleton), ptr_end(singleton + 1) {}
+
+  span_ref(const span_ref<typename std::remove_const<T>::type>& other)
+      : ptr_start(other.ptr_start), ptr_end(other.ptr_end) {}
+
+  span_ref(std::vector<T>& items)
+      : ptr_start(items.data()), ptr_end(items.data() + items.size()) {}
+
+  span_ref(const std::vector<typename std::remove_const<T>::type>& items)
+      : ptr_start(items.data()), ptr_end(items.data() + items.size()) {}
+
+  template <size_t K>
+  span_ref(std::array<T, K>& items)
+      : ptr_start(items.data()), ptr_end(items.data() + items.size()) {}
+
+  template <size_t K>
+  span_ref(const std::array<typename std::remove_const<T>::type, K>& items)
+      : ptr_start(items.data()), ptr_end(items.data() + items.size()) {}
+
+  span_ref sub(size_t start_offset, size_t end_offset) const {
+    return span_ref<T>(ptr_start + start_offset, ptr_start + end_offset);
+  }
+
+  size_t size() const { return ptr_end - ptr_start; }
+
+  const T* begin() const { return ptr_start; }
+
+  const T* end() const { return ptr_end; }
+
+  const T& back() const { return *(ptr_end - 1); }
+
+  const T& front() const { return *ptr_start; }
+
+  bool empty() const { return ptr_end == ptr_start; }
+
+  T* begin() { return ptr_start; }
+
+  T* end() { return ptr_end; }
+
+  T& back() { return *(ptr_end - 1); }
+
+  T& front() { return *ptr_start; }
+
+  const T& operator[](size_t index) const { return ptr_start[index]; }
+
+  T& operator[](size_t index) { return ptr_start[index]; }
+
+  bool operator==(const span_ref<const T>& other) const {
+    size_t n = size();
+    if (n != other.size()) {
+      return false;
+    }
+    for (size_t k = 0; k < n; k++) {
+      if (ptr_start[k] != other[k]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool
+  operator==(const span_ref<typename std::remove_const<T>::type>& other) const {
+    return span_ref<const T>(ptr_start, ptr_end) ==
+           span_ref<const T>(other.ptr_start, other.ptr_end);
+  }
+
+  bool operator!=(const span_ref<const T>& other) const {
+    return !(*this == other);
+  }
+
+  bool
+  operator!=(const span_ref<typename std::remove_const<T>::type>& other) const {
+    return !(*this == other);
+  }
+
+  std::string str() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  // Lexicographic ordering.
+  bool operator<(const span_ref<const T>& other) const {
+    auto n = std::min(size(), other.size());
+    for (size_t k = 0; k < n; k++) {
+      if ((*this)[k] != other[k]) {
+        return (*this)[k] < other[k];
+      }
+    }
+    return size() < other.size();
+  }
+
+  bool
+  operator<(const span_ref<typename std::remove_const<T>::type>& other) const {
+    return span_ref<const T>(ptr_start, ptr_end) <
+           span_ref<const T>(other.ptr_start, other.ptr_end);
+  }
+};
+
+// Wraps an iterable object so that its values are printed with comma
+// separators.
+template <typename t_iter> struct seperator;
+
+/// A wrapper indicating a range of values should be printed with comma
+/// separators.
+template <typename t_iter> struct seperator {
+  const t_iter& iter;
+  const char* sep;
+  std::string str() const {
+    std::stringstream out;
+    out << *this;
+    return out.str();
+  }
+};
+
+template <typename t_iter>
+seperator<t_iter> seperate(const t_iter& v, const char* sep = ", ") {
+  return seperator<t_iter>{v, sep};
+}
+
+template <typename t_iter>
+std::ostream& operator<<(std::ostream& out, const seperator<t_iter>& v) {
+  bool first = true;
+  for (const auto& t : v.iter) {
+    if (first) {
+      first = false;
+    } else {
+      out << v.sep;
+    }
+    out << t;
+  }
+  return out;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& out, const span_ref<T>& v) {
+  out << "span_ref{" << seperate(v) << "}";
+  return out;
+}
+
+// A memory resource that efficiently accumulates data incrementally.
+
+// There are three important types of "region" involved: the tail region, the
+// current region, and old regions.
+
+// The tail is used for adding contiguous data to the buffer in an incremental
+// manner. When the tail exceeds the available storage, more memory is allocated
+// and the tail is copied into it to maintain contiguity. The tail can be
+// discarded or committed at any time. Discarding allows reusing the covered
+// memory when writing the next tail. Committing permanently preserves the data
+// (until clearing or deconstructing the monotonic buffer) and ensures that it
+// will not move so that pointers to it can be stored.
+
+// The current region is a continuous block of memory where the tail is being
+// written. If the tail grows beyond this region and triggers an allocation,
+// then this current region becomes an old region, while newly allocated memory
+// becomes the new current region. Each subsequent current region will have at
+// least double size compared to its predecessor.
+
+// Old regions are finalized memory segments that will be retained until
+// clearing or deconstructing of the buffer occurs.
+
+// some ref: https://zhuanlan.zhihu.com/p/96089089
+template <typename T> struct monotonic_buffer {
+
+  // Contiguous memory that is being appended to, but has not yet been
+  // committed.
+  span_ref<T> tail;
+
+  // The current contiguous memory region with a mix of committed, staged, and
+  // unused memory.
+  span_ref<T> cur;
+
+  // Old contiguous memory regions that have been committed and now need to be
+  // kept.
+  std::vector<span_ref<T>> old_areas;
+
+  // Constructs an empty monotonic buffer.
+  monotonic_buffer() : tail(), cur(), old_areas() {}
+
+  // Constructs an empty monotonic buffer with initial capacity for its current
+  // region.
+  monotonic_buffer(size_t reserve) { ensure_available(reserve); }
+
+  void _soft_clear() {
+    cur.ptr_start = nullptr;
+    cur.ptr_end = nullptr;
+    tail.ptr_start = nullptr;
+    tail.ptr_end = nullptr;
+    old_areas.clear();
+  }
+
+  void _hard_clear() {
+    for (auto old : old_areas) {
+      free(old.ptr_start);
+    }
+    if (nullptr != cur.ptr_start) {
+      free(cur.ptr_start);
+    }
+  }
+
+  ~monotonic_buffer() { _hard_clear(); }
+
+  monotonic_buffer(monotonic_buffer&& other) noexcept
+      : tail(other.tail), cur(other.cur),
+        old_areas(std::move(other.old_areas)) {
+    other._soft_clear();
+  }
+
+  monotonic_buffer(const monotonic_buffer& other) = delete;
+
+  monotonic_buffer& operator=(monotonic_buffer&& other) noexcept {
+    _hard_clear();
+    cur = other.cur;
+    tail = other.tail;
+    old_areas = std::move(other.old_areas);
+    other._soft_clear();
+    return *this;
+  }
+
+  // Invalidates all previous data and resets the class into a clean state.
+  // Happens to keep the current contiguous memory region and free old regions.
+  void clear() {
+    for (auto old : old_areas) {
+      free(old.ptr_start);
+    }
+    old_areas.clear();
+    tail.ptr_end = tail.ptr_start = cur.ptr_start;
+  }
+
+  // Returns the size of memory allocated and held by this monotonic buffer (in
+  // units of sizeof(T)).
+  size_t total_allocated() const {
+    size_t result = cur.size();
+    for (auto old : old_areas) {
+      result += old.size();
+    }
+    return result;
+  }
+
+  // Appends and commits data.
+  // Requires the tail to be empty, to avoid bugs where previously staged data
+  // is committed.
+  span_ref<T> take_copy(span_ref<const T> data) {
+    assert(tail.size() == 0);
+    append_tail(data);
+    return commit_tail();
+  }
+
+  // Adds a staged data item.
+  void append_tail(T item) {
+    ensure_available(1);
+    *tail.ptr_end = item;
+    tail.ptr_end++;
+  }
+
+  // Adds staged data.
+  void append_tail(span_ref<const T> data) {
+    ensure_available(data.size());
+    std::copy(data.begin(), data.end(), tail.ptr_end);
+    tail.ptr_end += data.size();
+  }
+
+  // Throws away staged data, so its memory can be re-used.
+  void discard_tail() { tail.ptr_end = tail.ptr_start; }
+
+  // Changes staged data into committed data that will be kept until the buffer
+  // is cleared or deconstructed.
+  span_ref<T> commit_tail() {
+    span_ref<T> result(tail);
+    tail.ptr_start = tail.ptr_end;
+    return result;
+  }
+
+  // Ensures it is possible to stage at least `min_required` more items without
+  // more reallocations.
+  void ensure_available(size_t min_required) {
+    size_t available = cur.ptr_end - tail.ptr_end;
+    if (available >= min_required) {
+      return;
+    }
+
+    size_t alloc_count = std::max(min_required, cur.size() << 1);
+    if (nullptr != cur.ptr_start) {
+      old_areas.push_back(cur);
+    }
+    cur.ptr_start = (T*)malloc(alloc_count * sizeof(T));
+    cur.ptr_end = cur.ptr_start + alloc_count;
+
+    // Staged data is not complete yet; keep it contiguous by copying it to the
+    // new larger memory region.
+    size_t tail_size = tail.size();
+    if (tail_size) {
+      std::move(tail.ptr_start, tail.ptr_end, cur.ptr_start);
+    }
+
+    tail = {cur.ptr_start, cur.ptr_start + tail_size};
+  }
+};
+
+#endif
diff --git a/src/qfvm_clifford/table.h b/src/qfvm_clifford/table.h
new file mode 100644
index 0000000..e06fba4
--- /dev/null
+++ b/src/qfvm_clifford/table.h
@@ -0,0 +1,347 @@
+#ifndef TABLE_H_
+#define TABLE_H_
+
+#include "bit.h"
+#include "bit_word.h"
+#include "packed_bit_word.h"
+#include "packed_bit_word_slice.h"
+#include "utils.h"
+#include <cstddef>
+#include <cstdint>
+#include <ostream>
+#include <random>
+#include <sstream>
+#include <string>
+
+// row-major table, padded and aligned to make table more efficient
+// major represents the row (not contiguous in memory), minor represents the
+// column (contiguous in memory), the smallest table is word_size * word_size
+template <size_t word_size> struct table {
+  size_t num_bit_words_major;
+  size_t num_bit_words_minor;
+
+  packed_bit_word<word_size> data;
+
+  table(size_t min_bits_major, size_t min_bits_minor)
+      : num_bit_words_major(bits_to_word_padded<word_size>(min_bits_major)),
+        num_bit_words_minor(bits_to_word_padded<word_size>(min_bits_minor)),
+        data(bits_to_bits_padded<word_size>(min_bits_major) *
+             bits_to_bits_padded<word_size>(min_bits_minor)) {}
+
+  // index operator, major index should be the number of rows
+  inline packed_bit_word_slice<word_size> operator[](size_t major_index) {
+    return data.slice(major_index * num_bit_words_minor, num_bit_words_minor);
+  }
+
+  // index operator, major index should be the number of rows
+  inline const packed_bit_word_slice<word_size>
+  operator[](size_t major_index) const {
+    return data.slice(major_index * num_bit_words_minor, num_bit_words_minor);
+  }
+
+  // equality operator
+  bool operator==(const table<word_size>& other) const {
+    return num_bit_words_major == other.num_bit_words_major &&
+           num_bit_words_minor == other.num_bit_words_minor &&
+           data == other.data;
+  }
+
+  bool operator!=(const table<word_size>& other) const {
+    return !(*this == other);
+  }
+
+  // convert stablizer tableau to string
+  std::string str() const { return std::string(*this); }
+
+  // for better printing
+  std::string str(size_t n) const {
+    std::stringstream ss;
+    for (size_t i = 0; i < n; i++) {
+      if (i)
+        ss << "\n";
+      for (size_t j = 0; j < n; j++)
+        ss << "_1"[(*this)[i][j]];
+    }
+
+    return ss.str();
+  }
+
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  // major_index is the index of bit_word in the row, minor_index is the index
+  // of bit_word in the column, major_index_sub is the index of bit in the
+  // bit_word
+  inline size_t get_index_bit_word(const size_t major_index,
+                                   const size_t minor_index,
+                                   const size_t major_index_sub) const {
+    auto index =
+        (major_index << bit_word<word_size>::BIT_POW) + major_index_sub;
+    return index * num_bit_words_minor + minor_index;
+  }
+
+  // transpose the table
+  table<word_size> transpose() const {
+    table<word_size> result(num_bit_words_minor * word_size,
+                            num_bit_words_major * word_size);
+
+    for (size_t major_word = 0; major_word < num_bit_words_major;
+         major_word++) {
+      for (size_t minor_word = 0; minor_word < num_bit_words_minor;
+           minor_word++) {
+        for (size_t major_word_sub = 0; major_word_sub < word_size;
+             major_word_sub++) {
+          size_t src_index =
+              get_index_bit_word(major_word, minor_word, major_word_sub);
+          size_t dst_index =
+              result.get_index_bit_word(minor_word, major_word, major_word_sub);
+          result.data.bw[dst_index] = data.bw[src_index];
+        }
+      }
+    }
+
+    // transpose the bit word block, the shape of the block is (word_size,
+    // word_size)
+    for (size_t major_word = 0; major_word < result.num_bit_words_major;
+         major_word++) {
+      for (size_t minor_word = 0; minor_word < result.num_bit_words_minor;
+           minor_word++) {
+        size_t block_start =
+            result.get_index_bit_word(major_word, minor_word, 0);
+        bit_word<word_size>::inplace_transpose_square(
+            result.data.bw + block_start, result.num_bit_words_minor);
+      }
+    }
+
+    return result;
+  }
+
+  // inplace transpose, only for square matrix
+  table<word_size>& inplace_transpose() {
+
+    // transpose the bit word block, the shape of the block is (word_size,
+    // word_size)
+    for (size_t major_word = 0; major_word < num_bit_words_major;
+         major_word++) {
+      for (size_t minor_word = 0; minor_word < num_bit_words_minor;
+           minor_word++) {
+        size_t block_start = get_index_bit_word(major_word, minor_word, 0);
+        bit_word<word_size>::inplace_transpose_square(data.bw + block_start,
+                                                      num_bit_words_minor);
+      }
+    }
+
+    // transpose the table
+    for (size_t major_word = 0; major_word < num_bit_words_major; major_word++)
+      for (size_t minor_word = major_word + 1; minor_word < num_bit_words_minor;
+           minor_word++)
+        for (size_t major_word_sub = 0; major_word_sub < word_size;
+             major_word_sub++)
+          std::swap(data.bw[get_index_bit_word(major_word, minor_word,
+                                               major_word_sub)],
+                    data.bw[get_index_bit_word(minor_word, major_word,
+                                               major_word_sub)]);
+
+    return *this;
+  }
+
+  // square matrix multiplication (assuming row indexing)
+  table<word_size> square_matrix_mul(const table<word_size>& right,
+                                     size_t n) const {
+    auto tmp = right.transpose();
+
+    table<word_size> result(n, n);
+    for (std::size_t i = 0; i < n; i++) {
+      for (std::size_t j = 0; j < n; j++) {
+        bit_word<word_size> accumulater{};
+        (*this)[i].for_each_word(
+            tmp[j], [&](auto& a, auto& b) { accumulater ^= a & b; });
+        result[i][j] = accumulater.count() & 1;
+      }
+    }
+
+    return result;
+  }
+
+  // sqaure matrix inverse for lower triangular matrix
+  table<word_size> inverse_for_lower_triangular_matrix(size_t n) const {
+    table<word_size> result = table<word_size>::identity(n);
+    packed_bit_word<word_size> tmp(num_bit_words_minor * word_size);
+
+    for (size_t i = 0; i < n; i++) {
+      tmp = (*this)[i];
+      // pivot
+      for (size_t j = 0; j < i; j++) {
+        if (tmp[j]) {
+          tmp ^= (*this)[j];
+          result[i] ^= result[j];
+        }
+      }
+    }
+
+    return result;
+  }
+
+  // concatenate four tables
+  static table<word_size>
+  concatenate_four(size_t n, const table<word_size>& upper_left,
+                   const table<word_size>& upper_right,
+                   const table<word_size>& lower_left,
+                   const table<word_size>& lower_right) {
+    table<word_size> result(n << 1, n << 1);
+    for (size_t i = 0; i < n; i++) {
+      for (size_t j = 0; j < n; j++) {
+        result[i][j] = upper_left[i][j];
+        result[i][j + n] = upper_right[i][j];
+        result[i + n][j] = lower_left[i][j];
+        result[i + n][j + n] = lower_right[i][j];
+      }
+    }
+
+    return result;
+  }
+
+  // generate identity table
+  static table<word_size> identity(size_t n) {
+    table<word_size> result(n, n);
+    for (size_t i = 0; i < n; i++)
+      result[i][i] = true;
+
+    return result;
+  }
+
+  // generate random table
+  static table<word_size> random(size_t random_bits_major,
+                                 size_t random_bits_minor,
+                                 std::mt19937_64& rng) {
+    table<word_size> result(random_bits_major, random_bits_minor);
+    for (size_t major = 0; major < random_bits_major; major++)
+      result[major].randomize(random_bits_minor, rng);
+
+    return result;
+  }
+
+  // Sample from the quantum Mallows distribution, generate a bit string h and a
+  // permutation S
+  // TODO: fix -Wstringop-overflow in test when n is 1
+  static inline std::pair<std::vector<bool>, std::vector<size_t>>
+  sample_quantum_mallows(size_t n, std::mt19937_64& rng) {
+    auto r_dis = std::uniform_real_distribution<double>(0, 1);
+    std::vector<bool> h;
+    std::vector<size_t> S;
+    std::vector<size_t> A;
+
+    for (size_t i = 0; i < n; i++)
+      A.push_back(i);
+
+    for (size_t i = 0; i < n; i++) {
+      auto m = A.size();
+      auto r = r_dis(rng);
+      auto eps = pow(4, -int(m));
+      auto k = size_t(-ceil(log2(r + (1 - r) * eps)));
+      h.push_back(k < m);
+      if (k >= m)
+        k = 2 * m - k - 1;
+      S.push_back(A[k]);
+      A.erase(A.begin() + k);
+    }
+
+    return {h, S};
+  }
+
+  // Samples a random valid stabilizer tableau.
+  // reference: Generation of random Clifford operators in
+  // https://arxiv.org/pdf/2003.09412.pdf
+  static table<word_size> random_valid_stabilizer_table(size_t n,
+                                                        std::mt19937_64& rng) {
+    auto h_S = sample_quantum_mallows(n, rng);
+
+    const auto& h = h_S.first;
+    const auto& S = h_S.second;
+
+    table<word_size> symmetric(n, n);
+    for (size_t i = 0; i < n; i++) {
+      symmetric[i].randomize(i + 1, rng);
+      for (size_t j = 0; j < i; j++)
+        symmetric[j][i] = symmetric[i][j];
+    }
+
+    table<word_size> symmetric_m(n, n);
+    for (size_t i = 0; i < n; i++) {
+      symmetric_m[i].randomize(i + 1, rng);
+      symmetric_m[i][i] &= h[i];
+      for (size_t j = 0; j < i; j++) {
+        bool b = h[i] && h[j];
+        b |= h[i] > h[j] && S[i] < S[j];
+        b |= h[i] < h[j] && S[i] > S[j];
+        symmetric_m[i][j] &= b;
+        symmetric_m[j][i] = symmetric_m[i][j];
+      }
+    }
+
+    auto lower = table<word_size>::identity(n);
+    for (size_t i = 0; i < n; i++)
+      lower[i].randomize(i, rng);
+
+    auto lower_m = table<word_size>::identity(n);
+    for (size_t i = 0; i < n; i++) {
+      lower_m[i].randomize(i, rng);
+      for (size_t j = 0; j < i; j++) {
+        bool b = h[i] < h[j];
+        b |= h[i] && h[j] && S[i] > S[j];
+        b |= !h[i] && !h[j] && S[i] < S[j];
+        lower_m[i][j] &= b;
+      }
+    }
+
+    // a normalized probability distribution, P_n(h, S) is the fraction of
+    // n-qubit Clifford operators U such that the canonical form of U defined in
+    // Theorem 1 contains a layer of h gates labeled by h and a qubit
+    // permutation S.
+    auto prod = symmetric.square_matrix_mul(lower, n);
+    auto prod_m = symmetric_m.square_matrix_mul(lower_m, n);
+
+    auto inv = lower.inverse_for_lower_triangular_matrix(n);
+    auto inv_m = lower_m.inverse_for_lower_triangular_matrix(n);
+
+    inv.inplace_transpose();
+    inv_m.inplace_transpose();
+
+    // the first n columns represent Pauli operators Fx_iF^{-1} (ignoring the
+    // phase) and the last n columns represent Fz_iF^{−1} . Stabilizer tableau
+    // of the Hadamard stage and qubit permutation layers in the canonical form
+    auto fused = table<word_size>::concatenate_four(
+        n, lower, table<word_size>(n, n), prod, inv);
+    auto fused_m = table<word_size>::concatenate_four(
+        n, lower_m, table<word_size>(n, n), prod_m, inv_m);
+
+    table<word_size> u(2 * n, 2 * n);
+    for (size_t i = 0; i < n; i++) {
+      u[i] = fused[S[i]];
+      u[i + n] = fused[S[i] + n];
+    }
+
+    // hadamards
+    for (size_t i = 0; i < n; i++)
+      if (h[i])
+        u[i].swap(u[i + n]);
+
+    return fused_m.square_matrix_mul(u, 2 * n);
+  }
+};
+
+template <size_t word_size>
+std::ostream& operator<<(std::ostream& os, const table<word_size>& table) {
+  for (size_t i = 0; i < table.num_bit_words_major; i++) {
+    if (i)
+      os << "\n";
+    os << table[i];
+  }
+
+  return os;
+}
+
+#endif
diff --git a/src/qfvm_clifford/tableau.h b/src/qfvm_clifford/tableau.h
new file mode 100644
index 0000000..7cb880e
--- /dev/null
+++ b/src/qfvm_clifford/tableau.h
@@ -0,0 +1,434 @@
+#ifndef TABLEAU_H_
+#define TABLEAU_H_
+
+#include "gate_macro.h"
+#include "packed_bit_word.h"
+#include "pauli.h"
+#include "pauli_slice.h"
+#include "table.h"
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <math.h>
+#include <optional>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <variant>
+#include <vector>
+
+// tableau trans is the transpose of the tableau
+template <size_t word_size> struct tableau_trans;
+
+// tableau is the main class of the clifford simulator
+template <size_t word_size> struct tableau;
+
+// quantum gate type
+// SINGLE_QUBIT_GATE: single qubit gate
+// TWO_QUBIT_GATE: two qubit gate
+// COLLAPSING_GATE: measurement gate
+// ERROR_QUBIT_GATE: error gate, which has some probability to apply the gate
+enum gate_type {
+  SINGLE_QUBIT_GATE,
+  TWO_QUBIT_GATE,
+  COLLAPSING_GATE,
+  ERROR_QUBIT_GATE
+};
+
+// quantum gate function return type
+// std::nullopt: The return value of non-measurement quantum gate
+// (COLLASPING_GATE) bool: The return value of measurement quantum gate
+using result = std::optional<bool>;
+
+// quantum gate function type
+// The type is used to represent the all the quantum gate function type, which
+// is for store gate map. The first type is for single qubit gate and error
+// qubit gate, the second type is for two qubit gate, the third type is for
+// collasping gate
+template <size_t word_size>
+using func_type = std::variant<
+    std::function<result(tableau<word_size>& t, const size_t qubit)>,
+    std::function<result(tableau<word_size>& t, const size_t qubit1,
+                         const size_t qubit2)>,
+    std::function<result(tableau<word_size>& t, std::mt19937_64& rng,
+                         const size_t qubit)>>;
+
+// the implementation of the quantum gate function
+#define FUNCTION_REGISTRATION
+#include "gate_list.h"
+#undef FUNCTION_REGISTRATION
+
+// the gate map, which is used to store all the quantum gate function, the index
+// is the gate name
+template <size_t word_size>
+std::unordered_map<std::string, std::pair<gate_type, func_type<word_size>>>
+    gate_map = {
+#define GATE_MAP_REGISTRATION
+#include "gate_list.h"
+#undef GATE_MAP_REGISTRATION
+};
+
+// Inner struct of the tableau, which is used to store the the distabilizer and
+// stabilizer of the tableau.
+//
+// Reference: https://arxiv.org/pdf/quant-ph/0406196.pdf
+template <size_t word_size> struct _tableau {
+  size_t num_qubits;
+
+  // The stabilizer tableau is represented by two bit tables, one for X and one
+  // for Z.
+  table<word_size> xs_t, zs_t;
+
+  // The signs of the tableau
+  packed_bit_word<word_size> signs;
+
+  // constructor, n is the number of qubits
+  _tableau(size_t n) : num_qubits(n), xs_t(n, n), zs_t(n, n), signs(n) {}
+
+  // index operator
+  pauli_string_slice<word_size> operator[](size_t qubit) {
+    // if padding, the number of real words is not the same with the number of
+    // padded words
+    size_t num_words = (num_qubits + word_size - 1) / word_size;
+    return pauli_string_slice<word_size>(num_qubits, signs[qubit],
+                                         xs_t[qubit].slice(0, num_words),
+                                         zs_t[qubit].slice(0, num_words));
+  }
+
+  const pauli_string_slice<word_size> operator[](size_t qubit) const {
+    size_t num_words = (num_qubits + word_size - 1) / word_size;
+    return pauli_string_slice<word_size>(num_qubits, signs[qubit],
+                                         xs_t[qubit].slice(0, num_words),
+                                         zs_t[qubit].slice(0, num_words));
+  }
+};
+
+// A Clifford operation is a unitary quantum operation that conjugates Pauli
+// products into Pauli products. C is Clifford if, for all pauli products P, it
+// is the case that C^*PC is also a Pauli product. In fact, a Clifford operation
+// can be uniquely identified (up to global phase) by how it conjugates Pauli
+// products.
+// A stabilizer tableau is a representation of a Clifford operation
+// that simply directly stores how the Clifford operation conjugates each
+// generator of the Pauli group.
+template <size_t word_size> struct tableau {
+
+  size_t num_qubits;
+
+  // n distabilizer generators which are Pauli operators that together with the
+  // stabilizer generators generate the full Pauli group
+  _tableau<word_size> distabilizer;
+  _tableau<word_size> stabilizer;
+
+  // constructor
+  explicit tableau(size_t num_qubits)
+      : num_qubits(num_qubits), distabilizer(num_qubits),
+        stabilizer(num_qubits) {
+    // Initialize identity elements along the diagonal. The state is |0...0>
+    for (size_t q = 0; q < num_qubits; q++) {
+      distabilizer.xs_t[q][q] = true;
+      stabilizer.zs_t[q][q] = true;
+    }
+  }
+
+  // convert tableau to string
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+// quantum gate function registration
+#define STRUCT_FUNCTION_REGISTRATION
+#include "gate_list.h"
+#undef STRUCT_FUNCTION_REGISTRATION
+
+  std::string str() const { return std::string(*this); }
+
+  // return identity tableau, that means the state is |0...0>
+  static tableau<word_size> identity(size_t num_qubits) {
+    return tableau<word_size>(num_qubits);
+  }
+
+  void reset() { *this = tableau<word_size>(num_qubits); }
+  void reset(std::mt19937_64& rng, size_t qubit) { r_gate(rng, qubit); }
+  // void reset_x(std::mt19937_64& rng, size_t qubit) { rx_gate(rng, qubit); }
+  // void reset_y(std::mt19937_64& rng, size_t qubit) { ry_gate(rng, qubit); }
+
+  // expand current tableau to new_num_qubits
+  // args:
+  //   new_num_qubits: the new number of qubits
+  //   resize_pad_factor: the resize pad factor, which means leave more space
+  //   for future storage of more quantum bits.
+  void expand(size_t new_num_qubits, double resize_pad_factor) {
+
+    assert(new_num_qubits >= num_qubits);
+    assert(resize_pad_factor >= 1);
+
+    if (new_num_qubits <= distabilizer.xs_t.num_bit_words_major * word_size) {
+      size_t old_num_qubits = num_qubits;
+      num_qubits = new_num_qubits;
+      distabilizer.num_qubits = new_num_qubits;
+      stabilizer.num_qubits = new_num_qubits;
+
+      // Initialize identity elements along the diagonal.
+      for (size_t k = old_num_qubits; k < new_num_qubits; k++) {
+        distabilizer.xs_t[k][k] = true;
+        stabilizer.zs_t[k][k] = true;
+      }
+
+      return;
+    }
+
+    size_t old_num_bit_words = distabilizer.xs_t.num_bit_words_major;
+    size_t old_num_qubits = num_qubits;
+    tableau<word_size> old_state = std::move(*this);
+    *this = tableau<word_size>((size_t)(new_num_qubits * resize_pad_factor));
+    this->num_qubits = new_num_qubits;
+    this->distabilizer.num_qubits = new_num_qubits;
+    this->stabilizer.num_qubits = new_num_qubits;
+
+    // Copy stored state back into new larger space.
+    auto partial_copy = [=](packed_bit_word_slice<word_size> dst,
+                            packed_bit_word_slice<word_size> src) {
+      dst.slice(0, old_num_bit_words) = src;
+    };
+    partial_copy(distabilizer.signs, old_state.distabilizer.signs);
+    partial_copy(stabilizer.signs, old_state.stabilizer.signs);
+    for (size_t k = 0; k < old_num_qubits; k++) {
+      partial_copy(distabilizer[k].xs, old_state.distabilizer[k].xs);
+      partial_copy(distabilizer[k].zs, old_state.distabilizer[k].zs);
+      partial_copy(stabilizer[k].xs, old_state.stabilizer[k].xs);
+      partial_copy(stabilizer[k].zs, old_state.stabilizer[k].zs);
+    }
+  }
+
+  // transpose each table of the tableau
+  void inplace_transpose() {
+    stabilizer.xs_t.inplace_transpose();
+    stabilizer.zs_t.inplace_transpose();
+    distabilizer.xs_t.inplace_transpose();
+    distabilizer.zs_t.inplace_transpose();
+  }
+
+  // Clifford state measurements only have three probabilities: (p0, p1) = (0.5,
+  // 0.5), (1, 0), or (0, 1) The random case happens if there is a row
+  // anti-commuting with Z[qubit]
+  bool is_deterministic_z(size_t target_qubit) const {
+    return !stabilizer[target_qubit].xs.is_not_all_zero();
+  }
+
+  bool is_deterministic_x(size_t target_qubit) const {
+    return !distabilizer[target_qubit].xs.is_not_all_zero();
+  }
+
+  bool is_deterministic_y(size_t target_qubit) const {
+    return distabilizer[target_qubit].xs == stabilizer[target_qubit].xs;
+  }
+
+  pauli_string<word_size> eval_y_obs(size_t qubit) const {
+    pauli_string<word_size> result = distabilizer[qubit];
+    uint8_t log_i = pauli_string_slice<word_size>(result).inplace_right_mul(
+        stabilizer[qubit]);
+    log_i++;
+    assert((log_i & 1) == 0);
+    if (log_i & 2) {
+      result.sign ^= true;
+    }
+    return result;
+  }
+
+  // collapse the qubit along z axis
+  // args:
+  //   t_trans: the transpose of the tableau
+  //   target_qubit: the target qubit
+  //   rng: the random number generator
+  size_t collapse_qubit_along_z(tableau_trans<word_size>& t_trans,
+                                size_t target_qubit, std::mt19937_64& rng) {
+
+    size_t pivot = 0;
+
+    // search for any generator that anti-commutes with the measurement
+    // observable
+    while (pivot < num_qubits &&
+           !t_trans.t.stabilizer.xs_t[pivot][target_qubit])
+      pivot++;
+
+    // Such an p does not exist. In this case the outcome is determinate, so
+    // measuring the state will not change it; the only task is to determine
+    // whether 0 or 1 is observed.
+    if (pivot == num_qubits)
+      return SIZE_MAX;
+
+    // perform partial gaussian elimination over the stabilizer generators that
+    // anti-commute with the measurement. do this by introducing
+    // no-effect-because-control-is-zero CNOT at the beginning of time.
+    for (size_t k = pivot + 1; k < num_qubits; k++)
+      if (t_trans.t.stabilizer.xs_t[k][target_qubit])
+        t_trans.cnot_gate(pivot, k);
+
+    // swap the non-isolated anti-commuting stablizer generator for one that
+    // commutes with the measurement
+    if (t_trans.t.stabilizer.zs_t[pivot][target_qubit]) {
+      t_trans.h_yz_gate(pivot);
+    } else {
+      t_trans.h_gate(pivot);
+    }
+
+    // assign measure result
+    bool result_if_measured = rng() & 1;
+    if (stabilizer.signs[target_qubit] != result_if_measured) {
+      t_trans.x_gate(pivot);
+    };
+
+    return pivot;
+  }
+
+  // random valid stablizer tableau
+  // reference: https://arxiv.org/abs/2003.09412
+  static tableau<word_size>
+  random_valid_stabilizer_tableau(size_t num_qubits, std::mt19937_64& rng) {
+    auto raw = table<word_size>::random_valid_stabilizer_table(num_qubits, rng);
+    tableau<word_size> result(num_qubits);
+    for (size_t row = 0; row < num_qubits; row++) {
+      for (size_t col = 0; col < num_qubits; col++) {
+        result.distabilizer[row].xs[col] = raw[row][col];
+        result.distabilizer[row].zs[col] = raw[row][col + num_qubits];
+        result.stabilizer[row].xs[col] = raw[row + num_qubits][col];
+        result.stabilizer[row].zs[col] =
+            raw[row + num_qubits][col + num_qubits];
+      }
+    }
+
+    result.distabilizer.signs.randomize(num_qubits, rng);
+    result.stabilizer.signs.randomize(num_qubits, rng);
+    return result;
+  }
+
+  // check whether the tableau satisfy the invariants, the tableau need to
+  // preserve commutativity
+  // everything must commute, except for X_k anticommuting with Z_k for each k.
+  bool satisfy_invariants() const {
+    for (size_t q1 = 0; q1 < num_qubits; q1++) {
+      auto x1 = distabilizer[q1];
+      auto z1 = stabilizer[q1];
+
+      if (x1.commutes(z1))
+        return false;
+
+      for (size_t q2 = q1 + 1; q2 < num_qubits; q2++) {
+        auto x2 = distabilizer[q2];
+        auto z2 = stabilizer[q2];
+
+        if (!x1.commutes(x2) || !x1.commutes(z2) || !z1.commutes(x2) ||
+            !z1.commutes(z2))
+          return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+template <size_t word_size>
+std::ostream& operator<<(std::ostream& out, const tableau<word_size>& t) {
+  out << "+-";
+  for (size_t k = 0; k < t.num_qubits; k++) {
+    out << "xz-";
+  }
+  out << "+\n|";
+  for (size_t k = 0; k < t.num_qubits; k++) {
+    out << ' ' << "+-"[t.distabilizer[k].sign] << "+-"[t.stabilizer[k].sign];
+  }
+  for (size_t q = 0; q < t.num_qubits; q++) {
+    out << " |\n|";
+    for (size_t k = 0; k < t.num_qubits; k++) {
+      out << ' '
+          << "IXZY"[t.distabilizer[k].xs[q] | t.distabilizer[k].zs[q] << 1]
+          << "IXZY"[t.stabilizer[k].xs[q] | t.stabilizer[k].zs[q] << 1];
+    }
+  }
+  out << " |";
+  return out;
+}
+
+// reference to the tableau, transpose the tableau at the construction, after
+// some computation, transpose back at the deconstruction
+template <size_t word_size> struct tableau_trans {
+  // referece to the tableau
+  tableau<word_size>& t;
+
+  // constructor
+  explicit tableau_trans(tableau<word_size>& t_in) : t(t_in) {
+    t.inplace_transpose();
+  };
+  tableau_trans() = delete;
+
+  // copt and move constructor
+  tableau_trans(const tableau_trans<word_size>& t) = delete;
+  tableau_trans(tableau_trans<word_size>&& t) = delete;
+
+  // deconstructor
+  ~tableau_trans() { t.inplace_transpose(); }
+
+  // Iterates over the Paulis in a row of the tableau.
+  //
+  // args
+  //   q: The row to iterate over.
+  //   body: A function taking X, Z, and SIGN words. The X and Z words are
+  //   chunks of xz-encoded Paulis from the row. The SIGN word is the
+  //   corresponding chunk of sign bits from the sign row.
+  template <typename FUNC>
+  inline void for_each_trans_obs(const size_t q, FUNC body) {
+    for (size_t k = 0; k < 2; k++) {
+      _tableau<word_size>& h = k == 0 ? t.distabilizer : t.stabilizer;
+      pauli_string_slice<word_size> p = h[q];
+      p.xs.for_each_word(p.zs, h.signs, body);
+    }
+  }
+
+  template <typename FUNC>
+  inline void for_each_trans_obs(const size_t q1, const size_t q2, FUNC body) {
+    for (size_t k = 0; k < 2; k++) {
+      _tableau<word_size>& h = k == 0 ? t.distabilizer : t.stabilizer;
+      pauli_string_slice<word_size> p1 = h[q1];
+      pauli_string_slice<word_size> p2 = h[q2];
+      p1.xs.for_each_word(p1.zs, p2.xs, p2.zs, h.signs, body);
+    }
+  }
+
+  tableau_trans<word_size>& x_gate(const size_t qubit) {
+    for_each_trans_obs(qubit, [](auto& x, auto& z, auto& s) { s ^= z; });
+    return *this;
+  }
+
+  tableau_trans<word_size>& h_gate(const size_t qubit) {
+    for_each_trans_obs(qubit, [](auto& x, auto& z, auto& s) {
+      std::swap(x, z);
+      s ^= x & z;
+    });
+    return *this;
+  }
+
+  tableau_trans<word_size>& h_yz_gate(const size_t qubit) {
+    for_each_trans_obs(qubit, [](auto& x, auto& z, auto& s) {
+      s ^= z.andnot(x);
+      x ^= z;
+    });
+    return *this;
+  }
+
+  tableau_trans<word_size>& cnot_gate(const size_t control_qubit,
+                                      const size_t target_qubit) {
+    for_each_trans_obs(control_qubit, target_qubit,
+                       [](auto& cx, auto& cz, auto& tx, auto& tz, auto& s) {
+                         s ^= (cz ^ tx).andnot(cx & tz);
+                         cz ^= tz;
+                         tx ^= cx;
+                       });
+    return *this;
+  }
+};
+#endif
diff --git a/src/qfvm_clifford/utils.h b/src/qfvm_clifford/utils.h
new file mode 100644
index 0000000..3ec0a27
--- /dev/null
+++ b/src/qfvm_clifford/utils.h
@@ -0,0 +1,40 @@
+#ifndef TABLEAU_ELEMENT_UTILS_H_
+#define TABLEAU_ELEMENT_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+
+template <size_t word_size> constexpr size_t bits_to_bits_padded(size_t bits) {
+  return (bits + (word_size - 1)) & ~(word_size - 1);
+}
+
+template <size_t word_size> constexpr size_t bits_to_word_padded(size_t bits) {
+  return bits_to_bits_padded<word_size>(bits) / word_size;
+}
+
+// reference:
+// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+inline uint8_t count_uint64_bits(uint64_t value) {
+  value = value - ((value >> 1) & 0x5555555555555555ull);
+  value =
+      (value & 0x3333333333333333ull) + ((value >> 2) & 0x3333333333333333ull);
+  return (((value + (value >> 4)) & 0xF0F0F0F0F0F0F0Full) *
+          0x101010101010101ull) >>
+         56;
+}
+
+// Concatenate preprocessor tokens A and B without expanding macro definitions
+// (however, if invoked from a macro, macro arguments are expanded).
+#define PPCAT_NX(A, B) A##B
+
+// Concatenate preprocessor tokens A and B after macro-expanding them.
+#define PPCAT(A, B) PPCAT_NX(A, B)
+
+// Turn A into a string literal without expanding macro definitions (however, if
+// invoked from a macro, macro arguments are expanded).
+#define STRINGIZE_NX(A) #A
+
+// Turn A into a string literal after macro-expanding it.
+#define STRINGIZE(A) STRINGIZE_NX(A)
+
+#endif
diff --git a/tests/quafu/simulator/basis_test.py b/tests/quafu/simulator/basis_test.py
index 4afa8f4..6521845 100644
--- a/tests/quafu/simulator/basis_test.py
+++ b/tests/quafu/simulator/basis_test.py
@@ -263,3 +263,82 @@ def test_after_measure(self):
             [diff_00, diff_11], [2, 0]
         )
         self.assertTrue(success)
+
+
+class TestCliffordSimulatorBasis(BaseTest):
+    """Test C++ Clifford simulator"""
+
+    circuit = None
+    assertEqual = unittest.TestCase.assertEqual
+    assertAlmostEqual = unittest.TestCase.assertAlmostEqual
+    assertDictEqual = unittest.TestCase.assertDictEqual
+    assertListEqual = unittest.TestCase.assertListEqual
+    assertTrue = unittest.TestCase.assertTrue
+
+    @pytest.mark.skipif(
+        sys.platform == "darwin", reason="Avoid error on MacOS arm arch."
+    )
+    def test_simulate(self):
+        print("test_simulate")
+        self.circuit = BellCircuits.bell_no_measure()
+        result = simulate(
+            qc=self.circuit, simulator="qfvm_clifford", output="count_dict"
+        )
+        count = result.count
+        self.assertDictAlmostEqual(count, {})
+
+    def test_singleQgate_measure_atlast(self):
+        self.circuit = BasicCircuits.singleQgate_measure_atlast()
+        result = simulate(
+            qc=self.circuit, shots=1, simulator="qfvm_clifford", output="count_dict"
+        )
+        counts = result.count
+        self.assertDictAlmostEqual(counts, {"11": 1})
+
+    def test_singleQgate_no_measure(self):
+        self.circuit = BasicCircuits.singleQgate_no_measure()
+        result = simulate(
+            qc=self.circuit, shots=1, simulator="qfvm_clifford", output="count_dict"
+        )
+        counts = result.count
+        self.assertDictAlmostEqual(counts, {})
+
+    def test_singleQgate_measure_normal(self):
+        self.circuit = BasicCircuits.singleQgate_measure_normal()
+        result = simulate(
+            qc=self.circuit, shots=10, simulator="qfvm_clifford", output="count_dict"
+        )
+        counts = result.count
+        self.assertDictAlmostEqual(counts, {"11": 10})
+
+    def test_multiQgate_measure_atlast(self):
+        self.circuit = BasicCircuits.multiQgate_measure_atlast()
+        result = simulate(
+            qc=self.circuit, shots=10, simulator="qfvm_clifford", output="count_dict"
+        )
+        counts = result.count
+        self.assertDictAlmostEqual(counts, {"11": 10})
+
+    def test_multiQgate_no_measure(self):
+        self.circuit = BasicCircuits.multiQgate_no_measure()
+        result = simulate(
+            qc=self.circuit, shots=1, simulator="qfvm_clifford", output="count_dict"
+        )
+        counts = result.count
+        self.assertDictAlmostEqual(counts, {})
+
+    def test_multiQgate_measure_normal(self):
+        self.circuit = BasicCircuits.multiQgate_measure_normal()
+        result = simulate(
+            qc=self.circuit, shots=10, simulator="qfvm_clifford", output="count_dict"
+        )
+        counts = result.count
+        self.assertDictAlmostEqual(counts, {"11": 10})
+
+    def test_anycbit_measure(self):
+        self.circuit = BasicCircuits.any_cbit_measure()
+        result = simulate(
+            qc=self.circuit, shots=10, simulator="qfvm_clifford", output="count_dict"
+        )
+        counts = result.count
+        self.assertDictAlmostEqual(counts, {"0101": 10})

From 099cb01110eea99537544ead4e3a0220a3a060bf Mon Sep 17 00:00:00 2001
From: Yanglin Zhang <56758695+lucky9-cyou@users.noreply.github.com>
Date: Fri, 22 Dec 2023 14:45:14 +0800
Subject: [PATCH 2/3] build: delete useless cmake code

---
 CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9595c4c..ab2d4e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,8 +60,6 @@ list (APPEND PRJ_INCLUDE_DIRS ${EIGEN3_INCLUDE_DIR})
 find_package(pybind11 CONFIG)
 list (APPEND PRJ_INCLUDE_DIRS ${PYBIND11_INCLUDE_DIR})
 
-set(MACOS_VERSION_MIN 10.14)
-
 #SIMD
 if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64" OR CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "amd64")
 	if(MSVC)

From a951126f4f67cac2ebe2c304c96868824fffb4d7 Mon Sep 17 00:00:00 2001
From: ylin <17835344407@163.com>
Date: Fri, 22 Dec 2023 14:47:45 +0800
Subject: [PATCH 3/3] feat: delete useless python code

---
 quafu/simulators/simulator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/quafu/simulators/simulator.py b/quafu/simulators/simulator.py
index 4be6e19..609b282 100644
--- a/quafu/simulators/simulator.py
+++ b/quafu/simulators/simulator.py
@@ -32,7 +32,6 @@ def simulate(
     shots: int = 100,
     use_gpu: bool = False,
     use_custatevec: bool = False,
-    use_clifford: bool = False,
 ) -> SimuResult:
     """Simulate quantum circuit
     Args: