diff --git a/README.md b/README.md
index c80b528..b73d566 100644
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ If you want:
 
 * To compile the **tests**, add `-D XFAC_BUILD_TEST=ON` to  the cmake line above.
 
-* To generate the **python** library, add `-D XFAC_BUILD_PYTHON=ON`
+* To generate the **python** library, add `-D XFAC_BUILD_PYTHON=ON`. If you find any problem try with an older numpy `pip install "numpy<2.0"`.
 
 * To **install** the library in specific **local directory**, add `-D CMAKE_INSTALL_PREFIX=<my_local_install_directory>`
 
diff --git a/include/xfac/matrix/mat_decomp.h b/include/xfac/matrix/mat_decomp.h
index 4c95f6b..572a87f 100644
--- a/include/xfac/matrix/mat_decomp.h
+++ b/include/xfac/matrix/mat_decomp.h
@@ -2,7 +2,6 @@
 #define MAT_DECOMP_H
 
 #include "xfac/index_set.h"
-#include<memory>
 #define ARMA_DONT_USE_OPENMP
 #include<armadillo>
 
@@ -96,6 +95,37 @@ struct MatSVDFixedTol: public MatDecompFixedTol<SVDDecomp<T>> {
 
 //--------------------------------------- rank-revealing LU decomposition -----------------------
 
+
+/// Given the pivot matrix P already in LU form, update the C columns according to P.
+template<class T>
+void apply_LU_on_cols(arma::Mat<T>& C, arma::Mat<T> const& P, bool leftOrthogonal)
+{
+    if (P.n_rows != C.n_rows)
+        throw std::invalid_argument("apply_LU_on_Cols: incompatible matrices");
+    for(auto k=0u; k<P.n_rows; k++) {
+        auto rows=arma::span(k+1,C.n_rows-1);
+        if (!leftOrthogonal)
+            C.row(k) *= 1.0/P(k,k);
+        if (k+1<P.n_rows)
+            C.rows(rows) -= P(rows,k)*C.row(k) ;
+    }
+}
+
+/// Given the pivot matrix P already in LU form, update the R rows according to P.
+template<class T>
+void apply_LU_on_rows(arma::Mat<T>& R, arma::Mat<T> const& P, bool leftOrthogonal)
+{
+    if (P.n_cols != R.n_cols)
+        throw std::invalid_argument("apply_LU_on_Rows: incompatible matrices");
+    for(auto k=0u; k<P.n_rows; k++) {
+        auto cols=arma::span(k+1,R.n_cols-1);
+        if (leftOrthogonal)
+            R.col(k) *= 1.0/P(k,k);
+        if (k+1<P.n_rows)
+            R.cols(cols) -= R.col(k)*P(k,cols) ;
+    }
+}
+
 template<class T>
 struct RRLUDecomp {
     using value_type=T;
@@ -108,11 +138,11 @@ struct RRLUDecomp {
 
     RRLUDecomp()=default;
 
-    RRLUDecomp(arma::Mat<T> const& A, bool leftOrthogonal_=true, double reltol=0, int rankMax=0)
+    RRLUDecomp(arma::Mat<T> const& A, bool leftOrthogonal_=true, double reltol=0, int rankMax=0, bool fast=false)
         : leftOrthogonal(leftOrthogonal_)
         , Iset (iota(A.n_rows))
         , Jset (iota(A.n_cols))
-    { calculate(A, reltol, rankMax); }
+    { if (fast) calculateFast(A,reltol,rankMax); else calculate(A, reltol, rankMax); }
 
     void calculate(arma::Mat<T> A, double reltol, int rankMax)
     {
@@ -148,6 +178,47 @@ struct RRLUDecomp {
         readLU(A);
     }
 
+    void calculateFast(arma::Mat<T> A, double reltol, int rankMax)
+    {
+        npivot= std::min(A.n_rows, A.n_cols);
+        if (reltol==0) reltol=npivot*std::abs(std::numeric_limits<T>::epsilon());
+        if (rankMax>0 && rankMax<npivot) npivot=rankMax;
+
+        arma::uvec J; // find column pivots by rrQR
+        {
+            arma::Mat<T> Q,R;
+            arma::qr(Q, R, J, A, "vector");
+            arma::vec sv=arma::abs(R.diag());
+            int rank=arma::find(sv>reltol*sv[0]).eval().size();
+            if (rank<npivot) npivot=rank;
+            if (npivot<std::min(A.n_rows, A.n_cols)) error=std::abs(sv.at(npivot));
+        }
+
+        arma::uvec I(A.n_rows); // find row pivots by P*A(:,J)==L*U
+        {
+            arma::Mat<T> P;
+            lu(L, U, P, A.cols(J.head_rows(npivot)));
+            for(auto i=0u; i<P.n_rows; i++)
+                I[i]=arma::find(P.row(i),1).eval().at(0);
+        }
+
+        arma::Col<T> D=U.diag().eval();
+        if (!leftOrthogonal) {
+            L = L*arma::diagmat(D);
+            U = arma::diagmat(1.0/D)*U;
+        }
+        if (npivot<A.n_cols){ // compute the second part of U
+            auto U2=A.submat(I.head_rows(npivot),J.tail_rows(J.size()-npivot)).eval();
+            arma::Mat<T> P=L.head_rows(npivot)+U; // pivot matrix in LU form
+            P.diag()=D;
+            apply_LU_on_cols(U2,P,leftOrthogonal);
+            U=arma::join_horiz(U,U2).eval();
+        }
+
+        Iset=arma::conv_to<vector<int>>::from(I);
+        Jset=arma::conv_to<vector<int>>::from(J);
+    }
+
     vector<int> row_pivots() const { return {Iset.begin(), Iset.begin()+npivot}; }
     vector<int> col_pivots() const { return {Jset.begin(), Jset.begin()+npivot}; }
     arma::Mat<T> PivotMatrixTri() const
@@ -191,35 +262,6 @@ struct RRLUDecomp {
     }
 };
 
-/// Given the pivot matrix P already in LU form, update the C columns according to P.
-template<class T>
-void apply_LU_on_cols(arma::Mat<T>& C, arma::Mat<T> const& P, bool leftOrthogonal)
-{
-    if (P.n_rows != C.n_rows)
-        throw std::invalid_argument("apply_LU_on_Cols: incompatible matrices");
-    for(auto k=0u; k<P.n_rows; k++) {
-        auto rows=arma::span(k+1,C.n_rows-1);
-        if (!leftOrthogonal)
-            C.row(k) *= 1.0/P(k,k);
-        if (k+1<P.n_rows)
-            C.rows(rows) -= P(rows,k)*C.row(k) ;
-    }
-}
-
-/// Given the pivot matrix P already in LU form, update the R rows according to P.
-template<class T>
-void apply_LU_on_rows(arma::Mat<T>& R, arma::Mat<T> const& P, bool leftOrthogonal)
-{
-    if (P.n_cols != R.n_cols)
-        throw std::invalid_argument("apply_LU_on_Rows: incompatible matrices");
-    for(auto k=0u; k<P.n_rows; k++) {
-        auto cols=arma::span(k+1,R.n_cols-1);
-        if (leftOrthogonal)
-            R.col(k) *= 1.0/P(k,k);
-        if (k+1<P.n_rows)
-            R.cols(cols) -= R.col(k)*P(k,cols) ;
-    }
-}
 
 template<class T>
 struct MatRRLUFixedTol: public MatDecompFixedTol<RRLUDecomp<T>> {
@@ -244,7 +286,7 @@ struct ARRLUParam {
 };
 
 template<class T>
-struct ARRLUDecomp: public RRLUDecomp<T> {
+struct ARRLUDecomp: public RRLUDecomp<T> { //TODO: try to use the fast rrlu
     using RRLUDecomp<T>::Iset;
     using RRLUDecomp<T>::Jset;
     using RRLUDecomp<T>::npivot;
@@ -258,7 +300,7 @@ struct ARRLUDecomp: public RRLUDecomp<T> {
         RRLUDecomp<T>::Iset=iota(fA.n_rows);
         RRLUDecomp<T>::Jset=iota(fA.n_cols);
 
-        if (param.fullPiv) { this->calculate(fA.submat(Iset,Jset), param.reltol, param.bondDim); return; }
+        if (param.fullPiv) { this->calculateFast(fA.submat(Iset,Jset), param.reltol, param.bondDim); return; }
 
         int rankMax=std::min(fA.n_rows,fA.n_cols);
         if (param.bondDim>0 && param.bondDim<rankMax) rankMax=param.bondDim;
@@ -320,8 +362,8 @@ struct CURDecomp: public ARRLUDecomp<T> {
 
     const arma::Mat<T> C, R;
 
-    CURDecomp(arma::Mat<T> const& A_, bool leftOrthogonal_=true, double reltol=1e-14, int rankMax=0)
-        : ARRLUDecomp<T>(A_,leftOrthogonal_,reltol,rankMax)
+    CURDecomp(arma::Mat<T> const& A_, bool leftOrthogonal_=true, double reltol=1e-14, int rankMax=0, bool fast=false)
+        : ARRLUDecomp<T>(A_,leftOrthogonal_,reltol,rankMax,fast)
         , C(A_.cols(arma::conv_to<arma::uvec>::from(col_pivots())))
         , R(A_.rows(arma::conv_to<arma::uvec>::from(row_pivots())))
     {}
diff --git a/python/python_bindings.cpp b/python/python_bindings.cpp
index 5a8b6ee..8ecf1d5 100644
--- a/python/python_bindings.cpp
+++ b/python/python_bindings.cpp
@@ -24,11 +24,18 @@ using cmpx=complex<double>;
 
 template<typename T>
 void declare_TensorCI(py::module &m, std::string typestr) {
+
+    m.def(("decomposeCI"s+typestr).c_str(),[](arma::Mat<T> A,bool isleft,double reltol,int maxBondDim,bool fast)
+    {
+        auto ci=CURDecomp<T>(A,isleft,reltol,maxBondDim,fast);
+        return std::make_pair(ci.left(),ci.right());
+    }, "A"_a, "leftCanonic"_a=true, "reltol"_a=1e-12, "maxBondDim"_a=0, "fast"_a=true);
+
     using TensorTraind=TensorTrain<T>;
     py::class_<TensorTraind>(m, ("TensorTrain"s+typestr).c_str())
             .def(py::init<size_t>(), "len"_a)
             .def_readonly("core",  &TensorTraind::M)
-            .def("setCoreAt",[](TensorTraind &tt, int i, py::array const& Mi){ tt.M.at(i)=carma::arr_to_cube<T>(Mi); }, "i"_a, "Mi"_a)
+            .def("setCoreAt",[](TensorTraind &tt, int i, arma::Cube<T> Mi){ tt.M.at(i)=Mi; }, "i"_a, "Mi"_a)
             .def("eval", &TensorTraind::eval)
             .def("sum", &TensorTraind::sum)
             .def("sum1", &TensorTraind::sum1)
diff --git a/test/test_main.cpp b/test/test_main.cpp
index afeca98..f5725ae 100644
--- a/test/test_main.cpp
+++ b/test/test_main.cpp
@@ -1,2 +1,20 @@
 #define CATCH_CONFIG_MAIN  // This tells Catch to provide a main() - only do this in one cpp file
 #include <catch2/catch.hpp>
+#include <armadillo>
+
+
+TEST_CASE("Test arma") {
+    using namespace arma;
+    mat A(120,10000,fill::randn), L, U, P;
+    lu(L, U, P, A);
+
+    REQUIRE(norm(P*A-L*U)<1e-14*norm(A));
+    std::cout << norm(L.diag()-1.0) << std::endl;
+
+    auto D=U.diag().eval();
+    L = L*arma::diagmat(D);
+    U = arma::diagmat(1.0/D)*U;
+
+    REQUIRE(norm(P*A-L*U)<1e-14*norm(A));
+    std::cout << norm(U.diag()-1.0) << std::endl;
+}
diff --git a/test/test_matrix_ci.cpp b/test/test_matrix_ci.cpp
index 10f381a..23c5367 100644
--- a/test/test_matrix_ci.cpp
+++ b/test/test_matrix_ci.cpp
@@ -2,6 +2,7 @@
 #include "xfac/tensor/tensor_ci.h"
 #include "xfac/grid.h"
 #include "xfac/matrix/mat_decomp.h"
+#include <chrono>
 
 using namespace arma;
 using namespace xfac;
@@ -179,6 +180,33 @@ TEST_CASE("cur")
         CURDecomp<double> sol(A);
         REQUIRE(arma::norm(A-sol.left()*sol.right())<tol);
     }
+
+    SECTION("timing") {
+        using namespace std;
+        cout<<"n time_svd time_rrlu time_rrlu_new\n";
+        for(int n=500; n<=1500; n+=500) {
+            mat A1(n,n/2,arma::fill::randu);
+            mat A2(n/2,n,arma::fill::randu);
+            mat A=A1*A2;
+            auto t0 = std::chrono::high_resolution_clock::now();
+            cout<<n;
+            {
+                SVDDecomp<double> sol(A,true,0.0,0);
+                auto t1 = std::chrono::high_resolution_clock::now();
+                auto dt=std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count();
+                cout<<" "<<dt*1e-3;
+                t0=t1;
+            }
+            for(bool fast:{false,true}) {
+                CURDecomp<double> sol(A,true,0.0,0,fast);
+                auto t1 = std::chrono::high_resolution_clock::now();
+                auto dt=std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count();
+                cout<<" "<<dt*1e-3;
+                t0=t1;
+            }
+            cout<<endl;
+        }
+    }
 }