diff --git a/README.md b/README.md
index c7768d163..f554ca2cf 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ speech recognition system with multiple decoding passes including lattice
 rescoring and confidence estimation.  We hope k2 will have many other
 applications as well.
 
-One of the key algorithms that we want to make efficient in the short term is
+One of the key algorithms that we have implemented is
 pruned composition of a generic FSA with a "dense" FSA (i.e. one that
 corresponds to log-probs of symbols at the output of a neural network).  This
 can be used as a fast implementation of decoding for ASR, and for CTC and
@@ -78,46 +78,21 @@ general and extensible framework to allow further development of ASR technology.
 
 ## Current state of the code
 
- A lot of the code is still unfinished (Sep 11, 2020).
- We finished the CPU versions of many algorithms and this code is in `k2/csrc/host/`;
- however, after that we figured out how to implement things on the GPU and decided
- to change the interfaces so the CPU and GPU code had a more unified interface.
- Currently in `k2/csrc/` we have more GPU-oriented implementations (although
- these algorithms will also work on CPU).  We had almost finished the Python
- wrapping for the older code, in the `k2/python/` subdirectory, but we decided not to
- release code with that wrapping because it would have had to be reworked to be compatible
- with our GPU algorithms.  Instead we will use the interfaces drafted in `k2/csrc/`
- e.g. the Context object (which encapsulates things like memory managers from external
- toolkits) and the Tensor object which can be used to wrap tensors from external toolkits;
- and wrap those in Python (using pybind11).  The code in host/ will eventually
- be either deprecated, rewritten or wrapped with newer-style interfaces.
-
-## Plans for initial release
-
- We hope to get the first version working in early October.  The current
- short-term aim is to finish the GPU implementation of pruned composition of a
- normal FSA with a dense FSA, which is the same as decoder search in speech
- recognition and can be used to implement CTC training and lattice-free MMI (LF-MMI) training.  The
- proof-of-concept that we will release initially is something that's like CTC
- but allowing more general supervisions (general FSAs rather than linear
- sequences).  This will work on GPU.  The same underlying code will support
- LF-MMI so that would be easy to implement soon after.  We plan to put
- example code in a separate repository.
+ We have wrapped all the C++ code to Python with [pybind11](https://github.com/pybind/pybind11)
+ and have finished the integration with [PyTorch](https://github.com/pytorch/pytorch).
+
+ We are currently writing speech recognition recipes using k2, which are hosted in a
+ separate repository. Please see <https://github.com/k2-fsa/icefall>.
 
 ## Plans after initial release
 
- We will then gradually implement more algorithms in a way that's compatible
- with the interfaces in `k2/csrc/`.  Some of them will be CPU-only to start
- with.  The idea is to eventually have very rich capabilities for operating on
- collections of sequences, including methods to convert from a lattice to a
- collection of linear sequences and back again (for purposes of neural language
- model rescoring, neural confidence estimation and the like).
+ We are currently trying to make k2 ready for production use (see the branch
+ [v2.0-pre](https://github.com/k2-fsa/k2/tree/v2.0-pre)).
 
 ## Quick start
 
 Want to try it out without installing anything? We have setup a [Google Colab][1].
-
-Caution: k2 is not nearly ready for actual use!  We are still coding the core
-algorithms, and hope to have an early version working by early October.
+You can find more Colab notebooks using k2 in speech recognition at
+<https://icefall.readthedocs.io/en/latest/recipes/librispeech/conformer_ctc.html>.
 
 [1]: https://colab.research.google.com/drive/1qbHUhNZUX7AYEpqnZyf29Lrz2IPHBGlX?usp=sharing
diff --git a/cmake/cub.cmake b/cmake/cub.cmake
index c65f8f6d2..c4d6e1c94 100644
--- a/cmake/cub.cmake
+++ b/cmake/cub.cmake
@@ -20,8 +20,9 @@ function(download_cub)
 
   include(FetchContent)
 
-  set(cub_URL  "https://github.com/NVlabs/cub/archive/1.10.0.tar.gz")
-  set(cub_HASH "SHA256=8531e09f909aa021125cffa70a250761dfc247f960d7a1a12f65e6651ffb6477")
+  set(cub_URL  "https://github.com/NVlabs/cub/archive/1.15.0.tar.gz")
+  set(cub_HASH "SHA256=1781ee5eb7f00acfee5bff88e3acfc67378f6b3c24281335e18ae19e1f2ff685")
+
 
   FetchContent_Declare(cub
     URL               ${cub_URL}
diff --git a/docs/source/core_concepts/index.rst b/docs/source/core_concepts/index.rst
index 1fde45269..e67a3d569 100644
--- a/docs/source/core_concepts/index.rst
+++ b/docs/source/core_concepts/index.rst
@@ -193,13 +193,13 @@ In k2, you would use the following code to compute it:
     fsa = k2.Fsa.from_str(s)
     fsa.draw('fsa2.svg')
     fsa = k2.create_fsa_vec([fsa])
-    total_scores = k2.get_tot_scores(fsa, log_semiring=False, use_double_scores=False)
+    total_scores = fsa.get_tot_scores(log_semiring=False, use_double_scores=False)
     print(total_scores)
     # It prints: tensor([0.2000])
 
 .. HINT::
 
-    :func:`k2.get_tot_scores` takes a vector of FSAs as input,
+    :func:`k2.Fsa.get_tot_scores` takes a vector of FSAs as input,
     so we use :func:`k2.create_fsa_vec` to turn an FSA into a vector of FSAs.
 
     Most operations in k2 take a vector of FSAs as input and process them
@@ -230,7 +230,7 @@ The code in k2 looks like:
     '''
     fsa = k2.Fsa.from_str(s)
     fsa = k2.create_fsa_vec([fsa])
-    total_scores = k2.get_tot_scores(fsa, log_semiring=True, use_double_scores=False)
+    total_scores = fsa.get_tot_scores(log_semiring=True, use_double_scores=False)
     print(total_scores)
     # It prints: tensor([0.8444])
 
@@ -319,7 +319,7 @@ the FSA given in :numref:`autograd example`:
   fsa.scores = nnet_output
   fsa.draw('autograd_tropical.svg')
   fsa_vec = k2.create_fsa_vec([fsa])
-  total_scores = k2.get_tot_scores(fsa_vec, log_semiring=False, use_double_scores=False)
+  total_scores = fsa.get_tot_scores(log_semiring=False, use_double_scores=False)
 
   total_scores.backward()
   print(nnet_output.grad)
@@ -366,11 +366,11 @@ Example 2: Autograd in log semiring
 
 For the log semiring, we just change::
 
-  total_scores = k2.get_tot_scores(fsa_vec, log_semiring=False, use_double_scores=False)
+  total_scores = fsa.get_tot_scores(log_semiring=False, use_double_scores=False)
 
 to::
 
-  total_scores = k2.get_tot_scores(fsa_vec, log_semiring=True, use_double_scores=False)
+  total_scores = fsa.get_tot_scores(log_semiring=True, use_double_scores=False)
 
 For completeness and ease of reference, we repost the code below.
 
@@ -392,7 +392,7 @@ For completeness and ease of reference, we repost the code below.
     fsa.scores = nnet_output
     fsa.draw('autograd_log.svg')
     fsa_vec = k2.create_fsa_vec([fsa])
-    total_scores = k2.get_tot_scores(fsa_vec, log_semiring=True, use_double_scores=False)
+    total_scores = fsa.get_tot_scores(log_semiring=True, use_double_scores=False)
 
     total_scores.backward()
     print(nnet_output.grad)
diff --git a/docs/source/python_tutorials/fsa_algo/code/invert1.py b/docs/source/python_tutorials/fsa_algo/code/invert1.py
index c7333c3f6..4d045034c 100644
--- a/docs/source/python_tutorials/fsa_algo/code/invert1.py
+++ b/docs/source/python_tutorials/fsa_algo/code/invert1.py
@@ -4,7 +4,7 @@
 1 2 -1 -1 0.2
 2
 '''
-fsa = k2.Fsa.from_str(s)
+fsa = k2.Fsa.from_str(s, acceptor=False)
 inverted_fsa = k2.invert(fsa)
 fsa.draw('before_invert.svg', title='before invert')
 inverted_fsa.draw('after_invert.svg', title='after invert')
diff --git a/docs/source/python_tutorials/fsa_algo/code/invert2.py b/docs/source/python_tutorials/fsa_algo/code/invert2.py
index 86e80b30d..fb96ea655 100644
--- a/docs/source/python_tutorials/fsa_algo/code/invert2.py
+++ b/docs/source/python_tutorials/fsa_algo/code/invert2.py
@@ -5,7 +5,7 @@
 2
 '''
 fsa = k2.Fsa.from_str(s)
-fsa.aux_labels = k2.RaggedInt('[ [10 20] [-1] ]')
+fsa.aux_labels = k2.RaggedTensor('[ [10 20] [-1] ]')
 inverted_fsa = k2.invert(fsa)
 fsa.draw('before_invert_aux.svg',
          title='before invert with ragged tensors as aux_labels')
diff --git a/k2/csrc/cub.h b/k2/csrc/cub.h
index 55331f8b2..d1df56f32 100644
--- a/k2/csrc/cub.h
+++ b/k2/csrc/cub.h
@@ -30,14 +30,40 @@
 // that k2 and PyTorch use a different copy
 // of CUB.
 
+#ifdef CUB_NS_PREFIX
+#undef CUB_NS_PREFIX
+#endif
+
+#ifdef CUB_NS_POSTFIX
+#undef CUB_NS_POSTFIX
+#endif
+
+#ifdef CUB_NS_QUALIFIER
+#undef CUB_NS_QUALIFIER
+#endif
+
+// see
+// https://github.com/NVIDIA/cub/commit/6631c72630f10e370d93814a59146b12f7620d85
+// The above commit replaced "thrust" with "THRUST_NS_QUALIFIER"
+#ifndef THRUST_NS_QUALIFIER
+#define THRUST_NS_QUALIFIER thrust
+#endif
+
 #define CUB_NS_PREFIX namespace k2 {
 #define CUB_NS_POSTFIX }
 
+// See
+// https://github.com/NVIDIA/cub/commit/6631c72630f10e370d93814a59146b12f7620d85
+// and
+// https://github.com/NVIDIA/cub/pull/350
+#define CUB_NS_QUALIFIER ::k2::cub
+
 #ifdef K2_WITH_CUDA
 #include "cub/cub.cuh"  // NOLINT
 #endif
 
 #undef CUB_NS_PREFIX
 #undef CUB_NS_POSTFIX
+#undef CUB_NS_QUALIFIER
 
 #endif  // K2_CSRC_CUB_H_