Revert change in comment regarding seq_len+1

k2-fsa · danpovey · Jul 30, 2021 · Jul 13, 2021 · Jul 13, 2021 · Jul 13, 2021
commit bf20c119a14e031f9f3972e6918b442d83a827f7
diff --git a/k2/csrc/nbest.cu b/k2/csrc/nbest.cu
@@ -179,7 +179,7 @@ void CreateLcpIntervalArray(ContextPtr c,
                             Array1<T> *lcp_intervals_order,
                             Array1<T> *leaf_parent_intervals) {
 
-  //
+
   *lcp_intervals = Array1<LcpInterval<T> >(c, seq_len);
   LcpInterval<T> *lcp_intervals_data = lcp_intervals->Data();
 
@@ -216,6 +216,8 @@ void CreateLcpIntervalArray(ContextPtr c,
   stack.push_back({0, 0, seq_len, next++ });
   lcp_intervals_data[0] = stack.back();
   // We are using zero-based indexing so the code is not quite the same as our reference.
+  // Also, http://www.mi.fu-berlin.de/wiki/pub/ABI/RnaSeqP4/enhanced-suffix-array.pdf
+  // seems to be expecting a suffix array of size seq_len + 1, not seq_len.
   for (T i = 0; i < seq_len; ++i) {
     T lb = i, lcp_array_i = lcp_array[i];
     leaf_stack.push_back(lb);

diff --git a/k2/csrc/nbest.h b/k2/csrc/nbest.h
@@ -68,14 +68,16 @@ namespace k2 {
             must be longer than this by at least 3, for termination.)
             Require seq_len >= 0
     @param [out] suffix_array   A pre-allocated array of length
-             `seq_len + 1`.  At exit it will contain a permutation of
-             the list [ 0, 1, ... seq_len ], interpreted
-             as the start indexes of suffixes of `text_array`,
+             `seq_len`.  At exit it will contain a permutation of
+             the list [ 0, 1, ... seq_len  - 1 ], interpreted
+             as the start indexes of the nonempty suffixes of `text_array`,
              with the property that the sub-arrays of `text_array`
              starting at these positions are lexicographically sorted.
              For example, as a trivial case, if seq_len = 3
              and text_array contains [ 3, 2, 1, 0, 0, 0 ], then
              `suffix_array` would contain [ 2, 1, 0 ] at exit.
+             CAUTION: there is some literature on suffix arrays
+             that expects the suffix_array size tgo be n + 1, not n.
     @param [in] max_symbol  A number that must be >= the largest
              number that might be in `text_array`.  The work done
              is O(seq_len + max_symbol), so it is not advisable