Merge branch 'main' of https://github.com/stanford-futuredata/ColBERT

fschlatt · fschlatt · commit f232bdee48ce · 2023-07-31T14:58:14.000+02:00
diff --git a/README.md b/README.md
@@ -3,6 +3,8 @@
 * (1/29/23) We have merged a new index updater feature and support for additional Hugging Face models! These are in beta so please give us feedback as you try them out.
 * (1/24/23) If you're looking for the **DSP** framework for composing ColBERTv2 and LLMs, it's at: https://github.com/stanfordnlp/dsp
 
+[<img align="center" src="https://colab.research.google.com/assets/colab-badge.svg" />](https://colab.research.google.com/github/stanford-futuredata/ColBERT/blob/main/docs/intro2new.ipynb)
+
 # ColBERT (v2)
 
 ### ColBERT is a _fast_ and _accurate_ retrieval model, enabling scalable BERT-based search over large text collections in tens of milliseconds.
@@ -66,6 +68,8 @@ Below, we illustrate these steps via an example run on the MS MARCO Passage Rank
 
 ## API Usage Notebook
 
+**NEW**: We have an experimental notebook on [Google Colab](https://colab.research.google.com/github/stanford-futuredata/ColBERT/blob/main/docs/intro2new.ipynb) that you can use with free GPUs. Indexing 10,000 on the free Colab T4 GPU takes six minutes.
+
 This Jupyter notebook **[docs/intro.ipynb notebook](docs/intro.ipynb)** illustrates using the key features of ColBERT with the new Python API.
 
 It includes how to download the ColBERTv2 model checkpoint trained on MS MARCO Passage Ranking and how to download our new LoTTE benchmark.
diff --git a/colbert/modeling/checkpoint.py b/colbert/modeling/checkpoint.py
@@ -40,13 +40,13 @@ def doc(self, *args, to_cpu=False, **kw_args):
 
                 return D
 
-    def queryFromText(self, queries, bsize=None, to_cpu=False, context=None):
+    def queryFromText(self, queries, bsize=None, to_cpu=False, context=None, full_length_search=False):
         if bsize:
-            batches = self.query_tokenizer.tensorize(queries, context=context, bsize=bsize)
+            batches = self.query_tokenizer.tensorize(queries, context=context, bsize=bsize, full_length_search=full_length_search)
             batches = [self.query(input_ids, attention_mask, to_cpu=to_cpu) for input_ids, attention_mask in batches]
             return torch.cat(batches)
 
-        input_ids, attention_mask = self.query_tokenizer.tensorize(queries, context=context)
+        input_ids, attention_mask = self.query_tokenizer.tensorize(queries, context=context, full_length_search=full_length_search)
         return self.query(input_ids, attention_mask)
 
     def docFromText(self, docs, bsize=None, keep_dims=True, to_cpu=False, showprogress=False, return_tokens=False):
diff --git a/colbert/modeling/tokenization/query_tokenization.py b/colbert/modeling/tokenization/query_tokenization.py
@@ -48,14 +48,29 @@ def encode(self, batch_text, add_special_tokens=False):
 
         return ids
 
-    def tensorize(self, batch_text, bsize=None, context=None):
+    def tensorize(self, batch_text, bsize=None, context=None, full_length_search=False):
         assert type(batch_text) in [list, tuple], (type(batch_text))
 
         # add placehold for the [Q] marker
         batch_text = ['. ' + x for x in batch_text]
 
+        # Full length search is only available for single inference (for now)
+        # Batched full length search requires far deeper changes to the code base
+        assert(full_length_search == False or (type(batch_text) == list and len(batch_text) == 1))
+
+        if full_length_search:
+            # Tokenize each string in the batch
+            un_truncated_ids = self.tok(batch_text, add_special_tokens=False)['input_ids']
+            # Get the longest length in the batch
+            max_length_in_batch = max(len(x) for x in un_truncated_ids)
+            # Set the max length
+            max_length = self.max_len(max_length_in_batch)
+        else:
+            # Max length is the default max length from the config
+            max_length = self.query_maxlen
+
         obj = self.tok(batch_text, padding='max_length', truncation=True,
-                       return_tensors='pt', max_length=self.query_maxlen)
+                       return_tensors='pt', max_length=max_length)
 
         ids, mask = obj['input_ids'], obj['attention_mask']
 
@@ -95,3 +110,7 @@ def tensorize(self, batch_text, bsize=None, context=None):
             print()
 
         return ids, mask
+
+    # Ensure that query_maxlen <= length <= 500 tokens
+    def max_len(self, length):
+        return min(500, max(self.query_maxlen, length))
diff --git a/colbert/searcher.py b/colbert/searcher.py
@@ -46,24 +46,24 @@ def __init__(self, index, checkpoint=None, collection=None, config=None):
     def configure(self, **kw_args):
         self.config.configure(**kw_args)
 
-    def encode(self, text: TextQueries):
+    def encode(self, text: TextQueries, full_length_search=False):
         queries = text if type(text) is list else [text]
         bsize = 128 if len(queries) > 128 else None
 
         self.checkpoint.query_tokenizer.query_maxlen = self.config.query_maxlen
-        Q = self.checkpoint.queryFromText(queries, bsize=bsize, to_cpu=True)
+        Q = self.checkpoint.queryFromText(queries, bsize=bsize, to_cpu=True, full_length_search=full_length_search)
 
         return Q
 
-    def search(self, text: str, k=10, filter_fn=None):
-        Q = self.encode(text)
+    def search(self, text: str, k=10, filter_fn=None, full_length_search=False):
+        Q = self.encode(text, full_length_search=full_length_search)
         return self.dense_search(Q, k, filter_fn=filter_fn)
 
-    def search_all(self, queries: TextQueries, k=10, filter_fn=None):
+    def search_all(self, queries: TextQueries, k=10, filter_fn=None, full_length_search=False):
         queries = Queries.cast(queries)
         queries_ = list(queries.values())
 
-        Q = self.encode(queries_)
+        Q = self.encode(queries_, full_length_search=full_length_search)
 
         return self._search_all_Q(queries, Q, k, filter_fn=filter_fn)
 
diff --git a/docs/intro2new.ipynb b/docs/intro2new.ipynb
diff --git a/setup.py b/setup.py