Add colbert/data

Omar Khattab · Omar Khattab · commit 397988d1f71b · 2021-10-18T13:46:43.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,10 @@
-experiments/
-checkpoints/
-data/
-logs/
+/experiments/
+/checkpoints/
+/data/
+/logs/
+/mlruns/
+/profiler/
+/logs/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -10,6 +13,11 @@ __pycache__/
 
 # Jupyter Notebook
 .ipynb_checkpoints
+# notebooks/
 
 # mac
 .DS_Store
+
+# Other
+.vscode
+*.tsv
diff --git a/colbert/data/__init__.py b/colbert/data/__init__.py
@@ -0,0 +1,5 @@
+from .collection import *
+from .queries import *
+
+from .ranking import *
+from .examples import *
diff --git a/colbert/data/collection.py b/colbert/data/collection.py
@@ -0,0 +1,97 @@
+
+# Could be .tsv or .json. The latter always allows more customization via optional parameters.
+# I think it could be worth doing some kind of parallel reads too, if the file exceeds 1 GiBs.
+# Just need to use a datastructure that shares things across processes without too much pickling.
+# I think multiprocessing.Manager can do that!
+
+import os
+import itertools
+
+from colbert.evaluation.loaders import load_collection
+from colbert.infra.run import Run
+
+
+class Collection:
+    def __init__(self, path=None, data=None):
+        self.path = path
+        self.data = data or self._load_file(path)
+
+    def __iter__(self):
+        # TODO: If __data isn't there, stream from disk!
+        return self.data.__iter__()
+
+    def __getitem__(self, item):
+        # TODO: Load from disk the first time this is called. Unless self.data is already not None.
+        return self.data[item]
+
+    def __len__(self):
+        # TODO: Load here too. Basically, let's make data a property function and, on first call, either load or get __data.
+        return len(self.data)
+
+    def _load_file(self, path):
+        self.path = path
+        return self._load_tsv(path) if path.endswith('.tsv') else self._load_jsonl(path)
+
+    def _load_tsv(self, path):
+        return load_collection(path)
+
+    def _load_jsonl(self, path):
+        raise NotImplementedError()
+
+    def provenance(self):
+        return self.path
+
+    def save(self, new_path):
+        assert new_path.endswith('.tsv'), "TODO: Support .json[l] too."
+        assert not os.path.exists(new_path), new_path
+
+        with Run().open(new_path, 'w') as f:
+            # TODO: expects content to always be a string here; no separate title!
+            for pid, content in enumerate(self.data):
+                content = f'{pid}\t{content}\n'
+                f.write(content)
+            
+            return f.name
+
+    def enumerate(self, rank):
+        for _, offset, passages in self.enumerate_batches(rank=rank):
+            for idx, passage in enumerate(passages):
+                yield (offset + idx, passage)
+
+    def enumerate_batches(self, rank, chunksize=None):
+        assert rank is not None, "TODO: Add support for the rank=None case."
+
+        chunksize = chunksize or self.get_chunksize()
+
+        offset = 0
+        iterator = iter(self)
+
+        for chunk_idx, owner in enumerate(itertools.cycle(range(Run().nranks))):
+            L = [line for _, line in zip(range(chunksize), iterator)]
+
+            if len(L) > 0 and owner == rank:
+                yield (chunk_idx, offset, L)
+
+            offset += len(L)
+
+            if len(L) < chunksize:
+                return
+    
+    def get_chunksize(self):
+        return min(25_000, 1 + len(self) // Run().nranks)  # 25k is great, 10k allows things to reside on GPU??
+
+    @classmethod
+    def cast(cls, obj):
+        if type(obj) is str:
+            return cls(path=obj)
+
+        if type(obj) is list:
+            return cls(data=obj)
+
+        if type(obj) is cls:
+            return obj
+
+        assert False, f"obj has type {type(obj)} which is not compatible with cast()"
+
+
+# TODO: Look up path in some global [per-thread or thread-safe] list.
diff --git a/colbert/data/dataset.py b/colbert/data/dataset.py
@@ -0,0 +1,14 @@
+
+
+# Not just the corpus, but also an arbitrary number of query sets, indexed by name in a dictionary/dotdict.
+# And also query sets with top-k PIDs.
+# QAs too? TripleSets too?
+
+
+class Dataset:
+    def __init__(self):
+        pass
+
+    def select(self, key):
+        # Select the {corpus, queryset, tripleset, rankingset} determined by uniqueness or by key and return a "unique" dataset (e.g., for key=train)
+        pass
diff --git a/colbert/data/examples.py b/colbert/data/examples.py
@@ -0,0 +1,64 @@
+from colbert.infra.run import Run
+import os
+import ujson
+
+from colbert.utils.utils import print_message
+
+
+class Examples:
+    def __init__(self, path=None, data=None):
+        self.path = path
+        self.data = data or self._load_file(path)
+    
+    def provenance(self):
+        return self.path
+
+    def _load_file(self, path):
+        examples = []
+        
+        with open(path) as f:
+            for line in f:
+                examples.append(ujson.loads(line))
+
+        return examples
+
+
+    def tolist(self, rank=None, nranks=None):
+        """
+        NOTE: For distributed sampling, this isn't equivalent to perfectly uniform sampling.
+        In particular, each subset is perfectly represented in every batch! However, since we never
+        repeat passes over the data, we never repeat any particular triple, and the split across
+        nodes is random (since the underlying file is pre-shuffled), there's no concern here.
+        """
+
+        if rank or nranks:
+            assert rank in range(nranks), (rank, nranks)
+            return [self.data[idx] for idx in range(0, len(self.data), nranks)] # if line_idx % nranks == rank
+
+        return list(self.data)
+
+    def save(self, new_path):
+        assert 'json' in new_path.strip('/').split('/')[-1].split('.'), "TODO: Support .json[l] too."
+
+        print_message(f"#> Writing {len(self.data) / 1000_000.0}M examples to {new_path}")
+
+        with Run().open(new_path, 'w') as f:
+            for example in self.data:
+                ujson.dump(example, f)
+                f.write('\n')
+
+            return f.name
+        # print_message(f"#> Saved ranking of {len(self.data)} queries and {len(self.flat_ranking)} lines to {new_path}")
+
+    @classmethod
+    def cast(cls, obj):
+        if type(obj) is str:
+            return cls(path=obj)
+
+        if isinstance(obj, list):
+            return cls(data=obj)
+
+        if type(obj) is cls:
+            return obj
+
+        assert False, f"obj has type {type(obj)} which is not compatible with cast()"
diff --git a/colbert/data/queries.py b/colbert/data/queries.py
@@ -0,0 +1,160 @@
+from colbert.infra.run import Run
+import os
+import ujson
+
+from colbert.evaluation.loaders import load_queries
+
+# TODO: Look up path in some global [per-thread or thread-safe] list.
+# TODO: path could be a list of paths...? But then how can we tell it's not a list of queries..
+
+
+class Queries:
+    def __init__(self, path=None, data=None):
+        self.path = path
+
+        if data:
+            assert isinstance(data, dict), type(data)
+        self._load_data(data) or self._load_file(path)
+    
+    def __len__(self):
+        return len(self.data)
+
+    def __iter__(self):
+        return iter(self.data.items())
+
+    def provenance(self):
+        return self.path
+
+    def _load_data(self, data):
+        if data is None:
+            return None
+
+        self.data = {}
+        self._qas = {}
+
+        for qid, content in data.items():
+            if isinstance(content, dict):
+                self.data[qid] = content['question']
+                self._qas[qid] = content
+            else:
+                self.data[qid] = content
+
+        if len(self._qas) == 0:
+            del self._qas
+
+        return True
+
+    def _load_file(self, path):
+        if path.endswith('.tsv'):
+            self.data = load_queries(path)
+            return True
+        
+        # Load QAs
+        self.data = {}
+        self._qas = {}
+
+        with open(path) as f:
+            for line in f:
+                qa = ujson.loads(line)
+
+                assert qa['qid'] not in self.data
+                self.data[qa['qid']] = qa['question']
+                self._qas[qa['qid']] = qa
+
+        return self.data
+
+    def qas(self):
+        return dict(self._qas)
+
+    def __getitem__(self, key):
+        return self.data[key]
+
+    def keys(self):
+        return self.data.keys()
+
+    def values(self):
+        return self.data.values()
+
+    def items(self):
+        return self.data.items()
+
+    def save(self, new_path):
+        assert new_path.endswith('.tsv')
+        assert not os.path.exists(new_path), new_path
+
+        with Run().open(new_path, 'w') as f:
+            for qid, content in self.data.items():
+                content = f'{qid}\t{content}\n'
+                f.write(content)
+            
+            return f.name
+
+    def save_qas(self, new_path):
+        assert new_path.endswith('.json')
+        assert not os.path.exists(new_path), new_path
+
+        with open(new_path, 'w') as f:
+            for qid, qa in self._qas.items():
+                qa['qid'] = qid
+                f.write(ujson.dumps(qa) + '\n')
+
+    def _load_tsv(self, path):
+        raise NotImplementedError
+
+    def _load_jsonl(self, path):
+        raise NotImplementedError
+
+    @classmethod
+    def cast(cls, obj):
+        if type(obj) is str:
+            return cls(path=obj)
+
+        if isinstance(obj, dict) or isinstance(obj, list):
+            return cls(data=obj)
+
+        if type(obj) is cls:
+            return obj
+
+        assert False, f"obj has type {type(obj)} which is not compatible with cast()"
+
+
+# class QuerySet:
+#     def __init__(self, *paths, renumber=False):
+#         self.paths = paths
+#         self.original_queries = [load_queries(path) for path in paths]
+
+#         if renumber:
+#             self.queries = flatten([q.values() for q in self.original_queries])
+#             self.queries = {idx: text for idx, text in enumerate(self.queries)}
+
+#         else:
+#             self.queries = {}
+
+#             for queries in self.original_queries:
+#                 assert len(set.intersection(set(queries.keys()), set(self.queries.keys()))) == 0, \
+#                     "renumber=False requires non-overlapping query IDs"
+
+#                 self.queries.update(queries)
+
+#         assert len(self.queries) == sum(map(len, self.original_queries))
+
+#     def todict(self):
+#         return dict(self.queries)
+
+#     def tolist(self):
+#         return list(self.queries.values())
+
+#     def query_sets(self):
+#         return self.original_queries
+
+#     def split_rankings(self, rankings):
+#         assert type(rankings) is list
+#         assert len(rankings) == len(self.queries)
+
+#         sub_rankings = []
+#         offset = 0
+#         for source in self.original_queries:
+#             sub_rankings.append(rankings[offset:offset+len(source)])
+#             offset += len(source)
+
+#         return sub_rankings
diff --git a/colbert/data/ranking.py b/colbert/data/ranking.py