|
| 1 | +from colbert.infra.run import Run |
| 2 | +import os |
| 3 | +import ujson |
| 4 | + |
| 5 | +from colbert.evaluation.loaders import load_queries |
| 6 | + |
| 7 | +# TODO: Look up path in some global [per-thread or thread-safe] list. |
| 8 | +# TODO: path could be a list of paths...? But then how can we tell it's not a list of queries.. |
| 9 | + |
| 10 | + |
| 11 | +class Queries: |
| 12 | + def __init__(self, path=None, data=None): |
| 13 | + self.path = path |
| 14 | + |
| 15 | + if data: |
| 16 | + assert isinstance(data, dict), type(data) |
| 17 | + self._load_data(data) or self._load_file(path) |
| 18 | + |
| 19 | + def __len__(self): |
| 20 | + return len(self.data) |
| 21 | + |
| 22 | + def __iter__(self): |
| 23 | + return iter(self.data.items()) |
| 24 | + |
| 25 | + def provenance(self): |
| 26 | + return self.path |
| 27 | + |
| 28 | + def _load_data(self, data): |
| 29 | + if data is None: |
| 30 | + return None |
| 31 | + |
| 32 | + self.data = {} |
| 33 | + self._qas = {} |
| 34 | + |
| 35 | + for qid, content in data.items(): |
| 36 | + if isinstance(content, dict): |
| 37 | + self.data[qid] = content['question'] |
| 38 | + self._qas[qid] = content |
| 39 | + else: |
| 40 | + self.data[qid] = content |
| 41 | + |
| 42 | + if len(self._qas) == 0: |
| 43 | + del self._qas |
| 44 | + |
| 45 | + return True |
| 46 | + |
| 47 | + def _load_file(self, path): |
| 48 | + if path.endswith('.tsv'): |
| 49 | + self.data = load_queries(path) |
| 50 | + return True |
| 51 | + |
| 52 | + # Load QAs |
| 53 | + self.data = {} |
| 54 | + self._qas = {} |
| 55 | + |
| 56 | + with open(path) as f: |
| 57 | + for line in f: |
| 58 | + qa = ujson.loads(line) |
| 59 | + |
| 60 | + assert qa['qid'] not in self.data |
| 61 | + self.data[qa['qid']] = qa['question'] |
| 62 | + self._qas[qa['qid']] = qa |
| 63 | + |
| 64 | + return self.data |
| 65 | + |
| 66 | + def qas(self): |
| 67 | + return dict(self._qas) |
| 68 | + |
| 69 | + def __getitem__(self, key): |
| 70 | + return self.data[key] |
| 71 | + |
| 72 | + def keys(self): |
| 73 | + return self.data.keys() |
| 74 | + |
| 75 | + def values(self): |
| 76 | + return self.data.values() |
| 77 | + |
| 78 | + def items(self): |
| 79 | + return self.data.items() |
| 80 | + |
| 81 | + def save(self, new_path): |
| 82 | + assert new_path.endswith('.tsv') |
| 83 | + assert not os.path.exists(new_path), new_path |
| 84 | + |
| 85 | + with Run().open(new_path, 'w') as f: |
| 86 | + for qid, content in self.data.items(): |
| 87 | + content = f'{qid}\t{content}\n' |
| 88 | + f.write(content) |
| 89 | + |
| 90 | + return f.name |
| 91 | + |
| 92 | + def save_qas(self, new_path): |
| 93 | + assert new_path.endswith('.json') |
| 94 | + assert not os.path.exists(new_path), new_path |
| 95 | + |
| 96 | + with open(new_path, 'w') as f: |
| 97 | + for qid, qa in self._qas.items(): |
| 98 | + qa['qid'] = qid |
| 99 | + f.write(ujson.dumps(qa) + '\n') |
| 100 | + |
| 101 | + def _load_tsv(self, path): |
| 102 | + raise NotImplementedError |
| 103 | + |
| 104 | + def _load_jsonl(self, path): |
| 105 | + raise NotImplementedError |
| 106 | + |
| 107 | + @classmethod |
| 108 | + def cast(cls, obj): |
| 109 | + if type(obj) is str: |
| 110 | + return cls(path=obj) |
| 111 | + |
| 112 | + if isinstance(obj, dict) or isinstance(obj, list): |
| 113 | + return cls(data=obj) |
| 114 | + |
| 115 | + if type(obj) is cls: |
| 116 | + return obj |
| 117 | + |
| 118 | + assert False, f"obj has type {type(obj)} which is not compatible with cast()" |
| 119 | + |
| 120 | + |
| 121 | +# class QuerySet: |
| 122 | +# def __init__(self, *paths, renumber=False): |
| 123 | +# self.paths = paths |
| 124 | +# self.original_queries = [load_queries(path) for path in paths] |
| 125 | + |
| 126 | +# if renumber: |
| 127 | +# self.queries = flatten([q.values() for q in self.original_queries]) |
| 128 | +# self.queries = {idx: text for idx, text in enumerate(self.queries)} |
| 129 | + |
| 130 | +# else: |
| 131 | +# self.queries = {} |
| 132 | + |
| 133 | +# for queries in self.original_queries: |
| 134 | +# assert len(set.intersection(set(queries.keys()), set(self.queries.keys()))) == 0, \ |
| 135 | +# "renumber=False requires non-overlapping query IDs" |
| 136 | + |
| 137 | +# self.queries.update(queries) |
| 138 | + |
| 139 | +# assert len(self.queries) == sum(map(len, self.original_queries)) |
| 140 | + |
| 141 | +# def todict(self): |
| 142 | +# return dict(self.queries) |
| 143 | + |
| 144 | +# def tolist(self): |
| 145 | +# return list(self.queries.values()) |
| 146 | + |
| 147 | +# def query_sets(self): |
| 148 | +# return self.original_queries |
| 149 | + |
| 150 | +# def split_rankings(self, rankings): |
| 151 | +# assert type(rankings) is list |
| 152 | +# assert len(rankings) == len(self.queries) |
| 153 | + |
| 154 | +# sub_rankings = [] |
| 155 | +# offset = 0 |
| 156 | +# for source in self.original_queries: |
| 157 | +# sub_rankings.append(rankings[offset:offset+len(source)]) |
| 158 | +# offset += len(source) |
| 159 | + |
| 160 | +# return sub_rankings |
0 commit comments