-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbench.py
31 lines (25 loc) · 1.51 KB
/
bench.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import rottnest
import pyarrow
from tqdm import tqdm
import polars
import sys
metadata = polars.read_parquet("bench.parquet")[:int(sys.argv[1])]
# metadatas = []
# filenames = []
# for i in tqdm(range(10)):
# filename = f"s3://redpajama-1t/c4/c4-train.0000{i}-of-01024.parquet"
# x, y = rottnest.rottnest.get_parquet_layout("text", filename, "aws")
# filenames.append(filename)
# metadatas.append(y.metadata_bytes)
# polars.from_dict({"filename": filenames, "metadata_bytes": metadatas}).write_parquet("metadata.parquet")
file_metadata = polars.read_parquet("metadata.parquet")
file_metadata = {filename: metadata for filename, metadata in zip(file_metadata["filename"], file_metadata["metadata_bytes"])}
result = pyarrow.chunked_array(rottnest.rottnest.read_indexed_pages("text", ["s3://redpajama-1t/" + i for i in metadata["filename"].to_list()],
[0] * len(metadata["filename"]),
metadata["page_offset_right"].to_list(),
metadata["page_byte_size"].to_list(),
[0] * len(metadata["filename"]),
"aws",
file_metadata,
False))
print(len(result))