diff --git a/data/openwebtext/prepare.py b/data/openwebtext/prepare.py index 2a9b9752b7..f7ed164ecc 100644 --- a/data/openwebtext/prepare.py +++ b/data/openwebtext/prepare.py @@ -1,6 +1,7 @@ # saves the openwebtext dataset to a binary file for training. following was helpful: # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py +import multiprocessing import os from tqdm import tqdm import numpy as np @@ -9,7 +10,7 @@ # number of workers in .map() call # good number to use is ~order number of cpu cores // 2 -num_proc = 8 +num_proc = multiprocessing.cpu_count() // 2 # number of workers in load_dataset() call # best number might be different from num_proc above as it also depends on NW speed.