From f63c351e3bace1f383f1080c0dc37c827a2cbcc8 Mon Sep 17 00:00:00 2001 From: David Heyman Date: Sun, 2 Apr 2017 11:07:47 -0400 Subject: [PATCH 1/3] Fill and write each array before creating the next one, to save memory. --- scripts/preprocess.py | 47 ++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/scripts/preprocess.py b/scripts/preprocess.py index 90b834b6..e3cb3076 100644 --- a/scripts/preprocess.py +++ b/scripts/preprocess.py @@ -49,29 +49,34 @@ if not args.quiet: print 'Using dtype ', dtype - # Just load data into memory ... we'll have to do something more clever - # for huge datasets but this should be fine for now - train = np.zeros(train_size, dtype=dtype) - val = np.zeros(val_size, dtype=dtype) - test = np.zeros(test_size, dtype=dtype) - splits = [train, val, test] - - # Go through the file again and write data to numpy arrays - split_idx, cur_idx = 0, 0 + # Create, fill, and store each dataset, + # one at a time to save memory with codecs.open(args.input_txt, 'r', args.encoding) as f: - for line in f: - for char in line: - splits[split_idx][cur_idx] = token_to_idx[char] - cur_idx += 1 - if cur_idx == splits[split_idx].size: - split_idx += 1 - cur_idx = 0 + with h5py.File(args.output_h5, 'w') as h: + def fill_and_store(arr_size, set_name): + """Create a one-dimensional numpy array + of the given size, fill it, + and write the result to h under the given name. + + Leaves the source file advanced as far + as it had to go to fill the array. + + If the remaining part of the file is shorter + than arr_size, the remainder of the array is + filled with zeroes. + """ + arr = np.zeros(arr_size, dtype=dtype) + for idx in xrange(arr_size): + char = f.read(1) + if not char: + break + arr[idx] = token_to_idx[char] + + h.create_dataset(set_name, data=arr) - # Write data to HDF5 file - with h5py.File(args.output_h5, 'w') as f: - f.create_dataset('train', data=train) - f.create_dataset('val', data=val) - f.create_dataset('test', data=test) + fill_and_store(train_size, 'train') + fill_and_store(val_size, 'val') + fill_and_store(test_size, 'test') # For 'bytes' encoding, replace non-ascii characters so the json dump # doesn't crash From 828780203360f406247a53bf1d2643706a5f8e00 Mon Sep 17 00:00:00 2001 From: David Heyman Date: Sun, 2 Apr 2017 11:49:48 -0400 Subject: [PATCH 2/3] allow use of dtype uint64 and uint16 in preprocessing --- scripts/preprocess.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/preprocess.py b/scripts/preprocess.py index e3cb3076..798d8ddc 100644 --- a/scripts/preprocess.py +++ b/scripts/preprocess.py @@ -45,7 +45,13 @@ # Choose the datatype based on the vocabulary size dtype = np.uint8 if len(token_to_idx) > 255: - dtype = np.uint32 + if len(token_to_idx) > 65535: + if len(token_to_idx) > 4294967295: + dtype = np.uint64 + else: + dtype = np.uint32 + else: + dtype = np.uint16 if not args.quiet: print 'Using dtype ', dtype From 7de2c74d12d89d8e06076c8d518deabfa77cbac0 Mon Sep 17 00:00:00 2001 From: David Heyman Date: Sun, 2 Apr 2017 18:58:12 -0400 Subject: [PATCH 3/3] spacing and branching cleanup as requested --- scripts/preprocess.py | 56 +++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/scripts/preprocess.py b/scripts/preprocess.py index 798d8ddc..e8ec8103 100644 --- a/scripts/preprocess.py +++ b/scripts/preprocess.py @@ -43,15 +43,15 @@ print ' Test size: %d' % test_size # Choose the datatype based on the vocabulary size - dtype = np.uint8 - if len(token_to_idx) > 255: - if len(token_to_idx) > 65535: - if len(token_to_idx) > 4294967295: - dtype = np.uint64 - else: - dtype = np.uint32 - else: - dtype = np.uint16 + if len(token_to_idx) > 4294967295: + dtype = np.uint64 + elif len(token_to_idx) > 65535: + dtype = np.uint32 + elif len(token_to_idx) > 255: + dtype = np.uint16 + else: + dtype = np.uint8 + if not args.quiet: print 'Using dtype ', dtype @@ -60,25 +60,25 @@ with codecs.open(args.input_txt, 'r', args.encoding) as f: with h5py.File(args.output_h5, 'w') as h: def fill_and_store(arr_size, set_name): - """Create a one-dimensional numpy array - of the given size, fill it, - and write the result to h under the given name. - - Leaves the source file advanced as far - as it had to go to fill the array. - - If the remaining part of the file is shorter - than arr_size, the remainder of the array is - filled with zeroes. - """ - arr = np.zeros(arr_size, dtype=dtype) - for idx in xrange(arr_size): - char = f.read(1) - if not char: - break - arr[idx] = token_to_idx[char] - - h.create_dataset(set_name, data=arr) + """Create a one-dimensional numpy array + of the given size, fill it, + and write the result to h under the given name. + + Leaves the source file advanced as far + as it had to go to fill the array. + + If the remaining part of the file is shorter + than arr_size, the remainder of the array is + filled with zeroes. + """ + arr = np.zeros(arr_size, dtype=dtype) + for idx in xrange(arr_size): + char = f.read(1) + if not char: + break + arr[idx] = token_to_idx[char] + + h.create_dataset(set_name, data=arr) fill_and_store(train_size, 'train') fill_and_store(val_size, 'val')