-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
38 lines (33 loc) · 1.16 KB
/
preprocess.py
File metadata and controls
38 lines (33 loc) · 1.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import pandas as pd
import json
from parsers import parse_sentence
def preprocess(parquet_file:str, output_path:str):
'''
load datasets, parse sentences, write to output
'''
print("loading data")
df = pd.read_parquet(parquet_file).head(2000)
with open(output_path, 'w') as file:
for i, row in df.iterrows():
en_text = row["english"]
fr_text = row["non_english"]
try:
en_parse = parse_sentence('en', en_text)
fr_parse = parse_sentence('fr', fr_text)
except Exception as error:
print(f"could parse row {i}: {error}")
continue
output_data = {
"en_raw": en_text,
"en_parse": en_parse,
"fr_raw": fr_text,
"fr_parse": fr_parse
}
file.write(json.dumps(output_data) + '\n')
if i % 100 == 0:
print(f"processed {i} rows")
print("done!")
if __name__ == "__main__":
parquet_file = 'data/raw/train-00000-of-00002.parquet'
output_file = 'data/processed/parsed_data.jsonl'
preprocess(parquet_file, output_file)