-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstep3_splitting_partfiles_into_sample_files.py
47 lines (42 loc) · 1.58 KB
/
step3_splitting_partfiles_into_sample_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#############
# @authors: Roshan Bhandari, Abhijeet Amle, Abhimanyu
# This code is used to split the part files botained from the
# map reduce into multiple sample fi
#############
import os
data_dict = {}
header_data = {}
input_dir = "/mnt/disks/localdisk/new_sampled_data/"
output_file_dir = "/mnt/disks/localdisk/new_sampled_data1/"
header_file = "/mnt/disks/localdisk/headerfile.txt"
files = os.listdir(input_dir)
count = 0
with open(header_file) as file:
for each_line in file.readlines():
# print(each_line)
try:
filename = each_line.split(":")[0]
header = ','.join(each_line.split(":")[1:])
header_data[filename.replace("data/", "")] = header
except Exception as e:
# print(str(e))
count += 1
for each_file in files:
with open (input_dir + each_file, 'r') as f:
for each_line in f.readlines():
#print(each_line)
file, line = each_line.split("\t")
file = file.strip()[1:-1]
line = line[1:-2]
# print(each_line.split())
if file not in data_dict:
data_dict[file] = [line]
else:
data_dict[file].append(line)
print("Total Unprocessed Files: ", count)
for each_file in data_dict:
with open(output_file_dir + each_file, 'w') as file:
file.write(header_data[each_file] + "\n")
for count, each_line in enumerate(data_dict[each_file]):
if header_data[each_file] != each_line:
file.write(each_line + "\n")