-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path1 - parse_data.py
110 lines (78 loc) · 3.01 KB
/
1 - parse_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# take data from raw csv files and
# parse into a list (food ids) and numpy array (serving size, nutrition...)
import numpy as np
import csv
import _pickle as cPickle
# 1.Create list of all food ID numbers (NDB_No) and name (dicts)
food_list = []
file1 = open('Data/FOOD_DES.csv', encoding = 'utf-8')
csv_reader = csv.reader(file1,delimiter = ',')
for item in csv_reader:
food_item = {}
food_item["food_id"] = item[0]
food_item["food_name"] = item[2]
food_list.append(food_item)
print("All food items appended to food_list")
file1.close()
# 2. Go through nutrient list and for each food-nutrient combo, append to matching dict
food_nutrient_list = []
# each line of Nutrients.csv:
# ['NDB_No', 'Nutrient_Code', 'Nutrient_name', 'Derivation_Code', 'Output_value', 'Output_uom']
file2 = open('Data/NUT_DATA.csv', encoding = 'utf-8')
csv_reader = csv.reader(file2, delimiter = ',')
for item in csv_reader:
#represents one food and nutrient pair
food_nutrient = {}
# append food id, nutrient id, ammount
food_nutrient["food_id"] = item[0]
food_nutrient["nutrient_id"] = item[1]
food_nutrient["value"] = item[2]
food_nutrient_list.append(food_nutrient)
print("All food nutrient pairs appended to food_nutrient_list")
file2.close()
# 3. Create list of all nutrients
# used to store unique nutrients
all_nutrients = []
file3 = open('Data/NUTR_DEF.csv', encoding = 'utf-8')
csv_reader = csv.reader(file3, delimiter = ',')
for item in csv_reader:
nutrient = {}
nutrient["nutrient_id"] = item[0]
nutrient["unit"] = item[1]
nutrient["tag"] = item[2]
nutrient["nutrient_name"] = item[3]
all_nutrients.append(nutrient)
print("All unique nutrients identified")
# 4. Create numpy array for all nutrients, all foods
# First remove first row (labels) for each stored list
all_nutrients = all_nutrients[1:]
food_list = food_list[1:]
food_nutrient_list = food_nutrient_list[1:]
food_nutrient_mat = np.zeros([len(food_list),len(all_nutrients)])
# 5. Transfer info from dictionary to numpy list
# for each item in the food_nutrient list
count = 0
for item in food_nutrient_list:
food_id = 0
nutrient_id = 0
#iterate through all food, nutrient combos and if match, transfer to array
for i in range(0,len(food_list)):
if item["food_id"] == food_list[i]["food_id"]:
food_id = i
break
for j in range(0,len(all_nutrients)):
# if food and nutrient match the food and nutrient at j and i, respectively
if item["nutrient_id"] == all_nutrients[j]["nutrient_id"]:
nutrient_id = j
break
food_nutrient_mat[food_id,nutrient_id] = item["value"]
count = count + 1
if count % 1000 == 0:
print("On data point {}".format(count))
# 6. Create label lists for numpy array
# 7. Pickle these final data structure
# Commented for safety
#f = open("Nutrition_Data_Matrix.cpkl", 'wb')
#cPickle.dump((food_list,all_nutrients,food_nutrient_mat),f)
#f.close()
#print("All data pickled.")