-
Notifications
You must be signed in to change notification settings - Fork 0
/
display_html_3.py
457 lines (405 loc) · 16.6 KB
/
display_html_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
#https://seaborn.pydata.org/generated/seaborn.distplot.html
import re
import front_end
import pickle
import sys
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from nltk import pos_tag, word_tokenize
from os import path
from collections import namedtuple
import solution_parser
import CMUTweetTagger
import os
import cal_calorie_given_food_name
from pprint import pprint
import nltk
import time
def load(fileName):
with open(fileName, 'r') as f:
return pickle.load(f)
def save(variable, fileName):
with open(fileName, 'w') as f:
pickle.dump(variable, f)
def read_file(fileName, parser_type = None, only_files_with_solutions = False, base_accuracy_on_how_many_unique_food_items_detected = True):
write2file = ''
total_calorie = 0.0
calorie = cal_calorie_given_food_name.food_to_calorie()
#Previous versions
#foodNames = load(path.join('.', path.join('data','food_pair_dict.pickle')))
#foodNames = load('.\\data\\nltk_food_dictionary.pickle')
foodNames = load("./data/food_desc_files/food_names.pickle")
foodGroup = load("./data/food_desc_files/food_group.pickle")
langua = load("./data/food_desc_files/langua.pickle")
ark_parsed_data = ark_parser(fileName)
unique_food_names = {}
f = file(fileName, 'r')
current_line_number = 0
predicted_food_labels_set = set() # syntax: key = (line_number, (start_index_of_food_string_on_line, end_index_of_food_string_on_line), where ending indices are inclusive.
solution_set_loaded = False
solution_file_path = path.join('solutions', fileName)
try:
print('loading solution set')
solution_set = solution_parser.get_solution_set_from_file(solution_file_path)
solution_set_loaded = True
except IOError:
print('no solution file found for: ' + solution_file_path)
# if we only want files with solutions, and no solution set is found, break early so we don't need to parse the file for food words.
if only_files_with_solutions:
if not solution_set_loaded:
return "solution set not found", None
prev_time = time.time()
for line_no, i in enumerate(f): # i is the current line (a string)
calorie_text = ''
food_id_group_pairs = []
food_id_langua_pairs = []
current_line_number += 1
if i[0] == '*':
print "\n\nLine no -> ", line_no, "\n\n"
word_char_index, word_char_index_string_fromat = provide_words_with_char_nos(i, line_no+1)
text = ''
i = i.lower()
print "Time taken", time.time() - prev_time
prev_time = time.time()
#i = i.split()
#for word in i:
# if word not in foodNames:
# text += word + ' '
# else:
# text += '<mark>'+word+'</mark> '
#write2file += text + '<br>'
found_at_least = 0
index_of_food_names = []
temp_i = re.sub('[^a-zA-Z0-9 \n]', ' ', i[4:])
#temp_i = i[4:]
spans_found_on_line = []
start_time_generate_pairs = time.time()
sentence_to_word_pairs = generate_pair(i)
#print "Time to generate pairs -> ", time.time() - start_time_generate_pairs
for word in foodNames:
#Generate all the pair of words of sentences
#print "test strings -> ", sentence_to_word_pairs
#pprint(sentence_to_word_pairs)
#food_pairs_edit_distance = []
start_time_generate_pairs = time.time()
edit_distance_time = time.time()
food_pairs_edit_distance = calculate_edit_distance(i, sentence_to_word_pairs, word)
edit_distance_final_time = time.time() - edit_distance_time
# if edit_distance_final_time - edit_distance_time > 0.5:
# print "Time greater than 1", food_pairs_edit_distance
#print food_pairs_edit_distance
#print "Time to Calculate edit distance -> ", time.time() - start_time_generate_pairs
#continue
if len(food_pairs_edit_distance) > 0:
#if temp_i.__contains__(' ' + word + ' '):
print "Time taken -> ", edit_distance_final_time, food_pairs_edit_distance
# print(tags)
#print word
unique_food_names[word] = 1
found_at_least = 1
# #Previous Setting
# c = i.find(word)
# index_of_food_names.append([c, c + len(word) + 1])
# #removed the plus one
# spans_found_on_line.append((c, c + len(word)))
try:
4/0
temp_calorie = calorie.cal_calorie(word)
total_calorie += temp_calorie
calorie_text += '<br><mark>'+word+"</mark>-> "+str(temp_calorie)
except:
#print sys.exc_info()
pass
#tags = pos_tag(word_tokenize(temp_i))
individual_food_words = word.split()
last_word = individual_food_words[-1]
# for word, label in tags:
# if word == last_word and check_if_noun(label):
# index_of_food_names.append([c, c + len(word) + 1])
# print('chose word: '+ word)
# pass
# else:
# continue
#Commented recently
#print(tags)
#print(individual_food_words)
# for match in re.finditer(word, i):
# food_match_indexes = match.span()
# index_of_food_names.append([food_match_indexes[0], food_match_indexes[1]])
# spans_found_on_line.append([food_match_indexes[0], food_match_indexes[1]])
for pairs in food_pairs_edit_distance:
index_of_food_names.append([pairs[2], pairs[3]])
spans_found_on_line.append([pairs[2], pairs[3]])
#Adding stuffs after reading documentation from USDA
#print ("food -> ", foodNames[word], foodGroup[foodNames[word]])
# food_id = foodNames[word]
# if food_id in foodGroup:
# food_group_for_food_id = foodGroup[food_id]
# food_id_group_pairs.append([word, food_group_for_food_id])
# if food_id in langua:
# temp_langua = langua[food_id]
# t = []
# for temp_words in temp_langua:
# t.append(temp_words)
# food_id_langua_pairs.append([word + " " + food_id, t])
# #food_id_langua_pairs =
#print("food -> ", food_id_group_pairs)
#print "word found", word, len(word), max_len, max_len_word
#print ("Temproray -> ", temp_i)
#print ("Final i -> ", i)
if found_at_least:
dic = minimum_no_meeting_rooms(index_of_food_names, len(i))
#print('dic')
#print(dic)
for char_pos in dic:
if dic[char_pos] == 1:
text += '<mark>' + i[char_pos] + '</mark>'
else:
text += i[char_pos]
text += calorie_text
tuples_list = give_largest_non_overlapping_sequences(spans_found_on_line) # filters out spans that conflict with other spans. larger spans are given priority
for tup in tuples_list:
set_elem = (current_line_number, tup) # add line number so we know where in the document we got it
predicted_food_labels_set.add(set_elem)
else:
pass
text += i[1:]
#print ("Final text ->", text)
tags = ''
if parser_type == 'stanford_POS' and 0:
# print('running stanford')
tags = pos_tag(word_tokenize(temp_i))
#Joining the tags
tags = join_tags(tags)
elif parser_type == 'ark_tweet_parser' and 0:
print('running ark')
#tags = CMUTweetTagger.runtagger_parse([temp_i])
tags = join_tags(ark_parsed_data[line_no])
#tags = ''
#tags1 = join_tags(tags)
#print("tags -> ", tags1)
#print("pairs ---> ", food_id_langua_pairs, len(food_id_langua_pairs))
#print ("pairs -> ", word_char_index)
food_tags = ''
# if len(food_id_group_pairs):
# for pairs in food_id_group_pairs:
# food_tags += "<mark>" + pairs[0] + "</mark>" + "----> " + pairs[1] + "<br>"
food_ledger_langua = ''
# if len(food_id_langua_pairs):
# for pairs in food_id_langua_pairs:
# food_name_langua = pairs[0]
# food_ledger_langua += "<mark>" + food_name_langua + "----></mark>"
# for ledger in pairs[1]:
# food_ledger_langua += ledger.lower() + ", "
# food_ledger_langua += "<br>" + "<br>"
write2file += text + word_char_index_string_fromat + '<br>' + tags + '<br>' + food_tags + '<br>' + food_ledger_langua
#Orignal
#write2file += text + '<br>'
return
write2file += "<hr>" + "Total Calories -> " + str(total_calorie)
num_true_pos = None # give dummy values in case try fails
num_false_pos = None
num_false_neg = None
if solution_set_loaded:
print('loading solution set')
solution_set = solution_parser.get_solution_set_from_file(solution_file_path)
print('calculating')
if base_accuracy_on_how_many_unique_food_items_detected:
food_names_only_solution_set = solution_parser.convert_solution_set_to_set_of_food_names(fileName, solution_set)
food_names_only_predicted_set = solution_parser.convert_solution_set_to_set_of_food_names(fileName, predicted_food_labels_set)
precision, recall, false_pos_list, false_neg_list, true_pos_list = solution_parser.calculate_precision_and_recall(
food_names_only_solution_set, food_names_only_predicted_set)
else:
precision, recall, false_pos_list, false_neg_list, true_pos_list = solution_parser.calculate_precision_and_recall(solution_set, predicted_food_labels_set)
num_true_pos = len(true_pos_list)
num_false_pos = len(false_pos_list)
num_false_neg = len(false_neg_list)
print('file:' + fileName)
print('precision: ' + str(precision))
print('recall: ' + str(recall))
print('true positives:') + str(true_pos_list)
if not base_accuracy_on_how_many_unique_food_items_detected:
for line in solution_parser.get_corresponding_lines(fileName, true_pos_list):
print(line)
print('false positives: ' + str(false_pos_list))
if not base_accuracy_on_how_many_unique_food_items_detected:
for line in solution_parser.get_corresponding_lines(fileName, false_pos_list):
print(line)
print('false negatives: ' + str(false_neg_list))
if not base_accuracy_on_how_many_unique_food_items_detected:
for line in solution_parser.get_corresponding_lines(fileName, false_neg_list):
print(line)
print('# true pos: {}'.format(num_true_pos))
print('# false pos: {}'.format(num_false_pos))
print('# false neg: {}'.format(num_false_neg))
if not base_accuracy_on_how_many_unique_food_items_detected:
write2file += '<br><hr>'+"Precision: "+str(precision)+ \
"<br>Recall: "+str(recall) + "<br><hr>"
write2file += "False Positives<br>"+ str(false_pos_list)+ \
"<br>"
for line in solution_parser.get_corresponding_lines(fileName, false_pos_list):
write2file += str(line)+ " ---> <mark>" + str(line[1][line[0][1][0]:line[0][1][1]]) +"</mark><br>"
write2file += "<hr>False negatives:<br>"+str(false_neg_list) + "<br>"
for line in solution_parser.get_corresponding_lines(fileName, false_neg_list):
write2file += str(line)+ " ---> <mark>" + str(line[1][line[0][1][0]:line[0][1][1]]) +"</mark><br>"
else:
print('no solution set found')
#return write2file, unique_food_names
#namedtuple()
Accuracy = namedtuple('Accuracy',
'num_true_pos num_false_pos num_false_neg') # makes returning multiple values more clear
results = Accuracy(num_true_pos=num_true_pos, num_false_pos=num_false_pos, num_false_neg=num_false_neg)
return write2file, results
def generate_pair(sentence):
#print sentence
sentence = sentence.strip().split()
#print sentence
return_sentence = []
for range_ in xrange(1, len(sentence)):
for i in xrange(0, len(sentence)):
if i + range_ <= len(sentence):
#print sentence[i: range_]
return_sentence.append(' '.join(sentence[i:i+range_]))
return return_sentence
def calculate_edit_distance(sentence, sentence_list_format, foodName, k = 0.3):
#return nltk.edit_distance(sentence, foodName)
return_list = []
for word_pairs in sentence_list_format:
if abs(len(word_pairs) - len(foodName)) <= 3:
distance = nltk.edit_distance(word_pairs, foodName)
if float(distance)/float(len(word_pairs)) < k:
start = sentence.find(word_pairs)
return_list.append([foodName, word_pairs,start, start + len(word_pairs)])
#print word_pairs, "->", foodName, distance
#print "Printing", return_list
return return_list
def provide_words_with_char_nos(sentence, line_no):
temp_char = ''
start_count = 0
return_array = []
for index, char in enumerate(sentence):
if char != ' ' and char != '\t':
temp_char += char
else:
return_array.append([temp_char, start_count, index])
start_count = index + 1
temp_char = ' '
#Converting to displayable format (String format)
return_string = '<br>(line->' + str(line_no)+ ") "
for word in return_array:
return_string += word[0].lower() + " ("+str(word[1])+","+str(word[2])+") "
return_string += "<br>"
return return_array, return_string
def join_tags(sentence):
text = ' '
for i in sentence:
text += '(' + i[0] + "->" + i[1] + ") "
return text
def match_word(food_key_word, sentence, value = 0):
food_key_word = food_key_word.split()
sentence = sentence.split()
for word in food_key_word:
if word not in sentence:
return 0
return 1
def minimum_no_meeting_rooms(list_of_timings, length_of_sent):
dic = defaultdict(int)
for i in xrange(1, length_of_sent):
dic[i] = 0
for meeting_schedules in list_of_timings:
for i in xrange(meeting_schedules[0], meeting_schedules[1]):
dic[i] = 1
return dic
def check_if_noun(tag):
if tag == 'NN' or tag == 'NNS' or tag == 'NNP' or tag == 'NNPS':
return True
return False
def give_largest_non_overlapping_sequences(list_of_start_end_tuples):
Sequence = namedtuple('Sequence', ['start', 'end', 'size'])
list_of_named_sequences = [Sequence(start = x[0], end = x[1], size = x[1] - x[0] - 1) for x in list_of_start_end_tuples] # size is -1 because the end number represents the index of the character AFTER the last character in the sequence.
sorted_by_size_sequences = sorted(list_of_named_sequences, key=lambda seq: seq.size) # smallest size is first, largest size is last
non_overlapping_sequences = []
while len(sorted_by_size_sequences) > 0:
sequence = sorted_by_size_sequences.pop() # last element in list, therefore sequence with largest size still on the list
if not conflicts_with_sequences(non_overlapping_sequences, sequence):
non_overlapping_sequences.append(sequence)
extracted_tuples = [(seq.start, seq.end) for seq in non_overlapping_sequences]
return extracted_tuples
def conflicts_with_sequences(list_of_sequences, test_sequence):
"""Tests if test_sequence conflicts with any sequence in the list_of_sequences"""
for already_added_sequence in list_of_sequences:
if sequences_overlap(already_added_sequence, test_sequence):
return True
return False
def sequences_overlap(seq1, seq2):
"""Returns if two sequences overlap"""
if seq1.end <= seq2.start: # seq1 must end before seq2 begins. they do not overlap
return False
elif seq2.end <= seq1.start:
return False
else:
return True
def ark_parser(fileName):
final_list_of_sentences = []
list_of_sentences = open(fileName, "r").read()
for sentence in list_of_sentences.split('\n'):
if len(sentence) > 1:
if sentence[0] == '*':
final_list_of_sentences.append(' '.join(sentence.split()))
print final_list_of_sentences
var = CMUTweetTagger.runtagger_parse(final_list_of_sentences)
return var
def evaluate_all_files_in_directory(directory_path, only_files_with_solutions = False):
sum_true_pos = 0
sum_false_pos = 0
sum_false_neg = 0
for filename in os.listdir(directory_path):
file_path = directory_path + '/' + filename
print(file_path)
html_format, results = read_file(file_path, only_files_with_solutions=only_files_with_solutions)
if results is not None:
if results.num_true_pos is not None: # if it is none, a solution set was not loaded
sum_true_pos += results.num_true_pos
if results.num_false_pos is not None:
sum_false_pos += results.num_false_pos
if results.num_false_neg is not None:
sum_false_neg += results.num_false_neg
precision = sum_true_pos / float(sum_true_pos + sum_false_pos)
recall = sum_true_pos / float(sum_true_pos + sum_false_neg)
return precision, recall, sum_true_pos, sum_false_pos, sum_false_neg
if __name__ == '__main__':
try:
#fileName = 'HSLLD/HV3/MT/brtmt3.cha' # coffee
#print adsasda
start = time.time()
fileName = 'HSLLD/HV1/MT/admmt1.cha'
html_format, results = read_file(fileName, 'ark_tweet_parser')
#print "HTNL Format", html_format
front_end.wrapStringInHTMLWindows(body = html_format)
print "Time Taken -> ", time.time() - start
except:
print "none"
print sys.exc_info()
print calculate_edit_distance("Hello I go to UMAss Amherst", generate_pair("Hello I go to UMAss Amherst"), "Amherst")
# fileCounts = []
# all_files = load("C:\\Users\\priti\\OneDrive\\Documents\\CCPP\\FoodMonitoring-NLP\\data\\food_files.pickle")
# c = 0
# for file_name in all_files:
# print "File ", c
# c += 1
# try:
# html_format, count = read_file(file_name)
# except:
# continue
# else:
# fileCounts.append(len(cont))
# sns.distplot(fileCounts,
# #hist = False,
# kde = False,
# #rug=False,
# norm_hist = False,
# rug_kws={"color": "g"},
# kde_kws={"color": "k", "lw": 3, "label": "KDE"},
# hist_kws={"histtype": "step", "linewidth": 3,"alpha": 1, "color": "g"})
# plt.show()