-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdict.py
101 lines (86 loc) · 3.81 KB
/
dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#written by Scott Bergstresser, Christian Okada, Cameron Mayes, and Brandon Bui
#simple machine learning exercise, takes in categorical data from a training sets
# and constructs dictionaries that map specific data to specific categories.
# Program then reads in a second excel file with similar data fields
# and chooses the best category based on the given data, weighting the
# categories based on their importance.
#Accenture internship Summer 2017
from __future__ import unicode_literals
from openpyxl import load_workbook
cd = {} # counter dictionary, used to determine the correct category
#dictionaries that map data to the 31 possible categories
ids,office,org,carl,fun,title,tt,tloc,cor,vname,confname,ctype,certname,curl,ccn,act= ({} for i in range(16))
charstr = ['B', 'D', 'F', 'I', 'J', 'L', 'M','P', 'Q', 'S', 'T', 'V', 'W', 'X', 'Z', 'AB']
dlist = []
dlist.extend((ids,office,org,carl,fun,title,tt,tloc,cor,vname,confname,ctype,certname,curl,ccn,act))
KEY = '0xFF001234A117'
TARGET = 'AA'
# Add universal key and char value for each column
c = 0
for dic in dlist:
dic[KEY] = charstr[c]
c += 1
wb = load_workbook('contest-train.xlsx') #crack open a cold workbook with the boys
ws = wb['contest-train']
catlist=[] #populate a list of possible categories
for row in range(2,ws.max_row):
if ws[TARGET + str(row)].value not in catlist:
catlist.append(ws[TARGET + str(row)].value)
if ws[TARGET + str(row)].value not in cd.keys():
cd[ws[TARGET + str(row)].value] = 0
for cat in catlist:# initialize
cd[cat] = 0
#initialize dictionaries, these map our data sets to categories. Each dictionary
#has keys representing the 31 categories, values are an array of data learned
#from the training set
for dic in dlist:
for cat in catlist:
dic[cat]=[]
#############################Populate the dictionaries####################################
for row in range(2,ws.max_row+1):
for dic in dlist:
if (ws[dic.get(KEY)+str(row)].value is not 'NA') and (ws[dic.get(KEY)+str(row)].value is not 'N/A'):
if ws[dic.get(KEY)+str(row)].value not in ids[ws[TARGET+str(row)].value]:
dic[ws[TARGET+str(row)].value].append(ws[dic.get(KEY)+str(row)].value)#.encode("ascii", "ignore"))
#################Dictionaries have been created, now to parse test set###################
wb2 = load_workbook('contest-test.xlsx') #cold workbook number 2
ws2 = wb2['contest-test']
#Data have different weights based on importance to categories from training set
for row in range(2,ws2.max_row+1):
for cat in catlist:
if ws2['D'+str(row)].value in office[cat]:
cd[cat] += 1
if ws2['F'+str(row)].value in org[cat]:
cd[cat] += 2
if ws2['I'+str(row)].value in carl[cat]:
cd[cat] += 1
if ws2['J'+str(row)].value in fun[cat]:
cd[cat] += 1
if ws2['L'+str(row)].value in title[cat]:
cd[cat] += 5
if ws2['M'+str(row)].value in tt[cat]:
cd[cat] += 3
if ws2['P'+str(row)].value in tloc[cat]:
cd[cat] += 4
if ws2['Q'+str(row)].value in cor[cat]:
cd[cat] += 4
if ws2['S'+str(row)].value in vname[cat]:
cd[cat] += 2
if ws2['T'+str(row)].value in confname[cat]:
cd[cat] += 2
if ws2['V'+str(row)].value in ctype[cat]:
cd[cat] += 2
if ws2['W'+str(row)].value in certname[cat]:
cd[cat] += 3
if ws2['X'+str(row)].value in curl[cat]:
cd[cat] += 3
if ws2['Z'+str(row)].value in ccn[cat]:
cd[cat] += 2
if ws2['AB'+str(row)].value in act[cat]:
cd[cat] += 6
maximum = max(cd, key=cd.get)
maxcount = cd[maximum]
ws2[TARGET+str(row)].value=maximum
for key in cd.keys():
cd[key] = 0
wb2.save('contest-test.xlsx')