PythonGroupProject/hw_combined.py at master · tatlchri/PythonGroupProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
# -*- coding: utf-8 -*-
"""
BS1819-1617
Data Structures and Algorithms
Group Assignment
Team 3:
  - Ahmad Bilal Aslam
  - Chris Ying
  - Christina Lefkothea Tatli
  - Joh B
  - Kelvin Goh
  - Selly Salkha

"""


import pandas as pd
import numpy as np
import csv

##### START OF PART 1 - calculate daily returns #####

##~~ KEY VARIABLE: "returns" stores daily returns of stocks

##### To Read company names into a dictionary
def readNamesIntoDict():
    d = dict()
    input_file = csv.DictReader(open("SP_500_firms.csv"))
    for row in input_file:
        #print(row)
        d[row['Symbol']] = [row['Name'],row['Sector']]
    return d

##### To calculate daily returns from stock prices
def returns_Stocks(priceData):
    # input is a pd.dataframe of stock prices
    # output is a pd.dataframe of daily returns
    returns = priceData.pct_change()
    # remove index 0 of returns - it's a nan value because the first period data has no daily return
    #returns = returns[1:len(returns)]
    # We had used the built-in function, the next function below is the manual calculation
    return returns

def returns_Stocks_manual_calc(priceData):
    # Manual calculation of the returns
    returns = priceData / priceData.shift(1) - 1
    # remove index 0 of returns - it's a nan value because the first period data has no daily return
    #returns = returns[1:len(returns)]
    return returns

# make sure that the manual calculation is same as built-in function
def test_returns_Stock_built_in_equals_manual(priceData):
    returns_built_in = returns_Stocks(priceData)
    returns_manual = returns_Stocks_manual_calc(priceData)
    difference = returns_built_in - returns_manual
    print("The total difference between built-in and manual way of calculating daily returns, across stocks and time period, is:", difference.sum().sum())

##### Several functions to determine which stock has max, min return; which are overall best, worst stocks; and max and min std of daily returns
def max_return(returns, namesDict = readNamesIntoDict()):
    # input: pd.dataframe of daily returns, and a dictionary of company/sector (call the readNamesIntoDict() function if the dictionary is not passed as an argument)
    # output: returns the maximum daily return along with company name and Sector
    maxDaily_byComp  = returns.max()            #Maximum returns for each company

    maxDaily = maxDaily_byComp.max()            #Overall highest daily return
    maxDaily_CompSym = maxDaily_byComp.idxmax() #Getting index of the maximum return value

    maxDaily_CompName = namesDict[maxDaily_CompSym][0]  #Company Name using its symbol
    maxDaily_Sector = namesDict[maxDaily_CompSym][1]    #Company Sector

    return maxDaily_Sector, maxDaily_CompName, maxDaily


def min_return (returns, namesDict = readNamesIntoDict()):
    # input: pd.dataframe of daily returns, and a dictionary of company/sector (call the readNamesIntoDict() function if the dictionary is not passed as an argument)
    # output: returns the minimum daily return along with company name and Sector

    minDaily_byComp  = returns.min()            #Minimum returns for each company

    minDaily = minDaily_byComp.min()            #Overall lowest daily return
    minDaily_CompSym = minDaily_byComp.idxmin() #Getting index of the minimum return value

    minDaily_CompName = namesDict[minDaily_CompSym][0]  #Company Name using its symbol
    minDaily_Sector = namesDict[minDaily_CompSym][1]    #Company Sector

    return minDaily_Sector, minDaily_CompName, minDaily


def overall_best (priceData, namesDict = readNamesIntoDict()):
    # input: pd.dataframe of price data, and a dictionary of company/sector (call the readNamesIntoDict() function if the dictionary is not passed as an argument)
    # output: returns the maximum yearly return along with company name and Sector
    overallReturn_byComp  = priceData.iloc[-1] / priceData.iloc[0] - 1   #yearly returns for each company

    overallBest = overallReturn_byComp.max()            #best yearly return
    overallBest_CompSym = overallReturn_byComp.idxmax() #Getting index of the best yearly return

     #Loading Company Symbols mapping into namesDict
    overallBest_CompName = namesDict[overallBest_CompSym][0]  #Company Name using its symbol
    overallBest_Sector = namesDict[overallBest_CompSym][1]    #Company Sector

    return overallBest_Sector, overallBest_CompName, overallBest


def overall_worst (priceData, namesDict = readNamesIntoDict()):
    # input: pd.dataframe of price data, and a dictionary of company/sector (call the readNamesIntoDict() function if the dictionary is not passed as an argument)
    # output: returns the minimum yearly return along with company name and Sector
    overallReturn_byComp  = priceData.iloc[-1] / priceData.iloc[0] - 1   #yearly returns for each company

    overallWorst = overallReturn_byComp.min()            #worst yearly return
    overallWorst_CompSym = overallReturn_byComp.idxmin() #Getting index of the worst yearly return

    overallWorst_CompName = namesDict[overallWorst_CompSym][0]  #Company Name using its symbol
    overallWorst_Sector = namesDict[overallWorst_CompSym][1]    #Company Sector

    return overallWorst_Sector, overallWorst_CompName, overallWorst


def max_std (returns, namesDict = readNamesIntoDict()):
    # input: pd.dataframe of price data, and a dictionary of company/sector (call the readNamesIntoDict() function if the dictionary is not passed as an argument)
    # output: returns the maximum std. dev along with company name and Sector
    std_byComp  = returns.std()            #std. dev of returns for each company

    maxStd = std_byComp.max()            #maximum std. dev
    maxStd_CompSym = std_byComp.idxmax() #Getting index of the maximum std. dev

    namesDict = readNamesIntoDict() #Loading Company Symbols mapping into namesDict
    maxStd_CompName = namesDict[maxStd_CompSym][0]  #Company Name using its symbol
    maxStd_Sector = namesDict[maxStd_CompSym][1]    #Company Sector

    return maxStd_Sector, maxStd_CompName, maxStd


def min_std (returns, namesDict = readNamesIntoDict()):
    # input: pd.dataframe of price data, and a dictionary of company/sector (call the readNamesIntoDict() function if the dictionary is not passed as an argument)
    # output: returns the minimum std. dev along with company name and Sector
    std_byComp  = returns.std()            #std. dev of returns for each company

    minStd = std_byComp.min()            #minimum std. dev
    minStd_CompSym = std_byComp.idxmin() #Getting index of the minimum std. dev

    minStd_CompName = namesDict[minStd_CompSym][0]  #Company Name using its symbol
    minStd_Sector = namesDict[minStd_CompSym][1]    #Company Sector

    return minStd_Sector, minStd_CompName, minStd


##### Read company names into a dictionary
namesDict = readNamesIntoDict()

##### Read Prices Data into pandas
filename = 'SP_500_close_2015.csv'
priceData = pd.read_csv(filename,index_col = 0)

##### Call the function to calculate stocks' daily returns from the price data
returns = returns_Stocks (priceData)

# test that manual and built-in calculations are the same
# uncomment the next line of code to run test
#test_returns_Stock_built_in_equals_manual(priceData)

# UNCOMMENT THIS WHEN DONE WITH TESTING
###### Print results
#a, b, c = max_return (returns, namesDict)
#print(b,", which belongs to the",a,"sector, had the maximum daily return of the year i.e.", c*100, "%.")
##http://investors.fcx.com/investor-center/news-releases/news-release-details/2015/Freeport-McMoRan-Announces-Further-Spending-Cuts-in-Response-to-Market-Conditions/default.aspx
#
#a, b, c = min_return (returns, namesDict)
#print(b,", which belongs to the",a,"sector, had the minimum daily return of the year i.e.", c*100, "%.")
##the company warned that third-quarter results wouldn't be as strong as expected.
#
#a, b, c = overall_best (priceData, namesDict)
#print(b,", which belongs to the",a,"sector, had the best overall performance of the year with a return of", c*100, "%.")
#
#a, b, c = overall_worst (priceData, namesDict)
#print(b,", which belongs to the",a,"sector, had the worst overall performance of the year with a return of", c*100, "%.")
#
#a, b, c = max_std (returns, namesDict)
#print(b,", which belongs to the",a,"sector, exhibited the most volatility in its returns during the year, with a standard deviation of", c*100, "%.")
#
#a, b, c = min_std (returns, namesDict)
#print(b,", which belongs to the",a,"sector, exhibited the least volatility in its returns during the year, with a standard deviation of", c*100, "%.")

##### END OF PART 1 #####


##### START OF PART 2 - find correlations between stocks' daily returns #####

##~~ KEY VARIABLE: "correlationTable" stores correlation results

##### Generate pairwise correlation table using panda
def corTable(returns):
    # this uses the built-in function. The manual calculation is as below.
    return returns.corr()

# store correlation results (to use as input to other functions)
correlationTable = corTable(returns)

##### read company names into pd dataframe
compData = pd.read_csv('SP_500_firms.csv', index_col = 0)

##### Print correlation between company A and B
def printCor(correlationTable, companyA, companyB):
    corr = correlationTable.loc[companyA,companyB]
    nameA = compData.loc[companyA,'Name']
    nameB = compData.loc[companyB,'Name']
    return nameA, nameB, corr

##### Compare panda method and python manual method of calculating correlations
def testCor_pairwise(correlationTable, companyA, companyB):
    print('Panda method:')
    print(correlationTable.loc[companyA,companyB])
    print('Standard data structure method')
    returns_for_manual = returns[1:len(returns)]
    a,b = np.array(returns_for_manual.get(companyA).tolist(),dtype = float),np.array(returns_for_manual.get(companyB).tolist(),dtype = float)
    print(np.sum((a - np.mean(a))/np.std(a)*(b - np.mean(b))/np.std(b))/(len(a)))

##### The above does a pairwise correlation manually
#     This chunk of code computes all pairwise correlations manually
def testCor_allprices(returns):
    # copy returns dataframe to initialise corr_matrix. Will edit the cell contents in code below.
    corr_matrix = returns.copy()
    col_names = list(returns.columns.values)
    # remove index 0 of returns - it's a nan value because the first period data has no daily return
    returns = returns[1:len(returns)]
    for i in range(len(col_names)):
        for j in range(i, len(col_names)):
            companyA = col_names[i]
            companyB = col_names[j]
            a,b = np.array(returns.get(companyA).tolist(),dtype = float),np.array(returns.get(companyB).tolist(),dtype = float)
            corr_matrix.ix[companyA, companyB] = (np.sum((a - np.mean(a))/np.std(a)*(b - np.mean(b))/np.std(b))/(len(a)))
            corr_matrix.ix[companyB, companyA] = (np.sum((a - np.mean(a))/np.std(a)*(b - np.mean(b))/np.std(b))/(len(a)))
    return corr_matrix


##### make sure that the manual calculation is same as built-in function
def test_correlation_Stock_built_in_equals_manual(returns):
    corr_built_in = corTable(returns)
    corr_manual = testCor_allprices(returns)
    difference = corr_built_in - corr_manual
    print("The total difference between built-in and manual way of calculating correlations across stocks is:", difference.sum().sum())

# To test if built-in and manual way of finding correlations are the same
# Uncomment the lines below to test it. (have tested, values returned are the same. There is a very small difference 3.1778630833582955e-12, for the manual and built-in way of computing all correlations, likely due to rounding errors)
#testCor_pairwise(correlationTable, 'GOOGL', 'FB')
#test_correlation_Stock_built_in_equals_manual(returns)


##### List top and bottom correlated companies of a company
def top_bottom_Cor(correlationTable,company):
    print('Finding the top and bottom correlated companies for ', company, ':')
    print('===================================================')
    min = correlationTable[company].sort_values()[0:5]
    max = correlationTable[company].sort_values(ascending=False)[1:6]
    list1 = []
    list2 = []
    for i in min.index:
        list1.append(compData.loc[i,'Name'])
    for i in max.index:
        list2.append(compData.loc[i,'Name'])
    min.index = list1
    max.index = list2
    print('Bottom correlated :')
    print('-----------------')
    print(min)
    print('') # break line
    print('Top correlated:')
    print('---------------')
    print(max)


# UNCOMMENT THIS WHEN DONE WITH TESTING
#top_bottom_Cor(correlationTable,'AAPL')
#print("") # break line
#top_bottom_Cor(correlationTable,'AMZN')
#print("") # break line
#top_bottom_Cor(correlationTable,'MSFT')
#print("") # break line
#top_bottom_Cor(correlationTable,'FB')
#print("") # break line
#top_bottom_Cor(correlationTable,'GOOGL')

##### END OF PART 2 #####


##### START OF PART 3 - clustering algorithm #####

##~~ KEY VARIABLE: " " stores {}

##### END OF PART 3 #####


##### START OF PART 4 part 1 - more efficient clustering algorithm #####

##~~ KEY VARIABLE: " " stores {}

##### Store all stock names into a list
all_stock_names = list(correlationTable.columns.values)
n = len(all_stock_names)
#print("length of correlationTable: ", n)
#print("length of namesDict: ", len(namesDict))
# COMMENT: The length of namesDict and all_stock_names
#          DO NOT MATCH! all_stock_names is correct,
#          there are 496 stock names in the price csv file
#          len(namesDict) shows 504, which matches
#          the firms csv file
#          So the results differ because of the csv files.
#          To check that the usage of namesDict is correct.

##### Add all pairwise correlation (upper-triangle matrix), into a list
edge_list = []
for i in range(len(all_stock_names)-1):
    for j in range(i+1, len(all_stock_names)):
        # add tuple (weight, source, destination) to list
        edge_list.append( (correlationTable.loc[all_stock_names[i], all_stock_names[j]], i, j) )


# sort them in descending order
edge_list = sorted(edge_list, reverse = True)


##### Create a node class with:
#       node_name: prints the stock's name
#       parent: nodes with the same parent will belong to the same cluster
#       rank: depth of the node. Use this for efficient merging - tries to keep tree as balanced as possible - shorter search time


#       setlist: stores the names of all the nodes that belong to the same cluster. Updates this when sets are merged
#                (the root of the tree at any time will always store the complete set of nodes belonging to the same cluster)

class Node():
    def __init__(self, node_name):
        self.name = node_name
        # maintain dual pointers
        # children contains list of node's children
        #   This is to facilitate printing of entire cluster later. Find the root node, and print all its children as belonging to same cluster
        # parent is node's parent
        #   This is to facilitate finding the root of the tree and merging of trees
        self.children = [];
        self.parent = None

    def setParent(self, node):
        self.parent = node

    def getParent(self):
        return self.parent

    def setChildren(self, node):
        self.children.append(node)

    def getChildren(self):
        return self.children

    def setRank(self, rank):
        self.rank = rank

    def getRank(self):
        return self.rank

    def getSet(self):
        return self.setlist

    # this function is to facilitate checking if node is root node (node's parent is itself)
    def equals(self, node):
        return (self.name == node.name)

    def __str__(self):
        return self.name


def MakeSet(x):
    x.setParent(x)
    x.setRank(0)

def Union(x, y):
    # Union in a way such that the depth of the tree is minimised
    # always add the shorter tree under the deeper tree so that the height doesn't increase
    xRoot = Find(x)
    yRoot = Find(y)
    if xRoot == yRoot:
        return

    # x and y are not already in same set. Merge them.
    # always merge the smaller subtree into the bigger one
    # to minimise the tree height
    if xRoot.getRank() < yRoot.getRank():
        xRoot.setParent(yRoot)
        yRoot.setChildren(xRoot)
    elif xRoot.getRank() > yRoot.getRank():
        yRoot.setParent(xRoot)
        xRoot.setChildren(yRoot)
    else:
        yRoot.setParent(xRoot)
        xRoot.setChildren(yRoot)
        xRoot.setRank(xRoot.getRank() + 1)

def Find(x):
    # Path compression - whenever you're finding the root of a node, set the parent to the root directly
    if x.getParent() != x:
        x.setParent(Find(x.getParent()))
    return x.getParent()

def link_clusters(edge_list, node_names, k):
    # edge_list is list of sorted edges in tuple form (weight, source, destination)
    # node_names is the list of nodes' names
    # k is the number of iterations

    # get number of nodes
    n = len(node_names)

    # initialise nodePointers dictionary of linked nodes
    nodeList = []

    # add all the node names as Nodes into the list and init MakeSet (each node is a set of itself at the beginning)
    for i in range(n):
        nodeList.append(Node(node_names[i]))
        MakeSet(nodeList[i])

    # loop this k times
    for i in range(k):
        # extract the k highest weights / correlations from the list
        # Negative correlated nodes should be nearer to the end of the list (i.e. stocks that are dissimiliar to each other, in opposite direction)
        # Negative weights should not affect the algorithm correctness
        weight, source, dest = edge_list[i]
        #print(weight, source, dest) # for debugging purposes
        Union(nodeList[source], nodeList[dest])

    return nodeList

# test cluster algorithm
k = 20
if (k > len(edge_list)):
    k = len(edge_list)

nodeList = link_clusters(edge_list, all_stock_names, k)

# prints result

# O(n) time to go through all nodes and
# store root nodes into a list
root_nodes_list = []
for node in nodeList:
    # use the following line if you want to know a particular node's cluster
    #print("Node", nodeList[i], "is in same set as Node ", Find(nodeList[i]), ". Cluster of nodes are:", Find(nodeList[i]).getSet())
    if node.getParent().equals(node):
        root_nodes_list.append(node)


print("After", k, "iterations, we have these clusters:")
cluster_num = 0
# for each root node
#  use a Breadth First Search to go through all its children. All its children belong to the same cluster
for root_node in root_nodes_list:
    cluster_num += 1
    cluster = []
    queue = [root_node]
    while (len(queue) > 0):
        curr_node = queue.pop(0)
        cluster.append(str(curr_node))
        queue.extend(curr_node.getChildren())

    print("Cluster #", cluster_num, ": =", cluster)

##### END OF PART 4 part 1 #####