-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
59 lines (46 loc) · 1.65 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
'''
Jason Liu, Yoonseo Song, Daniel Miau
CSE 163
Final Project
This program uses ML models to analyze the Cleveland heart disease data
set from the UCI Machine Learning Repository
'''
import pandas as pd
import numpy as np
import models
def main():
# Reads in the file
data = pd.read_csv('https://raw.githubusercontent.com/yoonseosong/cse163-'
'final-project/master/processed.cleveland.csv')
# FOR TESTING: checks the shape of the dataframe before dropping rows
# with empty values
# print(data.shape)
# Drops '?' data points
data = data.replace('?', np.NaN)
data = data.dropna()
# FOR TESTING: checks the shape of the dataframe after dropping rows
# with empty values
# print(data.shape)
# Stores results
all_results = {}
# Generates list of features
features = data.copy()
features = features.loc[:, data.columns != 'num']
features_list = features.columns
# Sets how many times to run each model
iterations = 100
# Create model using all features
models.create_model(None, data, all_results, iterations)
# Create models with 1 dropped feature each
for feature in features_list:
models.create_model(feature, data, all_results, iterations)
# Converts results to a pandas dataframe
all_results = pd.DataFrame.from_dict(all_results)
# FOR TESTING: checks to make sure all_results is a pandas dataframe
# and checks the results
# print(type(all_results))
# print(all_results)
# Creates a boxplot of the results and finds the p-value through ANOVA
models.analyze_results(all_results)
if __name__ == '__main__':
main()