forked from llSourcell/prepare_dataset_challenge
-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathDataSetManager.py
More file actions
137 lines (105 loc) · 4.4 KB
/
DataSetManager.py
File metadata and controls
137 lines (105 loc) · 4.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Author: WeldFire
# Created: 12/20/2016
from pprint import pprint
import tensorflow
import random
import numpy
"""
Creates a random list of IDs to be used in training and validation
IN:
data - dataset that you are wanting to train with
validationRation - the amount of validation data that you would like removed from your training set
OUT:
trainingDataIDs - A list of indicies in the original dataset to be used for training
validationDataIDs - An ordered list of indicies in the original dataset to be used for validation
"""
def _generateValidationandTrainingDataIDSets(data, validationRatio=0.1):
dataLength = len(data)
entriesUsedInValidation = int(dataLength * validationRatio)
dataIDs = range(dataLength)
#Get a random sample of data IDs to be used in validation
validationDataIDs = random.sample(dataIDs, entriesUsedInValidation)
#Remove the validation IDs from the overall pool of IDs
trainingDataIDs = list(set(dataIDs)-set(validationDataIDs))
return trainingDataIDs, validationDataIDs
"""
Normalizes the dataset provided
IN:
data - the data that you want to normalized
OUT:
normalizedData - the normalized data from the input provided
standardDeviation - the calculated standardDeviation to be reused optionally later
average - the calculated average to be reused optionally later
"""
def _normalizeData(data):
dataArray = numpy.asarray(data, dtype=numpy.float32)
standardDeviation = dataArray.std(axis=0)
average = dataArray.mean(axis=0)
normalizedData = (dataArray - average)/ (standardDeviation)
return normalizedData, standardDeviation, average
"""
Normalizes the dataset provided using precomputed values
IN:
data - the data that you want to normalized
standardDeviation - the precalculated standardDeviation from a previous normalization
average - the precalculated average from a previous normalization
OUT:
normalizedData - the normalized data from the input provided
"""
def _precomputedDataNormalize(data, standardDeviation, average):
dataArray = numpy.asarray(data, dtype=numpy.float32)
normalizedData = (dataArray - average)/ (standardDeviation)
return normalizedData
"""
Creates one hot representations for the array provided
IN:
data - the data that you want a one hot representation of
dataSize - the data size of the one hot representation
OUT:
oneHotData - the one hot data from the input provided
"""
def _oneHotData(data, dataSize):
#Convert the provided array to a numpy array
numpyDataArray = numpy.array(data).astype(dtype=numpy.uint8)
#Convert the numpy array into a one hot matrix
oneHotData = (numpy.arange(dataSize) == numpyDataArray[:, None]).astype(numpy.float32)
return oneHotData
"""
Creates two array sets one set of training data and labels and one set of validation data and labels
IN:
data - the dataset that you are wanting to train with
labels - the label set that you are wanting to train your data on
validationRation - the amount of validation data that you would like removed from your training set
OUT:
trainingData - a list of normalized training data excluding validation data
trainingLabels - a list of normalized training labels excluding validation labels
validationData- a list of normalized validation data excluding training data
validationLabels - a list of normalized validation labels excluding training labels
"""
def generateDataSets(data, labels, validationRatio=0.1):
#Create training data output placeholder variables
trainingData = []
trainingLabels = []
#Create validation data output placeholder variables
validationData = []
validationLabels = []
#Normalize our data
normalizedData,std,avg = _normalizeData(data)
#Get the Data IDs that we want to use for training and validation
trainingDataIDs, validationDataIDs = \
_generateValidationandTrainingDataIDSets(data, validationRatio)
#Shuffle our accesses for randomness
shuffledDataIndexes = range(len(normalizedData))
random.shuffle(shuffledDataIndexes)
#For every index in our normalizedData array we want to populate our lists
for i in shuffledDataIndexes:
#If the index is in our validation ID set
#add the corresponding data and label to their respective arrays
if i in validationDataIDs:
validationData.append(normalizedData[i])
validationLabels.append(labels[i])
else:
#Else we add the data and label to the training set!
trainingData.append(normalizedData[i])
trainingLabels.append(labels[i])
return trainingData, trainingLabels, validationData, validationLabels