prepare_dataset_challenge/DataSetManager.py at master · WeldFire/prepare_dataset_challenge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Author: WeldFire
# Created: 12/20/2016
from pprint import pprint
import tensorflow
import random
import numpy

"""
Creates a random list of IDs to be used in training and validation

IN:
data - dataset that you are wanting to train with
validationRation - the amount of validation data that you would like removed from your training set

OUT:
trainingDataIDs - A list of indicies in the original dataset to be used for training
validationDataIDs - An ordered list of indicies in the original dataset to be used for validation
"""
def _generateValidationandTrainingDataIDSets(data, validationRatio=0.1):
	dataLength = len(data)
	entriesUsedInValidation = int(dataLength * validationRatio)

	dataIDs = range(dataLength)

	#Get a random sample of data IDs to be used in validation
	validationDataIDs = random.sample(dataIDs, entriesUsedInValidation)
	#Remove the validation IDs from the overall pool of IDs
	trainingDataIDs = list(set(dataIDs)-set(validationDataIDs))

	return trainingDataIDs, validationDataIDs


"""
Normalizes the dataset provided

IN:
data - the data that you want to normalized

OUT:
normalizedData - the normalized data from the input provided
standardDeviation - the calculated standardDeviation to be reused optionally later
average - the calculated average to be reused optionally later
"""
def _normalizeData(data):
	dataArray = numpy.asarray(data, dtype=numpy.float32)

	standardDeviation = dataArray.std(axis=0)
	average = dataArray.mean(axis=0)

	normalizedData = (dataArray - average)/ (standardDeviation)

	return normalizedData, standardDeviation, average

"""
Normalizes the dataset provided using precomputed values

IN:
data - the data that you want to normalized
standardDeviation - the precalculated standardDeviation from a previous normalization
average - the precalculated average from a previous normalization

OUT:
normalizedData - the normalized data from the input provided
"""
def _precomputedDataNormalize(data, standardDeviation, average):
	dataArray = numpy.asarray(data, dtype=numpy.float32)

	normalizedData = (dataArray - average)/ (standardDeviation)

	return normalizedData


"""
Creates one hot representations for the array provided

IN:
data - the data that you want a one hot representation of
dataSize - the data size of the one hot representation

OUT:
oneHotData - the one hot data from the input provided
"""
def _oneHotData(data, dataSize):
	#Convert the provided array to a numpy array
	numpyDataArray = numpy.array(data).astype(dtype=numpy.uint8)

	#Convert the numpy array into a one hot matrix
	oneHotData = (numpy.arange(dataSize) == numpyDataArray[:, None]).astype(numpy.float32)

	return oneHotData


"""
Creates two array sets one set of training data and labels and one set of validation data and labels

IN:
data - the dataset that you are wanting to train with
labels - the label set that you are wanting to train your data on
validationRation - the amount of validation data that you would like removed from your training set

OUT:
trainingData - a list of normalized training data excluding validation data
trainingLabels - a list of normalized training labels excluding validation labels
validationData- a list of normalized validation data excluding training data
validationLabels - a list of normalized validation labels excluding training labels
"""
def generateDataSets(data, labels, validationRatio=0.1):
	#Create training data output placeholder variables
	trainingData = []
	trainingLabels = []
	#Create validation data output placeholder variables
	validationData = []
	validationLabels = []

	#Normalize our data
	normalizedData,std,avg = _normalizeData(data)
	#Get the Data IDs that we want to use for training and validation
	trainingDataIDs, validationDataIDs = \
		_generateValidationandTrainingDataIDSets(data, validationRatio)

	#Shuffle our accesses for randomness
	shuffledDataIndexes = range(len(normalizedData))
	random.shuffle(shuffledDataIndexes)

	#For every index in our normalizedData array we want to populate our lists
	for i in shuffledDataIndexes:
		#If the index is in our validation ID set
		#add the corresponding data and label to their respective arrays
		if i in validationDataIDs:
			validationData.append(normalizedData[i])
			validationLabels.append(labels[i])
		else:
			#Else we add the data and label to the training set!
			trainingData.append(normalizedData[i])
			trainingLabels.append(labels[i])

	return trainingData, trainingLabels, validationData, validationLabels