Skip to content

Commit 531c495

Browse files
committed
initial commit for library
1 parent 42641e8 commit 531c495

File tree

3,060 files changed

+1124479
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

3,060 files changed

+1124479
-0
lines changed

README.md

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# pytrain
2+
Machine Learning Library for python
3+
4+
###You can test library code with below
5+
6+
python pytrain_test.py
7+

__init__.py

Whitespace-only changes.

pytrain/__init__.py

Whitespace-only changes.

pytrain/dtree/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from naive_dtree import *

pytrain/dtree/naive_dtree.py

+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#
2+
# naive decision tree
3+
#
4+
# @ author becxer
5+
6+
#
7+
8+
from numpy import *
9+
from math import log
10+
import operator
11+
12+
class naive_dtree:
13+
def __init__(self, mat_data, label_data):
14+
self.mat_data = mat_data
15+
self.label_data = label_data
16+
17+
#make tree with matrix_data & label_data
18+
def fit(self):
19+
self.tree = self.create_tree(self.mat_data,self.label_data)
20+
return self.tree
21+
22+
#search array_input in tree
23+
def predict(self, array_input):
24+
return self.search_tree(self.tree, array_input)
25+
26+
#search array_input's feature in tree recursively
27+
#if tree node is dictionary recursive
28+
#else return label data
29+
def search_tree(self, tree, array_input):
30+
searched_label = "not found"
31+
node_col = tree.keys()[0]
32+
node_dict = tree[node_col]
33+
for node_val in node_dict.keys():
34+
if array_input[node_col] == node_val:
35+
if type(node_dict[node_val]).__name__ == 'dict':
36+
next_input = array_input[:node_col]
37+
next_input.extend(array_input[node_col+1:])
38+
searched_label = self.search_tree(node_dict[node_val],next_input)
39+
else : searched_label = node_dict[node_val]
40+
return searched_label
41+
42+
#create tree to lower entropy recursively
43+
#when split data, calculate each feature splitted matrix entropy and compare
44+
#select most lower entorpy and split
45+
#Example) matrix => label :: [[A,B] , [A,C], [A,D], [A,E] ,[B,D]] => ['YES','YES','YES','YES',NO' ]
46+
#output example tree ) { 0 , { 'A' : 'YES', 'B' : 'No'}}
47+
# | | | | |
48+
# | | | | |
49+
# column value | value |
50+
# label label
51+
#
52+
def create_tree(self, mat_data, label_data):
53+
#if left data has same label, then return label
54+
if label_data.count(label_data[0]) == len(label_data):
55+
return label_data[0]
56+
#if there is no feature to split, then return most major label
57+
if len(mat_data[0]) == 0 or ( len(mat_data[0]) == 1 and \
58+
len(set([row[0] for row in mat_data])) == 1 ) :
59+
return self.major_label_count(label_data)
60+
best_col_index = self.choose_col_to_split(mat_data, label_data)
61+
tree = {best_col_index:{}}
62+
best_col = [row[best_col_index] for row in mat_data]
63+
uniq_val = set(best_col)
64+
for val in uniq_val:
65+
splitted_mat, splitted_label = self.split_data(\
66+
mat_data, label_data, best_col_index, val)
67+
tree[best_col_index][val] = self.create_tree(splitted_mat, splitted_label)
68+
return tree
69+
70+
#split matrix & label data with axis and it's value
71+
def split_data(self, mat_data, label_data, axis, split_value):
72+
ret_data = []
73+
ret_label = []
74+
for index, row in enumerate(mat_data):
75+
if row[axis] == split_value:
76+
temp = row[:axis]
77+
temp.extend(row[axis+1:])
78+
ret_data.append(temp)
79+
ret_label.append(label_data[index])
80+
return ret_data, ret_label
81+
82+
#choose column to split comparing entropy
83+
def choose_col_to_split(self, mat_data, label_data):
84+
num_cols = len(mat_data[0])
85+
base_ent = self.calc_shannon_ent(label_data)
86+
max_info = 0.0
87+
best_col = -1
88+
for i in range(num_cols):
89+
col = [row[i] for row in mat_data]
90+
uniq_col = set(col)
91+
new_ent = 0.0
92+
for val in uniq_col:
93+
splitted_mat_data, splitted_label_data = \
94+
self.split_data(mat_data, label_data, i ,val)
95+
prob = len(splitted_label_data) / float(len(label_data))
96+
new_ent += prob * self.calc_shannon_ent(splitted_label_data)
97+
info = base_ent - new_ent
98+
if (info >= max_info):
99+
max_info = info
100+
best_col = i
101+
return best_col
102+
103+
def calc_shannon_ent(self, label_data):
104+
num_entry = len(label_data)
105+
label_count = {}
106+
for label in label_data:
107+
if label not in label_count.keys():
108+
label_count[label] = 0
109+
label_count[label] += 1
110+
shannon_ent = 0.0
111+
for key in label_count:
112+
prob = float(label_count[key]) / num_entry
113+
shannon_ent -= prob * log(prob,2)
114+
return shannon_ent
115+
116+
def major_label_count(self, label_data):
117+
label_count = {}
118+
for label in label_data:
119+
if label not in label_count.keys():
120+
label_count[label] = 0
121+
label_count[label] += 1
122+
sorted_label_count = sorted(label_count.iteritems(),
123+
key=operator.itemgetter(1), reverse=True)
124+
return sorted_label_count[0][0]
125+
126+

pytrain/knn/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from naive_knn import *

pytrain/knn/naive_knn.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#
2+
# naive k-nearest neighbors
3+
#
4+
# @ author becxer
5+
6+
#
7+
8+
from numpy import *
9+
from pytrain.ptlib import ptlib
10+
import operator
11+
12+
class naive_knn:
13+
def __init__(self, mat_data,label_data,k):
14+
if mat_data.__class__.__name__ != 'ndarray':
15+
mat_data = ptlib.mat2arr(mat_data)
16+
self.mat_data = mat_data
17+
self.label_data = label_data
18+
self.train_size = mat_data.shape[0]
19+
self.k = k
20+
21+
def fit(self):
22+
pass
23+
24+
# compare distance from all mat_data rows and choose most closer one
25+
def predict(self, array_input):
26+
if array_input.__class__.__name__ != 'ndarray':
27+
array_input = ptlib.list2arr(array_input)
28+
diff_mat = tile(array_input, (self.train_size,1)) - self.mat_data
29+
pow_diff_mat = diff_mat ** 2
30+
pow_distances = pow_diff_mat.sum(axis=1)
31+
distances = pow_distances ** 0.5
32+
sorted_distances = distances.argsort()
33+
class_count = {}
34+
for i in range(self.k):
35+
kth_label = self.label_data[sorted_distances[i]]
36+
class_count[kth_label] = class_count.get(kth_label, 0) + 1
37+
sorted_class_count = sorted(class_count.iteritems(),
38+
key=operator.itemgetter(1),reverse=True)
39+
return sorted_class_count[0][0]
40+

pytrain/ptlib/__init__.py

Whitespace-only changes.

pytrain/ptlib/ptlib.py

+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
#
2+
# library for data manipulation & etc.
3+
#
4+
# @ author becxer
5+
6+
#
7+
8+
from numpy import *
9+
import operator
10+
import math
11+
import sys
12+
13+
#convert file which format is
14+
#[label, feature1, feature2 ... , featureN]
15+
#to matrix_train, label_train, matrix_test, label_test
16+
#according to ho_ratio
17+
#ho_ratio is test_set ratio how you want
18+
def f2mat(filename, ho_ratio):
19+
fr = open(filename)
20+
lines = fr.readlines()
21+
lnum_test = math.ceil(len(lines) * ho_ratio)
22+
lnum_train = len(lines) - lnum_test
23+
colmax = len(lines[0].strip().split('\t'))
24+
mat_train = []
25+
mat_test = []
26+
label_train = []
27+
label_test = []
28+
train_index = 0
29+
test_index = 0
30+
split_index = 0
31+
if ho_ratio != 0 :
32+
split_index = 1.0 / ho_ratio
33+
for line in lines:
34+
line = line.strip()
35+
listFromLine = line.split('\t')
36+
if ho_ratio == 0 or (train_index + test_index) % split_index != 0 :
37+
mat_train.append(listFromLine[1:colmax])
38+
label_train.append(listFromLine[0])
39+
train_index += 1
40+
else :
41+
mat_test.append(listFromLine[1:colmax])
42+
label_test.append(listFromLine[0])
43+
test_index += 1
44+
if ho_ratio == 0:
45+
return mat_train,label_train
46+
else :
47+
return mat_train, label_train, mat_test, label_test
48+
49+
def mat2arr(data_mat):
50+
return array(map(lambda x:map(float,x),data_mat))
51+
52+
def list2arr(data_list):
53+
return array(map(float,data_list))
54+
55+
#normalize matrix feature with base-min & base-max
56+
def norm(data_mat):
57+
if data_mat.__class__.__name__ != 'ndarray':
58+
data_mat = mat2arr(data_mat)
59+
min_vals = data_mat.min(0)
60+
max_vals = data_mat.max(0)
61+
ranges = max_vals - min_vals
62+
ranges = map(lambda x : x + sys.float_info.epsilon ,ranges)
63+
normed_data_mat = zeros(shape(data_mat))
64+
rowsize = data_mat.shape[0]
65+
normed_data_mat = data_mat - tile(min_vals, (rowsize,1))
66+
normed_data_mat = normed_data_mat / tile(ranges,(rowsize,1))
67+
return normed_data_mat
68+
69+
#abstracted evaluation logic
70+
#p_module is pytrain module that you already trained
71+
def eval_predict(p_module ,mat_test, label_test, log_on = True):
72+
rsize_test = len(mat_test)
73+
error_count = 0.0
74+
for i in range(rsize_test):
75+
res = p_module.predict(mat_test[i])
76+
if log_on : print "predicted : '" + str(res) + "' --- origin : '" \
77+
+ str(label_test[i]) + "'"
78+
if(res != label_test[i]): error_count += 1.0
79+
if log_on : print "<" + p_module.__class__.__name__ + ">" +\
80+
" error rate is " + str(error_count / float(rsize_test))
81+
return error_count/rsize_test
82+
83+
#saving module to file
84+
def store_module(module, filename):
85+
import pickle
86+
module_f = open(filename, 'w')
87+
pickle.dump(module,module_f)
88+
module_f.close()
89+
90+
#loading module into object
91+
def restore_module(filename):
92+
import pickle
93+
module_f = open(filename)
94+
return pickle.load(module_f)
95+
96+
#test for ptlib
97+
def hello():
98+
print "hello this is ptlib"
99+

pytrain_test.py

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#
2+
# Dev test for pytrain library
3+
#
4+
# @ author becxer
5+
6+
#
7+
8+
#import from ptlib
9+
from pytrain.ptlib import ptlib
10+
11+
#import from modules
12+
from pytrain.knn import naive_knn
13+
from pytrain.dtree import naive_dtree
14+
15+
#testing ptlib
16+
ptlib.hello()
17+
18+
#testing ptlib.f2mat
19+
print "-Testing file 2 matrix"
20+
dmat_train, dlabel_train, dmat_test, dlabel_test \
21+
= ptlib.f2mat("sample/dating/date_info.txt", 0.1)
22+
23+
print "train mat count : " + str(len(dmat_train))
24+
print dmat_train[0:10]
25+
print "train label count : " + str(len(dlabel_train))
26+
print dlabel_train[0:10]
27+
28+
print "test mat count : " + str(len(dmat_test))
29+
print dmat_test[0:10]
30+
print "test label count : " + str(len(dlabel_test))
31+
print dlabel_test[0:10]
32+
33+
#testing ptlib.norm
34+
print "-Testing normalization"
35+
normed_dmat_train = ptlib.norm(dmat_train)
36+
normed_dmat_test = ptlib.norm(dmat_test)
37+
print normed_dmat_train[0:10]
38+
39+
#testing knn
40+
print "-Testing KNN"
41+
simple_mat_1 = [[1.0,1.1] , [1.0,1.0], [0,0], [0,0.1]]
42+
simple_label_1 = ['A','A','B','B']
43+
knn = naive_knn(simple_mat_1, simple_label_1, 3)
44+
print "knn predict [0.9,0.9] : " + str(knn.predict([0.9,0.9]))
45+
print "knn predict [0.1,0.4] : " + str(knn.predict([0.1,0.4]))
46+
47+
#eval knn date
48+
knn_date = naive_knn(normed_dmat_train, dlabel_train, 3)
49+
error_rate = ptlib.eval_predict(knn_date, normed_dmat_test, dlabel_test, False)
50+
print "<knn> date error rate : " + str(error_rate)
51+
52+
#eval knn digits
53+
#dg_mat_train, dg_label_train = ptlib.f2mat("sample/digit/digit-train.txt",0)
54+
#dg_mat_test, dg_label_test = ptlib.f2mat("sample/digit/digit-test.txt",0)
55+
#knn_digit = naive_knn(dg_mat_train, dg_label_train, 3)
56+
#error_rate = ptlib.eval_predict(knn_digit, dg_mat_test, dg_label_test)
57+
58+
#testing dtree
59+
print "-Testing Dtree"
60+
simple_mat_2 = [[7,8,8],[8,7,8],[8,8,8],[8,8,8],[8,7,7],[7,7,8],[7,7,7],[7,8,7],[8,8,8]]
61+
simple_label_2 = ['yes', 'yes', 'yes', 'no', 'no', 'yes', 'no', 'no', 'no']
62+
tree = naive_dtree(simple_mat_2, simple_label_2)
63+
print "tree fit : " + str(tree.fit())
64+
print "tree predict : " + str(tree.predict([8,8,8]))
65+
66+
#testing store & restore
67+
print "-Testing store & restore"
68+
ptlib.store_module(tree,"tmp/tree_878.dat")
69+
mod = ptlib.restore_module("tmp/tree_878.dat")
70+
print "restored tree : " + str(mod.tree)
71+
print "restored tree predict : " + str(mod.predict([8,8,7]))
72+
73+
#eval dtree lense
74+
lense_mat_train, lense_label_train, lense_mat_test, lense_label_test = \
75+
ptlib.f2mat("sample/lense/lense.txt", 0.4)
76+
dtree_lense = naive_dtree(lense_mat_train,lense_label_train)
77+
dtree_lense.fit()
78+
error_rate = ptlib.eval_predict(dtree_lense, lense_mat_test, lense_label_test)
79+

0 commit comments

Comments
 (0)