becxer
diff --git a/‎README.md
+7 b/‎README.md
+7
diff --git a/‎__init__.py b/‎__init__.py
diff --git a/‎pytrain/__init__.py b/‎pytrain/__init__.py
diff --git a/‎pytrain/dtree/__init__.py
+1 b/‎pytrain/dtree/__init__.py
+1
diff --git a/‎pytrain/dtree/naive_dtree.py
+126 b/‎pytrain/dtree/naive_dtree.py
+126
diff --git a/‎pytrain/knn/__init__.py
+1 b/‎pytrain/knn/__init__.py
+1
diff --git a/‎pytrain/knn/naive_knn.py
+40 b/‎pytrain/knn/naive_knn.py
+40
diff --git a/‎pytrain/ptlib/__init__.py b/‎pytrain/ptlib/__init__.py
diff --git a/‎pytrain/ptlib/ptlib.py
+99 b/‎pytrain/ptlib/ptlib.py
+99
diff --git a/‎pytrain_test.py
+79 b/‎pytrain_test.py
+79
@@ -0,0 +1,7 @@
+# pytrain
+Machine Learning Library for python
+
+###You can test library code with below
+
+      python pytrain_test.py
+
@@ -0,0 +1 @@
+from naive_dtree import *
@@ -0,0 +1,126 @@
+#
+# naive decision tree
+#
+# @ author becxer
+# @ e-mail [email protected]
+#
+
+from numpy import *
+from math import log
+import operator
+
+class naive_dtree:
+	def __init__(self, mat_data, label_data):
+		self.mat_data = mat_data
+		self.label_data = label_data
+
+	#make tree with matrix_data & label_data	
+	def fit(self):
+		self.tree = self.create_tree(self.mat_data,self.label_data)
+		return self.tree
+
+	#search array_input in tree
+	def predict(self, array_input):
+		return self.search_tree(self.tree, array_input)
+
+	#search array_input's feature in tree recursively
+	#if tree node is dictionary recursive
+	#else return label data
+	def search_tree(self, tree, array_input):
+		searched_label = "not found"
+		node_col = tree.keys()[0]
+		node_dict = tree[node_col]
+		for node_val in node_dict.keys():
+			if array_input[node_col] == node_val:
+				if type(node_dict[node_val]).__name__ == 'dict':
+					next_input = array_input[:node_col]
+					next_input.extend(array_input[node_col+1:])	
+					searched_label = self.search_tree(node_dict[node_val],next_input)
+				else : searched_label = node_dict[node_val]
+		return searched_label
+
+	#create tree to lower entropy recursively
+	#when split data, calculate each feature splitted matrix entropy and compare
+	#select most lower entorpy and split
+	#Example)  matrix => label ::  [[A,B] , [A,C], [A,D], [A,E] ,[B,D]] => ['YES','YES','YES','YES',NO' ]
+	#output example tree )   { 0 , { 'A' : 'YES', 'B' : 'No'}}
+	#                          |      |      |     |     |
+	#                          |      |      |     |     |
+	#                       column  value    |    value  |
+	#                                      label        label
+	#
+	def create_tree(self, mat_data, label_data):
+		#if left data has same label, then return label
+		if label_data.count(label_data[0]) == len(label_data):
+			return label_data[0]
+		#if there is no feature to split, then return most major label
+		if len(mat_data[0]) == 0 or ( len(mat_data[0]) == 1 and \
+				len(set([row[0] for row in mat_data])) == 1 ) :
+			return self.major_label_count(label_data)
+		best_col_index = self.choose_col_to_split(mat_data, label_data)
+		tree = {best_col_index:{}}
+		best_col = [row[best_col_index] for row in mat_data]
+		uniq_val = set(best_col)
+		for val in uniq_val:
+			splitted_mat, splitted_label = self.split_data(\
+								mat_data, label_data, best_col_index, val)
+			tree[best_col_index][val] = self.create_tree(splitted_mat, splitted_label)
+		return tree
+
+	#split matrix & label data with axis and it's value
+	def split_data(self, mat_data, label_data, axis, split_value):
+		ret_data = []
+		ret_label = []
+		for index, row in enumerate(mat_data):
+			if row[axis] == split_value:
+				temp = row[:axis]
+				temp.extend(row[axis+1:])
+				ret_data.append(temp)
+				ret_label.append(label_data[index])
+		return ret_data, ret_label
+	
+	#choose column to split comparing entropy
+	def choose_col_to_split(self, mat_data, label_data):
+		num_cols = len(mat_data[0]) 
+		base_ent = self.calc_shannon_ent(label_data)
+		max_info = 0.0
+		best_col = -1
+		for i in range(num_cols):
+			col = [row[i] for row in mat_data]
+			uniq_col = set(col)
+			new_ent = 0.0
+			for val in uniq_col:
+				splitted_mat_data, splitted_label_data = \
+						 self.split_data(mat_data, label_data, i ,val)
+				prob = len(splitted_label_data) / float(len(label_data))
+				new_ent += prob * self.calc_shannon_ent(splitted_label_data)
+			info = base_ent - new_ent
+			if (info >= max_info):
+				max_info = info
+				best_col = i
+		return best_col
+
+	def calc_shannon_ent(self, label_data):
+		num_entry = len(label_data)
+		label_count = {}
+		for label in label_data:
+			if label not in label_count.keys():
+				label_count[label] = 0
+			label_count[label] += 1
+		shannon_ent = 0.0
+		for key in label_count:
+			prob = float(label_count[key]) / num_entry
+			shannon_ent -= prob * log(prob,2)
+		return shannon_ent	
+
+	def major_label_count(self, label_data):
+		label_count = {}
+		for label in label_data:
+			if label not in label_count.keys():
+				label_count[label] = 0
+			label_count[label] += 1
+		sorted_label_count = sorted(label_count.iteritems(),
+								key=operator.itemgetter(1), reverse=True)
+		return sorted_label_count[0][0]
+
+
@@ -0,0 +1 @@
+from naive_knn import *
@@ -0,0 +1,40 @@
+#
+# naive k-nearest neighbors
+#
+# @ author becxer
+# @ e-mail [email protected]
+#
+
+from numpy import *
+from pytrain.ptlib import ptlib
+import operator
+
+class naive_knn:
+	def __init__(self, mat_data,label_data,k):
+		if mat_data.__class__.__name__ != 'ndarray':
+			mat_data = ptlib.mat2arr(mat_data)	
+		self.mat_data = mat_data
+		self.label_data = label_data
+		self.train_size = mat_data.shape[0]
+		self.k = k
+
+	def fit(self):
+		pass
+
+	# compare distance from all mat_data rows and choose most closer one
+	def predict(self, array_input):
+		if array_input.__class__.__name__ != 'ndarray':
+			array_input = ptlib.list2arr(array_input)	
+		diff_mat = tile(array_input, (self.train_size,1)) - self.mat_data
+		pow_diff_mat = diff_mat ** 2
+		pow_distances = pow_diff_mat.sum(axis=1)
+		distances = pow_distances ** 0.5
+		sorted_distances = distances.argsort()
+		class_count = {}
+		for i in range(self.k):
+			kth_label = self.label_data[sorted_distances[i]]
+			class_count[kth_label] = class_count.get(kth_label, 0) + 1
+		sorted_class_count = sorted(class_count.iteritems(), 
+			key=operator.itemgetter(1),reverse=True)
+		return sorted_class_count[0][0]
+
@@ -0,0 +1,99 @@
+#
+# library for data manipulation & etc.
+#
+# @ author becxer
+# @ e-mail [email protected]
+#
+
+from numpy import *
+import operator
+import math
+import sys
+
+#convert file which format is 
+#[label, feature1, feature2 ... , featureN]
+#to matrix_train, label_train, matrix_test, label_test
+#according to ho_ratio
+#ho_ratio is test_set ratio how you want
+def f2mat(filename, ho_ratio):
+	fr = open(filename)
+	lines = fr.readlines()
+	lnum_test = math.ceil(len(lines) * ho_ratio) 
+	lnum_train = len(lines) - lnum_test
+	colmax = len(lines[0].strip().split('\t'))
+	mat_train = []
+	mat_test = [] 
+	label_train = []
+	label_test = []
+	train_index = 0
+	test_index = 0
+	split_index = 0
+	if ho_ratio != 0 :
+		split_index = 1.0 / ho_ratio
+	for line in lines:
+		line = line.strip()
+		listFromLine = line.split('\t')
+		if ho_ratio == 0 or (train_index + test_index) % split_index != 0 :
+			mat_train.append(listFromLine[1:colmax])
+			label_train.append(listFromLine[0])
+			train_index += 1
+		else :
+			mat_test.append(listFromLine[1:colmax])
+			label_test.append(listFromLine[0])
+			test_index += 1
+	if ho_ratio == 0:
+		return mat_train,label_train
+	else :
+		return mat_train, label_train, mat_test, label_test
+
+def mat2arr(data_mat):
+	return array(map(lambda x:map(float,x),data_mat))
+
+def list2arr(data_list):
+	return array(map(float,data_list))
+
+#normalize matrix feature with base-min & base-max
+def norm(data_mat):
+	if data_mat.__class__.__name__ != 'ndarray':
+		data_mat = mat2arr(data_mat)
+	min_vals = data_mat.min(0)
+	max_vals = data_mat.max(0)
+	ranges = max_vals - min_vals
+	ranges = map(lambda x : x + sys.float_info.epsilon ,ranges)
+	normed_data_mat = zeros(shape(data_mat))
+	rowsize = data_mat.shape[0]
+	normed_data_mat = data_mat - tile(min_vals, (rowsize,1))
+	normed_data_mat = normed_data_mat / tile(ranges,(rowsize,1))
+	return normed_data_mat
+
+#abstracted evaluation logic 
+#p_module is pytrain module that you already trained
+def eval_predict(p_module ,mat_test, label_test, log_on = True):
+    rsize_test = len(mat_test)
+    error_count = 0.0
+    for i in range(rsize_test):
+        res = p_module.predict(mat_test[i])
+        if log_on : print "predicted : '" + str(res) + "' --- origin : '" \
+						+ str(label_test[i]) + "'"
+        if(res != label_test[i]): error_count += 1.0
+    if log_on : print "<" + p_module.__class__.__name__ + ">" +\
+					" error rate is " + str(error_count / float(rsize_test))
+    return error_count/rsize_test
+
+#saving module to file
+def store_module(module, filename):
+	import pickle
+	module_f = open(filename, 'w')
+	pickle.dump(module,module_f)
+	module_f.close()
+
+#loading module into object
+def restore_module(filename):
+	import pickle
+	module_f = open(filename)
+	return pickle.load(module_f)
+
+#test for ptlib
+def hello():
+	print "hello this is ptlib"
+
@@ -0,0 +1,79 @@
+#
+# Dev test for pytrain library
+#
+# @ author becxer
+# @ email [email protected]
+#
+
+#import from ptlib
+from pytrain.ptlib import ptlib
+
+#import from modules
+from pytrain.knn import naive_knn
+from pytrain.dtree import naive_dtree
+
+#testing ptlib
+ptlib.hello()
+
+#testing ptlib.f2mat
+print "-Testing file 2 matrix"
+dmat_train, dlabel_train, dmat_test, dlabel_test \
+	= ptlib.f2mat("sample/dating/date_info.txt", 0.1)
+
+print "train mat count : " + str(len(dmat_train))
+print dmat_train[0:10]
+print "train label count : " + str(len(dlabel_train))
+print dlabel_train[0:10]
+
+print "test mat count : " + str(len(dmat_test))
+print dmat_test[0:10]
+print "test label count : " + str(len(dlabel_test))
+print dlabel_test[0:10]
+
+#testing ptlib.norm
+print "-Testing normalization"
+normed_dmat_train = ptlib.norm(dmat_train)
+normed_dmat_test = ptlib.norm(dmat_test)
+print normed_dmat_train[0:10]
+
+#testing knn
+print "-Testing KNN"
+simple_mat_1 = [[1.0,1.1] , [1.0,1.0], [0,0], [0,0.1]]
+simple_label_1 = ['A','A','B','B']
+knn = naive_knn(simple_mat_1, simple_label_1, 3)
+print "knn predict [0.9,0.9] : " + str(knn.predict([0.9,0.9]))
+print "knn predict [0.1,0.4] : " + str(knn.predict([0.1,0.4]))
+
+#eval knn date
+knn_date = naive_knn(normed_dmat_train, dlabel_train, 3)
+error_rate = ptlib.eval_predict(knn_date, normed_dmat_test, dlabel_test, False)
+print "<knn> date error rate : " + str(error_rate)
+
+#eval knn digits
+#dg_mat_train, dg_label_train = ptlib.f2mat("sample/digit/digit-train.txt",0)
+#dg_mat_test, dg_label_test = ptlib.f2mat("sample/digit/digit-test.txt",0)
+#knn_digit = naive_knn(dg_mat_train, dg_label_train, 3)
+#error_rate = ptlib.eval_predict(knn_digit, dg_mat_test, dg_label_test)
+
+#testing dtree
+print "-Testing Dtree"
+simple_mat_2 = [[7,8,8],[8,7,8],[8,8,8],[8,8,8],[8,7,7],[7,7,8],[7,7,7],[7,8,7],[8,8,8]]
+simple_label_2 = ['yes',  'yes',  'yes', 'no',  'no',  'yes',   'no',   'no', 'no']
+tree = naive_dtree(simple_mat_2, simple_label_2)
+print "tree fit : " + str(tree.fit())
+print "tree predict : " + str(tree.predict([8,8,8]))
+
+#testing store & restore
+print "-Testing store & restore"
+ptlib.store_module(tree,"tmp/tree_878.dat")
+mod = ptlib.restore_module("tmp/tree_878.dat")
+print "restored tree : " + str(mod.tree)
+print "restored tree predict : " + str(mod.predict([8,8,7]))
+
+#eval dtree lense
+lense_mat_train, lense_label_train, lense_mat_test, lense_label_test = \
+							ptlib.f2mat("sample/lense/lense.txt", 0.4)
+dtree_lense = naive_dtree(lense_mat_train,lense_label_train)
+dtree_lense.fit()
+error_rate = ptlib.eval_predict(dtree_lense, lense_mat_test, lense_label_test)
+