Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify kNN #3

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 16 additions & 13 deletions AdaBoost/ROC.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,12 @@

def loadDataSet(fileName):
numFeat = len((open(fileName).readline().split('\t')))
dataMat = []; labelMat = []
dataMat = []
labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat - 1):
lineArr.append(float(curLine[i]))
lineArr = [float(curLine[i]) for i in range(numFeat - 1)]
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))

Expand All @@ -39,7 +38,7 @@ def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
Returns:
retArray - 分类结果
"""
retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1
retArray = np.ones((np.shape(dataMatrix)[0], 1)) #初始化retArray为1
if threshIneq == 'lt':
retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1
else:
Expand All @@ -59,18 +58,22 @@ def buildStump(dataArr,classLabels,D):
minError - 最小误差
bestClasEst - 最佳的分类结果
"""
dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T
dataMatrix = np.mat(dataArr)
labelMat = np.mat(classLabels).T
m,n = np.shape(dataMatrix)
numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1)))
numSteps = 10
bestStump = {}
bestClasEst = np.mat(np.zeros((m, 1)))
minError = float('inf') #最小误差初始化为正无穷大
for i in range(n): #遍历所有特征
rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值
rangeMin = dataMatrix[:, i].min()
rangeMax = dataMatrix[:, i].max() #找到特征中最小的值和最大值
stepSize = (rangeMax - rangeMin) / numSteps #计算步长
for j in range(-1, int(numSteps) + 1):
for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than
threshVal = (rangeMin + float(j) * stepSize) #计算阈值
predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果
errArr = np.mat(np.ones((m,1))) #初始化误差矩阵
errArr = np.mat(np.ones((m, 1))) #初始化误差矩阵
errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0
weightedError = D.T * errArr #计算误差
# print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError))
Expand All @@ -82,7 +85,7 @@ def buildStump(dataArr,classLabels,D):
bestStump['ineq'] = inequal
return bestStump, minError, bestClasEst

def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
"""
使用AdaBoost算法训练分类器
Parameters:
Expand All @@ -96,7 +99,7 @@ def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
weakClassArr = []
m = np.shape(dataArr)[0]
D = np.mat(np.ones((m, 1)) / m) #初始化权重
aggClassEst = np.mat(np.zeros((m,1)))
aggClassEst = np.mat(np.zeros((m, 1)))
for i in range(numIt):
bestStump, error, classEst = buildStump(dataArr, classLabels, D) #构建单层决策树
# print("D:",D.T)
Expand All @@ -110,7 +113,7 @@ def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
#计算AdaBoost误差,当误差为0的时候,退出循环
aggClassEst += alpha * classEst #计算类别估计累计值
# print("aggClassEst: ", aggClassEst.T)
aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m,1))) #计算误差
aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m, 1))) #计算误差
errorRate = aggErrors.sum() / m
# print("total error: ", errorRate)
if errorRate == 0.0: break #误差为0,退出循环
Expand Down Expand Up @@ -145,7 +148,7 @@ def plotROC(predStrengths, classLabels):
ySum += cur[1] #高度累加
ax.plot([cur[0], cur[0] - delX], [cur[1], cur[1] - delY], c = 'b') #绘制ROC
cur = (cur[0] - delX, cur[1] - delY) #更新绘制光标的位置
ax.plot([0,1], [0,1], 'b--')
ax.plot([0, 1], [0, 1], 'b--')
plt.title('AdaBoost马疝病检测系统的ROC曲线', FontProperties = font)
plt.xlabel('假阳率', FontProperties = font)
plt.ylabel('真阳率', FontProperties = font)
Expand Down
33 changes: 18 additions & 15 deletions AdaBoost/adaboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def loadSimpData():
[ 1. , 1. ],
[ 2. , 1. ]])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
return datMat,classLabels
return datMat, classLabels

def showDataSet(dataMat, labelMat):
"""
Expand All @@ -46,10 +46,8 @@ def showDataSet(dataMat, labelMat):
data_plus.append(dataMat[i])
else:
data_minus.append(dataMat[i])
data_plus_np = np.array(data_plus) #转换为numpy矩阵
data_minus_np = np.array(data_minus) #转换为numpy矩阵
plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1]) #正样本散点图
plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1]) #负样本散点图
plt.scatter(np.transpose(data_plus)[0], np.transpose(data_plus)[1]) #正样本散点图
plt.scatter(np.transpose(data_minus)[0], np.transpose(data_minus)[1]) #负样本散点图
plt.show()

def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
Expand All @@ -63,7 +61,7 @@ def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
Returns:
retArray - 分类结果
"""
retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1
retArray = np.ones((np.shape(dataMatrix)[0], 1)) #初始化retArray为1
if threshIneq == 'lt':
retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1
else:
Expand All @@ -83,18 +81,22 @@ def buildStump(dataArr,classLabels,D):
minError - 最小误差
bestClasEst - 最佳的分类结果
"""
dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T
dataMatrix = np.mat(dataArr)
labelMat = np.mat(classLabels).T
m,n = np.shape(dataMatrix)
numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1)))
numSteps = 10.0
bestStump = {}
bestClasEst = np.mat(np.zeros((m, 1)))
minError = float('inf') #最小误差初始化为正无穷大
for i in range(n): #遍历所有特征
rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值
rangeMin = dataMatrix[:, i].min()
rangeMax = dataMatrix[:, i].max() #找到特征中最小的值和最大值
stepSize = (rangeMax - rangeMin) / numSteps #计算步长
for j in range(-1, int(numSteps) + 1):
for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than
threshVal = (rangeMin + float(j) * stepSize) #计算阈值
predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果
errArr = np.mat(np.ones((m,1))) #初始化误差矩阵
errArr = np.mat(np.ones((m, 1))) #初始化误差矩阵
errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0
weightedError = D.T * errArr #计算误差
# print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError))
Expand All @@ -106,7 +108,7 @@ def buildStump(dataArr,classLabels,D):
bestStump['ineq'] = inequal
return bestStump, minError, bestClasEst

def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
"""
使用AdaBoost算法提升弱分类器性能
Parameters:
Expand All @@ -120,7 +122,7 @@ def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
weakClassArr = []
m = np.shape(dataArr)[0]
D = np.mat(np.ones((m, 1)) / m) #初始化权重
aggClassEst = np.mat(np.zeros((m,1)))
aggClassEst = np.mat(np.zeros((m, 1)))
for i in range(numIt):
bestStump, error, classEst = buildStump(dataArr, classLabels, D) #构建单层决策树
# print("D:",D.T)
Expand All @@ -134,10 +136,11 @@ def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
#计算AdaBoost误差,当误差为0的时候,退出循环
aggClassEst += alpha * classEst #计算类别估计累计值
# print("aggClassEst: ", aggClassEst.T)
aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m,1))) #计算误差
aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m, 1))) #计算误差
errorRate = aggErrors.sum() / m
# print("total error: ", errorRate)
if errorRate == 0.0: break #误差为0,退出循环
if errorRate == 0.0:
break #误差为0,退出循环
return weakClassArr, aggClassEst


Expand All @@ -152,7 +155,7 @@ def adaClassify(datToClass,classifierArr):
"""
dataMatrix = np.mat(datToClass)
m = np.shape(dataMatrix)[0]
aggClassEst = np.mat(np.zeros((m,1)))
aggClassEst = np.mat(np.zeros((m, 1)))
for i in range(len(classifierArr)): #遍历所有分类器,进行分类
classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq'])
aggClassEst += classifierArr[i]['alpha'] * classEst
Expand Down
30 changes: 17 additions & 13 deletions AdaBoost/horse_adaboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@

def loadDataSet(fileName):
numFeat = len((open(fileName).readline().split('\t')))
dataMat = []; labelMat = []
dataMat = []
labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat - 1):
lineArr.append(float(curLine[i]))
lineArr = [float(curLine[i]) for i in range(numFeat - 1)]
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))

Expand Down Expand Up @@ -58,18 +57,22 @@ def buildStump(dataArr,classLabels,D):
minError - 最小误差
bestClasEst - 最佳的分类结果
"""
dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T
dataMatrix = np.mat(dataArr)
labelMat = np.mat(classLabels).T
m,n = np.shape(dataMatrix)
numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1)))
numSteps = 10
bestStump = {}
bestClasEst = np.mat(np.zeros((m, 1)))
minError = float('inf') #最小误差初始化为正无穷大
for i in range(n): #遍历所有特征
rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值
rangeMin = dataMatrix[:, i].min()
rangeMax = dataMatrix[:, i].max() #找到特征中最小的值和最大值
stepSize = (rangeMax - rangeMin) / numSteps #计算步长
for j in range(-1, int(numSteps) + 1):
for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than
threshVal = (rangeMin + float(j) * stepSize) #计算阈值
predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果
errArr = np.mat(np.ones((m,1))) #初始化误差矩阵
errArr = np.mat(np.ones((m, 1))) #初始化误差矩阵
errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0
weightedError = D.T * errArr #计算误差
# print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError))
Expand All @@ -81,7 +84,7 @@ def buildStump(dataArr,classLabels,D):
bestStump['ineq'] = inequal
return bestStump, minError, bestClasEst

def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
"""
使用AdaBoost算法提升弱分类器性能
Parameters:
Expand All @@ -95,7 +98,7 @@ def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
weakClassArr = []
m = np.shape(dataArr)[0]
D = np.mat(np.ones((m, 1)) / m) #初始化权重
aggClassEst = np.mat(np.zeros((m,1)))
aggClassEst = np.mat(np.zeros((m, 1)))
for i in range(numIt):
bestStump, error, classEst = buildStump(dataArr, classLabels, D) #构建单层决策树
# print("D:",D.T)
Expand All @@ -109,10 +112,11 @@ def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
#计算AdaBoost误差,当误差为0的时候,退出循环
aggClassEst += alpha * classEst #计算类别估计累计值
# print("aggClassEst: ", aggClassEst.T)
aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m,1))) #计算误差
aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m, 1))) #计算误差
errorRate = aggErrors.sum() / m
# print("total error: ", errorRate)
if errorRate == 0.0: break #误差为0,退出循环
if errorRate == 0.0:
break #误差为0,退出循环
return weakClassArr, aggClassEst

def adaClassify(datToClass,classifierArr):
Expand All @@ -126,7 +130,7 @@ def adaClassify(datToClass,classifierArr):
"""
dataMatrix = np.mat(datToClass)
m = np.shape(dataMatrix)[0]
aggClassEst = np.mat(np.zeros((m,1)))
aggClassEst = np.mat(np.zeros((m, 1)))
for i in range(len(classifierArr)): #遍历所有分类器,进行分类
classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq'])
aggClassEst += classifierArr[i]['alpha'] * classEst
Expand Down
7 changes: 3 additions & 4 deletions AdaBoost/sklearn_adaboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,12 @@

def loadDataSet(fileName):
numFeat = len((open(fileName).readline().split('\t')))
dataMat = []; labelMat = []
dataMat = []
labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat - 1):
lineArr.append(float(curLine[i]))
lineArr = [float(curLine[i]) for i in range(numFeat - 1)]
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))

Expand Down
Loading