diff --git a/DecisionStump/.project b/DecisionStump/.project new file mode 100644 index 0000000..58d61c6 --- /dev/null +++ b/DecisionStump/.project @@ -0,0 +1,17 @@ + + + DecisionStump + + + + + + org.python.pydev.PyDevBuilder + + + + + + org.python.pydev.pythonNature + + diff --git a/DecisionStump/.pydevproject b/DecisionStump/.pydevproject new file mode 100644 index 0000000..8b78a33 --- /dev/null +++ b/DecisionStump/.pydevproject @@ -0,0 +1,10 @@ + + + + +Default +python 2.7 + +/DecisionStump + + diff --git a/DecisionStump/src/Adaboosting.py b/DecisionStump/src/Adaboosting.py new file mode 100644 index 0000000..d9da749 --- /dev/null +++ b/DecisionStump/src/Adaboosting.py @@ -0,0 +1,68 @@ +from numpy import * + +def loadSimpData(): + datMat = matrix([[ 1. , 2.1], + [ 2. , 1.1], + [ 1.3, 1. ], + [ 1. , 1. ], + [ 2. , 1. ]]) + classLabels = [1.0, 1.0, -1.0, -1.0, 1.0] + return datMat,classLabels + +def loadDataSet(fileName): #general function to parse tab -delimited floats + numFeat = len(open(fileName).readline().split('\t')) #get number of fields + dataMat = []; labelMat = [] + fr = open(fileName) + for line in fr.readlines(): + lineArr =[] + curLine = line.strip().split('\t') + for i in range(numFeat-1): + lineArr.append(float(curLine[i])) + dataMat.append(lineArr) + labelMat.append(float(curLine[-1])) + return dataMat,labelMat + +def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#just classify the data + retArray = ones((shape(dataMatrix)[0],1)) + if threshIneq == 'lt': + retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 + else: + retArray[dataMatrix[:,dimen] > threshVal] = -1.0 + + return retArray + + +def buildStump(dataArr,classLabels,D): + dataMatrix = mat(dataArr); labelMat = mat(classLabels).T + m,n = shape(dataMatrix) + numSteps = 10.0; bestStump = {}; bestClasEst = mat(zeros((m,1))) + minError = inf #init error sum, to +infinity + for i in range(n):#loop over all dimensions + rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max(); + + stepSize = (rangeMax-rangeMin)/numSteps + for j in range(-1,int(numSteps)+1):#loop over all range in current dimension + for inequal in ['lt', 'gt']: #go over less than and greater than + threshVal = (rangeMin + float(j) * stepSize) + + predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)#call stump classify with i, j, lessThan + errArr = mat(ones((m,1))) + + + errArr[predictedVals == labelMat] = 0 + + weightedError = D.T*errArr #calc total error multiplied by D + #print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError) + if weightedError < minError: + minError = weightedError + bestClasEst = predictedVals.copy() + bestStump['dim'] = i + bestStump['thresh'] = threshVal + bestStump['ineq'] = inequal + return bestStump,minError,bestClasEst + +if __name__=='__main__': + datMat,classLabels=loadSimpData() + d=mat(ones((5,1))/5) + bestStump,minError,bestClasEst=buildStump(datMat,classLabels,d) + print bestStump,minError,bestClasEst \ No newline at end of file