[+] transformation funtions implementation

dmitryikh · Mar 16, 2019 · f50d815 · f50d815
1 parent 5fc9ae7
commit f50d815
Show file tree

Hide file tree

Showing 10 changed files with 269 additions and 34 deletions.
diff --git a/leaves.go b/leaves.go
@@ -5,6 +5,8 @@ import (
 	"math"
 	"runtime"
 	"sync"
+
+	"github.com/dmitryikh/leaves/transformation"
 )
 
 // BatchSize for parallel task
@@ -23,6 +25,18 @@ type ensembleBaseInterface interface {
 // Ensemble is a common wrapper for all models
 type Ensemble struct {
 	ensembleBaseInterface
+	transform transformation.Transform
+}
+
+func (e *Ensemble) predictInnerAndTransform(fvals []float64, nEstimators int, predictions []float64, startIndex int) {
+	if e.Transformation().Type() == transformation.Raw {
+		e.predictInner(fvals, nEstimators, predictions, startIndex)
+	} else {
+		// TODO: avoid allocation here
+		rawPredictions := make([]float64, e.NRawOutputGroups())
+		e.predictInner(fvals, nEstimators, rawPredictions, 0)
+		e.transform.Transform(rawPredictions, predictions, startIndex)
+	}
 }
 
 // PredictSingle calculates prediction for single class model. If ensemble is
@@ -32,7 +46,7 @@ type Ensemble struct {
 // function transformation and etc)
 // NOTE: for multiclass prediction use Predict
 func (e *Ensemble) PredictSingle(fvals []float64, nEstimators int) float64 {
-	if e.NRawOutputGroups() != 1 {
+	if e.NOutputGroups() != 1 {
 		return 0.0
 	}
 	if e.NFeatures() > len(fvals) {
@@ -41,7 +55,7 @@ func (e *Ensemble) PredictSingle(fvals []float64, nEstimators int) float64 {
 	nEstimators = e.adjustNEstimators(nEstimators)
 	ret := [1]float64{0.0}
 
-	e.predictInner(fvals, nEstimators, ret[:], 0)
+	e.predictInnerAndTransform(fvals, nEstimators, ret[:], 0)
 	return ret[0]
 }
 
@@ -51,15 +65,15 @@ func (e *Ensemble) PredictSingle(fvals []float64, nEstimators int) float64 {
 // NOTE: for single class predictions one can use simplified function PredictSingle
 func (e *Ensemble) Predict(fvals []float64, nEstimators int, predictions []float64) error {
 	nRows := 1
-	if len(predictions) < e.NRawOutputGroups()*nRows {
-		return fmt.Errorf("predictions slice too short (should be at least %d)", e.NRawOutputGroups()*nRows)
+	if len(predictions) < e.NOutputGroups()*nRows {
+		return fmt.Errorf("predictions slice too short (should be at least %d)", e.NOutputGroups()*nRows)
 	}
 	if e.NFeatures() > len(fvals) {
 		return fmt.Errorf("incorrect number of features (%d)", len(fvals))
 	}
 	nEstimators = e.adjustNEstimators(nEstimators)
 
-	e.predictInner(fvals, nEstimators, predictions, 0)
+	e.predictInnerAndTransform(fvals, nEstimators, predictions, 0)
 	return nil
 }
 
@@ -72,8 +86,8 @@ func (e *Ensemble) Predict(fvals []float64, nEstimators int, predictions []float
 // Note, `predictions` slice should be properly allocated on call side
 func (e *Ensemble) PredictCSR(indptr []int, cols []int, vals []float64, predictions []float64, nEstimators int, nThreads int) error {
 	nRows := len(indptr) - 1
-	if len(predictions) < e.NRawOutputGroups()*nRows {
-		return fmt.Errorf("predictions slice too short (should be at least %d)", e.NRawOutputGroups()*nRows)
+	if len(predictions) < e.NOutputGroups()*nRows {
+		return fmt.Errorf("predictions slice too short (should be at least %d)", e.NOutputGroups()*nRows)
 	}
 	nEstimators = e.adjustNEstimators(nEstimators)
 	if nRows <= BatchSize || nThreads == 0 || nThreads == 1 {
@@ -136,7 +150,7 @@ func (e *Ensemble) predictCSRInner(
 				fvals[cols[j]] = vals[j]
 			}
 		}
-		e.predictInner(fvals, nEstimators, predictions, i*e.NRawOutputGroups())
+		e.predictInnerAndTransform(fvals, nEstimators, predictions, i*e.NOutputGroups())
 		e.resetFVals(fvals)
 	}
 }
@@ -156,8 +170,8 @@ func (e *Ensemble) PredictDense(
 	nThreads int,
 ) error {
 	nRows := nrows
-	if len(predictions) < e.NRawOutputGroups()*nRows {
-		return fmt.Errorf("predictions slice too short (should be at least %d)", e.NRawOutputGroups()*nRows)
+	if len(predictions) < e.NOutputGroups()*nRows {
+		return fmt.Errorf("predictions slice too short (should be at least %d)", e.NOutputGroups()*nRows)
 	}
 	if ncols == 0 || e.NFeatures() > ncols {
 		return fmt.Errorf("incorrect number of columns")
@@ -166,7 +180,7 @@ func (e *Ensemble) PredictDense(
 	if nRows <= BatchSize || nThreads == 0 || nThreads == 1 {
 		// single thread calculations
 		for i := 0; i < nRows; i++ {
-			e.predictInner(vals[i*ncols:(i+1)*ncols], nEstimators, predictions, i*e.NRawOutputGroups())
+			e.predictInnerAndTransform(vals[i*ncols:(i+1)*ncols], nEstimators, predictions, i*e.NOutputGroups())
 		}
 		return nil
 	}
@@ -190,7 +204,7 @@ func (e *Ensemble) PredictDense(
 					endIndex = nRows
 				}
 				for i := startIndex; i < endIndex; i++ {
-					e.predictInner(vals[i*int(ncols):(i+1)*int(ncols)], nEstimators, predictions, i*e.NRawOutputGroups())
+					e.predictInnerAndTransform(vals[i*int(ncols):(i+1)*int(ncols)], nEstimators, predictions, i*e.NOutputGroups())
 				}
 			}
 		}()
@@ -211,12 +225,20 @@ func (e *Ensemble) NEstimators() int {
 }
 
 // NRawOutputGroups returns number of groups (numbers) in every object
-// predictions. For example binary logistic model will give 1, but 4-class
-// prediction model will give 4 numbers per object
+// predictions before transformation function applied. This value is provided
+// mainly for information purpose
 func (e *Ensemble) NRawOutputGroups() int {
 	return e.ensembleBaseInterface.NRawOutputGroups()
 }
 
+// NOutputGroups returns number of groups (numbers) in every object predictions.
+// For example binary logistic model will give 1, but 4-class prediction model
+// will give 4 numbers per object. This value usually used to preallocate slice
+// with prediction values
+func (e *Ensemble) NOutputGroups() int {
+	return e.transform.NOutputGroups()
+}
+
 // NFeatures returns number of features in the model
 func (e *Ensemble) NFeatures() int {
 	return e.ensembleBaseInterface.NFeatures()
@@ -226,3 +248,14 @@ func (e *Ensemble) NFeatures() int {
 func (e *Ensemble) Name() string {
 	return e.ensembleBaseInterface.Name()
 }
+
+// Transformation returns transformation objects which applied to model outputs.
+func (e *Ensemble) Transformation() transformation.Transform {
+	return e.transform
+}
+
+// EnsembleWithRawPredictions returns ensemble instance with TransformRaw (no
+// transformation functions will be applied to the model resulst)
+func (e *Ensemble) EnsembleWithRawPredictions() *Ensemble {
+	return &Ensemble{e, &transformation.TransformRaw{e.NRawOutputGroups()}}
+}
diff --git a/lgensemble_io.go b/lgensemble_io.go
@@ -9,6 +9,7 @@ import (
 	"strconv"
 	"strings"
 
+	"github.com/dmitryikh/leaves/transformation"
 	"github.com/dmitryikh/leaves/util"
 )
 
@@ -48,6 +49,36 @@ type lgNodeJSON struct {
 	RightChild    interface{}
 }
 
+// lgObjective keeps parsed data from 'objective' field of lightgbm txt format
+// 'multiclass num_class:13' parsed to
+// lgObjective{name: 'multiclass', param: 'num_class', value:13}
+type lgObjective struct {
+	name  string
+	param string
+	value int
+}
+
+func lgObjectiveParse(objective string) (lgObjective, error) {
+	tokens := strings.Split(objective, " ")
+	objectiveStruct := lgObjective{}
+	errorMsg := fmt.Errorf("unexpected objective field: '%s'", objective)
+	if len(tokens) != 2 {
+		return objectiveStruct, errorMsg
+	}
+	objectiveStruct.name = tokens[0]
+	paramTokens := strings.Split(tokens[1], ":")
+	if len(paramTokens) != 2 {
+		return objectiveStruct, errorMsg
+	}
+	objectiveStruct.param = paramTokens[0]
+	value, err := strconv.Atoi(paramTokens[1])
+	if err != nil {
+		return objectiveStruct, errorMsg
+	}
+	objectiveStruct.value = value
+	return objectiveStruct, nil
+}
+
 func convertMissingType(decisionType uint32) (uint8, error) {
 	missingTypeOrig := (decisionType >> 2) & 3
 	missingType := uint8(0)
@@ -262,10 +293,6 @@ func lgTreeFromReader(reader *bufio.Reader) (lgTree, error) {
 func LGEnsembleFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensemble, error) {
 	e := &lgEnsemble{name: "lightgbm.gbdt"}
 
-	if loadTransformation {
-		return nil, fmt.Errorf("transformation functions are not supported for LightGBM models")
-	}
-
 	params, err := util.ReadParamsUntilBlank(reader)
 	if err != nil {
 		return nil, err
@@ -316,6 +343,35 @@ func LGEnsembleFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensem
 		return nil, fmt.Errorf("wrong number of trees (%d) for number of class (%d)", nTrees, e.nRawOutputGroups)
 	}
 
+	var transform transformation.Transform
+	transform = &transformation.TransformRaw{e.nRawOutputGroups}
+	// NOTE: it seems that we don't nee to apply transformation to random forest models
+	// TODO: check it
+	if loadTransformation && !e.averageOutput {
+		objectiveStr, err := params.ToString("objective")
+		if err != nil {
+			return nil, err
+		}
+		objectiveStruct, err := lgObjectiveParse(objectiveStr)
+		if err != nil {
+			return nil, err
+		}
+		if objectiveStruct.name == "binary" && objectiveStruct.param == "sigmoid" {
+			if objectiveStruct.value != 1 {
+				return nil, fmt.Errorf("got sigmoid with value != 1 (got %d)", objectiveStruct.value)
+			}
+			transform = &transformation.TransformLogistic{}
+		} else if objectiveStruct.name == "multiclass" && objectiveStruct.param == "num_class" {
+			if objectiveStruct.value != e.nRawOutputGroups {
+				return nil, fmt.Errorf("got multiclass num_class != %d (got %d)", e.nRawOutputGroups, objectiveStruct.value)
+			}
+			transform = &transformation.TransformSoftmax{objectiveStruct.value}
+			// multiclass num_class:13
+		} else {
+			return nil, fmt.Errorf("unknown transformation function '%s'", objectiveStr)
+		}
+	}
+
 	e.Trees = make([]lgTree, 0, nTrees)
 	for i := 0; i < nTrees; i++ {
 		tree, err := lgTreeFromReader(reader)
@@ -324,7 +380,7 @@ func LGEnsembleFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensem
 		}
 		e.Trees = append(e.Trees, tree)
 	}
-	return &Ensemble{e}, nil
+	return &Ensemble{e, transform}, nil
 }
 
 // LGEnsembleFromFile reads LightGBM model from binary file
@@ -602,5 +658,5 @@ func LGEnsembleFromJSON(reader io.Reader, loadTransformation bool) (*Ensemble, e
 		}
 		e.Trees = append(e.Trees, tree)
 	}
-	return &Ensemble{e}, nil
+	return &Ensemble{e, &transformation.TransformRaw{e.nRawOutputGroups}}, nil
 }
diff --git a/skensemble_io.go b/skensemble_io.go
@@ -6,6 +6,7 @@ import (
 	"os"
 
 	"github.com/dmitryikh/leaves/internal/pickle"
+	"github.com/dmitryikh/leaves/transformation"
 )
 
 func lgTreeFromSklearnDecisionTreeRegressor(tree pickle.SklearnDecisionTreeRegressor, scale float64, base float64) (lgTree, error) {
@@ -113,7 +114,7 @@ func lgTreeFromSklearnDecisionTreeRegressor(tree pickle.SklearnDecisionTreeRegre
 }
 
 // SKEnsembleFromReader reads sklearn tree ensemble model from `reader`
-func SKEnsembleFromReader(reader *bufio.Reader) (*Ensemble, error) {
+func SKEnsembleFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensemble, error) {
 	e := &lgEnsemble{name: "sklearn.ensemble.GradientBoostingClassifier"}
 	decoder := pickle.NewDecoder(reader)
 	res, err := decoder.Decode()
@@ -171,16 +172,16 @@ func SKEnsembleFromReader(reader *bufio.Reader) (*Ensemble, error) {
 			base[k] = 0.0
 		}
 	}
-	return &Ensemble{e}, nil
+	return &Ensemble{e, &transformation.TransformRaw{e.nRawOutputGroups}}, nil
 }
 
 // SKEnsembleFromFile reads sklearn tree ensemble model from pickle file
-func SKEnsembleFromFile(filename string) (*Ensemble, error) {
+func SKEnsembleFromFile(filename string, loadTransformation bool) (*Ensemble, error) {
 	reader, err := os.Open(filename)
 	if err != nil {
 		return nil, err
 	}
 	defer reader.Close()
 	bufReader := bufio.NewReader(reader)
-	return SKEnsembleFromReader(bufReader)
+	return SKEnsembleFromReader(bufReader, loadTransformation)
 }
diff --git a/transformation/logistic.go b/transformation/logistic.go
@@ -0,0 +1,30 @@
+package transformation
+
+import (
+	"fmt"
+
+	"github.com/dmitryikh/leaves/util"
+)
+
+type TransformLogistic struct{}
+
+func (t *TransformLogistic) Transform(rawPredictions []float64, outputPredictions []float64, startIndex int) error {
+	if len(rawPredictions) != 1 {
+		return fmt.Errorf("expected len(rawPredictions) = 1 (got %d)", len(rawPredictions))
+	}
+
+	outputPredictions[startIndex] = util.Sigmoid(rawPredictions[0])
+	return nil
+}
+
+func (t *TransformLogistic) NOutputGroups() int {
+	return 1
+}
+
+func (t *TransformLogistic) Type() TransformType {
+	return Logistic
+}
+
+func (t *TransformLogistic) Name() string {
+	return Logistic.Name()
+}
diff --git a/transformation/raw.go b/transformation/raw.go
@@ -0,0 +1,24 @@
+package transformation
+
+type TransformRaw struct {
+	NumOutputGroups int
+}
+
+func (t *TransformRaw) Transform(rawPredictions []float64, outputPredictions []float64, startIndex int) error {
+	for i, v := range rawPredictions {
+		outputPredictions[startIndex+i] = v
+	}
+	return nil
+}
+
+func (t *TransformRaw) NOutputGroups() int {
+	return t.NumOutputGroups
+}
+
+func (t *TransformRaw) Type() TransformType {
+	return Raw
+}
+
+func (t *TransformRaw) Name() string {
+	return Raw.Name()
+}
diff --git a/transformation/softmax.go b/transformation/softmax.go
@@ -0,0 +1,32 @@
+package transformation
+
+import (
+	"fmt"
+
+	"github.com/dmitryikh/leaves/util"
+)
+
+type TransformSoftmax struct {
+	NClasses int
+}
+
+func (t *TransformSoftmax) Transform(rawPredictions []float64, outputPredictions []float64, startIndex int) error {
+	if len(rawPredictions) != t.NClasses {
+		return fmt.Errorf("expected len(rawPredictions) = %d (got %d)", t.NClasses, len(rawPredictions))
+	}
+
+	util.SoftmaxFloat64Slice(rawPredictions, outputPredictions, startIndex)
+	return nil
+}
+
+func (t *TransformSoftmax) NOutputGroups() int {
+	return t.NClasses
+}
+
+func (t *TransformSoftmax) Type() TransformType {
+	return Softmax
+}
+
+func (t *TransformSoftmax) Name() string {
+	return Softmax.Name()
+}