Skip to content

Commit

Permalink
[+] transformation funtions implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
dmitryikh committed Mar 16, 2019
1 parent 5fc9ae7 commit f50d815
Show file tree
Hide file tree
Showing 10 changed files with 269 additions and 34 deletions.
61 changes: 47 additions & 14 deletions leaves.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"math"
"runtime"
"sync"

"github.com/dmitryikh/leaves/transformation"
)

// BatchSize for parallel task
Expand All @@ -23,6 +25,18 @@ type ensembleBaseInterface interface {
// Ensemble is a common wrapper for all models
type Ensemble struct {
ensembleBaseInterface
transform transformation.Transform
}

func (e *Ensemble) predictInnerAndTransform(fvals []float64, nEstimators int, predictions []float64, startIndex int) {
if e.Transformation().Type() == transformation.Raw {
e.predictInner(fvals, nEstimators, predictions, startIndex)
} else {
// TODO: avoid allocation here
rawPredictions := make([]float64, e.NRawOutputGroups())
e.predictInner(fvals, nEstimators, rawPredictions, 0)
e.transform.Transform(rawPredictions, predictions, startIndex)
}
}

// PredictSingle calculates prediction for single class model. If ensemble is
Expand All @@ -32,7 +46,7 @@ type Ensemble struct {
// function transformation and etc)
// NOTE: for multiclass prediction use Predict
func (e *Ensemble) PredictSingle(fvals []float64, nEstimators int) float64 {
if e.NRawOutputGroups() != 1 {
if e.NOutputGroups() != 1 {
return 0.0
}
if e.NFeatures() > len(fvals) {
Expand All @@ -41,7 +55,7 @@ func (e *Ensemble) PredictSingle(fvals []float64, nEstimators int) float64 {
nEstimators = e.adjustNEstimators(nEstimators)
ret := [1]float64{0.0}

e.predictInner(fvals, nEstimators, ret[:], 0)
e.predictInnerAndTransform(fvals, nEstimators, ret[:], 0)
return ret[0]
}

Expand All @@ -51,15 +65,15 @@ func (e *Ensemble) PredictSingle(fvals []float64, nEstimators int) float64 {
// NOTE: for single class predictions one can use simplified function PredictSingle
func (e *Ensemble) Predict(fvals []float64, nEstimators int, predictions []float64) error {
nRows := 1
if len(predictions) < e.NRawOutputGroups()*nRows {
return fmt.Errorf("predictions slice too short (should be at least %d)", e.NRawOutputGroups()*nRows)
if len(predictions) < e.NOutputGroups()*nRows {
return fmt.Errorf("predictions slice too short (should be at least %d)", e.NOutputGroups()*nRows)
}
if e.NFeatures() > len(fvals) {
return fmt.Errorf("incorrect number of features (%d)", len(fvals))
}
nEstimators = e.adjustNEstimators(nEstimators)

e.predictInner(fvals, nEstimators, predictions, 0)
e.predictInnerAndTransform(fvals, nEstimators, predictions, 0)
return nil
}

Expand All @@ -72,8 +86,8 @@ func (e *Ensemble) Predict(fvals []float64, nEstimators int, predictions []float
// Note, `predictions` slice should be properly allocated on call side
func (e *Ensemble) PredictCSR(indptr []int, cols []int, vals []float64, predictions []float64, nEstimators int, nThreads int) error {
nRows := len(indptr) - 1
if len(predictions) < e.NRawOutputGroups()*nRows {
return fmt.Errorf("predictions slice too short (should be at least %d)", e.NRawOutputGroups()*nRows)
if len(predictions) < e.NOutputGroups()*nRows {
return fmt.Errorf("predictions slice too short (should be at least %d)", e.NOutputGroups()*nRows)
}
nEstimators = e.adjustNEstimators(nEstimators)
if nRows <= BatchSize || nThreads == 0 || nThreads == 1 {
Expand Down Expand Up @@ -136,7 +150,7 @@ func (e *Ensemble) predictCSRInner(
fvals[cols[j]] = vals[j]
}
}
e.predictInner(fvals, nEstimators, predictions, i*e.NRawOutputGroups())
e.predictInnerAndTransform(fvals, nEstimators, predictions, i*e.NOutputGroups())
e.resetFVals(fvals)
}
}
Expand All @@ -156,8 +170,8 @@ func (e *Ensemble) PredictDense(
nThreads int,
) error {
nRows := nrows
if len(predictions) < e.NRawOutputGroups()*nRows {
return fmt.Errorf("predictions slice too short (should be at least %d)", e.NRawOutputGroups()*nRows)
if len(predictions) < e.NOutputGroups()*nRows {
return fmt.Errorf("predictions slice too short (should be at least %d)", e.NOutputGroups()*nRows)
}
if ncols == 0 || e.NFeatures() > ncols {
return fmt.Errorf("incorrect number of columns")
Expand All @@ -166,7 +180,7 @@ func (e *Ensemble) PredictDense(
if nRows <= BatchSize || nThreads == 0 || nThreads == 1 {
// single thread calculations
for i := 0; i < nRows; i++ {
e.predictInner(vals[i*ncols:(i+1)*ncols], nEstimators, predictions, i*e.NRawOutputGroups())
e.predictInnerAndTransform(vals[i*ncols:(i+1)*ncols], nEstimators, predictions, i*e.NOutputGroups())
}
return nil
}
Expand All @@ -190,7 +204,7 @@ func (e *Ensemble) PredictDense(
endIndex = nRows
}
for i := startIndex; i < endIndex; i++ {
e.predictInner(vals[i*int(ncols):(i+1)*int(ncols)], nEstimators, predictions, i*e.NRawOutputGroups())
e.predictInnerAndTransform(vals[i*int(ncols):(i+1)*int(ncols)], nEstimators, predictions, i*e.NOutputGroups())
}
}
}()
Expand All @@ -211,12 +225,20 @@ func (e *Ensemble) NEstimators() int {
}

// NRawOutputGroups returns number of groups (numbers) in every object
// predictions. For example binary logistic model will give 1, but 4-class
// prediction model will give 4 numbers per object
// predictions before transformation function applied. This value is provided
// mainly for information purpose
func (e *Ensemble) NRawOutputGroups() int {
return e.ensembleBaseInterface.NRawOutputGroups()
}

// NOutputGroups returns number of groups (numbers) in every object predictions.
// For example binary logistic model will give 1, but 4-class prediction model
// will give 4 numbers per object. This value usually used to preallocate slice
// with prediction values
func (e *Ensemble) NOutputGroups() int {
return e.transform.NOutputGroups()
}

// NFeatures returns number of features in the model
func (e *Ensemble) NFeatures() int {
return e.ensembleBaseInterface.NFeatures()
Expand All @@ -226,3 +248,14 @@ func (e *Ensemble) NFeatures() int {
func (e *Ensemble) Name() string {
return e.ensembleBaseInterface.Name()
}

// Transformation returns transformation objects which applied to model outputs.
func (e *Ensemble) Transformation() transformation.Transform {
return e.transform
}

// EnsembleWithRawPredictions returns ensemble instance with TransformRaw (no
// transformation functions will be applied to the model resulst)
func (e *Ensemble) EnsembleWithRawPredictions() *Ensemble {
return &Ensemble{e, &transformation.TransformRaw{e.NRawOutputGroups()}}
}
68 changes: 62 additions & 6 deletions lgensemble_io.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"strconv"
"strings"

"github.com/dmitryikh/leaves/transformation"
"github.com/dmitryikh/leaves/util"
)

Expand Down Expand Up @@ -48,6 +49,36 @@ type lgNodeJSON struct {
RightChild interface{}
}

// lgObjective keeps parsed data from 'objective' field of lightgbm txt format
// 'multiclass num_class:13' parsed to
// lgObjective{name: 'multiclass', param: 'num_class', value:13}
type lgObjective struct {
name string
param string
value int
}

func lgObjectiveParse(objective string) (lgObjective, error) {
tokens := strings.Split(objective, " ")
objectiveStruct := lgObjective{}
errorMsg := fmt.Errorf("unexpected objective field: '%s'", objective)
if len(tokens) != 2 {
return objectiveStruct, errorMsg
}
objectiveStruct.name = tokens[0]
paramTokens := strings.Split(tokens[1], ":")
if len(paramTokens) != 2 {
return objectiveStruct, errorMsg
}
objectiveStruct.param = paramTokens[0]
value, err := strconv.Atoi(paramTokens[1])
if err != nil {
return objectiveStruct, errorMsg
}
objectiveStruct.value = value
return objectiveStruct, nil
}

func convertMissingType(decisionType uint32) (uint8, error) {
missingTypeOrig := (decisionType >> 2) & 3
missingType := uint8(0)
Expand Down Expand Up @@ -262,10 +293,6 @@ func lgTreeFromReader(reader *bufio.Reader) (lgTree, error) {
func LGEnsembleFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensemble, error) {
e := &lgEnsemble{name: "lightgbm.gbdt"}

if loadTransformation {
return nil, fmt.Errorf("transformation functions are not supported for LightGBM models")
}

params, err := util.ReadParamsUntilBlank(reader)
if err != nil {
return nil, err
Expand Down Expand Up @@ -316,6 +343,35 @@ func LGEnsembleFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensem
return nil, fmt.Errorf("wrong number of trees (%d) for number of class (%d)", nTrees, e.nRawOutputGroups)
}

var transform transformation.Transform
transform = &transformation.TransformRaw{e.nRawOutputGroups}
// NOTE: it seems that we don't nee to apply transformation to random forest models
// TODO: check it
if loadTransformation && !e.averageOutput {
objectiveStr, err := params.ToString("objective")
if err != nil {
return nil, err
}
objectiveStruct, err := lgObjectiveParse(objectiveStr)
if err != nil {
return nil, err
}
if objectiveStruct.name == "binary" && objectiveStruct.param == "sigmoid" {
if objectiveStruct.value != 1 {
return nil, fmt.Errorf("got sigmoid with value != 1 (got %d)", objectiveStruct.value)
}
transform = &transformation.TransformLogistic{}
} else if objectiveStruct.name == "multiclass" && objectiveStruct.param == "num_class" {
if objectiveStruct.value != e.nRawOutputGroups {
return nil, fmt.Errorf("got multiclass num_class != %d (got %d)", e.nRawOutputGroups, objectiveStruct.value)
}
transform = &transformation.TransformSoftmax{objectiveStruct.value}
// multiclass num_class:13
} else {
return nil, fmt.Errorf("unknown transformation function '%s'", objectiveStr)
}
}

e.Trees = make([]lgTree, 0, nTrees)
for i := 0; i < nTrees; i++ {
tree, err := lgTreeFromReader(reader)
Expand All @@ -324,7 +380,7 @@ func LGEnsembleFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensem
}
e.Trees = append(e.Trees, tree)
}
return &Ensemble{e}, nil
return &Ensemble{e, transform}, nil
}

// LGEnsembleFromFile reads LightGBM model from binary file
Expand Down Expand Up @@ -602,5 +658,5 @@ func LGEnsembleFromJSON(reader io.Reader, loadTransformation bool) (*Ensemble, e
}
e.Trees = append(e.Trees, tree)
}
return &Ensemble{e}, nil
return &Ensemble{e, &transformation.TransformRaw{e.nRawOutputGroups}}, nil
}
9 changes: 5 additions & 4 deletions skensemble_io.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"os"

"github.com/dmitryikh/leaves/internal/pickle"
"github.com/dmitryikh/leaves/transformation"
)

func lgTreeFromSklearnDecisionTreeRegressor(tree pickle.SklearnDecisionTreeRegressor, scale float64, base float64) (lgTree, error) {
Expand Down Expand Up @@ -113,7 +114,7 @@ func lgTreeFromSklearnDecisionTreeRegressor(tree pickle.SklearnDecisionTreeRegre
}

// SKEnsembleFromReader reads sklearn tree ensemble model from `reader`
func SKEnsembleFromReader(reader *bufio.Reader) (*Ensemble, error) {
func SKEnsembleFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensemble, error) {
e := &lgEnsemble{name: "sklearn.ensemble.GradientBoostingClassifier"}
decoder := pickle.NewDecoder(reader)
res, err := decoder.Decode()
Expand Down Expand Up @@ -171,16 +172,16 @@ func SKEnsembleFromReader(reader *bufio.Reader) (*Ensemble, error) {
base[k] = 0.0
}
}
return &Ensemble{e}, nil
return &Ensemble{e, &transformation.TransformRaw{e.nRawOutputGroups}}, nil
}

// SKEnsembleFromFile reads sklearn tree ensemble model from pickle file
func SKEnsembleFromFile(filename string) (*Ensemble, error) {
func SKEnsembleFromFile(filename string, loadTransformation bool) (*Ensemble, error) {
reader, err := os.Open(filename)
if err != nil {
return nil, err
}
defer reader.Close()
bufReader := bufio.NewReader(reader)
return SKEnsembleFromReader(bufReader)
return SKEnsembleFromReader(bufReader, loadTransformation)
}
30 changes: 30 additions & 0 deletions transformation/logistic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package transformation

import (
"fmt"

"github.com/dmitryikh/leaves/util"
)

type TransformLogistic struct{}

func (t *TransformLogistic) Transform(rawPredictions []float64, outputPredictions []float64, startIndex int) error {
if len(rawPredictions) != 1 {
return fmt.Errorf("expected len(rawPredictions) = 1 (got %d)", len(rawPredictions))
}

outputPredictions[startIndex] = util.Sigmoid(rawPredictions[0])
return nil
}

func (t *TransformLogistic) NOutputGroups() int {
return 1
}

func (t *TransformLogistic) Type() TransformType {
return Logistic
}

func (t *TransformLogistic) Name() string {
return Logistic.Name()
}
24 changes: 24 additions & 0 deletions transformation/raw.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package transformation

type TransformRaw struct {
NumOutputGroups int
}

func (t *TransformRaw) Transform(rawPredictions []float64, outputPredictions []float64, startIndex int) error {
for i, v := range rawPredictions {
outputPredictions[startIndex+i] = v
}
return nil
}

func (t *TransformRaw) NOutputGroups() int {
return t.NumOutputGroups
}

func (t *TransformRaw) Type() TransformType {
return Raw
}

func (t *TransformRaw) Name() string {
return Raw.Name()
}
32 changes: 32 additions & 0 deletions transformation/softmax.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package transformation

import (
"fmt"

"github.com/dmitryikh/leaves/util"
)

type TransformSoftmax struct {
NClasses int
}

func (t *TransformSoftmax) Transform(rawPredictions []float64, outputPredictions []float64, startIndex int) error {
if len(rawPredictions) != t.NClasses {
return fmt.Errorf("expected len(rawPredictions) = %d (got %d)", t.NClasses, len(rawPredictions))
}

util.SoftmaxFloat64Slice(rawPredictions, outputPredictions, startIndex)
return nil
}

func (t *TransformSoftmax) NOutputGroups() int {
return t.NClasses
}

func (t *TransformSoftmax) Type() TransformType {
return Softmax
}

func (t *TransformSoftmax) Name() string {
return Softmax.Name()
}
Loading

0 comments on commit f50d815

Please sign in to comment.