-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.go
132 lines (112 loc) · 2.53 KB
/
parse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
package main
import (
"encoding/csv"
"errors"
"fmt"
"io"
"strconv"
)
type parsedInput struct {
isRegression bool
X [][]float64
YClf []string // will be nil when isRegression = true
YReg []float64 // will be nil when isRegression = false
VarNames []string
}
// parse csv file, detect if first row is header/has var names,
// returns X, Y, varNames, error
func parseCSV(r io.Reader, forceClf bool) (*parsedInput, error) {
reader := csv.NewReader(r)
// isRegression=true, parse as regression until we hit
// errors parsing floats, then set flag; set to false
// when forceClf
p := &parsedInput{isRegression: !forceClf}
// grab first fow
row, err := reader.Read()
if err != nil {
return p, err
}
// check if it's a header row
varNames, err := parseHeader(row)
if err == nil {
p.VarNames = varNames
} else {
// use X1, X2,...Xn for var names
for i := range row[1:] {
p.VarNames = append(p.VarNames, fmt.Sprintf("X%d", i+1))
}
// parse row
err = p.ParseRow(row)
if err != nil {
return p, err
}
}
// keep reading rows until EOF
for {
row, err := reader.Read()
if err == io.EOF {
break
}
if err != nil {
return p, err
}
err = p.ParseRow(row)
if err != nil {
return p, err
}
}
// drop the y vals we aren't using
if p.isRegression {
p.YClf = nil
} else {
p.YReg = nil
}
return p, err
}
func (p *parsedInput) ParseRow(row []string) error {
xi, err := parseFeatureVals(row)
if err != nil {
return err
}
p.X = append(p.X, xi)
// parse as regression and classification until we encounter errors
// parsing floats
if p.isRegression {
yi, err := strconv.ParseFloat(row[0], 64)
if err != nil {
p.isRegression = false
}
p.YReg = append(p.YReg, yi)
}
p.YClf = append(p.YClf, row[0])
return nil
}
func parseFeatureVals(row []string) ([]float64, error) {
var xi []float64
if len(row) < 1 {
return xi, errors.New("row only has one column")
}
for _, val := range row[1:] {
fv, err := strconv.ParseFloat(val, 64)
if err != nil {
return xi, err
}
xi = append(xi, fv)
}
return xi, nil
}
func parseHeader(row []string) ([]string, error) {
colNames := []string{}
// we only accept numeric input values, so we can consider the first row
// as a header row if one or more of the values isn't a number
if len(row) > 1 {
for _, val := range row[1:] {
_, err := strconv.ParseFloat(val, 64)
if err == nil {
return colNames, errors.New("not a header row")
}
colNames = append(colNames, val)
}
}
return colNames, nil
}