@@ -37,6 +37,18 @@ type BM25EncoderParams struct {
37
37
Bm25Language string
38
38
}
39
39
40
+ // [BM25EncoderFileParams] holds the parameters for initing bm25 encoder by local files.
41
+ //
42
+ // Fields:
43
+ // - WordsFreqFile: The local file path of the words frequency.
44
+ // - StopWordsFile: The local file path of the stopwords.
45
+ // - UserDictFile: The local file path of the user define dictionary.
46
+ type BM25EncoderFileParams struct {
47
+ WordsFreqFile string
48
+ StopWordsFile string
49
+ UserDictFile string
50
+ }
51
+
40
52
type BM25LearnedParams struct {
41
53
TokenFreq map [string ]float64 `json:"token_freq,omitempty"`
42
54
DocCount int64 `json:"doc_count,omitempty"`
@@ -84,6 +96,63 @@ func NewBM25Encoder(params *BM25EncoderParams) (SparseEncoder, error) {
84
96
return bm25 , nil
85
97
}
86
98
99
+ func NewBM25EncoderByFiles (params * BM25EncoderFileParams ) (SparseEncoder , error ) {
100
+ bm25 := new (BM25Encoder )
101
+ var stopWords interface {}
102
+ if params .StopWordsFile == "" {
103
+ stopWords = false
104
+ } else {
105
+ stopWords = params .StopWordsFile
106
+ }
107
+ JiebaTokenizer , err := tokenizer .NewJiebaTokenizer (& tokenizer.TokenizerParams {
108
+ StopWords : stopWords ,
109
+ UserDictFilePath : params .UserDictFile ,
110
+ })
111
+ if err != nil {
112
+ return nil , err
113
+ }
114
+
115
+ bm25 .Tokenizer = JiebaTokenizer
116
+
117
+ if params .WordsFreqFile == "" {
118
+ return bm25 , nil
119
+ }
120
+
121
+ var data []byte
122
+ if ! tcvdbtext .FileExists (params .WordsFreqFile ) {
123
+ return nil , fmt .Errorf ("the filepath %v doesn't exist" , params .WordsFreqFile )
124
+ } else {
125
+ data , err = os .ReadFile (params .WordsFreqFile )
126
+ if err != nil {
127
+ return nil , fmt .Errorf ("cannot read file: %v" , err )
128
+ }
129
+ }
130
+
131
+ bm25ParamsByFile := new (BM25Params )
132
+ err = json .Unmarshal (data , bm25ParamsByFile )
133
+ if err != nil {
134
+ return nil , fmt .Errorf ("cannot parse file %v to JSON, err: %v" , params .WordsFreqFile , err .Error ())
135
+ }
136
+
137
+ bm25 .B = * bm25ParamsByFile .B
138
+ bm25 .K1 = * bm25ParamsByFile .K1
139
+ bm25 .BM25LearnedParams = bm25ParamsByFile .BM25LearnedParams
140
+
141
+ err = bm25 .Tokenizer .UpdateParameters (tokenizer.TokenizerParams {
142
+ ForSearch : bm25ParamsByFile .ForSearch ,
143
+ CutAll : bm25ParamsByFile .CutAll ,
144
+ Hmm : bm25ParamsByFile .Hmm ,
145
+
146
+ HashFunction : bm25ParamsByFile .HashFunction ,
147
+ })
148
+
149
+ if err != nil {
150
+ return nil , fmt .Errorf ("update parameters by file %v failed, err: %v" , params .WordsFreqFile , err .Error ())
151
+ }
152
+
153
+ return bm25 , nil
154
+ }
155
+
87
156
func (bm25 * BM25Encoder ) GetTokenizer () tokenizer.Tokenizer {
88
157
return bm25 .Tokenizer
89
158
}
0 commit comments