Skip to content

Commit 9f4bf70

Browse files
author
averiewang
committed
Merge branch 'feat/averiewang/encoder-init' into 'main' (merge request !77)
feat: tcvdbtext add [NewBM25EncoderByFiles] functions to init encoder
2 parents d861874 + a2003e9 commit 9f4bf70

File tree

7 files changed

+409
-1
lines changed

7 files changed

+409
-1
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Changelog
22

3+
## v1.6.3
4+
* feat: tcvdbtext add [NewBM25EncoderByFiles] functions to init encoder in an offline environment without internet access
5+
36
## v1.6.2
47
* fix: upload file by io.Reader
58

tcvdbtext/README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Tencent VectorDB Sparse Encoder SDK
2+
3+
Go SDK for [Tencent VectorDB Sparse Encoder](https://cloud.tencent.com/document/product/1709/111372).
4+
5+
## Getting started
6+
7+
### Prerequisites
8+
1. Go 1.17 or higher
9+
10+
### Install TencentCloud VectorDB Go SDK
11+
12+
1. Use `go get` to install the latest version of the TencentCloud VectorDB Sparse Encoder SDK dependencies:
13+
```sh
14+
go get -u github.com/tencent/vectordatabase-sdk-go/tcvdbtext
15+
```
16+
17+
2. Try [sparse_vector_demo](examples/sparse_vector_demo/main.go) in an online environment with internet access.
18+
19+
3. Try [sparse_vector_offline_demo](examples/sparse_vector_offline_demo/main.go) in an offline environment without internet access.
20+
Before running the code, please download files which you need.
21+
22+
- [Chinese Words Frequency File](https://vectordb-public-1310738255.cos.ap-guangzhou.myqcloud.com/sparsevector/bm25_zh_default.json)
23+
- [English Words Frequency File](https://vectordb-public-1310738255.cos.ap-guangzhou.myqcloud.com/sparsevector/bm25_en_default.json)
24+
- [Default Stopwords File](https://vectordb-public-1310738255.cos.ap-guangzhou.myqcloud.com/sparsevector/default_stopwords.txt)

tcvdbtext/encoder/bm25_encoder.go

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,18 @@ type BM25EncoderParams struct {
3737
Bm25Language string
3838
}
3939

40+
// [BM25EncoderFileParams] holds the parameters for initing bm25 encoder by local files.
41+
//
42+
// Fields:
43+
// - WordsFreqFile: The local file path of the words frequency.
44+
// - StopWordsFile: The local file path of the stopwords.
45+
// - UserDictFile: The local file path of the user define dictionary.
46+
type BM25EncoderFileParams struct {
47+
WordsFreqFile string
48+
StopWordsFile string
49+
UserDictFile string
50+
}
51+
4052
type BM25LearnedParams struct {
4153
TokenFreq map[string]float64 `json:"token_freq,omitempty"`
4254
DocCount int64 `json:"doc_count,omitempty"`
@@ -84,6 +96,63 @@ func NewBM25Encoder(params *BM25EncoderParams) (SparseEncoder, error) {
8496
return bm25, nil
8597
}
8698

99+
func NewBM25EncoderByFiles(params *BM25EncoderFileParams) (SparseEncoder, error) {
100+
bm25 := new(BM25Encoder)
101+
var stopWords interface{}
102+
if params.StopWordsFile == "" {
103+
stopWords = false
104+
} else {
105+
stopWords = params.StopWordsFile
106+
}
107+
JiebaTokenizer, err := tokenizer.NewJiebaTokenizer(&tokenizer.TokenizerParams{
108+
StopWords: stopWords,
109+
UserDictFilePath: params.UserDictFile,
110+
})
111+
if err != nil {
112+
return nil, err
113+
}
114+
115+
bm25.Tokenizer = JiebaTokenizer
116+
117+
if params.WordsFreqFile == "" {
118+
return bm25, nil
119+
}
120+
121+
var data []byte
122+
if !tcvdbtext.FileExists(params.WordsFreqFile) {
123+
return nil, fmt.Errorf("the filepath %v doesn't exist", params.WordsFreqFile)
124+
} else {
125+
data, err = os.ReadFile(params.WordsFreqFile)
126+
if err != nil {
127+
return nil, fmt.Errorf("cannot read file: %v", err)
128+
}
129+
}
130+
131+
bm25ParamsByFile := new(BM25Params)
132+
err = json.Unmarshal(data, bm25ParamsByFile)
133+
if err != nil {
134+
return nil, fmt.Errorf("cannot parse file %v to JSON, err: %v", params.WordsFreqFile, err.Error())
135+
}
136+
137+
bm25.B = *bm25ParamsByFile.B
138+
bm25.K1 = *bm25ParamsByFile.K1
139+
bm25.BM25LearnedParams = bm25ParamsByFile.BM25LearnedParams
140+
141+
err = bm25.Tokenizer.UpdateParameters(tokenizer.TokenizerParams{
142+
ForSearch: bm25ParamsByFile.ForSearch,
143+
CutAll: bm25ParamsByFile.CutAll,
144+
Hmm: bm25ParamsByFile.Hmm,
145+
146+
HashFunction: bm25ParamsByFile.HashFunction,
147+
})
148+
149+
if err != nil {
150+
return nil, fmt.Errorf("update parameters by file %v failed, err: %v", params.WordsFreqFile, err.Error())
151+
}
152+
153+
return bm25, nil
154+
}
155+
87156
func (bm25 *BM25Encoder) GetTokenizer() tokenizer.Tokenizer {
88157
return bm25.Tokenizer
89158
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package main
2+
3+
import (
4+
"fmt"
5+
"log"
6+
7+
"github.com/tencent/vectordatabase-sdk-go/tcvdbtext/encoder"
8+
)
9+
10+
func main() {
11+
bm25, err := encoder.NewBM25Encoder(&encoder.BM25EncoderParams{Bm25Language: "zh"})
12+
if err != nil {
13+
log.Fatalf(err.Error())
14+
}
15+
16+
text := "什么是腾讯云向量数据库。"
17+
18+
// 如需了解分词的情况,可参考下一行代码获取
19+
tokens := bm25.GetTokenizer().Tokenize(text)
20+
fmt.Println("tokens: ", tokens)
21+
22+
sparse_vectors, err := bm25.EncodeText(text)
23+
if err != nil {
24+
log.Fatalf(err.Error())
25+
}
26+
fmt.Println("sparse vectors: ", sparse_vectors)
27+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package main
2+
3+
import (
4+
"fmt"
5+
"log"
6+
7+
"github.com/tencent/vectordatabase-sdk-go/tcvdbtext/encoder"
8+
)
9+
10+
func main() {
11+
bm25, err := encoder.NewBM25EncoderByFiles(&encoder.BM25EncoderFileParams{
12+
StopWordsFile: "./stopwords.txt",
13+
//WordsFreqFile: "./bm25_zh_default.json",
14+
UserDictFile: "",
15+
})
16+
if err != nil {
17+
log.Fatalf(err.Error())
18+
}
19+
20+
text := "什么是腾讯云向量数据库。"
21+
22+
// 如需了解分词的情况,可参考下一行代码获取
23+
tokens := bm25.GetTokenizer().Tokenize(text)
24+
fmt.Println("tokens: ", tokens)
25+
26+
// [EncodeText] can be used after set WordsFreqFile in [NewBM25EncoderByFiles]
27+
// sparse_vectors, err := bm25.EncodeText(text)
28+
// if err != nil {
29+
// log.Fatalf(err.Error())
30+
// }
31+
// fmt.Println("sparse vectors: ", sparse_vectors)
32+
}

0 commit comments

Comments
 (0)