Skip to content

Commit

Permalink
Merge branch 'fix/files-encode' into 'main' (merge request !67)
Browse files Browse the repository at this point in the history
fix/files-encode
feat: remove lfs
  • Loading branch information
rogersqsliu committed Dec 20, 2024
2 parents 971b4e6 + f0b8018 commit 7ec82f7
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 27 deletions.
13 changes: 9 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
# Changelog

## v1.5.1
* fix: replace large file-sized dictionaries in Git LFS with URLs to enable the usage of NewBM25Encoder with language-specific dictionaries when customers refer to the "tcvdbtext" directory in this Go package.

## v1.5.0
* support binary vector
* support query count, and delete with limit
* add params TerminateAfter and CutoffFrequency, when hybrid search
* support modify vector indexes
* feat: support binary vector
* feat: support to return documents' count when using Count function
* feat: support to delete documents with limit
* feat: add params terminateAfter and cutoffFrequency, when hybrid search
* feat: support to hybrid search by text in embedding collection
* feat: support to modify vector indexes

## v1.4.9
* 为tcvectordb包的部分接口,增加注释
Expand Down
1 change: 0 additions & 1 deletion tcvdbtext/data/.gitattributes

This file was deleted.

3 changes: 0 additions & 3 deletions tcvdbtext/data/bm25_en_default.json

This file was deleted.

3 changes: 0 additions & 3 deletions tcvdbtext/data/bm25_zh_default.json

This file was deleted.

72 changes: 58 additions & 14 deletions tcvdbtext/encoder/bm25_encoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@ package encoder
import (
"encoding/json"
"fmt"
"io"
"log"
"math"
"net/http"
"os"
"path/filepath"
"runtime"
"strconv"

tcvdbtext "github.com/tencent/vectordatabase-sdk-go/tcvdbtext"
Expand Down Expand Up @@ -88,19 +89,58 @@ func (bm25 *BM25Encoder) GetTokenizer() tokenizer.Tokenizer {
}

func (bm25 *BM25Encoder) SetDefaultParams(bm25Language string) error {
_, filePath, _, _ := runtime.Caller(0)
dir := filepath.Dir(filePath)

bm25ParamsPath := ""
fileName := ""
if bm25Language == BM25_ZH_CONTENT {
bm25ParamsPath = dir + BM25Params_ZH_Path
fileName = "bm25_zh_default.json"
} else if bm25Language == BM25_EN_CONTENT {
bm25ParamsPath = dir + BM25Params_EN_Path
fileName = "bm25_en_default.json"
} else {
return fmt.Errorf("input name be 'zh' or 'en'")
return fmt.Errorf("input language name must be 'zh' or 'en'")
}
defaultStoragePath := "/tmp/tencent/vectordatabase/data/"
fileStoragePath := defaultStoragePath + fileName

if !tcvdbtext.FileExists(fileStoragePath) {
bm25ParamsUrl := ""
if bm25Language == BM25_ZH_CONTENT {
bm25ParamsUrl = "https://vectordb-public-1310738255.cos.ap-guangzhou.myqcloud.com/sparsevector/bm25_zh_default.json"
} else if bm25Language == BM25_EN_CONTENT {
bm25ParamsUrl = "https://vectordb-public-1310738255.cos.ap-guangzhou.myqcloud.com/sparsevector/bm25_en_default.json"
}
_, err := os.Stat(defaultStoragePath)
if os.IsNotExist(err) {
err := os.MkdirAll(defaultStoragePath, os.ModePerm)
if err != nil {
return fmt.Errorf("failed to create directory: %v", err.Error())
}
log.Printf("directory created: %v", defaultStoragePath)
} else if err != nil {
return fmt.Errorf("failed to check directory: %v", err.Error())
}

file, err := os.Create(fileStoragePath)
if err != nil {
return fmt.Errorf("failed to create temporary file %v, err: %v",
fileStoragePath, err.Error())
}
defer file.Close()

log.Printf("[Waring] start to download dictionary %v and store it in %v, please wait a moment",
bm25ParamsUrl, fileStoragePath)
resp, err := http.Get(bm25ParamsUrl)
if err != nil {
return fmt.Errorf("failed to download file %v, err: %v", bm25ParamsUrl, err)
}
defer resp.Body.Close()

_, err = io.Copy(file, resp.Body)
if err != nil {
return fmt.Errorf("failed to download url %v to local dir %v, err: %v",
bm25ParamsUrl, fileStoragePath, err.Error())
}
}

err := bm25.SetParams(bm25ParamsPath)
err := bm25.SetParams(fileStoragePath)
if err != nil {
return fmt.Errorf("use default settings file for language %v to set params failed, err: %v",
bm25Language, err.Error())
Expand All @@ -110,12 +150,16 @@ func (bm25 *BM25Encoder) SetDefaultParams(bm25Language string) error {
}

func (bm25 *BM25Encoder) SetParams(paramsFileLoadPath string) error {
var data []byte
var err error

if !tcvdbtext.FileExists(paramsFileLoadPath) {
return fmt.Errorf("the filepath %v doesn't exist", paramsFileLoadPath)
}
data, err := os.ReadFile(paramsFileLoadPath)
if err != nil {
return fmt.Errorf("cannot read file: %v", err)
} else {
data, err = os.ReadFile(paramsFileLoadPath)
if err != nil {
return fmt.Errorf("cannot read file: %v", err)
}
}

bm25ParamsByFile := new(BM25Params)
Expand Down
5 changes: 4 additions & 1 deletion tcvdbtext/encoder/bm25_encoder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ func Test_BM25Encoder_DownloadParams(t *testing.T) {

func Test_BM25Encoder_SetDefaultParams(t *testing.T) {
bm25Encoder, _ := NewBM25Encoder(nil)
bm25Encoder.SetDefaultParams("zh")
err := bm25Encoder.SetDefaultParams("zh")
if err != nil {
println(err.Error())
}
bm25Encoder.DownloadParams("./bm25_params.json")
}

Expand Down
2 changes: 1 addition & 1 deletion tcvectordb/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@

package tcvectordb

const SDKVersion = "v1.5.0"
const SDKVersion = "v1.5.1"

0 comments on commit 7ec82f7

Please sign in to comment.