Skip to content

Commit

Permalink
Merge branch 'feat/averiewang/support-multi-parsingType' into 'main' …
Browse files Browse the repository at this point in the history
…(merge request !63)

feat: support vision model parsing
  • Loading branch information
rogersqsliu committed Dec 2, 2024
2 parents e35903f + 6b249cd commit 77dff32
Show file tree
Hide file tree
Showing 16 changed files with 319 additions and 23 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

## v1.4.8
* CreateCollectionView和LoadAndSplitText接口支持设置文件解析的parsingType,可设置为AlgorithmParsing/VisionModelParsing

## v1.4.7
* 移除buildtag,支持windows平台上编译运行

Expand Down
15 changes: 13 additions & 2 deletions example/ai_demo/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"time"

"github.com/tencent/vectordatabase-sdk-go/tcvectordb"
"github.com/tencent/vectordatabase-sdk-go/tcvectordb/api"
"github.com/tencent/vectordatabase-sdk-go/tcvectordb/api/ai_document_set"
collection_view "github.com/tencent/vectordatabase-sdk-go/tcvectordb/api/collection_view"
)
Expand Down Expand Up @@ -114,6 +115,11 @@ func (d *AIDemo) CreateCollectionView(ctx context.Context, database, collectionV
AppendTitleToChunk: &appendTitleToChunk,
AppendKeywordsToChunk: &appendKeywordsToChunk,
},
// parsing files with vision model for all in this collectionView
// vision model parsing only for pdf filetype, and algorithm parsing for other supported filetypes
ParsingProcess: &api.ParsingProcess{
ParsingType: string(tcvectordb.VisionModelParsing),
},
})

if err != nil {
Expand Down Expand Up @@ -152,6 +158,11 @@ func (d *AIDemo) LoadAndSplitText(ctx context.Context, database, collection, fil
AppendTitleToChunk: &appendTitleToChunk,
AppendKeywordsToChunk: &appendKeywordsToChunk,
},
// parsing this file with vision model
// vision model parsing only for pdf filetype, and algorithm parsing for other supported filetypes
ParsingProcess: &api.ParsingProcess{
ParsingType: string(tcvectordb.VisionModelParsing),
},
})
if err != nil {
return nil, err
Expand Down Expand Up @@ -200,7 +211,7 @@ func (d *AIDemo) QueryAndSearch(ctx context.Context, database, collectionView st
// 查找与给定查询向量相似的向量。支持输入文本信息检索与输入文本相似的内容,同时,支持搭配标量字段的 Filter 表达式一并检索。
enableRerank := true
res, err := coll.Search(ctx, tcvectordb.SearchAIDocumentSetsParams{
Content: "什么是向量数据库",
Content: "平安保险的偿付能力是什么水平?",
ExpandChunk: []int{1, 0},
Filter: tcvectordb.NewFilter(`test_str="v1"`),
Limit: 2,
Expand Down Expand Up @@ -287,7 +298,7 @@ func main() {
err = testVdb.CreateCollectionView(ctx, database, collectionView)
printErr(err)
// 当前支持的文件格式markdown(.md或.markdown)、pdf(.pdf)、ppt(.pptx)、word(.docx)
loadFileRes, err := testVdb.LoadAndSplitText(ctx, database, collectionView, "../tcvdb.md")
loadFileRes, err := testVdb.LoadAndSplitText(ctx, database, collectionView, "../demo_files/demo_vision_model_parsing.pdf")
printErr(err)
time.Sleep(time.Second * 30) // 等待后台解析文件完成
err = testVdb.GetFile(ctx, database, collectionView, loadFileRes.DocumentSetName)
Expand Down
Binary file added example/demo_files/demo_pdf_image2text_search.pdf
Binary file not shown.
Binary file added example/demo_files/demo_vision_model_parsing.pdf
Binary file not shown.
File renamed without changes.
8 changes: 8 additions & 0 deletions tcvectordb/ai_collection_view.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ type AICollectionView struct {
Alias []string `json:"alias"`
Embedding *collection_view.DocumentEmbedding `json:"embedding"`
SplitterPreprocess *collection_view.SplitterPreprocess `json:"splitterPreprocess"`
ParsingProcess *api.ParsingProcess `json:"parsingProcess"`
IndexedDocumentSets uint64 `json:"indexedDocumentSets"`
TotalDocumentSets uint64 `json:"totalDocumentSets"`
UnIndexedDocumentSets uint64 `json:"unIndexedDocumentSets"`
Expand All @@ -66,6 +67,7 @@ type CreateCollectionViewParams struct {
Indexes Indexes
Embedding *collection_view.DocumentEmbedding
SplitterPreprocess *collection_view.SplitterPreprocess
ParsingProcess *api.ParsingProcess
ExpectedFileNum uint64
AverageFileSize uint64
}
Expand Down Expand Up @@ -105,6 +107,9 @@ func (i *implementerCollectionView) CreateCollectionView(ctx context.Context, na
req.SplitterPreprocess.AppendTitleToChunk = param.SplitterPreprocess.AppendTitleToChunk
req.SplitterPreprocess.AppendKeywordsToChunk = param.SplitterPreprocess.AppendKeywordsToChunk
}
if param.ParsingProcess != nil {
req.ParsingProcess = param.ParsingProcess
}
req.AverageFileSize = param.AverageFileSize
req.ExpectedFileNum = param.ExpectedFileNum

Expand Down Expand Up @@ -264,6 +269,9 @@ func (i *implementerCollectionView) toCollectionView(item *collection_view.Descr
AppendKeywordsToChunk: item.SplitterPreprocess.AppendKeywordsToChunk,
}
}
if item.ParsingProcess != nil {
coll.ParsingProcess = item.ParsingProcess
}

if item.Status != nil {
coll.IndexedDocumentSets = item.Status.IndexedDocumentSets
Expand Down
45 changes: 36 additions & 9 deletions tcvectordb/ai_document_sets.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"log"

"github.com/pkg/errors"
"github.com/tencent/vectordatabase-sdk-go/tcvectordb/api"
"github.com/tencent/vectordatabase-sdk-go/tcvectordb/api/ai_document_set"
"github.com/tencentyun/cos-go-sdk-v5"
)
Expand Down Expand Up @@ -66,6 +67,7 @@ type AIDocumentSet struct {
DocumentSetInfo *ai_document_set.DocumentSetInfo `json:"documentSetInfo"`
ScalarFields map[string]Field
SplitterPreprocess *ai_document_set.DocumentSplitterPreprocess `json:"splitterPreprocess,omitempty"`
ParsingProcess *api.ParsingProcess `json:"parsingProcess,omitempty"`
}

type implementerAIDocumentSets struct {
Expand Down Expand Up @@ -341,7 +343,8 @@ func (i *implementerAIDocumentSets) Update(ctx context.Context, updateFields map
}

type GetCosTmpSecretParams struct {
DocumentSetName string `json:"documentSetName"`
DocumentSetName string `json:"documentSetName"`
ParsingProcess *api.ParsingProcess `json:"parsingProcess,omitempty"`
}

type GetCosTmpSecretResult struct {
Expand All @@ -354,6 +357,8 @@ type GetCosTmpSecretResult struct {
TmpSecretID string `json:"tmpSecretId"`
TmpSecretKey string `json:"tmpSecretKey"`
SessionToken string `json:"token"`
Expiration string `json:"Expiration,omitempty"`
ExpiredTime int `json:"ExpiredTime,omitempty"`
MaxSupportContentLength int64 `json:"maxSupportContentLength"`
}

Expand All @@ -368,6 +373,7 @@ func (i *implementerAIDocumentSets) GetCosTmpSecret(ctx context.Context, param G
req.Database = i.database.DatabaseName
req.CollectionView = i.collectionView.CollectionViewName
req.DocumentSetName = param.DocumentSetName
req.ParsingProcess = param.ParsingProcess

err := i.Request(ctx, req, res)
if err != nil {
Expand All @@ -387,6 +393,8 @@ func (i *implementerAIDocumentSets) GetCosTmpSecret(ctx context.Context, param G
result.TmpSecretID = res.Credentials.TmpSecretID
result.TmpSecretKey = res.Credentials.TmpSecretKey
result.SessionToken = res.Credentials.SessionToken
result.Expiration = res.Credentials.Expiration
result.ExpiredTime = res.Credentials.ExpiredTime
result.MaxSupportContentLength = res.UploadCondition.MaxSupportContentLength

return result, nil
Expand All @@ -398,6 +406,14 @@ type LoadAndSplitTextParams struct {
LocalFilePath string
MetaData map[string]interface{}
SplitterPreprocess ai_document_set.DocumentSplitterPreprocess
ParsingProcess *api.ParsingProcess
}

type cosMetaConfig struct {
AppendTitleToChunk *bool `json:"appendTitleToChunk,omitempty"`
AppendKeywordsToChunk *bool `json:"appendKeywordsToChunk,omitempty"`
ChunkSplitter *string `json:"chunkSplitter,omitempty"`
ParsingProcess *api.ParsingProcess `json:"parsingProcess,omitempty"`
}

type LoadAndSplitTextResult struct {
Expand All @@ -416,6 +432,7 @@ func (i *implementerAIDocumentSets) LoadAndSplitText(ctx context.Context, param
defer reader.Close()
res, err := i.GetCosTmpSecret(ctx, GetCosTmpSecretParams{
DocumentSetName: param.DocumentSetName,
ParsingProcess: param.ParsingProcess,
})
if err != nil {
return nil, err
Expand All @@ -442,7 +459,15 @@ func (i *implementerAIDocumentSets) LoadAndSplitText(ctx context.Context, param
if err != nil {
return nil, fmt.Errorf("put param MetaData into cos header failed, err: %v", err.Error())
}
configMarshalData, err := json.Marshal(param.SplitterPreprocess)

cosMetaConfig := cosMetaConfig{
AppendTitleToChunk: param.SplitterPreprocess.AppendKeywordsToChunk,
AppendKeywordsToChunk: param.SplitterPreprocess.AppendKeywordsToChunk,
ChunkSplitter: param.SplitterPreprocess.ChunkSplitter,
ParsingProcess: param.ParsingProcess,
}

configMarshalData, err := json.Marshal(cosMetaConfig)
if err != nil {
return nil, fmt.Errorf("put param SplitterPreprocess into cos header failed, err: %v", err.Error())
}
Expand Down Expand Up @@ -482,13 +507,14 @@ func (i *implementerAIDocumentSets) loadAndSplitTextCheckParams(param *LoadAndSp
}
param.DocumentSetName = filepath.Base(param.LocalFilePath)
}

if param.SplitterPreprocess.ChunkSplitter != nil && *param.SplitterPreprocess.ChunkSplitter != "" {
fileType := strings.ToLower(filepath.Ext(param.DocumentSetName))
if !(fileType == "" || fileType == string(MarkdownFileType) || fileType == string(MdFileType)) {
log.Printf("[Waring] %s", "param SplitterPreprocess.ChunkSplitter will be ommitted, "+
"because only markdown filetype supports defining ChunkSplitter")
}
fileType := strings.ToLower(filepath.Ext(param.DocumentSetName))
isMarkdown := false
if fileType == "" || fileType == string(MarkdownFileType) || fileType == string(MdFileType) {
isMarkdown = true
}
if !isMarkdown && param.SplitterPreprocess.ChunkSplitter != nil && *param.SplitterPreprocess.ChunkSplitter != "" {
log.Printf("[Waring] %s", "param SplitterPreprocess.ChunkSplitter will be ommitted, "+
"because only markdown filetype supports defining ChunkSplitter")
}

if param.LocalFilePath != "" {
Expand Down Expand Up @@ -546,6 +572,7 @@ func (i *implementerAIDocumentSets) toDocumentSet(item ai_document_set.QueryDocu

documentSet.AIDocumentSetInterface = docSetImpl
documentSet.SplitterPreprocess = item.SplitterPreprocess
documentSet.ParsingProcess = item.ParsingProcess
return documentSet
}

Expand Down
9 changes: 6 additions & 3 deletions tcvectordb/api/ai_document_set/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,10 @@ type UpdateRes struct {

type UploadUrlReq struct {
api.Meta `path:"/ai/documentSet/uploadUrl" tags:"Document" method:"Post" summary:"获取cos上传签名"`
Database string `json:"database"`
CollectionView string `json:"collectionView"`
DocumentSetName string `json:"documentSetName"`
Database string `json:"database"`
CollectionView string `json:"collectionView"`
DocumentSetName string `json:"documentSetName"`
ParsingProcess *api.ParsingProcess `json:"parsingProcess,omitempty"`
}

type UploadUrlRes struct {
Expand All @@ -158,6 +159,8 @@ type Credentials struct {
TmpSecretID string `json:"TmpSecretId"`
TmpSecretKey string `json:"TmpSecretKey"`
SessionToken string `json:"Token"`
Expiration string `json:"Expiration,omitempty"`
ExpiredTime int `json:"ExpiredTime,omitempty"`
}

type GetReq struct {
Expand Down
3 changes: 3 additions & 0 deletions tcvectordb/api/ai_document_set/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"encoding/json"
"reflect"
"strings"

"github.com/tencent/vectordatabase-sdk-go/tcvectordb/api"
)

// Document document struct for document api
Expand All @@ -16,6 +18,7 @@ type QueryDocumentSet struct {
DocumentSetInfo *DocumentSetInfo `json:"documentSetInfo,omitempty"`
ScalarFields map[string]interface{} `json:"-"`
SplitterPreprocess *DocumentSplitterPreprocess `json:"splitterPreprocess,omitempty"`
ParsingProcess *api.ParsingProcess `json:"parsingProcess,omitempty"`
}

type DocumentSetInfo struct {
Expand Down
2 changes: 2 additions & 0 deletions tcvectordb/api/collection_view/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ type CreateReq struct {
// AverageFileSize uint64 `json:"averageFileSize,omitempty"`
Embedding *DocumentEmbedding `json:"embedding,omitempty"`
SplitterPreprocess *SplitterPreprocess `json:"splitterPreprocess,omitempty"`
ParsingProcess *api.ParsingProcess `json:"parsingProcess,omitempty"`
Indexes []*api.IndexColumn `json:"indexes,omitempty"`
ExpectedFileNum uint64 `json:"expectedFileNum,omitempty"`
AverageFileSize uint64 `json:"averageFileSize,omitempty"`
Expand Down Expand Up @@ -77,6 +78,7 @@ type DescribeCollectionViewItem struct {
// AverageFileSize uint64 `json:"averageFileSize,omitempty"`
Embedding *DocumentEmbedding `json:"embedding,omitempty"`
SplitterPreprocess *SplitterPreprocess `json:"splitterPreprocess,omitempty"`
ParsingProcess *api.ParsingProcess `json:"parsingProcess,omitempty"`
Indexes []*api.IndexColumn `json:"indexes,omitempty"`

CreateTime string `json:"createTime"`
Expand Down
3 changes: 3 additions & 0 deletions tcvectordb/api/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,6 @@ type IndexParams struct {
Nprobe uint32 `protobuf:"varint,3,opt,name=nprobe,proto3" json:"nprobe,omitempty"`
Nlist uint32 `protobuf:"varint,4,opt,name=nlist,proto3" json:"nlist,omitempty"`
}
type ParsingProcess struct {
ParsingType string `json:"parsingType,omitempty"`
}
7 changes: 7 additions & 0 deletions tcvectordb/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,13 @@ const (
LanguageMulti Language = "multi"
)

type ParsingType string

const (
AlgorithmParsing ParsingType = "AlgorithmParsing"
VisionModelParsing ParsingType = "VisionModelParsing"
)

type AppendTitleToChunkType uint32

const (
Expand Down
2 changes: 1 addition & 1 deletion tcvectordb/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@

package tcvectordb

const SDKVersion = "v1.4.7"
const SDKVersion = "v1.4.8"
Loading

0 comments on commit 77dff32

Please sign in to comment.