-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_auto_tag_index.py
59 lines (47 loc) · 1.26 KB
/
create_auto_tag_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os
import pinecone
import json
# get api key from app.pinecone.io
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get('PINECONE_ENVIRONMENT')
VECTOR_LENGTH = 384
pinecone.init(
api_key=PINECONE_API_KEY,
environment=PINECONE_ENV
)
index_name = 'auto-tag-ml'
# only create index if it doesn't exist
if index_name not in pinecone.list_indexes():
pinecone.create_index(
name=index_name,
dimension=VECTOR_LENGTH,
metric="dotproduct"
)
# now connect to the index
index = pinecone.GRPCIndex(index_name)
with open('data/auto_tag.json') as f:
cache = json.load(f)
from tqdm.auto import tqdm
for i, data in tqdm(enumerate(cache)):
# find end of batch
# create IDs batch
ids = i
# create metadata batch
metadatas = {
'tags': data['tags'],
}
# create embeddings
xc = data["target_embedding"]
# create records list for upsert
records = [
{
'id': str(ids),
"values": xc,
"metadata": metadatas,
}
]
# upsert to Pinecone
index.upsert(vectors=records)
# check number of records in the index
index.describe_index_stats()