-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathupload_vectors.py
56 lines (42 loc) · 1.46 KB
/
upload_vectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import feedparser
import os
import pinecone
import numpy as np
import openai
import requests
from bs4 import BeautifulSoup
# OpenAI API key
openai.api_key = os.getenv('OPENAI_API_KEY')
# get the Pinecone API key and environment
pinecone_api = os.getenv('PINECONE_API_KEY')
pinecone_env = os.getenv('PINECONE_ENVIRONMENT')
pinecone.init(api_key=pinecone_api, environment=pinecone_env)
# set index; must exist
index = pinecone.Index('blog-index')
# URL of the RSS feed to parse
url = 'https://blog.baeke.info/feed/'
# Parse the RSS feed with feedparser
feed = feedparser.parse(url)
# get number of entries in feed
entries = len(feed.entries)
print("Number of entries: ", entries)
post_texts = []
pinecone_vectors = []
for i, entry in enumerate(feed.entries[:50]):
# report progress
print("Processing entry ", i, " of ", entries)
r = requests.get(entry.link)
soup = BeautifulSoup(r.text, 'html.parser')
article = soup.find('div', {'class': 'entry-content'}).text
# vectorize with OpenAI text-emebdding-ada-002
embedding = openai.Embedding.create(
input=article,
model="text-embedding-ada-002"
)
# print the embedding (length = 1536)
vector = embedding["data"][0]["embedding"]
# append tuple to pinecone_vectors list
pinecone_vectors.append((str(i), vector, {"url": entry.link}))
# all vectors can be upserted to pinecode in one go
upsert_response = index.upsert(vectors=pinecone_vectors)
print("Vector upload complete.")