Skip to content
This repository has been archived by the owner on Nov 13, 2024. It is now read-only.

KB initial docstring #76

Merged
merged 21 commits into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b5ddb77
add docstrings to kb
acatav Oct 16, 2023
ddc0118
Update resin/knoweldge_base/base.py
acatav Oct 17, 2023
8098d45
remove docstrings from base class
acatav Oct 17, 2023
98f7004
Merge remote-tracking branch 'origin/docstrings-template' into docstr…
acatav Oct 17, 2023
12ef6ce
add more details to KB description and init
acatav Oct 17, 2023
e1fc2f0
Update resin/knoweldge_base/knowledge_base.py
acatav Oct 17, 2023
474d22a
Update resin/knoweldge_base/knowledge_base.py
acatav Oct 17, 2023
a7e646e
Update resin/knoweldge_base/knowledge_base.py
acatav Oct 17, 2023
e2ae5be
Update resin/knoweldge_base/knowledge_base.py
acatav Oct 17, 2023
8dba979
Update resin/knoweldge_base/knowledge_base.py
acatav Oct 17, 2023
ac88997
Update resin/knoweldge_base/knowledge_base.py
acatav Oct 17, 2023
6819813
Update resin/knoweldge_base/knowledge_base.py
acatav Oct 17, 2023
b30f461
Update resin/knoweldge_base/knowledge_base.py
acatav Oct 17, 2023
c159837
add examples and details
acatav Oct 17, 2023
6326258
Merge remote-tracking branch 'origin/docstrings-template' into docstr…
acatav Oct 17, 2023
f2a71f3
remove weird line breaks
acatav Oct 17, 2023
a76067e
Merge remote-tracking branch 'origin/dev' into docstrings-template
acatav Oct 18, 2023
0417255
add tokenizer init to code examples
acatav Oct 18, 2023
4082411
remove docsting note about starter env delete
acatav Oct 18, 2023
9327df2
small PR comments
acatav Oct 18, 2023
ae5f7b4
Merge remote-tracking branch 'origin/dev' into docstrings-template
acatav Oct 18, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions resin/knoweldge_base/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ class BaseKnowledgeBase(ABC, ConfigurableMixin):
"""

@abstractmethod
def query(self, queries: List[Query], global_metadata_filter: Optional[dict] = None
def query(self,
queries: List[Query],
global_metadata_filter: Optional[dict] = None
) -> List[QueryResult]:
pass

Expand Down Expand Up @@ -44,7 +46,6 @@ async def aquery(self,
async def aupsert(self,
documents: List[Document],
namespace: str = "",

) -> None:
pass

Expand Down
239 changes: 237 additions & 2 deletions resin/knoweldge_base/knowledge_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,37 @@

class KnowledgeBase(BaseKnowledgeBase):

"""
The `KnowledgeBase` is used to store and retrieve text documents,
using an underlying Pinecone index.
acatav marked this conversation as resolved.
Show resolved Hide resolved
Every document is chunked into multiple text snippets based on the text structure (e.g. Markdown or HTML formatting)
Then, each chunk is encoded into a vector using an embedding model, and the resulting vectors are inserted to the Pinecone index.
After documents were inserted, the KnowledgeBase can be queried by sending a textual query, which will first encoded to a vector
and then used to retrieve the closest top-k document chunks.

Note: Since Resin defines its own data format,
you can not use a pre-existing Pinecone index with Resin's KnowledgeBase.
The index must be created by using `knowledge_base.create_resin_index()`
or the CLI command `resin new`.

When creating a new Resin service,
the user must first create the underlying Pinecone index.
This is a one-time setup process -
the index will exist on Pinecone's managed service until it is deleted.

Example:
>>> kb = KnowledgeBase(index_name="my_index")
>>> kb.create_resin_index()

In any future interactions, the user simply needs to connect:

Example:
>>> kb = KnowledgeBase(index_name="my_index")
>>> kb.connect()

Note: the KnowledgeBase is not connected to the index until connect() is called.
"""

_DEFAULT_COMPONENTS = {
'record_encoder': OpenAIRecordEncoder,
'chunker': MarkdownChunker,
Expand All @@ -53,6 +84,46 @@ def __init__(self,
default_top_k: int = 5,
igiloh-pinecone marked this conversation as resolved.
Show resolved Hide resolved
index_params: Optional[dict] = None,
):
"""
Init the knowledge base object.

If the index does not exist, the user must first create it by calling `create_resin_index()`.

Note: Resin will add the prefix --resin to your selected index name.
You can retrieve the full index name knowledge_base.index_name at any time.
Or find it in the Pinecone console at https://app.pinecone.io/

Example:

create a new index:

>>> kb = KnowledgeBase(index_name="my_index")
>>> kb.create_resin_index()

In any future interactions,
the user simply needs to connect to the existing service:

>>> kb = KnowledgeBase(index_name="my_index")
>>> kb.connect()

Note: the KnowledgeBase is not connected to the index until connect() is called.

Args:
index_name: The name of the underlying Pinecone index.
record_encoder: An instance of RecordEncoder to use for encoding documents
and queries. Defaults to OpenAIRecordEncoder.
chunker: An instance of Chunker to use for chunking documents.
Defaults to MarkdownChunker.
reranker: An instance of Reranker to use for reranking query results.
Defaults to TransparentReranker.
default_top_k: The default number of document chunks to return per query.
Defaults to 5.
index_params: A dictionary of parameters to pass to the index creation API.
see https://docs.pinecone.io/docs/python-client#create_index
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(@acatav I agree, adding this param to the constructor is really ugly...
I'll try to find a way to remove it and still support from_config.


Returns:
KnowledgeBase object.
"""
if default_top_k < 1:
raise ValueError("default_top_k must be greater than 0")

Expand Down Expand Up @@ -129,11 +200,27 @@ def _connection_error_msg(self) -> str:
)

def connect(self) -> None:
"""
Connect to the underlying Pinecone index.
This method must be called before making any other calls to the knowledge base.

Note: If underlying index is not provisioned yet, an exception will be raised.
To provision the index, use `create_resin_index()`.

Returns:
None if successful, raises an exception otherwise.
"""
if self._index is None:
self._index = self._connect_index()
self.verify_index_connection()

def verify_index_connection(self) -> None:
"""
Verify that the knowledge base is connected to the index.

Returns:
None if successful, raises an exception otherwise.
"""
if self._index is None:
raise RuntimeError(self._connection_error_msg)

Expand All @@ -149,6 +236,40 @@ def create_resin_index(self,
dimension: Optional[int] = None,
index_params: Optional[dict] = None
):
"""
Create the underlying Pinecone index that will be used by the KnowledgeBase.
This is a one time set-up operation that only needs to be done once for every new Resin service.
After the index was created, it will persist in Pinecone until explicitly deleted.

Since Resin defines its own data format, namely a few dedicated metadata fields,
you can not use a pre-existing Pinecone index with Resin's KnowledgeBase.
The index must be created by using `knowledge_base.create_resin_index()`.

Note: Resin will add the prefix --resin to your selected index name.
You can retrieve the full index name knowledge_base.index_name at any time.
Or find it in the Pinecone console at https://app.pinecone.io/

Note: This operation may take a few minutes to complete.
Once created, you can see the index in the Pinecone console

Args:
indexed_fields: A list of metadata fields that would be indexed,
allowing them to be later used for metadata filtering.
All other metadata fields are stored but not indexed.
See: https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing.
Resin always indexes a built-in `document_id` field, is added to every vector.
acatav marked this conversation as resolved.
Show resolved Hide resolved
By default - all other metadata fields are **not** indexed, unless explicitly defined in this list.
dimension: The dimension of the vectors to index.
If `dimension` isn't explicitly provided,
Resin would try to infer the embedding's dimension based on the configured `Encoder`
index_params: A dictionary of parameters to pass to the index creation API.
For example, you can set the index's number of replicas
by passing {"replicas": 2}.
see https://docs.pinecone.io/docs/python-client#create_index

Returns:
None
"""
# validate inputs
if indexed_fields is None:
indexed_fields = ['document_id']
Expand Down Expand Up @@ -221,9 +342,22 @@ def _get_full_index_name(index_name: str) -> str:

@property
def index_name(self) -> str:
"""
The name of the index the knowledge base is connected to.
"""
return self._index_name

def delete_index(self):
"""
Delete the underlying Pinecone index.

**Note: THIS OPERATION IS NOT REVERSABLE!!**
Once deleted, the index, together with any stored documents cannot be restored!

After deletion - the `KnowledgeBase` would not be connected to a Pinecone index anymore,
so you will not be able to insert documents or query.
If you'd wish to re-create an index with the same name, simply call `knowledge_base.create_resin_index()`
"""
if self._index is None:
raise RuntimeError(self._connection_error_msg)
delete_index(self._index_name)
Expand All @@ -233,6 +367,36 @@ def query(self,
queries: List[Query],
global_metadata_filter: Optional[dict] = None
) -> List[QueryResult]:
"""
Query the knowledge base to retrieve documents chunks.

igiloh-pinecone marked this conversation as resolved.
Show resolved Hide resolved
This operation includes several steps:
1. Encode the queries to vectors using the underlying encoder.
2. Query the underlying Pinecone index to retrieve the top-k chunks for each query.
3. Rerank the results using the underlying reranker.
4. Return the results for each query as a list of QueryResult objects.

Args:
queries: A list of queries to run against the knowledge base.
global_metadata_filter: A metadata filter to apply to all queries.
in addition to any query-specific filters.
For example, the filter {"website": "wiki"}
will only return documents with the metadata
{"website": "wiki"} (in case provided in upsert)
see https://docs.pinecone.io/docs/metadata-filtering
Returns:
A list of QueryResult objects.

Examples:
>>> kb = KnowledgeBase(index_name="my_index")
>>> kb.connect()
>>> queries = [Query(text="How to make a cake"),
Query(text="How to make a pizza",
top_k=10,
metadata_filter={"website": "wiki"})]
>>> results = kb.query(queries)

"""
if self._index is None:
raise RuntimeError(self._connection_error_msg)

Expand Down Expand Up @@ -293,6 +457,41 @@ def upsert(self,
documents: List[Document],
namespace: str = "",
batch_size: int = 100):
"""
Upsert documents into the knowledge base.
Upsert operation stands for "update or insert".
It means that if a document with the same id already exists in the index, it will be updated with the new document.
Otherwise, a new document will be inserted.

igiloh-pinecone marked this conversation as resolved.
Show resolved Hide resolved
This operation includes several steps:
1. Chunk the documents into smaller chunks.
2. Encode the chunks to vectors.
3. Delete any existing chunks belonging to the same documents.
4. Upsert the chunks to the index.

Args:
documents: A list of documents to upsert.
namespace: The namespace in the underlying index to upsert documents into.
batch_size: The number of documents to upsert at once to the index,
after chunking and encoding.
acatav marked this conversation as resolved.
Show resolved Hide resolved

Returns:
None

Example:

>>> kb = KnowledgeBase(index_name="my_index")
>>> kb.connect()
>>> documents = [Document(id="doc1",
text="This is a document",
source="my_source",
metadata={"website": "wiki"}),
Document(id="doc2",
text="This is another document",
source="my_source",
metadata={"website": "wiki"})]
>>> kb.upsert(documents)
"""
if self._index is None:
raise RuntimeError(self._connection_error_msg)

Expand Down Expand Up @@ -337,11 +536,34 @@ def upsert(self,
# Upsert to Pinecone index
dataset.to_pinecone_index(self._index_name,
namespace=namespace,
should_create_index=False)
should_create_index=False,
batch_size=batch_size)

def delete(self,
document_ids: List[str],
namespace: str = "") -> None:
"""
Delete documents from the underlying Pinecone index.
Since each document is chunked into multiple chunks,
this operation will delete all chunks belonging to the given document ids.
This operation not raise an exception if the document does not exist.

igiloh-pinecone marked this conversation as resolved.
Show resolved Hide resolved
Note: Currently in starter env the delete by metadata
operation is not supported. Therefore, in starter env
this method will simply delete the first 32 chunks of each document.
acatav marked this conversation as resolved.
Show resolved Hide resolved

Args:
document_ids: A list of document ids to delete from the index.
namespace: The namespace in the underlying index to delete documents from.

Returns:
None

Example:
>>> kb = KnowledgeBase(index_name="my_index")
>>> kb.connect()
>>> kb.delete(document_ids=["doc1", "doc2"])
"""
if self._index is None:
raise RuntimeError(self._connection_error_msg)

Expand All @@ -366,7 +588,20 @@ def delete(self,
)

@classmethod
def from_config(cls, config: Dict[str, Any], index_name: Optional[str] = None):
def from_config(cls,
config: Dict[str, Any],
index_name: Optional[str] = None) -> "KnowledgeBase":
"""
Create a KnowledgeBase object from a configuration dictionary.

Args:
config: A dictionary containing the configuration for the knowledge base.
index_name: The name of the index to connect to (optional).
If not provided, the index name will be read from the environment variable INDEX_NAME.

Returns:
A KnowledgeBase object.
"""
index_name = index_name or os.getenv("INDEX_NAME")
if index_name is None:
raise ValueError(
Expand Down