Skip to content

Commit

Permalink
[Bug fix] Fix typos, static methods and other sanity improvements in …
Browse files Browse the repository at this point in the history
…the package (#1129)
  • Loading branch information
infinite-wait authored Jan 8, 2024
1 parent 62c0c52 commit 2496ed1
Show file tree
Hide file tree
Showing 41 changed files with 133 additions and 103 deletions.
13 changes: 9 additions & 4 deletions embedchain/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,14 @@
import requests
import yaml

from embedchain.cache import (Config, ExactMatchEvaluation,
SearchDistanceEvaluation, cache,
gptcache_data_manager, gptcache_pre_function)
from embedchain.cache import (
Config,
ExactMatchEvaluation,
SearchDistanceEvaluation,
cache,
gptcache_data_manager,
gptcache_pre_function,
)
from embedchain.client import Client
from embedchain.config import AppConfig, CacheConfig, ChunkerConfig
from embedchain.constants import SQLITE_PATH
Expand All @@ -27,7 +32,7 @@
from embedchain.vectordb.base import BaseVectorDB
from embedchain.vectordb.chroma import ChromaDB

# Setup the user directory if doesn't exist already
# Set up the user directory if it doesn't exist already
Client.setup_dir()


Expand Down
11 changes: 6 additions & 5 deletions embedchain/chunkers/base_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ def create_chunks(self, loader, src, app_id=None, config: Optional[ChunkerConfig
"""
Loads data and chunks it.
:param loader: The loader which's `load_data` method is used to create
:param loader: The loader whose `load_data` method is used to create
the raw data.
:param src: The data to be handled by the loader. Can be a URL for
remote sources or local content for local loaders.
:param app_id: App id used to generate the doc_id.
"""
documents = []
chunk_ids = []
idMap = {}
id_map = {}
min_chunk_size = config.min_chunk_size if config is not None else 1
logging.info(f"[INFO] Skipping chunks smaller than {min_chunk_size} characters")
data_result = loader.load_data(src)
Expand All @@ -49,8 +49,8 @@ def create_chunks(self, loader, src, app_id=None, config: Optional[ChunkerConfig
for chunk in chunks:
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
chunk_id = f"{app_id}--{chunk_id}" if app_id is not None else chunk_id
if idMap.get(chunk_id) is None and len(chunk) >= min_chunk_size:
idMap[chunk_id] = True
if id_map.get(chunk_id) is None and len(chunk) >= min_chunk_size:
id_map[chunk_id] = True
chunk_ids.append(chunk_id)
documents.append(chunk)
metadatas.append(meta_data)
Expand All @@ -77,5 +77,6 @@ def set_data_type(self, data_type: DataType):

# TODO: This should be done during initialization. This means it has to be done in the child classes.

def get_word_count(self, documents):
@staticmethod
def get_word_count(documents) -> int:
return sum([len(document.split(" ")) for document in documents])
2 changes: 1 addition & 1 deletion embedchain/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def __init__(self, api_key=None, host="https://apiv2.embedchain.ai"):
)

@classmethod
def setup_dir(self):
def setup_dir(cls):
"""
Loads the user id from the config file if it exists, otherwise generates a new
one and saves it to the config file.
Expand Down
5 changes: 3 additions & 2 deletions embedchain/config/add_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(
if self.min_chunk_size >= self.chunk_size:
raise ValueError(f"min_chunk_size {min_chunk_size} should be less than chunk_size {chunk_size}")
if self.min_chunk_size < self.chunk_overlap:
logging.warn(
logging.warning(
f"min_chunk_size {min_chunk_size} should be greater than chunk_overlap {chunk_overlap}, otherwise it is redundant." # noqa:E501
)

Expand All @@ -35,7 +35,8 @@ def __init__(
else:
self.length_function = length_function if length_function else len

def load_func(self, dotpath: str):
@staticmethod
def load_func(dotpath: str):
if "." not in dotpath:
return getattr(builtins, dotpath)
else:
Expand Down
9 changes: 6 additions & 3 deletions embedchain/config/cache_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ class CacheSimilarityEvalConfig(BaseConfig):
This is the evaluator to compare two embeddings according to their distance computed in embedding retrieval stage.
In the retrieval stage, `search_result` is the distance used for approximate nearest neighbor search and have been
put into `cache_dict`. `max_distance` is used to bound this distance to make it between [0-`max_distance`].
`positive` is used to indicate this distance is directly proportional to the similarity of two entites.
If `positive` is set `False`, `max_distance` will be used to substract this distance to get the final score.
`positive` is used to indicate this distance is directly proportional to the similarity of two entities.
If `positive` is set `False`, `max_distance` will be used to subtract this distance to get the final score.
:param max_distance: the bound of maximum distance.
:type max_distance: float
:param positive: if the larger distance indicates more similar of two entities, It is True. Otherwise it is False.
:param positive: if the larger distance indicates more similar of two entities, It is True. Otherwise, it is False.
:type positive: bool
"""

Expand All @@ -29,6 +29,7 @@ def __init__(
self.max_distance = max_distance
self.positive = positive

@staticmethod
def from_config(config: Optional[Dict[str, Any]]):
if config is None:
return CacheSimilarityEvalConfig()
Expand Down Expand Up @@ -63,6 +64,7 @@ def __init__(
self.similarity_threshold = similarity_threshold
self.auto_flush = auto_flush

@staticmethod
def from_config(config: Optional[Dict[str, Any]]):
if config is None:
return CacheInitConfig()
Expand All @@ -83,6 +85,7 @@ def __init__(
self.similarity_eval_config = similarity_eval_config
self.init_config = init_config

@staticmethod
def from_config(config: Optional[Dict[str, Any]]):
if config is None:
return CacheConfig()
Expand Down
10 changes: 6 additions & 4 deletions embedchain/config/llm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,24 +155,26 @@ def __init__(
self.stream = stream
self.where = where

def validate_prompt(self, prompt: Template) -> bool:
@staticmethod
def validate_prompt(prompt: Template) -> Optional[re.Match[str]]:
"""
validate the prompt
:param prompt: the prompt to validate
:type prompt: Template
:return: valid (true) or invalid (false)
:rtype: bool
:rtype: Optional[re.Match[str]]
"""
return re.search(query_re, prompt.template) and re.search(context_re, prompt.template)

def _validate_prompt_history(self, prompt: Template) -> bool:
@staticmethod
def _validate_prompt_history(prompt: Template) -> Optional[re.Match[str]]:
"""
validate the prompt with history
:param prompt: the prompt to validate
:type prompt: Template
:return: valid (true) or invalid (false)
:rtype: bool
:rtype: Optional[re.Match[str]]
"""
return re.search(history_re, prompt.template)
4 changes: 2 additions & 2 deletions embedchain/config/vectordb/qdrant.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
@register_deserializable
class QdrantDBConfig(BaseVectorDbConfig):
"""
Config to initialize an qdrant client.
:param url. qdrant url or list of nodes url to be used for connection
Config to initialize a qdrant client.
:param: url. qdrant url or list of nodes url to be used for connection
"""

def __init__(
Expand Down
2 changes: 1 addition & 1 deletion embedchain/config/vectordb/zilliz.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(
:param uri: Cluster endpoint obtained from the Zilliz Console, defaults to None
:type uri: Optional[str], optional
:param token: API Key, if a Serverless Cluster, username:password, if a Dedicated Cluster, defaults to None
:type port: Optional[str], optional
:type token: Optional[str], optional
"""
self.uri = uri or os.environ.get("ZILLIZ_CLOUD_URI")
if not self.uri:
Expand Down
3 changes: 2 additions & 1 deletion embedchain/data_formatter/data_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def __init__(
self.loader = self._get_loader(data_type=data_type, config=config.loader, loader=loader)
self.chunker = self._get_chunker(data_type=data_type, config=config.chunker, chunker=chunker)

def _lazy_load(self, module_path: str):
@staticmethod
def _lazy_load(module_path: str):
module_path, class_name = module_path.rsplit(".", 1)
module = import_module(module_path)
return getattr(module, class_name)
Expand Down
16 changes: 7 additions & 9 deletions embedchain/embedchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
from dotenv import load_dotenv
from langchain.docstore.document import Document

from embedchain.cache import (adapt, get_gptcache_session,
gptcache_data_convert,
gptcache_update_cache_callback)
from embedchain.cache import adapt, get_gptcache_session, gptcache_data_convert, gptcache_update_cache_callback
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config import AddConfig, BaseLlmConfig, ChunkerConfig
from embedchain.config.base_app_config import BaseAppConfig
Expand All @@ -19,8 +17,7 @@
from embedchain.helpers.json_serializable import JSONSerializable
from embedchain.llm.base import BaseLlm
from embedchain.loaders.base_loader import BaseLoader
from embedchain.models.data_type import (DataType, DirectDataType,
IndirectDataType, SpecialDataType)
from embedchain.models.data_type import DataType, DirectDataType, IndirectDataType, SpecialDataType
from embedchain.telemetry.posthog import AnonymousTelemetry
from embedchain.utils.misc import detect_datatype, is_valid_json_string
from embedchain.vectordb.base import BaseVectorDB
Expand Down Expand Up @@ -84,7 +81,7 @@ def __init__(
# Attributes that aren't subclass related.
self.user_asks = []

self.chunker: ChunkerConfig = None
self.chunker: Optional[ChunkerConfig] = None
# Send anonymous telemetry
self._telemetry_props = {"class": self.__class__.__name__}
self.telemetry = AnonymousTelemetry(enabled=self.config.collect_metrics)
Expand Down Expand Up @@ -290,7 +287,7 @@ def _get_existing_doc_id(self, chunker: BaseChunker, src: Any):
# Or it's different, then it will be added as a new text.
return None
elif chunker.data_type.value in [item.value for item in IndirectDataType]:
# These types have a indirect source reference
# These types have an indirect source reference
# As long as the reference is the same, they can be updated.
where = {"url": src}
if chunker.data_type == DataType.JSON and is_valid_json_string(src):
Expand Down Expand Up @@ -442,10 +439,11 @@ def _load_and_embed(
)
count_new_chunks = self.db.count() - chunks_before_addition

print((f"Successfully saved {src} ({chunker.data_type}). New chunks count: {count_new_chunks}"))
print(f"Successfully saved {src} ({chunker.data_type}). New chunks count: {count_new_chunks}")
return list(documents), metadatas, ids, count_new_chunks

def _format_result(self, results):
@staticmethod
def _format_result(results):
return [
(Document(page_content=result[0], metadata=result[1] or {}), result[2])
for result in zip(
Expand Down
6 changes: 3 additions & 3 deletions embedchain/embedder/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ class EmbeddingFunc(EmbeddingFunction):
def __init__(self, embedding_fn: Callable[[list[str]], list[str]]):
self.embedding_fn = embedding_fn

def __call__(self, input: Embeddable) -> Embeddings:
return self.embedding_fn(input)
def __call__(self, input_: Embeddable) -> Embeddings:
return self.embedding_fn(input_)


class BaseEmbedder:
Expand All @@ -29,7 +29,7 @@ class BaseEmbedder:

def __init__(self, config: Optional[BaseEmbedderConfig] = None):
"""
Intialize the embedder class.
Initialize the embedder class.
:param config: embedder configuration option class, defaults to None
:type config: Optional[BaseEmbedderConfig], optional
Expand Down
4 changes: 2 additions & 2 deletions embedchain/embedder/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ def __init__(self, config: Optional[GoogleAIEmbedderConfig] = None) -> None:
super().__init__()
self.config = config or GoogleAIEmbedderConfig()

def __call__(self, input: str) -> Embeddings:
def __call__(self, input_: str) -> Embeddings:
model = self.config.model
title = self.config.title
task_type = self.config.task_type
embeddings = genai.embed_content(model=model, content=input, task_type=task_type, title=title)
embeddings = genai.embed_content(model=model, content=input_, task_type=task_type, title=title)
return embeddings["embedding"]


Expand Down
2 changes: 1 addition & 1 deletion embedchain/helpers/json_serializable.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class JSONSerializable:
A class to represent a JSON serializable object.
This class provides methods to serialize and deserialize objects,
as well as save serialized objects to a file and load them back.
as well as to save serialized objects to a file and load them back.
"""

_deserializable_classes = set() # Contains classes that are whitelisted for deserialization.
Expand Down
15 changes: 8 additions & 7 deletions embedchain/llm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
from langchain.schema import BaseMessage as LCBaseMessage

from embedchain.config import BaseLlmConfig
from embedchain.config.llm.base import (DEFAULT_PROMPT,
DEFAULT_PROMPT_WITH_HISTORY_TEMPLATE,
DOCS_SITE_PROMPT_TEMPLATE)
from embedchain.config.llm.base import DEFAULT_PROMPT, DEFAULT_PROMPT_WITH_HISTORY_TEMPLATE, DOCS_SITE_PROMPT_TEMPLATE
from embedchain.helpers.json_serializable import JSONSerializable
from embedchain.memory.base import ChatHistory
from embedchain.memory.message import ChatMessage
Expand Down Expand Up @@ -76,7 +74,7 @@ def generate_prompt(self, input_query: str, contexts: List[str], **kwargs: Dict[
:return: The prompt
:rtype: str
"""
context_string = (" | ").join(contexts)
context_string = " | ".join(contexts)
web_search_result = kwargs.get("web_search_result", "")
if web_search_result:
context_string = self._append_search_and_context(context_string, web_search_result)
Expand Down Expand Up @@ -110,7 +108,8 @@ def generate_prompt(self, input_query: str, contexts: List[str], **kwargs: Dict[
prompt = self.config.prompt.substitute(context=context_string, query=input_query)
return prompt

def _append_search_and_context(self, context: str, web_search_result: str) -> str:
@staticmethod
def _append_search_and_context(context: str, web_search_result: str) -> str:
"""Append web search context to existing context
:param context: Existing context
Expand All @@ -134,7 +133,8 @@ def get_answer_from_llm(self, prompt: str):
"""
return self.get_llm_model_answer(prompt)

def access_search_and_get_results(self, input_query: str):
@staticmethod
def access_search_and_get_results(input_query: str):
"""
Search the internet for additional context
Expand All @@ -153,7 +153,8 @@ def access_search_and_get_results(self, input_query: str):
logging.info(f"Access search to get answers for {input_query}")
return search.run(input_query)

def _stream_response(self, answer: Any) -> Generator[Any, Any, None]:
@staticmethod
def _stream_response(answer: Any) -> Generator[Any, Any, None]:
"""Generator to be used as streaming response
:param answer: Answer chunk from llm
Expand Down
2 changes: 1 addition & 1 deletion embedchain/llm/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _get_answer(self, prompt: str) -> Union[str, Generator[Any, Any, None]]:
"temperature": self.config.temperature or 0.5,
}

if self.config.top_p >= 0.0 and self.config.top_p <= 1.0:
if 0.0 <= self.config.top_p <= 1.0:
generation_config_params["top_p"] = self.config.top_p
else:
raise ValueError("`top_p` must be > 0.0 and < 1.0")
Expand Down
2 changes: 1 addition & 1 deletion embedchain/llm/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def _from_model(prompt: str, config: BaseLlmConfig) -> str:
"max_new_tokens": config.max_tokens,
}

if config.top_p > 0.0 and config.top_p < 1.0:
if 0.0 < config.top_p < 1.0:
model_kwargs["top_p"] = config.top_p
else:
raise ValueError("`top_p` must be > 0.0 and < 1.0")
Expand Down
3 changes: 2 additions & 1 deletion embedchain/llm/ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ def __init__(self, config: Optional[BaseLlmConfig] = None):
def get_llm_model_answer(self, prompt):
return self._get_answer(prompt=prompt, config=self.config)

def _get_answer(self, prompt: str, config: BaseLlmConfig) -> Union[str, Iterable]:
@staticmethod
def _get_answer(prompt: str, config: BaseLlmConfig) -> Union[str, Iterable]:
callback_manager = [StreamingStdOutCallbackHandler()] if config.stream else [StdOutCallbackHandler()]

llm = Ollama(
Expand Down
2 changes: 1 addition & 1 deletion embedchain/loaders/base_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ class BaseLoader(JSONSerializable):
def __init__(self):
pass

def load_data():
def load_data(self, url):
"""
Implemented by child classes
"""
Expand Down
2 changes: 1 addition & 1 deletion embedchain/loaders/directory_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def load_data(self, path: str):
doc_id = hashlib.sha256((str(data_list) + str(directory_path)).encode()).hexdigest()

for error in self.errors:
logging.warn(error)
logging.warning(error)

return {"doc_id": doc_id, "data": data_list}

Expand Down
3 changes: 2 additions & 1 deletion embedchain/loaders/docs_site_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def _get_all_urls(self, url):
urls = [link for link in self.visited_links if urlparse(link).netloc == urlparse(url).netloc]
return urls

def _load_data_from_url(self, url):
@staticmethod
def _load_data_from_url(url: str) -> list:
response = requests.get(url)
if response.status_code != 200:
logging.info(f"Failed to fetch the website: {response.status_code}")
Expand Down
Loading

0 comments on commit 2496ed1

Please sign in to comment.