From 16211ad5fe54c33b29f4f93ca3982ad168ed4dc0 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 03:07:50 +0000 Subject: [PATCH 01/39] feat: Add pyproject.toml and pre-commit --- .github/dependabot.yml | 10 +++++ .github/matchers/pylint.json | 32 ++++++++++++++ .pre-commit-config.yaml | 85 ++++++++++++++++++++++++++++++++++++ pyproject.toml | 53 ++++++++++++++++++++++ 4 files changed, 180 insertions(+) create mode 100644 .github/dependabot.yml create mode 100644 .github/matchers/pylint.json create mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..3459d67 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,10 @@ +version: 2 +updates: + # Maintain dependencies for GitHub Actions + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + # Raise pull requests for version updates + # to pip against the `main` branch + target-branch: "main" \ No newline at end of file diff --git a/.github/matchers/pylint.json b/.github/matchers/pylint.json new file mode 100644 index 0000000..ee5a60b --- /dev/null +++ b/.github/matchers/pylint.json @@ -0,0 +1,32 @@ +{ + "problemMatcher": [ + { + "severity": "warning", + "pattern": [ + { + "regexp": "^([^:]+):(\\d+):(\\d+): ([A-DF-Z]\\d+): \\033\\[[\\d;]+m([^\\033]+).*$", + "file": 1, + "line": 2, + "column": 3, + "code": 4, + "message": 5 + } + ], + "owner": "pylint-warning" + }, + { + "severity": "error", + "pattern": [ + { + "regexp": "^([^:]+):(\\d+):(\\d+): (E\\d+): \\033\\[[\\d;]+m([^\\033]+).*$", + "file": 1, + "line": 2, + "column": 3, + "code": 4, + "message": 5 + } + ], + "owner": "pylint-error" + } + ] + } \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..c0dc4cf --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,85 @@ +exclude: | + (?x)^( + tests/utils/ + ) + +ci: + autoupdate_commit_msg: "chore: update pre-commit hooks" + autofix_commit_msg: "style: pre-commit fixes" + +repos: + - repo: https://github.com/psf/black + rev: "24.2.0" + hooks: + - id: black-jupyter + + - repo: https://github.com/asottile/blacken-docs + rev: "1.16.0" + hooks: + - id: blacken-docs + additional_dependencies: [black==23.7.0] + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: "v4.5.0" + hooks: + - id: check-added-large-files + - id: check-case-conflict + - id: check-merge-conflict + - id: check-symlinks + - id: check-yaml + - id: debug-statements + - id: end-of-file-fixer + - id: mixed-line-ending + - id: name-tests-test + args: ["--pytest-test-first"] + - id: requirements-txt-fixer + - id: trailing-whitespace + + - repo: https://github.com/pre-commit/pygrep-hooks + rev: "v1.10.0" + hooks: + - id: rst-backticks + - id: rst-directive-colons + - id: rst-inline-touching-normal + + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v4.0.0-alpha.8" + hooks: + - id: prettier + types_or: [yaml, markdown, html, css, scss, javascript, json] + args: [--prose-wrap=always] + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: "v0.3.2" + hooks: + - id: ruff + args: ["--fix", "--show-fixes"] + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: "v1.9.0" + hooks: + - id: mypy + files: src + args: ["--ignore-missing-imports"] + additional_dependencies: + - pytest + + - repo: https://github.com/codespell-project/codespell + rev: "v2.2.6" + hooks: + - id: codespell + args: ["--write-changes", "--ignore-words", ".codespell-whitelist"] + + - repo: https://github.com/kynan/nbstripout + rev: 0.7.1 + hooks: + - id: nbstripout + args: [--extra-keys=metadata.kernelspec metadata.language_info.version] + + - repo: local + hooks: + - id: disallow-caps + name: Disallow improper capitalization + language: pygrep + entry: PyBind|Numpy|Cmake|CCache|Github|PyTest + exclude: .pre-commit-config.yaml \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..06eee31 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,53 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "vectordb2" +dynamic = ["version"] +description = "A lightweight Python package for storing and retrieving text using chunking, embedding, and vector search" +readme = "README.md" +license = "" +authors = [ + { name = "Vladimir Prelovac", email = "vlad@kagi.com" }, +] +keywords = [ + "chunking", + "embedding", + "search", + "text", + "vector", +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", +] +dependencies = [ + "faiss-cpu", + "numpy>=1.21.0", + "scikit-learn>=0.24.0", + "scipy>=1.7.0", + "sentence_transformers", + "tensorflow_text", + "torch>=1.9.0", + "transformers>=4.10.0", +] + +[project.urls] +Homepage = "https://github.com/kagisearch/vectordb" + +[tool.hatch.version] +path = "vectordb/__init__.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/vectordb", +] From 455e04448f8112a47ade1ed7ebdfc3f0deb60dac Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 03:10:34 +0000 Subject: [PATCH 02/39] refactor: Update with pre-commit --- .github/dependabot.yml | 2 +- .github/matchers/pylint.json | 62 +++++++-------- .github/workflows/pylint.yml | 24 +++--- .gitignore | 2 +- .pre-commit-config.yaml | 2 +- README.md | 147 ++++++++++++++++++++--------------- images/.init | 1 - setup.py | 4 +- vectordb/__init__.py | 1 - vectordb/memory.py | 19 ++++- vectordb/vector_search.py | 4 +- 11 files changed, 151 insertions(+), 117 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 3459d67..500dfaa 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -7,4 +7,4 @@ updates: interval: "weekly" # Raise pull requests for version updates # to pip against the `main` branch - target-branch: "main" \ No newline at end of file + target-branch: "main" diff --git a/.github/matchers/pylint.json b/.github/matchers/pylint.json index ee5a60b..e3a6bd1 100644 --- a/.github/matchers/pylint.json +++ b/.github/matchers/pylint.json @@ -1,32 +1,32 @@ { - "problemMatcher": [ - { - "severity": "warning", - "pattern": [ - { - "regexp": "^([^:]+):(\\d+):(\\d+): ([A-DF-Z]\\d+): \\033\\[[\\d;]+m([^\\033]+).*$", - "file": 1, - "line": 2, - "column": 3, - "code": 4, - "message": 5 - } - ], - "owner": "pylint-warning" - }, - { - "severity": "error", - "pattern": [ - { - "regexp": "^([^:]+):(\\d+):(\\d+): (E\\d+): \\033\\[[\\d;]+m([^\\033]+).*$", - "file": 1, - "line": 2, - "column": 3, - "code": 4, - "message": 5 - } - ], - "owner": "pylint-error" - } - ] - } \ No newline at end of file + "problemMatcher": [ + { + "severity": "warning", + "pattern": [ + { + "regexp": "^([^:]+):(\\d+):(\\d+): ([A-DF-Z]\\d+): \\033\\[[\\d;]+m([^\\033]+).*$", + "file": 1, + "line": 2, + "column": 3, + "code": 4, + "message": 5 + } + ], + "owner": "pylint-warning" + }, + { + "severity": "error", + "pattern": [ + { + "regexp": "^([^:]+):(\\d+):(\\d+): (E\\d+): \\033\\[[\\d;]+m([^\\033]+).*$", + "file": 1, + "line": 2, + "column": 3, + "code": 4, + "message": 5 + } + ], + "owner": "pylint-error" + } + ] +} diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 383e65c..8713965 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -9,15 +9,15 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10"] steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pylint - - name: Analysing the code with pylint - run: | - pylint $(git ls-files '*.py') + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pylint + - name: Analysing the code with pylint + run: | + pylint $(git ls-files '*.py') diff --git a/.gitignore b/.gitignore index 64fa3de..9bdda1c 100644 --- a/.gitignore +++ b/.gitignore @@ -160,4 +160,4 @@ cython_debug/ #.idea/ -*~ \ No newline at end of file +*~ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c0dc4cf..9217977 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -82,4 +82,4 @@ repos: name: Disallow improper capitalization language: pygrep entry: PyBind|Numpy|Cmake|CCache|Github|PyTest - exclude: .pre-commit-config.yaml \ No newline at end of file + exclude: .pre-commit-config.yaml diff --git a/README.md b/README.md index 7df1263..16ad2f8 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,20 @@ # VectorDB -[![](https://dcbadge.vercel.app/api/server/aDNg6E9szy?compact=true&style=flat)](https://discord.gg/aDNg6E9szy) [![Twitter](https://img.shields.io/twitter/follow/KagiHQ?style=social)](https://twitter.com/KagiHQ) [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/license/mit/) +[![](https://dcbadge.vercel.app/api/server/aDNg6E9szy?compact=true&style=flat)](https://discord.gg/aDNg6E9szy) +[![Twitter](https://img.shields.io/twitter/follow/KagiHQ?style=social)](https://twitter.com/KagiHQ) +[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/license/mit/) -VectorDB is a simple, lightweight, fully local, end-to-end solution for using embeddings-based text retrieval. +VectorDB is a simple, lightweight, fully local, end-to-end solution for using +embeddings-based text retrieval. -Thanks to its low latency and small memory footprint, VectorDB is used to power AI features inside [Kagi Search](https://kagi.com). - -Check an [example Colab notebook](https://colab.research.google.com/drive/1pecKGCCru_Jvx7v0WRNrW441EBlcS5qS#scrollTo=Eh6o8m7d8eOk) where this is used to filter the content of [Kagi Small Web](https://kagi.com/smallweb) RSS feed based on stated user interests. +Thanks to its low latency and small memory footprint, VectorDB is used to power +AI features inside [Kagi Search](https://kagi.com). +Check an +[example Colab notebook](https://colab.research.google.com/drive/1pecKGCCru_Jvx7v0WRNrW441EBlcS5qS#scrollTo=Eh6o8m7d8eOk) +where this is used to filter the content of +[Kagi Small Web](https://kagi.com/smallweb) RSS feed based on stated user +interests. ## Installation @@ -19,7 +26,9 @@ pip install vectordb2 ## Usage -Quick example that loads data into memory, and runs retrieval. All data will be handled locally, including embeddings and vector search, completely trasparent for the user with maximum possible performance. +Quick example that loads data into memory, and runs retrieval. All data will be +handled locally, including embeddings and vector search, completely trasparent +for the user with maximum possible performance. ```python from vectordb import Memory @@ -28,24 +37,31 @@ from vectordb import Memory memory = Memory() memory.save( - ["apples are green", "oranges are orange"], # save your text content. for long text we will automatically chunk it - [{"url": "https://apples.com"}, {"url": "https://oranges.com"}], # associate any kind of metadata with it (optional) + [ + "apples are green", + "oranges are orange", + ], # save your text content. for long text we will automatically chunk it + [ + {"url": "https://apples.com"}, + {"url": "https://oranges.com"}, + ], # associate any kind of metadata with it (optional) ) # Search for top n relevant results, automatically using embeddings query = "green" -results = memory.search(query, top_n = 1) +results = memory.search(query, top_n=1) print(results) ``` -This returns the chunks with the added metadata and the vector distance (where 0 is the exact match and higher means further apart) +This returns the chunks with the added metadata and the vector distance (where 0 +is the exact match and higher means further apart) ```json [ { "chunk": "apples are green", - "metadata": {"url": "https://apples.com"}, + "metadata": { "url": "https://apples.com" }, "distance": 0.87 } ] @@ -53,62 +69,72 @@ This returns the chunks with the added metadata and the vector distance (where 0 ## Options - **Memory(memory_file=None, chunking_strategy={"mode":"sliding_window"}, embeddings="normal")** +- `memory_file`: _Optional._ Path to the memory file. If provided, memory will + persist to disk and loaded/saved to this file. +- `chunking_strategy`: _Optional._ Dictionary containing the chunking mode. -- `memory_file`: *Optional.* Path to the memory file. If provided, memory will persist to disk and loaded/saved to this file. -- `chunking_strategy`: *Optional.* Dictionary containing the chunking mode. - - Options:\ - `{'mode':'sliding_window', 'window_size': 240, 'overlap': 8}` (default)\ + Options:\ + `{'mode':'sliding_window', 'window_size': 240, 'overlap': 8}` (default)\ `{'mode':'paragraph'}` -- `embeddings`: *Optional.* - - Options:\ + +- `embeddings`: _Optional._ + + Options:\ `fast` - Uses Universal Sentence Encoder 4\ `normal` - Uses "BAAI/bge-small-en-v1.5" (default)\ `best` - Uses "BAAI/bge-base-en-v1.5"\ `multilingual` - Uses Universal Sentence Encoder Multilingual Large 3 - - You can also specify a custom HuggingFace model by name eg. `TaylorAI/bge-micro-v2`. See also [Pretrained models](https://www.sbert.net/docs/pretrained_models.html) and [MTEB](https://huggingface.co/spaces/mteb/leaderboard). + You can also specify a custom HuggingFace model by name eg. + `TaylorAI/bge-micro-v2`. See also + [Pretrained models](https://www.sbert.net/docs/pretrained_models.html) and + [MTEB](https://huggingface.co/spaces/mteb/leaderboard). **Memory.save(texts, metadata, memory_file=None)** -Save content to memory. Metadata will be automatically optimized to use less resources. +Save content to memory. Metadata will be automatically optimized to use less +resources. -- `texts`: *Required.* Text or list of texts to be saved. -- `metdata`: *Optional.* Metadata or list of metadata associated with the texts. -- `memory_file`: *Optional.* Path to persist the memory file. By default +- `texts`: _Required._ Text or list of texts to be saved. +- `metdata`: _Optional._ Metadata or list of metadata associated with the texts. +- `memory_file`: _Optional._ Path to persist the memory file. By default **Memory.search(query, top_n=5, unique=False, batch_results="flatten")** Search inside memory. -- `query`: *Required.* Query text or list of queries (see `batch_results` option below for handling results for a list). -- `top_n`: *Optional.* Number of most similar chunks to return (default: 5). -- `unique`: *Optional.* Return only items chunks from unique original texts (additional chunks coming from the same text will be ignored). Note this may return less chhunks than requested (default: False). -- `batch_results`: *Optional.* When input is a list of queries, output algorithm can be "flatten" or "diverse". Flatten returns true nearest neighbours across all input queries, meaning all results could come from just one query. "diverse" attempts to spread out the results, so that each query's nearest neighbours are equally added (neareast first across all queries, than 2nd nearest and so on). (default: "flatten") +- `query`: _Required._ Query text or list of queries (see `batch_results` option + below for handling results for a list). +- `top_n`: _Optional._ Number of most similar chunks to return (default: 5). +- `unique`: _Optional._ Return only items chunks from unique original texts + (additional chunks coming from the same text will be ignored). Note this may + return less chhunks than requested (default: False). +- `batch_results`: _Optional._ When input is a list of queries, output algorithm + can be "flatten" or "diverse". Flatten returns true nearest neighbours across + all input queries, meaning all results could come from just one query. + "diverse" attempts to spread out the results, so that each query's nearest + neighbours are equally added (neareast first across all queries, than 2nd + nearest and so on). (default: "flatten") **Memory.clear()** Clears the memory. - **Memory.dump()** Prints the contents of the memory. - ## Example ```python from vectordb import Memory memory = Memory( - chunking_strategy={"mode": "sliding_window", "window_size": 128, "overlap": 16}, embeddings='TaylorAI/bge-micro-v2' + chunking_strategy={"mode": "sliding_window", "window_size": 128, "overlap": 16}, + embeddings="TaylorAI/bge-micro-v2", ) texts = [ @@ -184,6 +210,7 @@ print(results) ``` Output: + ```json [ { @@ -203,48 +230,46 @@ Output: "distance": 0.83 } ] - ``` ## Embeddings performance analysis - -We constantly evaluate embedding models using standardized benchmarks (higher is better). Average latency is measured locally on CPU (lower is better). Benchmark data pulled from [MTEB](https://huggingface.co/spaces/mteb/leaderboard). - - - -| Model | Latency | Benchmark 1 | Benchmark 2 | Benchmark 3 | Benchmark 4 | -|-----------------------------------------------|----------|-------------|-------------|-------------|-------------| -| all-mpnet-base-v2 | 6.12 s | 80.28 | 65.07 | 43.69 | 83.04 | -| all-MiniLM-L6-v2 | 1.14 s | 78.9 | 63.05 | 42.35 | 82.37 | -| BAAI/bge-large-en-v1.5 | 20.8 s | 83.11 | 75.97 | 46.08 | 87.12 | -| BAAI/bge-base-en-v1.5 | 6.48 s | 82.4 | 75.53 | 45.77 | 86.55 | -| BAAI/bge-small-en-v1.5 | 1.85 s | 81.59 | 74.14 | 43.82 | 84.92 | -| TaylorAI/bge-micro-v2 | 0.671 s | 78.65 | 68.04 | 39.18 | 82.81 | -| TaylorAI/gte-tiny | 1.25 s | 80.46 | 70.35 | 42.09 | 82.83 | -| thenlper/gte-base | 6.28 s | 82.3 | 73.01 | 46.2 | 84.57 | -| thenlper/gte-small | 2.14 s | 82.07 | 72.31 | 44.89 | 83.54 | -| universal-sentence-encoder-large/5 | 0.769 s | 74.05 | 67.9 | 37.82 | 79.53 | -| universal-sentence-encoder-multilingual-large/3| 1.02 s | 75.35 | 65.78 | 35.06 | 79.62 | -| universal-sentence-encoder-multilingual/3 | 0.162 s | 75.39 | 63.42 | 34.82 | 75.43 | -| universal-sentence-encoder/4 | 0.019 s | 72.04 | 64.45 | 35.71 | 76.23 | - -*Relative embeddings latency on CPU* +We constantly evaluate embedding models using standardized benchmarks (higher is +better). Average latency is measured locally on CPU (lower is better). Benchmark +data pulled from [MTEB](https://huggingface.co/spaces/mteb/leaderboard). + +| Model | Latency | Benchmark 1 | Benchmark 2 | Benchmark 3 | Benchmark 4 | +| ----------------------------------------------- | ------- | ----------- | ----------- | ----------- | ----------- | +| all-mpnet-base-v2 | 6.12 s | 80.28 | 65.07 | 43.69 | 83.04 | +| all-MiniLM-L6-v2 | 1.14 s | 78.9 | 63.05 | 42.35 | 82.37 | +| BAAI/bge-large-en-v1.5 | 20.8 s | 83.11 | 75.97 | 46.08 | 87.12 | +| BAAI/bge-base-en-v1.5 | 6.48 s | 82.4 | 75.53 | 45.77 | 86.55 | +| BAAI/bge-small-en-v1.5 | 1.85 s | 81.59 | 74.14 | 43.82 | 84.92 | +| TaylorAI/bge-micro-v2 | 0.671 s | 78.65 | 68.04 | 39.18 | 82.81 | +| TaylorAI/gte-tiny | 1.25 s | 80.46 | 70.35 | 42.09 | 82.83 | +| thenlper/gte-base | 6.28 s | 82.3 | 73.01 | 46.2 | 84.57 | +| thenlper/gte-small | 2.14 s | 82.07 | 72.31 | 44.89 | 83.54 | +| universal-sentence-encoder-large/5 | 0.769 s | 74.05 | 67.9 | 37.82 | 79.53 | +| universal-sentence-encoder-multilingual-large/3 | 1.02 s | 75.35 | 65.78 | 35.06 | 79.62 | +| universal-sentence-encoder-multilingual/3 | 0.162 s | 75.39 | 63.42 | 34.82 | 75.43 | +| universal-sentence-encoder/4 | 0.019 s | 72.04 | 64.45 | 35.71 | 76.23 | + +_Relative embeddings latency on CPU_ ![Embeddings Latency on CPU](images/speed_cpu.png) -*Relative embeddings latency on GPU* +_Relative embeddings latency on GPU_ ![Embeddings Latency on GPU](images/speed_gpu.png) - ![Embeddings Quality](images/quality.png) ![Scatter of Embeddings](images/scatter.png) - - ## Vector search performance analysis -VectorDB is also optimized for speed of retrieval. We automatically uses [Faiss](https://github.com/facebookresearch/faiss) for low number of chunks (<4000) and [mrpt](https://github.com/vioshyvo/mrpt) for high number of chunks to ensure maximum performance across the spectrum of use cases. +VectorDB is also optimized for speed of retrieval. We automatically uses +[Faiss](https://github.com/facebookresearch/faiss) for low number of chunks +(<4000) and [mrpt](https://github.com/vioshyvo/mrpt) for high number of chunks +to ensure maximum performance across the spectrum of use cases. ![Vector search engine comparison](images/comparison.png) diff --git a/images/.init b/images/.init index 8b13789..e69de29 100644 --- a/images/.init +++ b/images/.init @@ -1 +0,0 @@ - diff --git a/setup.py b/setup.py index c09b245..abaa9eb 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -#pylint: disable = line-too-long, trailing-whitespace, trailing-newlines, line-too-long, missing-module-docstring, import-error, too-few-public-methods, too-many-instance-attributes, too-many-locals +# pylint: disable = line-too-long, trailing-whitespace, trailing-newlines, line-too-long, missing-module-docstring, import-error, too-few-public-methods, too-many-instance-attributes, too-many-locals from setuptools import setup, find_packages @@ -14,7 +14,7 @@ "scipy>=1.7.0", "sentence_transformers", "faiss-cpu", - "tensorflow_text" + "tensorflow_text", ], author="Vladimir Prelovac", author_email="vlad@kagi.com", diff --git a/vectordb/__init__.py b/vectordb/__init__.py index f1db8dc..6a368e6 100644 --- a/vectordb/__init__.py +++ b/vectordb/__init__.py @@ -1,3 +1,2 @@ # pylint: disable = line-too-long, trailing-whitespace, trailing-newlines, line-too-long, missing-module-docstring, import-error, too-few-public-methods, too-many-instance-attributes, too-many-locals -from .memory import Memory diff --git a/vectordb/memory.py b/vectordb/memory.py index a5a8d47..345988a 100644 --- a/vectordb/memory.py +++ b/vectordb/memory.py @@ -3,6 +3,7 @@ for text and associated metadata, with functionality for saving, searching, and managing memory entries. """ + # pylint: disable = line-too-long, trailing-whitespace, trailing-newlines, line-too-long, missing-module-docstring, import-error, too-few-public-methods, too-many-instance-attributes, too-many-locals from typing import List, Dict, Any, Union @@ -130,10 +131,16 @@ def save( self.memory.append(entry) if memory_file is not None: - Storage(self.memory_file).save_to_disk([{"memory": self.memory, "metadata" :self.metadata_memory}]) + Storage(self.memory_file).save_to_disk( + [{"memory": self.memory, "metadata": self.metadata_memory}] + ) def search( - self, query: str, top_n: int = 5, unique: bool = False, batch_results: str = "flatten" + self, + query: str, + top_n: int = 5, + unique: bool = False, + batch_results: str = "flatten", ) -> List[Dict[str, Any]]: """ Searches for the most similar chunks to the given query in memory. @@ -154,7 +161,9 @@ def search( if len(embeddings) == 0: return [] - indices = self.vector_search.search_vectors(query_embedding, embeddings, top_n, batch_results) + indices = self.vector_search.search_vectors( + query_embedding, embeddings, top_n, batch_results + ) if unique: unique_indices = [] seen_text_indices = set() # Change the variable name @@ -192,7 +201,9 @@ def clear(self): self.text_index_counter = 0 if self.memory_file is not None: - Storage(self.memory_file).save_to_disk([{"memory": self.memory, "metadata" :self.metadata_memory}]) + Storage(self.memory_file).save_to_disk( + [{"memory": self.memory, "metadata": self.metadata_memory}] + ) def dump(self): """ diff --git a/vectordb/vector_search.py b/vectordb/vector_search.py index 04075eb..7ac3763 100644 --- a/vectordb/vector_search.py +++ b/vectordb/vector_search.py @@ -51,7 +51,7 @@ def get_unique_k_elements(i, d, k=15, diverse=False): dd.append(dist) if len(ii) >= k: break - + return np.array(ii), np.array(dd) @staticmethod @@ -114,7 +114,7 @@ def search_vectors( :param top_n: the number of most similar vectors to return. :param batch_results: when input is a list of vectors, output algo can be "flatten" or "diverse" :return: a list of indices of the top_n most similar vectors in the embeddings. - + """ if isinstance(query_embedding, list): query_embedding = np.array(query_embedding).astype(np.float32) From ddfe3331356cf8595f9e682b0632eace0a06e746 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 03:11:26 +0000 Subject: [PATCH 03/39] refactor: Remove extra whitespace --- vectordb/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vectordb/__init__.py b/vectordb/__init__.py index 6a368e6..c7b3add 100644 --- a/vectordb/__init__.py +++ b/vectordb/__init__.py @@ -1,2 +1 @@ # pylint: disable = line-too-long, trailing-whitespace, trailing-newlines, line-too-long, missing-module-docstring, import-error, too-few-public-methods, too-many-instance-attributes, too-many-locals - From 1aa706fccab420e7ef08674428f307daed82eae7 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 03:28:31 +0000 Subject: [PATCH 04/39] feat: Add codespell-whitelist for pre-commit and fix spellings in README and embedding --- .codespell-whitelist | 0 .github/workflows/ci.yml | 74 ++++++++++++++++++++++++++++++++++++ .github/workflows/pylint.yml | 23 ----------- README.md | 6 +-- vectordb/embedding.py | 2 +- 5 files changed, 78 insertions(+), 27 deletions(-) create mode 100644 .codespell-whitelist create mode 100644 .github/workflows/ci.yml delete mode 100644 .github/workflows/pylint.yml diff --git a/.codespell-whitelist b/.codespell-whitelist new file mode 100644 index 0000000..e69de29 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..c2799e2 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,74 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: CI + +on: + workflow_dispatch: + pull_request: + push: + branches: + - main + - dev + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + FORCE_COLOR: 3 + PROJECT_NAME: "caustics" + +jobs: + build: + runs-on: ${{matrix.os}} + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.10", "3.11"] + os: [ubuntu-latest, windows-latest, macOS-latest] + + steps: + - name: Checkout caustics + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + allow-prereleases: true + + - name: Record State + run: | + pwd + echo github.ref is: ${{ github.ref }} + echo GITHUB_SHA is: $GITHUB_SHA + echo github.event_name is: ${{ github.event_name }} + echo github workspace: ${{ github.workspace }} + pip --version + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest pytest-cov torch wheel + + # We only want to install this on one run, because otherwise we'll have + # duplicate annotations. + - name: Install error reporter + if: ${{ matrix.python-version == '3.10' }} + run: | + python -m pip install pytest-github-actions-annotate-failures + + - name: Install Caustics + run: | + pip install -e ".[dev]" + pip show ${{ env.PROJECT_NAME }} + + - name: Test with pytest + run: | + pytest -vvv --cov=${{ env.PROJECT_NAME }} --cov-report=xml --cov-report=term tests/ + + - name: Upload coverage reports to Codecov with GitHub Action + uses: codecov/codecov-action@v4 \ No newline at end of file diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml deleted file mode 100644 index 8713965..0000000 --- a/.github/workflows/pylint.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Pylint - -on: [push] - -jobs: - build: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.8", "3.9", "3.10"] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pylint - - name: Analysing the code with pylint - run: | - pylint $(git ls-files '*.py') diff --git a/README.md b/README.md index 16ad2f8..201604a 100644 --- a/README.md +++ b/README.md @@ -20,14 +20,14 @@ interests. To install VectorDB, use pip: -``` +```shell pip install vectordb2 ``` ## Usage Quick example that loads data into memory, and runs retrieval. All data will be -handled locally, including embeddings and vector search, completely trasparent +handled locally, including embeddings and vector search, completely transparent for the user with maximum possible performance. ```python @@ -99,7 +99,7 @@ Save content to memory. Metadata will be automatically optimized to use less resources. - `texts`: _Required._ Text or list of texts to be saved. -- `metdata`: _Optional._ Metadata or list of metadata associated with the texts. +- `metadata`: _Optional._ Metadata or list of metadata associated with the texts. - `memory_file`: _Optional._ Path to persist the memory file. By default **Memory.search(query, top_n=5, unique=False, batch_results="flatten")** diff --git a/vectordb/embedding.py b/vectordb/embedding.py index c3809ef..80b150e 100644 --- a/vectordb/embedding.py +++ b/vectordb/embedding.py @@ -33,7 +33,7 @@ def __init__(self, model_name: str = "normal"): for embeddings. """ self.sbert = True - print("Initiliazing embeddings: ", model_name) + print("Initializing embeddings: ", model_name) if model_name == "fast": self.model = hub.load( "https://tfhub.dev/google/universal-sentence-encoder/4" From 2f6fd08c258db721452ab180741d93171e27475f Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 03:31:45 +0000 Subject: [PATCH 05/39] feat: Add requirements file --- requirements.txt | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d5aeebf --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +torch>=1.9.0, +transformers>=4.10.0, +numpy>=1.21.0, +scikit-learn>=0.24.0, +scipy>=1.7.0, +sentence_transformers, +faiss-cpu, +tensorflow_text \ No newline at end of file From 6968b2d6ef69ff8a63652b105a594131b1002f2f Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 03:34:30 +0000 Subject: [PATCH 06/39] feat: Add CI for main and dev branches --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c2799e2..87d1654 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ concurrency: env: FORCE_COLOR: 3 - PROJECT_NAME: "caustics" + PROJECT_NAME: "vectordb2" jobs: build: From 6584cbca28966c994ff0e26cdf8f17113217bb70 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 03:36:01 +0000 Subject: [PATCH 07/39] refactor: Fix PEP8 issues with README, requirements, and ci files --- .github/workflows/ci.yml | 2 +- README.md | 3 ++- requirements.txt | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 87d1654..6c417cb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -71,4 +71,4 @@ jobs: pytest -vvv --cov=${{ env.PROJECT_NAME }} --cov-report=xml --cov-report=term tests/ - name: Upload coverage reports to Codecov with GitHub Action - uses: codecov/codecov-action@v4 \ No newline at end of file + uses: codecov/codecov-action@v4 diff --git a/README.md b/README.md index 201604a..a9e899d 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,8 @@ Save content to memory. Metadata will be automatically optimized to use less resources. - `texts`: _Required._ Text or list of texts to be saved. -- `metadata`: _Optional._ Metadata or list of metadata associated with the texts. +- `metadata`: _Optional._ Metadata or list of metadata associated with the + texts. - `memory_file`: _Optional._ Path to persist the memory file. By default **Memory.search(query, top_n=5, unique=False, batch_results="flatten")** diff --git a/requirements.txt b/requirements.txt index d5aeebf..6ff77e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -torch>=1.9.0, -transformers>=4.10.0, +faiss-cpu, numpy>=1.21.0, scikit-learn>=0.24.0, scipy>=1.7.0, sentence_transformers, -faiss-cpu, -tensorflow_text \ No newline at end of file +tensorflow_text +torch>=1.9.0, +transformers>=4.10.0, From 4e22f4a3eae3e5687db54dfd4c65cfbeb9a15914 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 04:07:12 +0000 Subject: [PATCH 08/39] refactor: Move vectordb to src and update __init__ --- .codespell-whitelist | 1 + pyproject.toml | 6 +++--- {vectordb => src/vectordb}/__init__.py | 4 ++++ {vectordb => src/vectordb}/chunking.py | 0 {vectordb => src/vectordb}/embedding.py | 0 {vectordb => src/vectordb}/memory.py | 0 {vectordb => src/vectordb}/storage.py | 0 {vectordb => src/vectordb}/vector_search.py | 0 8 files changed, 8 insertions(+), 3 deletions(-) rename {vectordb => src/vectordb}/__init__.py (67%) rename {vectordb => src/vectordb}/chunking.py (100%) rename {vectordb => src/vectordb}/embedding.py (100%) rename {vectordb => src/vectordb}/memory.py (100%) rename {vectordb => src/vectordb}/storage.py (100%) rename {vectordb => src/vectordb}/vector_search.py (100%) diff --git a/.codespell-whitelist b/.codespell-whitelist index e69de29..db71550 100644 --- a/.codespell-whitelist +++ b/.codespell-whitelist @@ -0,0 +1 @@ +kagisearch \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 06eee31..67d12ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "vectordb2" dynamic = ["version"] description = "A lightweight Python package for storing and retrieving text using chunking, embedding, and vector search" readme = "README.md" -license = "" +license = "LICENSE" authors = [ { name = "Vladimir Prelovac", email = "vlad@kagi.com" }, ] @@ -45,9 +45,9 @@ dependencies = [ Homepage = "https://github.com/kagisearch/vectordb" [tool.hatch.version] -path = "vectordb/__init__.py" +path = "/src/vectordb/__init__.py" [tool.hatch.build.targets.sdist] include = [ - "/vectordb", + "/src/vectordb", ] diff --git a/vectordb/__init__.py b/src/vectordb/__init__.py similarity index 67% rename from vectordb/__init__.py rename to src/vectordb/__init__.py index c7b3add..6b2dc16 100644 --- a/vectordb/__init__.py +++ b/src/vectordb/__init__.py @@ -1 +1,5 @@ # pylint: disable = line-too-long, trailing-whitespace, trailing-newlines, line-too-long, missing-module-docstring, import-error, too-few-public-methods, too-many-instance-attributes, too-many-locals +from ._version import version as VERSION # noqa + +__version__ = VERSION +__author__ = "kagisearch" \ No newline at end of file diff --git a/vectordb/chunking.py b/src/vectordb/chunking.py similarity index 100% rename from vectordb/chunking.py rename to src/vectordb/chunking.py diff --git a/vectordb/embedding.py b/src/vectordb/embedding.py similarity index 100% rename from vectordb/embedding.py rename to src/vectordb/embedding.py diff --git a/vectordb/memory.py b/src/vectordb/memory.py similarity index 100% rename from vectordb/memory.py rename to src/vectordb/memory.py diff --git a/vectordb/storage.py b/src/vectordb/storage.py similarity index 100% rename from vectordb/storage.py rename to src/vectordb/storage.py diff --git a/vectordb/vector_search.py b/src/vectordb/vector_search.py similarity index 100% rename from vectordb/vector_search.py rename to src/vectordb/vector_search.py From 70b053a1fafd133cb15037dc898f433bfd7d4294 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 06:09:46 +0000 Subject: [PATCH 09/39] refactor: Update the pyproject.toml to dynamically handle versions --- pyproject.toml | 50 +++++++++++++++++++++++++++------------- requirements.txt | 14 +++++------ src/vectordb/_version.py | 16 +++++++++++++ tests/__init__.py | 0 4 files changed, 57 insertions(+), 23 deletions(-) create mode 100644 src/vectordb/_version.py create mode 100644 tests/__init__.py diff --git a/pyproject.toml b/pyproject.toml index 67d12ec..76dca15 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,17 @@ [build-system] -requires = ["hatchling"] +requires = ["hatchling", "hatch-requirements-txt", "hatch-vcs"] build-backend = "hatchling.build" [project] name = "vectordb2" -dynamic = ["version"] +dynamic = [ + "dependencies", + "version" +] description = "A lightweight Python package for storing and retrieving text using chunking, embedding, and vector search" readme = "README.md" -license = "LICENSE" +requires-python = ">=3.8" +license = {file = "LICENSE"} authors = [ { name = "Vladimir Prelovac", email = "vlad@kagi.com" }, ] @@ -30,24 +34,38 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", ] -dependencies = [ - "faiss-cpu", - "numpy>=1.21.0", - "scikit-learn>=0.24.0", - "scipy>=1.7.0", - "sentence_transformers", - "tensorflow_text", - "torch>=1.9.0", - "transformers>=4.10.0", -] [project.urls] Homepage = "https://github.com/kagisearch/vectordb" -[tool.hatch.version] -path = "/src/vectordb/__init__.py" +[project.optional-dependencies] +dev = [ + "pytest>=8.0,<9", + "pytest-cov>=4.1,<5", + "pytest-mock>=3.12,<4", + "pre-commit>=3.6,<4" +] + +[tool.hatch.metadata] +allow-direct-references = true [tool.hatch.build.targets.sdist] include = [ - "/src/vectordb", + "src/vectordb", ] + +[tool.hatch.metadata.hooks.requirements_txt] +files = ["requirements.txt"] + +[tool.hatch.version] +source = "vcs" + +[tool.hatch.build.hooks.vcs] +version-file = "src/vectordb/_version.py" + +[tool.hatch.version.raw-options] +local_scheme = "no-local-version" + +[tool.ruff] +# Same as Black. +line-length = 100 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6ff77e6..1417962 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -faiss-cpu, -numpy>=1.21.0, -scikit-learn>=0.24.0, -scipy>=1.7.0, -sentence_transformers, +faiss-cpu +numpy>=1.21.0 +scikit-learn>=0.24.0 +scipy>=1.7.0 +sentence_transformers tensorflow_text -torch>=1.9.0, -transformers>=4.10.0, +torch>=1.9.0 +transformers>=4.10.0 diff --git a/src/vectordb/_version.py b/src/vectordb/_version.py new file mode 100644 index 0000000..656908c --- /dev/null +++ b/src/vectordb/_version.py @@ -0,0 +1,16 @@ +# file generated by setuptools_scm +# don't change, don't track in version control +TYPE_CHECKING = False +if TYPE_CHECKING: + from typing import Tuple, Union + VERSION_TUPLE = Tuple[Union[int, str], ...] +else: + VERSION_TUPLE = object + +version: str +__version__: str +__version_tuple__: VERSION_TUPLE +version_tuple: VERSION_TUPLE + +__version__ = version = '0.1.dev104' +__version_tuple__ = version_tuple = (0, 1, 'dev104') diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 From eaefab85d0c8ab0e518d28e1d2725884533aebaa Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 06:18:06 +0000 Subject: [PATCH 10/39] docs: Add Makefile and JupyterBook requirements --- CODE_OF_CONDUCT.md | 119 ++++++++++++++++++++++++++++++++++++++++++ docs/Makefile | 19 +++++++ docs/requirements.txt | 6 +++ 3 files changed, 144 insertions(+) create mode 100644 CODE_OF_CONDUCT.md create mode 100644 docs/Makefile create mode 100644 docs/requirements.txt diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..ef04fe7 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,119 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +- Demonstrating empathy and kindness toward other people +- Being respectful of differing opinions, viewpoints, and experiences +- Giving and gracefully accepting constructive feedback +- Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +- Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +- The use of sexualized language or imagery, and sexual attention or advances of + any kind +- Trolling, insulting or derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information, such as a physical or email address, + without their explicit permission +- Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official email address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +[connor.stone@mila.quebec](mailto:connor.stone@mila.quebec). All complaints will +be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the +[Contributor Covenant](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html), +version 2.1. \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..298ea9e --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,19 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..c9cb7bf --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,6 @@ +ipywidgets +jupyter-book +matplotlib +pyro-ppl +sphinx +sphinx_rtd_theme \ No newline at end of file From 3eca2439d119e809a7359cfaaadd89e1900d0ee6 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 07:00:57 +0000 Subject: [PATCH 11/39] docs: Add community health files --- .codespell-whitelist | 2 +- .devcontainer/devcontainer.json | 0 .devcontainer/environment.yml | 0 .devcontainer/postBuild.sh | 0 .pre-commit-config.yaml | 16 ++-- CODE_OF_CONDUCT.md | 6 +- CONTRIBUTING.md | 11 +++ docs/Makefile | 2 +- docs/requirements.txt | 2 +- docs/source/_toc.yml | 17 +++++ docs/source/contributing.rst | 130 ++++++++++++++++++++++++++++++++ docs/source/install.rst | 30 ++++++++ docs/source/license.rst | 24 ++++++ noxfile.py | 53 +++++++++++++ pyproject.toml | 2 +- src/vectordb/__init__.py | 2 +- src/vectordb/_version.py | 5 +- 17 files changed, 284 insertions(+), 18 deletions(-) create mode 100644 .devcontainer/devcontainer.json create mode 100644 .devcontainer/environment.yml create mode 100644 .devcontainer/postBuild.sh create mode 100644 CONTRIBUTING.md create mode 100644 docs/source/_toc.yml create mode 100644 docs/source/contributing.rst create mode 100644 docs/source/install.rst create mode 100644 docs/source/license.rst create mode 100644 noxfile.py diff --git a/.codespell-whitelist b/.codespell-whitelist index db71550..dd29978 100644 --- a/.codespell-whitelist +++ b/.codespell-whitelist @@ -1 +1 @@ -kagisearch \ No newline at end of file +kagisearch diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..e69de29 diff --git a/.devcontainer/environment.yml b/.devcontainer/environment.yml new file mode 100644 index 0000000..e69de29 diff --git a/.devcontainer/postBuild.sh b/.devcontainer/postBuild.sh new file mode 100644 index 0000000..e69de29 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9217977..9e5bf1e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -55,14 +55,14 @@ repos: - id: ruff args: ["--fix", "--show-fixes"] - - repo: https://github.com/pre-commit/mirrors-mypy - rev: "v1.9.0" - hooks: - - id: mypy - files: src - args: ["--ignore-missing-imports"] - additional_dependencies: - - pytest + # - repo: https://github.com/pre-commit/mirrors-mypy + # rev: "v1.9.0" + # hooks: + # - id: mypy + # files: src + # args: ["--ignore-missing-imports"] + # additional_dependencies: + # - pytest - repo: https://github.com/codespell-project/codespell rev: "v2.2.6" diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index ef04fe7..45f341c 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -60,8 +60,8 @@ representative at an online or offline event. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at -[connor.stone@mila.quebec](mailto:connor.stone@mila.quebec). All complaints will -be reviewed and investigated promptly and fairly. +[connor.stone@mila.quebec](mailto:vlad@kagi.com). All complaints will be +reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. @@ -116,4 +116,4 @@ community. This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html), -version 2.1. \ No newline at end of file +version 2.1. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..72f51e5 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,11 @@ +## Contributing to Vectordb + +Thank you for your interest in contributing to vectordb! We welcome +contributions from the community to help improve our project. + +To get started, please refer to our +[online documentation](https://vectordb.readthedocs.io/en/latest/contributing.html) +for detailed guidelines on how to contribute to vectordb. + +We appreciate your contributions and look forward to your involvement in making +vectordb even better! \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile index 298ea9e..5128596 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -16,4 +16,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/requirements.txt b/docs/requirements.txt index c9cb7bf..69c8e43 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,4 +3,4 @@ jupyter-book matplotlib pyro-ppl sphinx -sphinx_rtd_theme \ No newline at end of file +sphinx_rtd_theme diff --git a/docs/source/_toc.yml b/docs/source/_toc.yml new file mode 100644 index 0000000..b9773cd --- /dev/null +++ b/docs/source/_toc.yml @@ -0,0 +1,17 @@ +# Table of contents +# Learn more at https://jupyterbook.org/customize/toc.html + +format: jb-book +root: intro +chapters: + - file: getting_started + - file: install + - file: examples/index + sections: + - file: examples/Example_ImageFit_LM + - file: contributing + - file: citation + - file: license + - file: modules + - file: glossary + - file: genindex \ No newline at end of file diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst new file mode 100644 index 0000000..b2eeaed --- /dev/null +++ b/docs/source/contributing.rst @@ -0,0 +1,130 @@ +See the `Scientific Python Developer Guide `_ for a detailed +description of best practices for developing scientific packages. + +Quick development +----------------- + +The fastest way to start with development is to use ``nox``. This will set up a +virtual environment for you to run all the checks and tests. There are 2 ways to +install ``nox``: + +Codespaces +~~~~~~~~~~ + +Nox is pre-installed in the Codespaces environment. So, after activating a +Codespace, you can just open the terminal and run ``nox`` to run all the checks +and tests. + +Local +~~~~~ + +If you don't have nox, you can do the following to install ``nox``: + +.. code-block:: bash + + pip install nox + +If you use macOS, then ``nox`` is in brew: + +.. code-block:: bash + + brew install nox + +Nox basics +~~~~~~~~~~ + +What is it? +^^^^^^^^^^^ + +``nox`` is a command-line tool that automates testing in multiple Python +environments, similar to tox. Unlike tox, Nox uses a standard Python file for +configuration, you can find this configuration in ``noxfile.py``. + +How do I use it? +^^^^^^^^^^^^^^^^ + +To use, run ``nox``. This will lint and test using every installed version of +Python on your system, skipping ones that are not installed. You can also run +specific jobs: + +.. code-block:: bash + + nox -s lint # Lint only + nox -s tests # Python tests + nox -s build # Make an SDist and wheel + +Nox handles everything for you, including setting up a temporary virtual +environment for each run. + +Setting up a development environment manually +--------------------------------------------- + +You can set up a development environment by running: + +.. code-block:: bash + + python3 -m venv .venv + source ./.venv/bin/activate + pip install -v -e .[dev] + +If you have the +`Python Launcher for Unix `_, you +can instead do: + +.. code-block:: bash + + py -m venv .venv + py -m install -v -e .[dev] + +Post setup +---------- + +You should prepare pre-commit, which will help you by checking that commits pass +required checks: + +.. code-block:: bash + + pip install pre-commit # or brew install pre-commit on macOS + pre-commit install # Will install a pre-commit hook into the git repo + +You can also/alternatively run ``pre-commit run`` (changes only) or +``pre-commit run --all-files`` to check even without installing the hook. + +Testing +------- + +Use pytest to run the unit checks: + +.. code-block:: bash + + pytest + +Coverage +-------- + +Use pytest-cov to generate coverage reports: + +.. code-block:: bash + + pytest --cov=vectordb2 + +Pre-commit +---------- + +This project uses pre-commit for all style checking. While you can run it with +nox, this is such an important tool that it deserves to be installed on its own. +Install pre-commit and run: + +.. code-block:: bash + + pre-commit run -a + +to check all files. + +Code of Conduct +--------------- + +By contributing to this project, you agree to abide by the `Code of Conduct +`_. +Please make sure to read and understand the guidelines outlined in the Code +of Conduct before making any contributions. \ No newline at end of file diff --git a/docs/source/install.rst b/docs/source/install.rst new file mode 100644 index 0000000..96bd85c --- /dev/null +++ b/docs/source/install.rst @@ -0,0 +1,30 @@ + +Installation +============ + +Regular Install +--------------- + +The easiest way to install is to make a new virtual environment then run:: + + pip install vectordb2 + +this will install all the required libraries and then install vectordb and you are ready to go! You can check out the tutorials afterwards to see some of vectordb's capabilities. + + +Developer Install +----------------- + +First clone the repo with:: + + git clone git@github.com:Ciela-Institute/vectordb.git + +this will create a directory ``vectordb`` wherever you ran the command. Next go into the directory and install in developer mode:: + + pip install -e ".[dev]" + +this will install all relevant libraries and then install vectordb in an editable format so any changes you make to the code will be included next time you import the package. To start making changes you should immediately create a new branch:: + + git checkout -b + +you can edit this branch however you like. If you are happy with the results and want to share with the rest of the community, then follow the contributors guide to create a pull request! \ No newline at end of file diff --git a/docs/source/license.rst b/docs/source/license.rst new file mode 100644 index 0000000..48bbc9f --- /dev/null +++ b/docs/source/license.rst @@ -0,0 +1,24 @@ +License +======= + +MIT License + +Copyright (c) [2023] [vectordb authors] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/noxfile.py b/noxfile.py new file mode 100644 index 0000000..95165cf --- /dev/null +++ b/noxfile.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +import shutil +from pathlib import Path + +import nox + +DIR = Path(__file__).parent.resolve() + +nox.options.sessions = ["lint", "pylint", "tests", "build"] + + +@nox.session +def lint(session: nox.Session) -> None: + """ + Run the linter. + """ + session.install("pre-commit") + session.run("pre-commit", "run", "--all-files", *session.posargs) + + +@nox.session +def pylint(session: nox.Session) -> None: + """ + Run PyLint. + """ + # This needs to be installed into the package environment, and is slower + # than a pre-commit check + session.install(".", "pylint") + session.run("pylint", "src", *session.posargs) + + +@nox.session +def tests(session: nox.Session) -> None: + """ + Run the unit and regular tests. Use --cov to activate coverage. + """ + session.install(".[dev]") + session.run("pytest", *session.posargs) + + +@nox.session +def build(session: nox.Session) -> None: + """ + Build an SDist and wheel. + """ + + build_p = DIR.joinpath("build") + if build_p.exists(): + shutil.rmtree(build_p) + + session.install("build") + session.run("python", "-m", "build") diff --git a/pyproject.toml b/pyproject.toml index 76dca15..c33ce68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,4 +68,4 @@ local_scheme = "no-local-version" [tool.ruff] # Same as Black. -line-length = 100 \ No newline at end of file +line-length = 100 diff --git a/src/vectordb/__init__.py b/src/vectordb/__init__.py index 6b2dc16..00985c8 100644 --- a/src/vectordb/__init__.py +++ b/src/vectordb/__init__.py @@ -2,4 +2,4 @@ from ._version import version as VERSION # noqa __version__ = VERSION -__author__ = "kagisearch" \ No newline at end of file +__author__ = "kagisearch" diff --git a/src/vectordb/_version.py b/src/vectordb/_version.py index 656908c..1e4bd65 100644 --- a/src/vectordb/_version.py +++ b/src/vectordb/_version.py @@ -3,6 +3,7 @@ TYPE_CHECKING = False if TYPE_CHECKING: from typing import Tuple, Union + VERSION_TUPLE = Tuple[Union[int, str], ...] else: VERSION_TUPLE = object @@ -12,5 +13,5 @@ __version_tuple__: VERSION_TUPLE version_tuple: VERSION_TUPLE -__version__ = version = '0.1.dev104' -__version_tuple__ = version_tuple = (0, 1, 'dev104') +__version__ = version = "0.1.dev104" +__version_tuple__ = version_tuple = (0, 1, "dev104") From 1f89aa90338947fb5104c836eb60826dcd683d83 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 07:30:03 +0000 Subject: [PATCH 12/39] docs: Add getting started/intro documentation --- .devcontainer/environment.yml | 17 +++++++++++ .devcontainer/postBuild.sh | 4 +++ docs/source/_toc.yml | 5 +-- docs/source/examples/ai_rss_reader.ipynb | 39 ++++++++++++++++++++++++ docs/source/getting_started.rst | 14 +++++++++ 5 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 docs/source/examples/ai_rss_reader.ipynb create mode 100644 docs/source/getting_started.rst diff --git a/.devcontainer/environment.yml b/.devcontainer/environment.yml index e69de29..b4a2ea2 100644 --- a/.devcontainer/environment.yml +++ b/.devcontainer/environment.yml @@ -0,0 +1,17 @@ +channels: + - conda-forge +dependencies: + - python=3.8 + - jupyterlab + - jupyterlab-git + - faiss-cpu + - numpy>=1.21.0 + - scikit-learn>=0.24.0 + - scipy>=1.7.0 + - sentence_transformers + - tensorflow_text + - torch>=1.9.0 + - transformers>=4.10.0 + - pre-commit + - nox + - pip \ No newline at end of file diff --git a/.devcontainer/postBuild.sh b/.devcontainer/postBuild.sh index e69de29..d08d23b 100644 --- a/.devcontainer/postBuild.sh +++ b/.devcontainer/postBuild.sh @@ -0,0 +1,4 @@ +# These commands will be run after the devcontainer is built. + +# Install vectordb locally for development +python3 -m pip install -e . \ No newline at end of file diff --git a/docs/source/_toc.yml b/docs/source/_toc.yml index b9773cd..7a6610e 100644 --- a/docs/source/_toc.yml +++ b/docs/source/_toc.yml @@ -8,10 +8,7 @@ chapters: - file: install - file: examples/index sections: - - file: examples/Example_ImageFit_LM + - file: examples/ai_rss_reader - file: contributing - - file: citation - file: license - - file: modules - - file: glossary - file: genindex \ No newline at end of file diff --git a/docs/source/examples/ai_rss_reader.ipynb b/docs/source/examples/ai_rss_reader.ipynb new file mode 100644 index 0000000..b1ad7e5 --- /dev/null +++ b/docs/source/examples/ai_rss_reader.ipynb @@ -0,0 +1,39 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install feedparser vectordb2" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst new file mode 100644 index 0000000..ca35bc4 --- /dev/null +++ b/docs/source/getting_started.rst @@ -0,0 +1,14 @@ + +Getting Started +=============== + +Install +------- + +Please follow the instructions on the :doc:`install` page. For most users, the basic pip install is all that's needed. + + +Read The Docs +------------- + +Docs for all the main functions in vectordb are available at :doc:`vectordb` at varying degrees of completeness. Further development of the docs is always ongoing. \ No newline at end of file From 239a83ceb604d986e4d5cf791110b7d709c1af04 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 07:34:26 +0000 Subject: [PATCH 13/39] feat: Add Codespaces functionality --- .devcontainer/devcontainer.json | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index e69de29..a34204c 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,14 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +{ + "image":"quay.io/pangeo/base-image:latest", + + "customizations": { + "vscode": { + "extensions": [ + "ms-toolsai.jupyter", + "ms-python.python" + ] + } + }, + "postCreateCommand": "sh .devcontainer/postBuild.sh" +} \ No newline at end of file From dc2d639604139f4b7abe34027ab48dcdb10c80f6 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 07:43:10 +0000 Subject: [PATCH 14/39] feat: Add apt file to install important pkgs to Codespace --- .devcontainer/apt.txt | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .devcontainer/apt.txt diff --git a/.devcontainer/apt.txt b/.devcontainer/apt.txt new file mode 100644 index 0000000..ba414aa --- /dev/null +++ b/.devcontainer/apt.txt @@ -0,0 +1,4 @@ +git +ncdu +wget +curl \ No newline at end of file From 9c63a1f06a462d19598d1b7c7734ab42956f7a93 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 00:47:46 -0700 Subject: [PATCH 15/39] fix: Add shell library installs to postBuild --- .devcontainer/postBuild.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.devcontainer/postBuild.sh b/.devcontainer/postBuild.sh index d08d23b..fb97747 100644 --- a/.devcontainer/postBuild.sh +++ b/.devcontainer/postBuild.sh @@ -1,4 +1,5 @@ # These commands will be run after the devcontainer is built. # Install vectordb locally for development -python3 -m pip install -e . \ No newline at end of file +python3 -m pip install -e . +apt-get install git ncdu wget curl From 7195b56d8096e64fd879b944c4e5f9740ac8da15 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 00:56:40 -0700 Subject: [PATCH 16/39] fix: Add shell package install to devcontainer --- .devcontainer/devcontainer.json | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index a34204c..f760360 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -10,5 +10,13 @@ ] } }, - "postCreateCommand": "sh .devcontainer/postBuild.sh" -} \ No newline at end of file + "postCreateCommand": "sh .devcontainer/postBuild.sh", + "features": { + "ghcr.io/devcontainers-contrib/features/black:2": {}, + "ghcr.io/devcontainers-contrib/features/pylint:2": {}, + "ghcr.io/devcontainers/features/git:1": {}, + "ghcr.io/devcontainers-contrib/features/curl-apt-get:1": {}, + "ghcr.io/devcontainers-contrib/features/ncdu:1": {}, + "ghcr.io/devcontainers-contrib/features/wget-apt-get:1": {} + } +} From d883e9dafb761407f89b5ad30577e3ea1a2307b8 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 00:58:13 -0700 Subject: [PATCH 17/39] refactor: Delete .devcontainer/apt.txt --- .devcontainer/apt.txt | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 .devcontainer/apt.txt diff --git a/.devcontainer/apt.txt b/.devcontainer/apt.txt deleted file mode 100644 index ba414aa..0000000 --- a/.devcontainer/apt.txt +++ /dev/null @@ -1,4 +0,0 @@ -git -ncdu -wget -curl \ No newline at end of file From 69666bf0b788f1209445a1665dab8e96092624a6 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 01:03:28 -0700 Subject: [PATCH 18/39] refactor: Install env pkgs in postBuild --- .devcontainer/postBuild.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.devcontainer/postBuild.sh b/.devcontainer/postBuild.sh index fb97747..4f40f74 100644 --- a/.devcontainer/postBuild.sh +++ b/.devcontainer/postBuild.sh @@ -1,5 +1,5 @@ # These commands will be run after the devcontainer is built. # Install vectordb locally for development -python3 -m pip install -e . -apt-get install git ncdu wget curl +python3 -m pip install --user -r requirements.txt # Install required packages +python3 -m pip install -e . # Install vectordb locally From a6f31c90aa6e4cf6b25560a445a667b441101dd6 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 01:04:25 -0700 Subject: [PATCH 19/39] refactor: Delete .devcontainer/environment.yml --- .devcontainer/environment.yml | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 .devcontainer/environment.yml diff --git a/.devcontainer/environment.yml b/.devcontainer/environment.yml deleted file mode 100644 index b4a2ea2..0000000 --- a/.devcontainer/environment.yml +++ /dev/null @@ -1,17 +0,0 @@ -channels: - - conda-forge -dependencies: - - python=3.8 - - jupyterlab - - jupyterlab-git - - faiss-cpu - - numpy>=1.21.0 - - scikit-learn>=0.24.0 - - scipy>=1.7.0 - - sentence_transformers - - tensorflow_text - - torch>=1.9.0 - - transformers>=4.10.0 - - pre-commit - - nox - - pip \ No newline at end of file From 96f6694c23df797d19961bab60ea155d0c453183 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 01:05:48 -0700 Subject: [PATCH 20/39] refactor: Install dev tools in postBuild --- .devcontainer/postBuild.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.devcontainer/postBuild.sh b/.devcontainer/postBuild.sh index 4f40f74..7dea045 100644 --- a/.devcontainer/postBuild.sh +++ b/.devcontainer/postBuild.sh @@ -2,4 +2,5 @@ # Install vectordb locally for development python3 -m pip install --user -r requirements.txt # Install required packages +python3 -m pip install pre-commit nox # Install development tools python3 -m pip install -e . # Install vectordb locally From ce51bcabf21a9150a2f695da7ab094011b3a96d5 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 08:10:32 +0000 Subject: [PATCH 21/39] chore: Pre-commit fixes --- .devcontainer/devcontainer.json | 31 +++++++++++------------- CONTRIBUTING.md | 2 +- docs/source/_toc.yml | 2 +- docs/source/contributing.rst | 2 +- docs/source/examples/ai_rss_reader.ipynb | 8 +----- docs/source/getting_started.rst | 2 +- docs/source/install.rst | 2 +- docs/source/license.rst | 2 +- src/vectordb/_version.py | 4 +-- 9 files changed, 23 insertions(+), 32 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index f760360..dabf871 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,22 +1,19 @@ // For format details, see https://aka.ms/devcontainer.json. For config options, see the { - "image":"quay.io/pangeo/base-image:latest", + "image": "quay.io/pangeo/base-image:latest", - "customizations": { - "vscode": { - "extensions": [ - "ms-toolsai.jupyter", - "ms-python.python" - ] - } - }, - "postCreateCommand": "sh .devcontainer/postBuild.sh", - "features": { - "ghcr.io/devcontainers-contrib/features/black:2": {}, - "ghcr.io/devcontainers-contrib/features/pylint:2": {}, - "ghcr.io/devcontainers/features/git:1": {}, - "ghcr.io/devcontainers-contrib/features/curl-apt-get:1": {}, - "ghcr.io/devcontainers-contrib/features/ncdu:1": {}, - "ghcr.io/devcontainers-contrib/features/wget-apt-get:1": {} + "customizations": { + "vscode": { + "extensions": ["ms-toolsai.jupyter", "ms-python.python"] } + }, + "postCreateCommand": "sh .devcontainer/postBuild.sh", + "features": { + "ghcr.io/devcontainers-contrib/features/black:2": {}, + "ghcr.io/devcontainers-contrib/features/pylint:2": {}, + "ghcr.io/devcontainers/features/git:1": {}, + "ghcr.io/devcontainers-contrib/features/curl-apt-get:1": {}, + "ghcr.io/devcontainers-contrib/features/ncdu:1": {}, + "ghcr.io/devcontainers-contrib/features/wget-apt-get:1": {} + } } diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 72f51e5..6c24175 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,4 +8,4 @@ To get started, please refer to our for detailed guidelines on how to contribute to vectordb. We appreciate your contributions and look forward to your involvement in making -vectordb even better! \ No newline at end of file +vectordb even better! diff --git a/docs/source/_toc.yml b/docs/source/_toc.yml index 7a6610e..7e968c4 100644 --- a/docs/source/_toc.yml +++ b/docs/source/_toc.yml @@ -11,4 +11,4 @@ chapters: - file: examples/ai_rss_reader - file: contributing - file: license - - file: genindex \ No newline at end of file + - file: genindex diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst index b2eeaed..1c40dda 100644 --- a/docs/source/contributing.rst +++ b/docs/source/contributing.rst @@ -127,4 +127,4 @@ Code of Conduct By contributing to this project, you agree to abide by the `Code of Conduct `_. Please make sure to read and understand the guidelines outlined in the Code -of Conduct before making any contributions. \ No newline at end of file +of Conduct before making any contributions. diff --git a/docs/source/examples/ai_rss_reader.ipynb b/docs/source/examples/ai_rss_reader.ipynb index b1ad7e5..ca235d9 100644 --- a/docs/source/examples/ai_rss_reader.ipynb +++ b/docs/source/examples/ai_rss_reader.ipynb @@ -16,11 +16,6 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -30,8 +25,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index ca35bc4..3a83d17 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -11,4 +11,4 @@ Please follow the instructions on the :doc:`install` page. For most users, the b Read The Docs ------------- -Docs for all the main functions in vectordb are available at :doc:`vectordb` at varying degrees of completeness. Further development of the docs is always ongoing. \ No newline at end of file +Docs for all the main functions in vectordb are available at :doc:`vectordb` at varying degrees of completeness. Further development of the docs is always ongoing. diff --git a/docs/source/install.rst b/docs/source/install.rst index 96bd85c..a644b95 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -27,4 +27,4 @@ this will install all relevant libraries and then install vectordb in an editabl git checkout -b -you can edit this branch however you like. If you are happy with the results and want to share with the rest of the community, then follow the contributors guide to create a pull request! \ No newline at end of file +you can edit this branch however you like. If you are happy with the results and want to share with the rest of the community, then follow the contributors guide to create a pull request! diff --git a/docs/source/license.rst b/docs/source/license.rst index 48bbc9f..2a5b425 100644 --- a/docs/source/license.rst +++ b/docs/source/license.rst @@ -21,4 +21,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file +SOFTWARE. diff --git a/src/vectordb/_version.py b/src/vectordb/_version.py index 1e4bd65..d3eeb2b 100644 --- a/src/vectordb/_version.py +++ b/src/vectordb/_version.py @@ -13,5 +13,5 @@ __version_tuple__: VERSION_TUPLE version_tuple: VERSION_TUPLE -__version__ = version = "0.1.dev104" -__version_tuple__ = version_tuple = (0, 1, "dev104") +__version__ = version = "0.1.dev116" +__version_tuple__ = version_tuple = (0, 1, "dev116") From 58ec24c33bb895c209f7f1a40f5ebb3e8f64bbf8 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 08:25:18 +0000 Subject: [PATCH 22/39] fix: Add modules to __init__ --- src/vectordb/__init__.py | 15 +++++++++++++++ src/vectordb/_version.py | 5 ++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/vectordb/__init__.py b/src/vectordb/__init__.py index 00985c8..ac9fd0d 100644 --- a/src/vectordb/__init__.py +++ b/src/vectordb/__init__.py @@ -1,5 +1,20 @@ # pylint: disable = line-too-long, trailing-whitespace, trailing-newlines, line-too-long, missing-module-docstring, import-error, too-few-public-methods, too-many-instance-attributes, too-many-locals from ._version import version as VERSION # noqa +from .chunking import Chunker +from .embedding import BaseEmbedder, Embedder +from .memory import Memory +from .storage import Storage +from .vector_search import VectorSearch + __version__ = VERSION __author__ = "kagisearch" + +__all__ = [ + "Chunker", + "BaseEmbedder", + "Embedder", + "Memory", + "Storage", + "VectorStorage", +] \ No newline at end of file diff --git a/src/vectordb/_version.py b/src/vectordb/_version.py index d3eeb2b..761eb15 100644 --- a/src/vectordb/_version.py +++ b/src/vectordb/_version.py @@ -3,7 +3,6 @@ TYPE_CHECKING = False if TYPE_CHECKING: from typing import Tuple, Union - VERSION_TUPLE = Tuple[Union[int, str], ...] else: VERSION_TUPLE = object @@ -13,5 +12,5 @@ __version_tuple__: VERSION_TUPLE version_tuple: VERSION_TUPLE -__version__ = version = "0.1.dev116" -__version_tuple__ = version_tuple = (0, 1, "dev116") +__version__ = version = '0.1.dev117' +__version_tuple__ = version_tuple = (0, 1, 'dev117') From 211c4338c8bf6b5e1607ee07e90ab76d3fe18488 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 08:33:09 +0000 Subject: [PATCH 23/39] fix: Set Python version to 3.8 --- .devcontainer/devcontainer.json | 19 +++++++++++++++---- src/vectordb/_version.py | 4 ++-- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index dabf871..d281fc9 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -9,11 +9,22 @@ }, "postCreateCommand": "sh .devcontainer/postBuild.sh", "features": { - "ghcr.io/devcontainers-contrib/features/black:2": {}, - "ghcr.io/devcontainers-contrib/features/pylint:2": {}, - "ghcr.io/devcontainers/features/git:1": {}, + "ghcr.io/devcontainers-contrib/features/black:2": { + "version": "latest" + }, + "ghcr.io/devcontainers-contrib/features/pylint:2": { + "version": "latest" + }, + "ghcr.io/devcontainers/features/git:1": { + "ppa": true, + "version": "latest" + }, "ghcr.io/devcontainers-contrib/features/curl-apt-get:1": {}, "ghcr.io/devcontainers-contrib/features/ncdu:1": {}, - "ghcr.io/devcontainers-contrib/features/wget-apt-get:1": {} + "ghcr.io/devcontainers-contrib/features/wget-apt-get:1": {}, + "ghcr.io/devcontainers/features/python:1": { + "installTools": true, + "version": "3.8" + } } } diff --git a/src/vectordb/_version.py b/src/vectordb/_version.py index 761eb15..459fefe 100644 --- a/src/vectordb/_version.py +++ b/src/vectordb/_version.py @@ -12,5 +12,5 @@ __version_tuple__: VERSION_TUPLE version_tuple: VERSION_TUPLE -__version__ = version = '0.1.dev117' -__version_tuple__ = version_tuple = (0, 1, 'dev117') +__version__ = version = '0.1.dev118' +__version_tuple__ = version_tuple = (0, 1, 'dev118') From 77318d382dabeee97c4375969083bc6be0191cf4 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 15:18:45 +0000 Subject: [PATCH 24/39] refactor: Add conda support --- .devcontainer/devcontainer.json | 6 +++++- requirements.txt | 4 ++-- src/vectordb/_version.py | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index d281fc9..eb61009 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -24,7 +24,11 @@ "ghcr.io/devcontainers-contrib/features/wget-apt-get:1": {}, "ghcr.io/devcontainers/features/python:1": { "installTools": true, - "version": "3.8" + "version": "latest" + }, + "ghcr.io/devcontainers/features/conda:1": { + "addCondaForge": true, + "version": "latest" } } } diff --git a/requirements.txt b/requirements.txt index 1417962..0923a54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -faiss-cpu +faiss-cpu>=1.8.0 numpy>=1.21.0 scikit-learn>=0.24.0 scipy>=1.7.0 -sentence_transformers +sentence_transformers>=2.6.1 tensorflow_text torch>=1.9.0 transformers>=4.10.0 diff --git a/src/vectordb/_version.py b/src/vectordb/_version.py index 459fefe..380ff21 100644 --- a/src/vectordb/_version.py +++ b/src/vectordb/_version.py @@ -12,5 +12,5 @@ __version_tuple__: VERSION_TUPLE version_tuple: VERSION_TUPLE -__version__ = version = '0.1.dev118' -__version_tuple__ = version_tuple = (0, 1, 'dev118') +__version__ = version = '0.1.dev119' +__version_tuple__ = version_tuple = (0, 1, 'dev119') From cafe7a6f624ca7bfb9249c5b4892fcf6230a0103 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 20:46:19 +0000 Subject: [PATCH 25/39] refactor: Add Dockerfile and update devcontainer.json --- .devcontainer/Dockerfile | 11 +++++++++++ .devcontainer/devcontainer.json | 28 +++++++--------------------- .devcontainer/environment.yml | 16 ++++++++++++++++ .devcontainer/postBuild.sh | 6 ++++++ src/vectordb/_version.py | 4 ++-- 5 files changed, 42 insertions(+), 23 deletions(-) create mode 100644 .devcontainer/Dockerfile create mode 100644 .devcontainer/environment.yml diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..2719590 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,11 @@ +# Use the Pangeo base image +ARG PANGEO_BASE_IMAGE_TAG=master +FROM pangeo/base-image:${PANGEO_BASE_IMAGE_TAG} + +# Copy the environment.yaml file into the Docker image +COPY environment.yaml /tmp/environment.yaml + +# Use the conda command to create a new environment from the environment.yaml file +RUN conda env create -f ~/.devcontainer/environment.yaml +RUN conda init bash +RUN echo "conda activate ssec-scipy2024" >> ~/.bashrc diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index eb61009..be9025d 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,34 +1,20 @@ // For format details, see https://aka.ms/devcontainer.json. For config options, see the { - "image": "quay.io/pangeo/base-image:latest", + "build": { "dockerfile": "Dockerfile" }, "customizations": { "vscode": { - "extensions": ["ms-toolsai.jupyter", "ms-python.python"] + "extensions": [ + "ms-toolsai.jupyter", + "ms-python.python" + ] } }, "postCreateCommand": "sh .devcontainer/postBuild.sh", "features": { - "ghcr.io/devcontainers-contrib/features/black:2": { - "version": "latest" - }, - "ghcr.io/devcontainers-contrib/features/pylint:2": { - "version": "latest" - }, - "ghcr.io/devcontainers/features/git:1": { - "ppa": true, - "version": "latest" - }, + "ghcr.io/devcontainers/features/git:1": {}, "ghcr.io/devcontainers-contrib/features/curl-apt-get:1": {}, "ghcr.io/devcontainers-contrib/features/ncdu:1": {}, - "ghcr.io/devcontainers-contrib/features/wget-apt-get:1": {}, - "ghcr.io/devcontainers/features/python:1": { - "installTools": true, - "version": "latest" - }, - "ghcr.io/devcontainers/features/conda:1": { - "addCondaForge": true, - "version": "latest" + "ghcr.io/devcontainers-contrib/features/wget-apt-get:1": {} } - } } diff --git a/.devcontainer/environment.yml b/.devcontainer/environment.yml new file mode 100644 index 0000000..726b626 --- /dev/null +++ b/.devcontainer/environment.yml @@ -0,0 +1,16 @@ +name: vectordb-dev +channel: + - conda-forge +dependencies: + - python==3.8 + - faiss-cpu>=1.8.0 + - numpy>=1.21.0 + - scikit-learn>=0.24.0 + - scipy>=1.7.0 + - sentence_transformers>=2.6.1 + - tensorflow_text + - torch>=1.9.0 + - transformers>=4.10.0 + - pre-commit + - nox + - pip diff --git a/.devcontainer/postBuild.sh b/.devcontainer/postBuild.sh index 7dea045..ca573f5 100644 --- a/.devcontainer/postBuild.sh +++ b/.devcontainer/postBuild.sh @@ -1,5 +1,11 @@ # These commands will be run after the devcontainer is built. +# Setup Conda environment +conda env create -f ~/.devcontainer/SciPy2024/environment.yml # Create environment from environment.yml +conda init bash # Initialize conda for bash +source ~/.bashrc # Reload bash +conda activate scipy2024 # Activate the environment + # Install vectordb locally for development python3 -m pip install --user -r requirements.txt # Install required packages python3 -m pip install pre-commit nox # Install development tools diff --git a/src/vectordb/_version.py b/src/vectordb/_version.py index 380ff21..7e2fe9a 100644 --- a/src/vectordb/_version.py +++ b/src/vectordb/_version.py @@ -12,5 +12,5 @@ __version_tuple__: VERSION_TUPLE version_tuple: VERSION_TUPLE -__version__ = version = '0.1.dev119' -__version_tuple__ = version_tuple = (0, 1, 'dev119') +__version__ = version = '0.1.dev120' +__version_tuple__ = version_tuple = (0, 1, 'dev120') From 8b657e5f29fb510b8c91da2301bedc5fe15327f0 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Thu, 28 Mar 2024 20:47:31 +0000 Subject: [PATCH 26/39] refactor: Remove pre-commit and nox from postBuild --- .devcontainer/postBuild.sh | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.devcontainer/postBuild.sh b/.devcontainer/postBuild.sh index ca573f5..6236346 100644 --- a/.devcontainer/postBuild.sh +++ b/.devcontainer/postBuild.sh @@ -1,12 +1,3 @@ # These commands will be run after the devcontainer is built. -# Setup Conda environment -conda env create -f ~/.devcontainer/SciPy2024/environment.yml # Create environment from environment.yml -conda init bash # Initialize conda for bash -source ~/.bashrc # Reload bash -conda activate scipy2024 # Activate the environment - -# Install vectordb locally for development -python3 -m pip install --user -r requirements.txt # Install required packages -python3 -m pip install pre-commit nox # Install development tools python3 -m pip install -e . # Install vectordb locally From 385cc242a67535407d7fc81a2fcdfb9605e66552 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Sun, 7 Apr 2024 23:38:59 -0700 Subject: [PATCH 27/39] fix: Update Dockerfile to build Conda env --- .devcontainer/Dockerfile | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 2719590..86aaa63 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,11 +1,21 @@ -# Use the Pangeo base image -ARG PANGEO_BASE_IMAGE_TAG=master -FROM pangeo/base-image:${PANGEO_BASE_IMAGE_TAG} +# Use the latest version of the pangeo/base-notebook (includes CUDA support) +FROM pangeo/base-notebook:latest -# Copy the environment.yaml file into the Docker image -COPY environment.yaml /tmp/environment.yaml +# Set the user to root +USER root -# Use the conda command to create a new environment from the environment.yaml file -RUN conda env create -f ~/.devcontainer/environment.yaml -RUN conda init bash -RUN echo "conda activate ssec-scipy2024" >> ~/.bashrc +# Copy the environment and requirements files into the Docker image +COPY environment.yml /tmp/environment.yml +COPY requirements.txt /tmp/requirements.txt + +# Create a new Conda environment from the environment.yml file +RUN conda env create -f /tmp/environment.yml + +# Install wget, git, ncdu, and curl +RUN apt-get update && apt-get install -y \ + wget \ + git \ + ncdu \ + curl \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* From 20e9d1ad7898cde27ff82ee660a5912f82252e54 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Sun, 7 Apr 2024 23:49:08 -0700 Subject: [PATCH 28/39] chore: Update ci.yml --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6c417cb..fff6303 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,7 @@ jobs: os: [ubuntu-latest, windows-latest, macOS-latest] steps: - - name: Checkout caustics + - name: Checkout vectordb uses: actions/checkout@v4 with: fetch-depth: 0 @@ -61,7 +61,7 @@ jobs: run: | python -m pip install pytest-github-actions-annotate-failures - - name: Install Caustics + - name: Install vectordb run: | pip install -e ".[dev]" pip show ${{ env.PROJECT_NAME }} From 508e75329ec9d555502d0e7ab6fbb78175437b52 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Fri, 19 Apr 2024 18:51:43 +0000 Subject: [PATCH 29/39] refactor: Update Dockerfile to use pangeo/base-notebook and install apt deps --- .devcontainer/Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 86aaa63..10f47da 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -4,9 +4,8 @@ FROM pangeo/base-notebook:latest # Set the user to root USER root -# Copy the environment and requirements files into the Docker image +# Copy the environment file into the Docker image COPY environment.yml /tmp/environment.yml -COPY requirements.txt /tmp/requirements.txt # Create a new Conda environment from the environment.yml file RUN conda env create -f /tmp/environment.yml @@ -18,4 +17,4 @@ RUN apt-get update && apt-get install -y \ ncdu \ curl \ && apt-get clean \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* \ No newline at end of file From 275857f45182cc7608def1443a8552baec8278b4 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Fri, 19 Apr 2024 21:01:48 +0000 Subject: [PATCH 30/39] refactor: Update requirement specs --- .devcontainer/environment.yml | 17 +++++++++-------- CONTRIBUTING.md | 2 +- requirements.txt | 14 +++++++------- src/vectordb/__init__.py | 2 +- src/vectordb/_version.py | 4 ++-- 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/.devcontainer/environment.yml b/.devcontainer/environment.yml index 726b626..57c1c4e 100644 --- a/.devcontainer/environment.yml +++ b/.devcontainer/environment.yml @@ -1,16 +1,17 @@ name: vectordb-dev channel: - conda-forge + - defaults dependencies: - - python==3.8 - - faiss-cpu>=1.8.0 - - numpy>=1.21.0 - - scikit-learn>=0.24.0 - - scipy>=1.7.0 - - sentence_transformers>=2.6.1 + - python>=3.8,<3.12 + - faiss-cpu + - numpy + - scikit-learn + - scipy + - sentence_transformers - tensorflow_text - - torch>=1.9.0 - - transformers>=4.10.0 + - torch + - transformers - pre-commit - nox - pip diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6c24175..effb9b2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,4 @@ -## Contributing to Vectordb +# Contributing to Vectordb Thank you for your interest in contributing to vectordb! We welcome contributions from the community to help improve our project. diff --git a/requirements.txt b/requirements.txt index 0923a54..5ace5dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -faiss-cpu>=1.8.0 -numpy>=1.21.0 -scikit-learn>=0.24.0 -scipy>=1.7.0 -sentence_transformers>=2.6.1 +faiss-cpu +numpy +scikit-learn +scipy +sentence_transformers tensorflow_text -torch>=1.9.0 -transformers>=4.10.0 +torch +transformers diff --git a/src/vectordb/__init__.py b/src/vectordb/__init__.py index ac9fd0d..1902912 100644 --- a/src/vectordb/__init__.py +++ b/src/vectordb/__init__.py @@ -16,5 +16,5 @@ "Embedder", "Memory", "Storage", - "VectorStorage", + "VectorSearch", ] \ No newline at end of file diff --git a/src/vectordb/_version.py b/src/vectordb/_version.py index 7e2fe9a..007a556 100644 --- a/src/vectordb/_version.py +++ b/src/vectordb/_version.py @@ -12,5 +12,5 @@ __version_tuple__: VERSION_TUPLE version_tuple: VERSION_TUPLE -__version__ = version = '0.1.dev120' -__version_tuple__ = version_tuple = (0, 1, 'dev120') +__version__ = version = '0.1.dev125' +__version_tuple__ = version_tuple = (0, 1, 'dev125') From c93a9df7a9540dfb14de88bc2f33967e7664dafb Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Fri, 19 Apr 2024 21:18:20 +0000 Subject: [PATCH 31/39] feat: Create auto-build docker image --- .github/workflows/build.yml | 89 +++++++++++++++++++++++ {.devcontainer => docker}/Dockerfile | 0 {.devcontainer => docker}/environment.yml | 0 requirements.txt | 2 + 4 files changed, 91 insertions(+) create mode 100644 .github/workflows/build.yml rename {.devcontainer => docker}/Dockerfile (100%) rename {.devcontainer => docker}/environment.yml (100%) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..359b224 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,89 @@ +# Any commit to master branch re-builds images, re-runs tests, and pushes SHA tags to DockerHub +name: Build +on: + push: + branches: + - main + - pre-commit + paths-ignore: + - 'LICENSE' + - 'README.md' + workflow_dispatch: + +env: + DOCKER_REGISTRY: ghcr.io + DOCKER_ORG: ${{ github.repository_owner }} + GITHUB_SHA: ${{ github.sha }} + GITHUB_REF: ${{ github.ref }} + +jobs: + build-images: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + strategy: + fail-fast: false + matrix: + IMAGE: [docker] + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + + - name: Get date tag + id: get_date + run: | + DATE_TAG="$( date -u '+%Y.%m.%d' )" + echo "date_tag=$DATE_TAG" >> $GITHUB_OUTPUT + + - name: Get registry and org + id: registry_org + run: | + ORG=$(echo "${{ env.DOCKER_ORG }}" | tr '[:upper:]' '[:lower:]') + echo "image_base=${{ env.DOCKER_REGISTRY }}/${ORG}" >> $GITHUB_OUTPUT + + # https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + - name: Free up disk space + run: | + df -h + docker image ls + sudo apt clean + sudo rm -rf /usr/local/lib/android /usr/share/dotnet /opt/ghc + df -h + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ steps.registry_org.outputs.image_base }}/${{ matrix.IMAGE }} + tags: | + # set latest tag for default branch + type=raw,value=latest + type=raw,value=${{ steps.get_date.outputs.date_tag }} + + - name: Log in to registry + uses: docker/login-action@v2 + with: + registry: ${{ env.DOCKER_REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + context: ${{ matrix.IMAGE }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + push: true + + - name: Inspect Image + run: | + docker run ${{ steps.registry_org.outputs.image_base }}/${{ matrix.IMAGE }}:latest conda list + docker images ls + + - name: Test Image + run: | + docker run -u 1000 -w /srv/test -v $PWD:/srv/test ${{ steps.registry_org.outputs.image_base }}/${{ matrix.IMAGE }}:latest \ No newline at end of file diff --git a/.devcontainer/Dockerfile b/docker/Dockerfile similarity index 100% rename from .devcontainer/Dockerfile rename to docker/Dockerfile diff --git a/.devcontainer/environment.yml b/docker/environment.yml similarity index 100% rename from .devcontainer/environment.yml rename to docker/environment.yml diff --git a/requirements.txt b/requirements.txt index 5ace5dc..f4a74ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ faiss-cpu +nox numpy +pre-commit scikit-learn scipy sentence_transformers From 1fd079cba2fb3ab62a736192c59ff6135f8c22e7 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Fri, 19 Apr 2024 21:38:45 +0000 Subject: [PATCH 32/39] fix: Check pkgs available on conda-forge --- .github/workflows/build.yml | 1 + docker/environment.yml | 8 +++++--- requirements.txt | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 359b224..fcd3662 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -4,6 +4,7 @@ on: push: branches: - main + - dev - pre-commit paths-ignore: - 'LICENSE' diff --git a/docker/environment.yml b/docker/environment.yml index 57c1c4e..34723bc 100644 --- a/docker/environment.yml +++ b/docker/environment.yml @@ -8,10 +8,12 @@ dependencies: - numpy - scikit-learn - scipy - - sentence_transformers - - tensorflow_text - - torch + - sentence-transformers + - tensorflow-hub + - pytorch - transformers - pre-commit - nox - pip + - pip: + - tensorflow_text diff --git a/requirements.txt b/requirements.txt index f4a74ed..a85c1d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,8 @@ numpy pre-commit scikit-learn scipy -sentence_transformers +sentence-transformers +tensorflow-hub tensorflow_text torch transformers From a1f7e9257cdced6cd1e7d2d574e2e15ee24dbea9 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Fri, 19 Apr 2024 21:53:26 +0000 Subject: [PATCH 33/39] refactor: Use docker image instead of Dockerfile in devcontainer.json --- .devcontainer/devcontainer.json | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index be9025d..bec02b3 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,6 +1,6 @@ // For format details, see https://aka.ms/devcontainer.json. For config options, see the { - "build": { "dockerfile": "Dockerfile" }, + "image": "ghcr.io/swarm-io-internal/docker:latest", "customizations": { "vscode": { @@ -10,11 +10,5 @@ ] } }, - "postCreateCommand": "sh .devcontainer/postBuild.sh", - "features": { - "ghcr.io/devcontainers/features/git:1": {}, - "ghcr.io/devcontainers-contrib/features/curl-apt-get:1": {}, - "ghcr.io/devcontainers-contrib/features/ncdu:1": {}, - "ghcr.io/devcontainers-contrib/features/wget-apt-get:1": {} - } + "postCreateCommand": "sh .devcontainer/postBuild.sh" } From f45431213a0cfaa04901025f4757678ac1bcadc0 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Fri, 19 Apr 2024 22:43:27 +0000 Subject: [PATCH 34/39] feat: Install support for GitHub repo install --- docker/environment.yml | 1 + pyproject.toml | 2 +- src/vectordb/_version.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docker/environment.yml b/docker/environment.yml index 34723bc..214b731 100644 --- a/docker/environment.yml +++ b/docker/environment.yml @@ -17,3 +17,4 @@ dependencies: - pip - pip: - tensorflow_text + - git+https://github.com/vioshyvo/mrpt/ diff --git a/pyproject.toml b/pyproject.toml index c33ce68..c26bac1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["hatchling", "hatch-requirements-txt", "hatch-vcs"] build-backend = "hatchling.build" [project] -name = "vectordb2" +name = "vectordb" dynamic = [ "dependencies", "version" diff --git a/src/vectordb/_version.py b/src/vectordb/_version.py index 007a556..9838fb0 100644 --- a/src/vectordb/_version.py +++ b/src/vectordb/_version.py @@ -12,5 +12,5 @@ __version_tuple__: VERSION_TUPLE version_tuple: VERSION_TUPLE -__version__ = version = '0.1.dev125' -__version_tuple__ = version_tuple = (0, 1, 'dev125') +__version__ = version = '0.1.dev129' +__version_tuple__ = version_tuple = (0, 1, 'dev129') From 4255800e2c4998d5e6b41617ff449a8742420726 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Fri, 19 Apr 2024 23:04:16 +0000 Subject: [PATCH 35/39] fix: Remove git installation --- docker/environment.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/docker/environment.yml b/docker/environment.yml index 214b731..34723bc 100644 --- a/docker/environment.yml +++ b/docker/environment.yml @@ -17,4 +17,3 @@ dependencies: - pip - pip: - tensorflow_text - - git+https://github.com/vioshyvo/mrpt/ From 84ecb1cbf54115c1e6f5110d06b4ff0e34534559 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Sat, 20 Apr 2024 05:37:34 +0000 Subject: [PATCH 36/39] fix: Update project name in ci.yml and add build-essentials to Dockerfile --- .github/workflows/ci.yml | 4 ++-- docker/Dockerfile | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fff6303..cb91d62 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ concurrency: env: FORCE_COLOR: 3 - PROJECT_NAME: "vectordb2" + PROJECT_NAME: "vectordb" jobs: build: @@ -25,7 +25,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.8","3.9", "3.10", "3.11"] os: [ubuntu-latest, windows-latest, macOS-latest] steps: diff --git a/docker/Dockerfile b/docker/Dockerfile index 10f47da..2234d5b 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -16,5 +16,6 @@ RUN apt-get update && apt-get install -y \ git \ ncdu \ curl \ + build-essential \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* \ No newline at end of file From e4c32a1fc4c4f6f6de1fd32d9b11f6bec7ae48f3 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Sat, 20 Apr 2024 06:28:11 +0000 Subject: [PATCH 37/39] docs: Update docs to include codespaces, docker, and conda instructions --- .devcontainer/postBuild.sh | 1 + README.md | 82 ++++++++++++++++++++++++++++++++++++++ src/vectordb/_version.py | 4 +- 3 files changed, 85 insertions(+), 2 deletions(-) diff --git a/.devcontainer/postBuild.sh b/.devcontainer/postBuild.sh index 6236346..3e63d21 100644 --- a/.devcontainer/postBuild.sh +++ b/.devcontainer/postBuild.sh @@ -1,3 +1,4 @@ # These commands will be run after the devcontainer is built. +python3 -m pip install git+https://github.com/vioshyvo/mrpt/ python3 -m pip install -e . # Install vectordb locally diff --git a/README.md b/README.md index a9e899d..64af681 100644 --- a/README.md +++ b/README.md @@ -277,3 +277,85 @@ to ensure maximum performance across the spectrum of use cases. ## License MIT License. + +## Contributing + +We welcome contributions to VectorDB! Here are the instructions to set up your development environment. + +### Activating a Codespace + +1. On the main page of the repository, click the `Code` button. +2. Click on `Open with Codespaces`. +3. Click on `New Codespace`. + +### Using a Docker Container + +1. Ensure [Docker](https://www.docker.com/products/docker-desktop/) is installed on your machine. +2. Pull the Docker image using the following command: + +```bash +docker pull ghcr.io/swarm-io-internal/docker:latest --platform linux/x86_64 +``` + +> Note: The "no matching manifest for linux/arm64/v8 in the manifest list entries" error occurs if you do not include the platform flag for Apple Silicon systems + +3. Run the Docker container: + +```bash +docker run -p 8888:8888 --platform linux/x86_64 -it ghcr.io/swarm-io-internal/docker:latest bash +``` + +4. Clone the repository: + +```bash +git clone https://github.com/kagisearch/vectordb.git +``` + +5. Navigate to the cloned repository: + +``` +cd vectordb +``` + +### Using a Conda Environment + +1. Ensure [Conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) is installed on your machine. +2. Clone the repository: + +```bash +git clone https://github.com/kagisearch/vectordb.git +``` + +3. Navigate to the cloned repository: + +```bash +cd vectordb +``` + +4. Create the Conda environment: + +```bash +conda env create -f ./docker/environment.yml +``` + +5. Initialize Conda: + +```bash +conda init +``` + +5. Source the bashrc file: + +```bash +. ~/.bashrc +``` + +6. Activate the Conda environment: + +```bash +conda activate vectordb-dev +``` + +Please make sure to update tests as appropriate when making changes. Also, update the documentation reflecting the changes you made. + +Happy coding! \ No newline at end of file diff --git a/src/vectordb/_version.py b/src/vectordb/_version.py index 9838fb0..c6c1682 100644 --- a/src/vectordb/_version.py +++ b/src/vectordb/_version.py @@ -12,5 +12,5 @@ __version_tuple__: VERSION_TUPLE version_tuple: VERSION_TUPLE -__version__ = version = '0.1.dev129' -__version_tuple__ = version_tuple = (0, 1, 'dev129') +__version__ = version = '0.1.dev132' +__version_tuple__ = version_tuple = (0, 1, 'dev132') From d46357afe6ecc568719ab3f723823255890a8c2d Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Tue, 14 May 2024 14:56:45 -0700 Subject: [PATCH 38/39] fix: Update install.rst organization --- docs/source/install.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/source/install.rst b/docs/source/install.rst index a644b95..a94b6a6 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -1,4 +1,3 @@ - Installation ============ @@ -17,7 +16,7 @@ Developer Install First clone the repo with:: - git clone git@github.com:Ciela-Institute/vectordb.git + git clone git@github.com:kagisearch/vectordb2.git this will create a directory ``vectordb`` wherever you ran the command. Next go into the directory and install in developer mode:: From b5b65462d1d5b4a03a4d6ea131460f23224b5963 Mon Sep 17 00:00:00 2001 From: Cordero Core <127983572+uwcdc@users.noreply.github.com> Date: Fri, 17 May 2024 15:13:05 -0700 Subject: [PATCH 39/39] feat: Update docker image in devcontainer.json --- .devcontainer/devcontainer.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index bec02b3..84fe1fb 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,6 +1,6 @@ // For format details, see https://aka.ms/devcontainer.json. For config options, see the { - "image": "ghcr.io/swarm-io-internal/docker:latest", + "image": "ghcr.io/swarm-io-internal/data-science-lite:latest", "customizations": { "vscode": {