From d129e35c7f0a1965badb0f249410fe61513a75fe Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Thu, 13 Jun 2024 19:03:42 -0400 Subject: [PATCH 1/6] nn search --- src/api-raw-types.d.ts | 22 +++++++++++----------- src/index.ts | 27 +++++++++++++++++++++++++++ src/projection.ts | 16 ++++++++++++++++ tests/neighbors.test.js | 23 +++++++++++++++++++++++ 4 files changed, 77 insertions(+), 11 deletions(-) create mode 100644 tests/neighbors.test.js diff --git a/src/api-raw-types.d.ts b/src/api-raw-types.d.ts index 130a2c1..e007bd2 100644 --- a/src/api-raw-types.d.ts +++ b/src/api-raw-types.d.ts @@ -1693,7 +1693,7 @@ export interface components { */ model: | components['schemas']['NomicTextEmbeddingModel'] - | components['schemas']['NomicVisionEmbeddingModel']; + | components['schemas']['NomicImageEmbeddingModel']; /** * Tokens * @description The total tokens used. @@ -1742,9 +1742,9 @@ export interface components { atlas_index_id: string; /** * Queries - * @description The bytes of a batch of embeddings to get neighbors for + * @description A set of embeddings to query. Where `n` is the number of vectors to search and `d` is the vector dimensionality, this can be either an `n`x`d` numpy array encoded to base64, OR a list of `n` lists with `d` numbers per list. */ - queries: string; + queries: string | number[][]; /** * K * @description The number of neighbors to return @@ -2332,6 +2332,14 @@ export interface components { */ atom_strategies: string[]; }; + /** + * NomicImageEmbeddingModel + * @description An enumeration. + * @enum {unknown} + */ + NomicImageEmbeddingModel: + | 'nomic-embed-vision-v1' + | 'nomic-embed-vision-v1.5'; /** * NomicProjectModel * @description An enumeration. @@ -2348,14 +2356,6 @@ export interface components { | 'nomic-embed-text-v1' | 'nomic-embed-text-v1.5' | 'nomic-embed-code'; - /** - * NomicVisionEmbeddingModel - * @description An enumeration. - * @enum {unknown} - */ - NomicVisionEmbeddingModel: - | 'nomic-embed-vision-v1' - | 'nomic-embed-vision-v1.5'; /** ObtainAccessTokenRequest */ ObtainAccessTokenRequest: { /** diff --git a/src/index.ts b/src/index.ts index 387a59f..d4d2973 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,6 +3,7 @@ import type { AtlasUser } from './user.js'; import { AtlasProjection } from './projection.js'; import { AtlasDataset as AtlasDataset } from './project.js'; import type { Table } from 'apache-arrow'; +import type { components } from 'api-raw-types.js'; type IndexInitializationOptions = { project_id?: Atlas.UUID; @@ -113,4 +114,30 @@ export class AtlasIndex extends BaseAtlasClass { )) as Table; return tb; } + + /** + * + * @param param0 A keyed dictionary including `k` (the number of neighbors to return) + * and `queries` (a list of vectors to search for). + * @returns + */ + async nearest_neighbors_by_vector({ + k = 10, + queries, + }: Omit< + components['schemas']['EmbeddingNeighborRequest'], + 'atlas_index_id' + >): Promise { + const { neighbors, distances } = (await this.apiCall( + `/v1/project/data/get/nearest_neighbors/by_embedding`, + 'POST', + { + atlas_index_id: this.id, + k, + queries, + } + )) as components['schemas']['EmbeddingNeighborResponse']; + + return { neighbors, distances }; + } } diff --git a/src/projection.ts b/src/projection.ts index 13948f8..f1ff87a 100644 --- a/src/projection.ts +++ b/src/projection.ts @@ -3,6 +3,7 @@ import { BaseAtlasClass } from './user.js'; import type { AtlasUser } from './user.js'; import { AtlasDataset } from './project.js'; import type { AtlasIndex } from './index.js'; +import { components } from 'api-raw-types.js'; type UUID = string; @@ -300,6 +301,21 @@ export class AtlasProjection extends BaseAtlasClass { return `${protocol}://${this.user.apiLocation}/v1/project/${this.project_id}/index/projection/${this.id}/quadtree`; } + async nearest_neighbors_by_vector({ + k = 10, + queries, + }: Omit< + components['schemas']['EmbeddingNeighborRequest'], + 'atlas_index_id' + >): Promise { + const index = await this.index(); + const { neighbors, distances } = await index.nearest_neighbors_by_vector({ + k, + queries, + }); + return { neighbors, distances }; + } + async info() { if (this._info !== undefined) { return this._info; diff --git a/tests/neighbors.test.js b/tests/neighbors.test.js new file mode 100644 index 0000000..594a30c --- /dev/null +++ b/tests/neighbors.test.js @@ -0,0 +1,23 @@ +import { test } from 'uvu'; +import { AtlasProjection } from '../dist/projection.js'; +import { AtlasUser } from '../dist/user.js'; + +test('Neighbors', async () => { + // get user + console.log('getting user'); + const user = new AtlasUser({ useEnvToken: true }); + const projection = new AtlasProjection( + '728d4f4d-91ab-4852-a4a6-6cf41da1cd5e', + user, + { project_id: '449402ea-1730-475c-9b41-4bbbf98b4e49' } + ); + const vec = []; + for (let i = 0; i <= 768; i++) { + vec.push(Math.random()); + } + const result = await projection.nearest_neighbors_by_vector({ + queries: [vec], + k: 25, + }); + console.log({ result }); +}); From 40120cc86b8d51eb473d82fd1b9356f361616455 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Thu, 13 Jun 2024 22:26:28 -0400 Subject: [PATCH 2/6] add vector embedding endpoints --- src/index.ts | 1 - src/project.ts | 24 +++++++++++++----------- src/projection.ts | 18 +++++++++++++++++- src/user.ts | 1 - tests/neighbors.test.js | 10 +++++----- tests/user.test.js | 7 ++++++- 6 files changed, 41 insertions(+), 20 deletions(-) diff --git a/src/index.ts b/src/index.ts index d4d2973..2a0c1d7 100644 --- a/src/index.ts +++ b/src/index.ts @@ -137,7 +137,6 @@ export class AtlasIndex extends BaseAtlasClass { queries, } )) as components['schemas']['EmbeddingNeighborResponse']; - return { neighbors, distances }; } } diff --git a/src/project.ts b/src/project.ts index 7232998..1ec513a 100644 --- a/src/project.ts +++ b/src/project.ts @@ -220,8 +220,19 @@ export class AtlasDataset extends BaseAtlasClass { * @param ids A list of identifiers to fetch from the server. */ - async fetch_ids(ids?: string[]): Promise[]> { - throw new Error('Not implemented'); + async fetch_ids( + ids?: string[] + ): Promise>> { + if (ids === undefined) { + return {}; + } + const response = await this.apiCall( + '/v1/project/data/get', + 'POST', + { project_id: this.id, datum_ids: ids }, + null + ); + return response as Record>; } async createIndex( @@ -285,15 +296,6 @@ export class AtlasDataset extends BaseAtlasClass { return new AtlasIndex(id, this.user, { project: this }); } - async delete_data(ids: string[]): Promise { - // TODO: untested - // const info = await this.info - await this.user.apiCall('/v1/project/data/delete', 'POST', { - project_id: this.id, - datum_ids: ids, - }); - } - validate_metadata(): void { // validate metadata } diff --git a/src/projection.ts b/src/projection.ts index f1ff87a..fc58b3e 100644 --- a/src/projection.ts +++ b/src/projection.ts @@ -307,12 +307,28 @@ export class AtlasProjection extends BaseAtlasClass { }: Omit< components['schemas']['EmbeddingNeighborRequest'], 'atlas_index_id' - >): Promise { + >): Promise> { const index = await this.index(); const { neighbors, distances } = await index.nearest_neighbors_by_vector({ k, queries, }); + const project = await this.project(); + const datums = (await Promise.all( + neighbors.map((ids) => project.fetch_ids(ids).then((d) => d.datums)) + )) as Record[][]; + const filled_out: Record[][] = []; + for (let i = 0; i < neighbors.length; i++) { + filled_out[i] = []; + for (let j = 0; j < neighbors[i].length; j++) { + const d = { ...datums[i][j] }; + d._distance = distances[i][j]; + filled_out[i].push(d); + } + } + + console.log({ filled_out }); + return filled_out; return { neighbors, distances }; } diff --git a/src/user.ts b/src/user.ts index c82cb08..e023f3a 100644 --- a/src/user.ts +++ b/src/user.ts @@ -301,7 +301,6 @@ export class AtlasUser { Record | string | Array | Table | Uint8Array | null > { // make an API call - if (headers === null) { const credentials = await this.credentials; if (credentials === null) { diff --git a/tests/neighbors.test.js b/tests/neighbors.test.js index 594a30c..a36a6cf 100644 --- a/tests/neighbors.test.js +++ b/tests/neighbors.test.js @@ -1,23 +1,23 @@ import { test } from 'uvu'; import { AtlasProjection } from '../dist/projection.js'; import { AtlasUser } from '../dist/user.js'; +import * as assert from 'uvu/assert'; test('Neighbors', async () => { // get user - console.log('getting user'); const user = new AtlasUser({ useEnvToken: true }); const projection = new AtlasProjection( - '728d4f4d-91ab-4852-a4a6-6cf41da1cd5e', + '0efb002a-09b3-47df-b43e-71780879b501', user, - { project_id: '449402ea-1730-475c-9b41-4bbbf98b4e49' } + { project_id: 'b7d7ff07-7272-4481-8618-c05bcf6feca5' } ); const vec = []; - for (let i = 0; i <= 768; i++) { + for (let i = 0; i < 768; i++) { vec.push(Math.random()); } const result = await projection.nearest_neighbors_by_vector({ queries: [vec], k: 25, }); - console.log({ result }); + assert.is(result[0].length, 25); }); diff --git a/tests/user.test.js b/tests/user.test.js index b36d0d2..bc4db28 100644 --- a/tests/user.test.js +++ b/tests/user.test.js @@ -7,7 +7,12 @@ import { AtlasOrganization } from '../dist/organization.js'; test('AtlasOrganization test', async () => { const user = new AtlasUser({ useEnvToken: true }); - const info = await user.info(); + + const info = await user.info().catch((err) => { + console.error(err); + throw err; + }); + const organization = new AtlasOrganization( info.organizations[0].organization_id, user From a14c4f34274f9b297e27863b4aaf5ac40675681f Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Thu, 13 Jun 2024 22:28:48 -0400 Subject: [PATCH 3/6] docstring --- src/projection.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/projection.ts b/src/projection.ts index fc58b3e..39ca53c 100644 --- a/src/projection.ts +++ b/src/projection.ts @@ -301,6 +301,11 @@ export class AtlasProjection extends BaseAtlasClass { return `${protocol}://${this.user.apiLocation}/v1/project/${this.project_id}/index/projection/${this.id}/quadtree`; } + /** + * + * @param param0 an object with keys k (number of numbers) and queries (list of vectors, where each one is the length of the embedding space). + * @returns A list of entries in sorted order, where each entry is a list of neighbors including distances in the `_distance` field. + */ async nearest_neighbors_by_vector({ k = 10, queries, From f61a30c8f56405bbba5aae806b30ee7dc32ba4d1 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Thu, 13 Jun 2024 22:31:15 -0400 Subject: [PATCH 4/6] version bump --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 3a6da68..f297a2a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@nomic-ai/atlas", - "version": "0.9.5", + "version": "0.10.0", "type": "module", "files": [ "dist" From 45dc938d2ac92bb422feb51a7ea9085a1e010874 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Thu, 13 Jun 2024 22:48:28 -0400 Subject: [PATCH 5/6] release notes, code review fixes --- RELEASE_NOTES.md | 4 ++++ src/projection.ts | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index a848288..24c4b86 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,3 +1,7 @@ +# 0.10.0 + +- Add support for nearest-neighbor search by vector. + # 0.9.6 - Rename "AtlasProject" to "AtlasDataset" with backwards compatible alias. diff --git a/src/projection.ts b/src/projection.ts index 39ca53c..487edde 100644 --- a/src/projection.ts +++ b/src/projection.ts @@ -332,9 +332,7 @@ export class AtlasProjection extends BaseAtlasClass { } } - console.log({ filled_out }); return filled_out; - return { neighbors, distances }; } async info() { From 45fad752d7b4baa66c2d80aac4c20aee27fe4af3 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Thu, 13 Jun 2024 23:09:25 -0400 Subject: [PATCH 6/6] add publish action --- .github/workflows/npm-publish.yml | 35 +++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 .github/workflows/npm-publish.yml diff --git a/.github/workflows/npm-publish.yml b/.github/workflows/npm-publish.yml new file mode 100644 index 0000000..27dea08 --- /dev/null +++ b/.github/workflows/npm-publish.yml @@ -0,0 +1,35 @@ +name: Publish @next release to npm +on: + push: + branches: + - main + +permissions: + contents: write + packages: write + deployments: write + +jobs: + publish: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-node@v2 + with: + node-version: '20.x' + registry-url: 'https://registry.npmjs.org' + - run: npm ci + - name: Configure Git user + run: | + git config --global user.email "github-actions@github.com" + git config --global user.name "GitHub Actions" + - run: npm version prerelease --preid=next + - name: Commit bumped version + run: | + # git add package.json package-lock.json + # git commit -m "Bump version to $(node -p "require('./package.json').version")" + git push + + - run: npm publish --tag next + env: + NODE_AUTH_TOKEN: ${{ secrets.NODE_AUTH_TOKEN }}