From e5c292a6fb2af8eefcc280c26c508e5361b40fc5 Mon Sep 17 00:00:00 2001 From: Tim Green Date: Mon, 8 Jul 2024 13:18:11 +0200 Subject: [PATCH] add outputs to working with vector data --- .../working-with-vector-data/notebook.ipynb | 3020 ++++++++++------- resources/nb-check.py | 3 - 2 files changed, 1819 insertions(+), 1204 deletions(-) diff --git a/notebooks/working-with-vector-data/notebook.ipynb b/notebooks/working-with-vector-data/notebook.ipynb index 1737dd7d..4ebb59dd 100644 --- a/notebooks/working-with-vector-data/notebook.ipynb +++ b/notebooks/working-with-vector-data/notebook.ipynb @@ -1,1203 +1,1821 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "b80549ed-1471-4fc5-8b1a-a3050246078a", - "metadata": {}, - "source": [ - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "
SingleStore Notebooks
\n", - "

Working with Vector Data

\n", - "
\n", - "
" - ] - }, - { - "cell_type": "markdown", - "id": "09cb34bc", - "metadata": {}, - "source": [ - "
\n", - " \n", - "
\n", - "

Note

\n", - "

This notebook can be run on a Free Starter Workspace. To create a Free Starter Workspace navigate to Start using the left nav. You can also use your existing Standard or Premium workspace with this Notebook.

\n", - "
\n", - "
" - ] - }, - { - "cell_type": "markdown", - "id": "782ffea9-fbc0-4942-8a1a-da8788ed2fec", - "metadata": {}, - "source": [ - "Using vector embeddings has become popular recently, but getting vector data into your\n", - "database can leave you with a lot of questions. This notebook shows various ways to\n", - "load vectors into SingleStoreDB from Python using the Python client, SQLAlchemy, pandas,\n", - "and the SQL magic commaands. It covers vectors in the form of numpy arrays or Python lists\n", - "of numerics.\n", - "\n", - "We'll use the following function to reset the vector data table between examples." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "f7fe2c95-9e0d-4b1b-ad24-d0536c4ef2d9", - "metadata": {}, - "outputs": [], - "source": [ - "def reset_table():\n", - " \"\"\"Reset the table for use in the examples below.\"\"\"\n", - " with s2.connect() as conn:\n", - " with conn.cursor() as cur:\n", - " cur.execute('DROP TABLE IF EXISTS vectors;')\n", - " cur.execute(r'''\n", - " CREATE TABLE vectors (\n", - " vec_f32 BLOB\n", - " );\n", - " ''')" - ] - }, - { - "cell_type": "markdown", - "id": "d087092f-696c-4735-9c66-33b8efc885ca", - "metadata": {}, - "source": [ - "At any time, if you want to see the actual query being sent to the database, you can set the following\n", - "environment variable before making the query to the server." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "45628671-dee1-41fe-ae77-b8c651c8c389", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# os.environ['SINGLESTOREDB_DEBUG_QUERIES'] = '1'" - ] - }, - { - "cell_type": "markdown", - "id": "9b1cd9d3", - "metadata": {}, - "source": [ - "
\n", - " \n", - "
\n", - "

Action Required

\n", - "

If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

\n", - "
\n", - "
" - ] - }, - { - "cell_type": "markdown", - "id": "e2e322f5-b81d-4249-b512-bd36f88aa168", - "metadata": {}, - "source": [ - "Create a database for our examples." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "fec12f93-7ca6-4f77-bc7f-355b0bfa98f9", - "metadata": {}, - "outputs": [], - "source": [ - "shared_tier_check = %sql show variables like 'is_shared_tier'\n", - "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", - " %sql DROP DATABASE IF EXISTS vector_data;\n", - " %sql CREATE DATABASE vector_data;" - ] - }, - { - "cell_type": "markdown", - "id": "1f2db020-7f76-44d0-9b32-cc81d35979ef", - "metadata": {}, - "source": [ - "
\n", - " \n", - "
\n", - "

Action Required

\n", - "

Make sure to select the vector_data database from the drop-down menu at the top of this notebook. It updates the connection_url which is used by the %%sql magic command and SQLAlchemy to make connections to the selected database.

\n", - "
\n", - "
" - ] - }, - { - "cell_type": "markdown", - "id": "3e65bd3b-49b4-48ca-8409-e3da89ebcce4", - "metadata": {}, - "source": [ - "## Generate numpy arrays containing vector data\n", - "\n", - "The code belowe generates 1,000 rows of 10 random 32-bit float numbers in a numpy array.\n", - "This data will be used in the following examples." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "c9fd6e9f-5513-45b3-bc4f-395e115ccd9e", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "aab67ef8-8bd1-4f9e-957a-ac8248984f4f", - "metadata": {}, - "outputs": [], - "source": [ - "vec_f32 = [np.random.rand(10).astype(np.float32) for _ in range(1000)]\n", - "vec_f32[:3]" - ] - }, - { - "cell_type": "markdown", - "id": "1539013e-6ad8-49cc-aafd-e0aa5c2dbf60", - "metadata": {}, - "source": [ - "### Create a Python list of float values from the numpy array\n", - "\n", - "We will show how to work with both numpy arrays and Python lists in the following examples.\n", - "This cell creates a list of Python lists of floats equivalent to the numpy arrays above." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c72202fa-3a15-42a0-83f2-2650a6d5faa6", - "metadata": {}, - "outputs": [], - "source": [ - "vec_f32_list = [list([float(y) for y in x]) for x in vec_f32]\n", - "vec_f32_list[:3]" - ] - }, - { - "cell_type": "markdown", - "id": "ebe71955-7358-4c7c-add8-162f5bca098a", - "metadata": {}, - "source": [ - "## Upload and downloading data to SingleStoreDB\n", - "\n", - "In the following sections, we'll describe how to use the SingleStoreDB Python client, SQLAlchemy, the `%%sql` magic,\n", - "and pandas to upload and download vector data." - ] - }, - { - "cell_type": "markdown", - "id": "2860a4f6-bfc6-4bc0-89d8-6c9d765f1240", - "metadata": {}, - "source": [ - "### Using SingleStoreDB Python client" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "35cd7e37-d05f-424f-98c7-ae61958c42d5", - "metadata": {}, - "outputs": [], - "source": [ - "import singlestoredb as s2\n", - "\n", - "conn = s2.connect()\n", - "cursor = conn.cursor()" - ] - }, - { - "cell_type": "markdown", - "id": "66e77736-4625-481b-9991-d7e7f28401cb", - "metadata": {}, - "source": [ - "#### Working with numpy arrays" - ] - }, - { - "cell_type": "markdown", - "id": "2d1453cd-21d2-4843-a41a-6aa1a33ce0a1", - "metadata": {}, - "source": [ - "The SingleStoreDB Python client supports numpy arrays natively. If a numpy array is passed as a parameter to a query,\n", - "it will be converted to a byte string containing the contents of the array. The data type of the numpy array is\n", - "preserved, so you need to ensure that it is the proper numpy dtype before uploading. You can change the data type\n", - "of a numpy array by using the `astype` method." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "5fa23885-106d-4b37-ade2-d7b6e6c8b593", - "metadata": {}, - "outputs": [], - "source": [ - "reset_table()" - ] - }, - { - "cell_type": "markdown", - "id": "a752e82f-bdf9-442e-94eb-9e29459da840", - "metadata": {}, - "source": [ - "Recall that `vec_f32` contained numpy arrays of float32 values." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "9fcdb1ce-254b-4420-815e-76cb2199ac05", - "metadata": {}, - "outputs": [], - "source": [ - "vec_f32[:3]" - ] - }, - { - "cell_type": "markdown", - "id": "df0f98b0-d916-4113-a34c-e0c13cffa242", - "metadata": {}, - "source": [ - "The `executemany` method will insert multiple rows of data in a single SQL query." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "b55d0954-9e8c-468b-b1da-019a3adf4fd2", - "metadata": {}, - "outputs": [], - "source": [ - "cursor.executemany('INSERT INTO vectors(vec_f32) VALUES (%s)', vec_f32)" - ] - }, - { - "cell_type": "markdown", - "id": "f929f1ed-2ee1-4209-a27d-121bec2a3a79", - "metadata": {}, - "source": [ - "To download the vector data from SingleStoreDB, you simple execute a `SELECT` statement. The data is held in\n", - "blob columns, so the result will simply contain byte strings." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "218071ef-0742-460b-b0a4-b079970ae568", - "metadata": {}, - "outputs": [], - "source": [ - "cursor.execute('SELECT vec_f32 FROM vectors LIMIT 5')" - ] - }, - { - "cell_type": "markdown", - "id": "22892481-3d71-48aa-abe3-ffd63b309419", - "metadata": {}, - "source": [ - "Since we want to use the data as numpy arrays, we can \"reconstitute\" the arrays as we read the data using the `np.frombuffer` function." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "52bfac93-5503-4144-8700-95db21f13897", - "metadata": {}, - "outputs": [], - "source": [ - "out_f32 = [np.frombuffer(x[0], dtype=np.float32) for x in cursor]\n", - "out_f32" - ] - }, - { - "cell_type": "markdown", - "id": "390b149f-8039-43ee-ae43-215ea7997a4f", - "metadata": {}, - "source": [ - "#### Working with Python lists\n", - "\n", - "It is also possible to upload Python lists without going through a numpy array using the [struct](https://docs.python.org/3/library/struct.html) package. In this method, we convert\n", - "the floats to a byte string and pass that byte string as the parameter to the `INSERT` statement. The possible format codes are as follows.\n", - "The little-endian indicator (`<`) should also be used.\n", - "\n", - "* f - float32\n", - "* d - float64\n", - "* b - int8\n", - "* h - int16\n", - "* l - int32\n", - "* q - int64" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "5707a569-4361-4d69-a078-5c71bb547dce", - "metadata": {}, - "outputs": [], - "source": [ - "reset_table()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "a0777da4-daba-4b06-8fb6-c7fcc30dcc25", - "metadata": {}, - "outputs": [], - "source": [ - "import struct\n", - "\n", - "# Construct the format for a vector of 10 32-bit floats, in this case it is '<10f'\n", - "fmt = '<{}f'.format(len(vec_f32_list[0]))\n", - "\n", - "vec_f32_list_bytes = [struct.pack(fmt, *x) for x in vec_f32_list]\n", - "vec_f32_list_bytes[:3]" - ] - }, - { - "cell_type": "markdown", - "id": "77a3b930-33cd-4436-a021-9e99ed94cd9c", - "metadata": {}, - "source": [ - "##### The `INSERT` and `SELECT` code is the same as for numy arrays" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "0a1f4d5b-50f1-4987-b8f8-613b2b6f03bd", - "metadata": {}, - "outputs": [], - "source": [ - "cursor.executemany('INSERT INTO vectors(vec_f32) VALUES (%s)', vec_f32_list_bytes)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "171acbee-c663-4073-843b-a3f83fa0a99a", - "metadata": {}, - "outputs": [], - "source": [ - "cursor.execute('SELECT vec_f32 FROM vectors LIMIT 5')" - ] - }, - { - "cell_type": "markdown", - "id": "b0b40daa-52a9-4bf8-aefd-4722974cb8f5", - "metadata": {}, - "source": [ - "To unpack the rows as Python lists, we use the `struct` package again." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "63490736-c68b-49d5-8db1-8ec203c7a583", - "metadata": {}, - "outputs": [], - "source": [ - "out_f32_list = [list(struct.unpack(fmt, x[0])) for x in cursor]\n", - "out_f32_list[:3]" - ] - }, - { - "cell_type": "markdown", - "id": "91927fbd-f19c-448a-926a-d4ee8dc3e607", - "metadata": {}, - "source": [ - "### Using SQLAlchemy\n", - "\n", - "In order to use SingleStoreDB with SQLAlchemy, you need to install the `sqlalchemy-singlestoredb` dialect as follows.\n", - "\n", - "```\n", - "pip install sqlalchemy-singlestoredb\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "7d715701-eba1-4868-b223-a17a6fa4b6ce", - "metadata": {}, - "outputs": [], - "source": [ - "import sqlalchemy as sa\n", - "\n", - "eng = sa.create_engine(connection_url)\n", - "conn = eng.connect()" - ] - }, - { - "cell_type": "markdown", - "id": "6ca1960a-a55e-465c-a4f2-3daeb56e2739", - "metadata": {}, - "source": [ - "The SQLAlchemy method works much like the SingleStoreDB method. However, SQLAlchemy (v2+) requires parameters to be\n", - "in a dictionary, and the substitution syntax is of the form `:var_name` where 'var_name' in the key in the dictionary." - ] - }, - { - "cell_type": "markdown", - "id": "473114ce-4b51-484d-90d9-eaafce4d4b58", - "metadata": {}, - "source": [ - "#### Working with numpy arrays" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "74707c74-2529-43e1-ba87-b693403b5e8d", - "metadata": {}, - "outputs": [], - "source": [ - "reset_table()" - ] - }, - { - "cell_type": "markdown", - "id": "86eff78c-4b8f-40d1-bc9f-978fd39dada6", - "metadata": {}, - "source": [ - "SQLAlchemy requires you to construct the query as a `sa.text` object. Parameters for inserting multple\n", - "rows are in a list of dictionaries." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "03905527-9239-4fd7-9a9b-4c35da0b7447", - "metadata": {}, - "outputs": [], - "source": [ - "query = sa.text('INSERT INTO vectors(vec_f32) VALUES (:vec_f32)')\n", - "conn.execute(query, [dict(vec_f32=x) for x in vec_f32])" - ] - }, - { - "cell_type": "markdown", - "id": "f95fb2be-e513-4555-b580-118f337e0f19", - "metadata": {}, - "source": [ - "Selecting the data works much as before as well." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "d7b22128-188c-475e-a1cb-5c52261d8403", - "metadata": {}, - "outputs": [], - "source": [ - "result = conn.execute(sa.text('SELECT vec_f32 FROM vectors LIMIT 5'))" - ] - }, - { - "cell_type": "markdown", - "id": "f7bc12cf-6ce6-4c20-8fa4-e83c2cb49e71", - "metadata": {}, - "source": [ - "We can use the `np.frombuffer` function again to convert the byte strings to numpy arrays." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "3391ee73-86c5-4913-b412-bf4d12fb9b68", - "metadata": {}, - "outputs": [], - "source": [ - "out_f32 = [np.frombuffer(x[0], dtype=np.float32) for x in result]\n", - "out_f32" - ] - }, - { - "cell_type": "markdown", - "id": "893ff3b8-5f16-4736-b157-52eec72a6fea", - "metadata": {}, - "source": [ - "#### Working with Python lists\n", - "\n", - "To upload Python lists of values, you use the `struct` package to construct the byte strings as described in the\n", - "\"Uploading Python Lists\" in the previous section. The rest of the code here stays the same with the exception of\n", - "replacing `vec_f32` with `vec_f32_list_bytes` as the query parameter for the `INSERT` query." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "4a4124d1-588b-408a-9d85-dc2acd8d8f31", - "metadata": {}, - "outputs": [], - "source": [ - "reset_table()" - ] - }, - { - "cell_type": "markdown", - "id": "3a09006f-6c12-40fd-b3d7-cd3b4c33f040", - "metadata": {}, - "source": [ - "Recall that we create a list of bytes (vector) objects in the previous example. This list of vectors\n", - "can be passed to the `INSERT` as well as numpy arrays." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "465ffe10-cc32-400c-adec-f4e91f25fb98", - "metadata": {}, - "outputs": [], - "source": [ - "vec_f32_list_bytes[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "97b2069f-2cd2-4af5-95cc-87637d1fc838", - "metadata": {}, - "outputs": [], - "source": [ - "query = sa.text('INSERT INTO vectors(vec_f32) VALUES (:vec_f32)')\n", - "conn.execute(query, [dict(vec_f32=x) for x in vec_f32_list_bytes])" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "ea364348-8b95-4835-9481-11a7bf67fce0", - "metadata": {}, - "outputs": [], - "source": [ - "result = conn.execute(sa.text('SELECT vec_f32 FROM vectors LIMIT 5'))" - ] - }, - { - "cell_type": "markdown", - "id": "8fa7bd8e-8842-438f-a336-e93ecc321820", - "metadata": {}, - "source": [ - "Unpacking the Python lists works as before as well." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "78b0619f-a057-4edb-a230-1e96c5b0b2e7", - "metadata": {}, - "outputs": [], - "source": [ - "out_f32_list = [list(struct.unpack(fmt, x[0])) for x in result]\n", - "out_f32_list[:3]" - ] - }, - { - "cell_type": "markdown", - "id": "2b2db64d-8e96-4f59-b91a-3731ee934287", - "metadata": {}, - "source": [ - "### Using pandas\n", - "\n", - "The pandas package has utilities for working with databases. The two primary methods / functions are\n", - "`DataFrame.to_sql` which uploads `DataFrame` data to a table, and `pd.read_sql` which downloads\n", - "data from a table." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "a53088c8-af5e-40f6-84b5-aa83cc81303f", - "metadata": {}, - "outputs": [], - "source": [ - "reset_table()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "50511825-6506-45b4-9b36-607dcee37dea", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "id": "91b876a8-da7d-48b5-89a9-8149fab91566", - "metadata": {}, - "source": [ - "First, we'll create a pandas `DataFrame` with our numpy arrays." - ] - }, - { - "cell_type": "markdown", - "id": "fdf50e43-68a2-4cfb-a6a0-215d442f27c8", - "metadata": {}, - "source": [ - "#### Working with numpy arrays" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "7f871623-9176-4865-97f4-5e89cf7c3a70", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(dict(vec_f32=pd.Series(vec_f32)))\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "c37150fa-e5f1-49d5-b13b-e26e9e88ed92", - "metadata": {}, - "source": [ - "We can use the `to_sql` method of the `DataFrame` to upload the data. Notice that we are using the SQLAlchemy\n", - "connection we created in the previous section as the `con` parameter." - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "1a853637-f29e-434a-9dd4-d2fb92bc4597", - "metadata": {}, - "outputs": [], - "source": [ - "df.to_sql('vectors', con=conn, if_exists='append', index=False)" - ] - }, - { - "cell_type": "markdown", - "id": "67fdc9d4-9d48-4af9-a4f9-b643a43992b9", - "metadata": {}, - "source": [ - "To read the data, we use the `read_sql` function. As before, we are getting byte strings back that will need to be\n", - "converted into numpy arrays." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "a75c5726-0ee7-4876-aac7-e71dc9752eae", - "metadata": {}, - "outputs": [], - "source": [ - "out_df = pd.read_sql('vectors', con=conn)\n", - "out_df.head(3)" - ] - }, - { - "cell_type": "markdown", - "id": "9d774b5f-88f9-45b3-a54d-229020aa16af", - "metadata": {}, - "source": [ - "We apply the `np.frombuffer` function to each element in the `vec_f32` column to reconstruct the numpy array." - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "48b56238-b251-479f-9d1f-271f46a7111e", - "metadata": {}, - "outputs": [], - "source": [ - "out_df['vec_f32'] = out_df['vec_f32'].apply(lambda x: np.frombuffer(x, dtype=np.float32))" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "c4e77d6b-a93c-47d2-89ce-b1c502950c71", - "metadata": {}, - "outputs": [], - "source": [ - "out_df.head(3)" - ] - }, - { - "cell_type": "markdown", - "id": "71b184dd-641c-4ef0-91cf-c581143d3945", - "metadata": {}, - "source": [ - "#### Working with Python lists\n", - "\n", - "Because Python lists are not typed arrays like numpy arrays, we have to convert them to bytes before\n", - "uploading them." - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "43187411-efe0-465d-b6dd-a167534f6823", - "metadata": {}, - "outputs": [], - "source": [ - "reset_table()" - ] - }, - { - "cell_type": "markdown", - "id": "6b0fa295-99e9-4846-9996-a704df463a36", - "metadata": {}, - "source": [ - "Construct a `DataFrame` using Python lists as the data." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "1ed1b6c2-3c79-42b9-a671-41b2828c4c31", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(dict(vec_f32=vec_f32_list))\n", - "df.head(3)" - ] - }, - { - "cell_type": "markdown", - "id": "94263962-9ec2-4e34-a08e-1e2ad41247dd", - "metadata": {}, - "source": [ - "Note that we are using our `fmt` value from a previous section to convert the Python lists\n", - "to bytes using `struct.pack`." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "3cb1b6e1-a732-4a2f-a751-095d6727e6ae", - "metadata": {}, - "outputs": [], - "source": [ - "fmt" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "6cdaafa5-7406-488b-a780-744f23b5c0e4", - "metadata": {}, - "outputs": [], - "source": [ - "df['vec_f32'] = df['vec_f32'].apply(lambda x: struct.pack(fmt, *x))" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "af739340-e5fd-482a-96c8-5eedf8202f1c", - "metadata": {}, - "outputs": [], - "source": [ - "df['vec_f32'].head(3)" - ] - }, - { - "cell_type": "markdown", - "id": "6f2d8675-c1ee-44d2-ac17-eef1c543d71c", - "metadata": {}, - "source": [ - "Use the `to_sql` method to upload the `DataFrame`." - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "49dde7bd-9823-4c55-8f34-4e16643e6b8e", - "metadata": {}, - "outputs": [], - "source": [ - "df.to_sql('vectors', con=conn, if_exists='append', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "137a7f8e-d713-4179-bcad-66f194d1f839", - "metadata": {}, - "outputs": [], - "source": [ - "out_df = pd.read_sql('vectors', con=conn)\n", - "out_df.head(3)" - ] - }, - { - "cell_type": "markdown", - "id": "99233fdb-57b2-4290-9038-7c3e5eaf553e", - "metadata": {}, - "source": [ - "We now have to convert the byte strings back to Python lists." - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "a60f967c-c8fe-4ad9-a11f-25f5fb35ce69", - "metadata": {}, - "outputs": [], - "source": [ - "out_df['vec_f32'] = out_df['vec_f32'].apply(lambda x: list(struct.unpack(fmt, x)))" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "2924f8b8-f543-4a2f-90c8-8e6e5c15275d", - "metadata": {}, - "outputs": [], - "source": [ - "out_df.head(3)" - ] - }, - { - "cell_type": "markdown", - "id": "8f070295-78e3-4137-82d6-8be8c64b3898", - "metadata": {}, - "source": [ - "### Using the `%%sql` / `%sql` magic commands\n", - "\n", - "While the SQL magic commands are convenient for invoking basic SQL commands, they aren't quite as good\n", - "for complex queries that insert data. The primary issue is that you must construct the query as a string\n", - "and ensure that all of your data is properly escaped. We'll demonstrate some basics here, but the\n", - "methods described in the previous sections are likely to work better." - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "5f13939e-2254-4956-9537-315f1dde1b63", - "metadata": {}, - "outputs": [], - "source": [ - "reset_table()" - ] - }, - { - "cell_type": "markdown", - "id": "3ac2349f-d2bd-452d-9e4f-d869ef0e774f", - "metadata": {}, - "source": [ - "#### Working with numpy arrays or Python lists\n", - "\n", - "The SQL magic commands do not do any of the automatic conversions of data to query parameters, so this must be done\n", - "manually before creating the query. This is done the same way whether the source is numpy arrays or Python lists.\n", - "In either case, you must convert the objects to byte strings as we have in the previous sections, then convert that\n", - "byte string into a hex literal that can be used in the query." - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "f6781046-e636-4495-8a99-e035db8988aa", - "metadata": {}, - "outputs": [], - "source": [ - "# Convert an element of the numpy array to a hex string\n", - "vec_f32[0].tobytes().hex()" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "957f98e1-c3d5-4e7c-b43a-5583cdff045e", - "metadata": {}, - "outputs": [], - "source": [ - "# Convert an element of the Python list to a hex string\n", - "struct.pack(fmt, *vec_f32_list[0]).hex()" - ] - }, - { - "cell_type": "markdown", - "id": "5424355e-fffb-4cc7-b0c3-eba7012d1bd1", - "metadata": {}, - "source": [ - "To construct the query string for the `%%sql` command, we need to build the entire list of values to insert\n", - "in a separate step. We'll insert the `X` at the beginning of the string to indicate a hex literal to\n", - "SingleStoreDB. We'll also add the parentheses around the value for inserting multiple rows of data using\n", - "the `INSERT` statement." - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "e269be32-5b56-4e19-baed-6420d6fd4bfb", - "metadata": {}, - "outputs": [], - "source": [ - "params = [\"(X'{}')\".format(x.tobytes().hex()) for x in vec_f32]\n", - "params[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "bd51d277-eec1-4787-b9b9-7a943f3eea0c", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "INSERT INTO vectors(vec_f32) VALUES {{ ','.join(params) }}" - ] - }, - { - "cell_type": "markdown", - "id": "5b982cc3-5e8a-460c-beff-440dbae58144", - "metadata": {}, - "source": [ - "We can now select the data." - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "8a9ce43b-2ebc-4c9a-8898-afacaff13df9", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql out <<\n", - "SELECT * FROM vectors LIMIT 5" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "b10b2f71-02fd-4630-8ae0-7845a8385934", - "metadata": {}, - "outputs": [], - "source": [ - "out" - ] - }, - { - "cell_type": "markdown", - "id": "3178ea48-2bfd-44d2-8211-a291dd5bf5ba", - "metadata": {}, - "source": [ - "At this point, there is nothing we can do with SQL magic commands to convert the data back into numpy arrays or Python\n", - "lists. We need to drop to Python for that." - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "5b6d0c98-a016-423c-a460-aa617615bcdf", - "metadata": {}, - "outputs": [], - "source": [ - "out_df = pd.DataFrame(out)" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "5a3e6552-b66c-460e-9394-04b6b1a25795", - "metadata": {}, - "outputs": [], - "source": [ - "out_df['vec_f32'] = out_df['vec_f32'].apply(lambda x: np.frombuffer(x, dtype=np.float32))" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "821ac65c-b8d5-47f5-8b14-945ed8e8d1fa", - "metadata": {}, - "outputs": [], - "source": [ - "out_df.head(3)" - ] - }, - { - "cell_type": "markdown", - "id": "ad5c2b96-0002-4948-87a4-949a68c3e0a2", - "metadata": {}, - "source": [ - "### Using JSON\n", - "\n", - "It is also possible to use JSON to create vectors, however, this method require serializing and deserializing JSON on either\n", - "end which isn't quite a efficient as the techniques above. It also requires using the `JSON_ARRAY_PACK` and `JSON_ARRAY_UNPACK`\n", - "functions in your queries to go back and forth between the vector bytes and JSON. Here is an example of inserting the\n", - "Python list of floats." - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "df5af6d1-15e1-4867-a02c-31634a65393b", - "metadata": {}, - "outputs": [], - "source": [ - "import json" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "id": "79f06760-9039-408a-a4c2-6331947dd3e4", - "metadata": {}, - "outputs": [], - "source": [ - "params = ['(JSON_ARRAY_PACK(\"{}\"))'.format(json.dumps(x)) for x in vec_f32_list]\n", - "params[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "92217c8d-f374-49a6-8fb8-f21666681f95", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql\n", - "INSERT INTO vectors(vec_f32) VALUES {{ ','.join(params) }}" - ] - }, - { - "cell_type": "markdown", - "id": "bd323c0a-7de4-4c56-9fc4-f2a22f4f661c", - "metadata": {}, - "source": [ - "If you use the `JSON_ARRAY_UNPACK` function in your `SELECT` statement, you can download the data as JSON." - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "b9aac5ba-efea-466b-82c7-12fa02174630", - "metadata": {}, - "outputs": [], - "source": [ - "%%sql out <<\n", - "SELECT JSON_ARRAY_UNPACK(vec_f32) AS 'vec_f32' FROM vectors LIMIT 5" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "9f9e9b6e-5a25-483b-9d40-aea95a302b5f", - "metadata": {}, - "outputs": [], - "source": [ - "out = pd.DataFrame(out)\n", - "out" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "bbfe895a-0f09-4094-a835-793329ee388e", - "metadata": {}, - "outputs": [], - "source": [ - "out['vec_f32'][0]" - ] - }, - { - "cell_type": "markdown", - "id": "6bdb300a-a8f6-40cc-a5f9-de54508bb22b", - "metadata": {}, - "source": [ - "Notice that since the data type of the column in the `SELECT` is JSON, it automatically gets converted to a Python list\n", - "in the client." - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "b5234146-b058-4462-b5d0-516ae699efc6", - "metadata": {}, - "outputs": [], - "source": [ - "type(out['vec_f32'][0])" - ] - }, - { - "cell_type": "markdown", - "id": "8a8cfd32-c903-4e9b-b27b-253fcbca6ad4", - "metadata": {}, - "source": [ - "## Conclusion\n", - "\n", - "As you can see, there are various interfaces available for uploading and downloading vector data. Depending on\n", - "which Python framework you are using and what format your data is in, you can pick and choose which\n", - "methods work for your use-case." - ] - }, - { - "cell_type": "markdown", - "id": "42060943", - "metadata": {}, - "source": [ - "
\n", - " \n", - "
\n", - "

Action Required

\n", - "

If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

\n", - "
\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "8f911f36-0153-4959-828a-41e637cc9887", - "metadata": {}, - "outputs": [], - "source": [ - "shared_tier_check = %sql show variables like 'is_shared_tier'\n", - "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", - " %sql DROP DATABASE IF EXISTS vector_data;" - ] - }, - { - "cell_type": "markdown", - "id": "546a9cee-db0a-438b-9fcc-081223339a9f", - "metadata": {}, - "source": [ - "
\n", - "
" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "b80549ed-1471-4fc5-8b1a-a3050246078a", + "metadata": {}, + "source": "
\n
\n \n
\n
\n
SingleStore Notebooks
\n

Working with Vector Data

\n
\n
" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "09cb34bc", + "metadata": {}, + "source": "
\n \n
\n

Note

\n

This notebook can be run on a Free Starter Workspace. To create a Free Starter Workspace navigate to Start using the left nav. You can also use your existing Standard or Premium workspace with this Notebook.

\n
\n
" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "782ffea9-fbc0-4942-8a1a-da8788ed2fec", + "metadata": {}, + "source": "Using vector embeddings has become popular recently, but getting vector data into your\ndatabase can leave you with a lot of questions. This notebook shows various ways to\nload vectors into SingleStoreDB from Python using the Python client, SQLAlchemy, pandas,\nand the SQL magic commaands. It covers vectors in the form of numpy arrays or Python lists\nof numerics.\n\nWe'll use the following function to reset the vector data table between examples." + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f7fe2c95-9e0d-4b1b-ad24-d0536c4ef2d9", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:17:48.855573Z", + "iopub.status.busy": "2024-07-08T10:17:48.855142Z", + "iopub.status.idle": "2024-07-08T10:17:48.859970Z", + "shell.execute_reply": "2024-07-08T10:17:48.859339Z", + "shell.execute_reply.started": "2024-07-08T10:17:48.855543Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "def reset_table():\n \"\"\"Reset the table for use in the examples below.\"\"\"\n with s2.connect() as conn:\n with conn.cursor() as cur:\n cur.execute('DROP TABLE IF EXISTS vectors;')\n cur.execute(r'''\n CREATE TABLE vectors (\n vec_f32 BLOB\n );\n ''')" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d087092f-696c-4735-9c66-33b8efc885ca", + "metadata": {}, + "source": "At any time, if you want to see the actual query being sent to the database, you can set the following\nenvironment variable before making the query to the server." + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "45628671-dee1-41fe-ae77-b8c651c8c389", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:17:56.269133Z", + "iopub.status.busy": "2024-07-08T10:17:56.268821Z", + "iopub.status.idle": "2024-07-08T10:17:56.272891Z", + "shell.execute_reply": "2024-07-08T10:17:56.272271Z", + "shell.execute_reply.started": "2024-07-08T10:17:56.269107Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "import os\n\n# os.environ['SINGLESTOREDB_DEBUG_QUERIES'] = '1'" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9b1cd9d3", + "metadata": {}, + "source": "
\n \n
\n

Action Required

\n

If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

\n
\n
" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e2e322f5-b81d-4249-b512-bd36f88aa168", + "metadata": {}, + "source": "Create a database for our examples." + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "fec12f93-7ca6-4f77-bc7f-355b0bfa98f9", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:02.152863Z", + "iopub.status.busy": "2024-07-08T10:18:02.152449Z", + "iopub.status.idle": "2024-07-08T10:18:02.170967Z", + "shell.execute_reply": "2024-07-08T10:18:02.170417Z", + "shell.execute_reply.started": "2024-07-08T10:18:02.152823Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "shared_tier_check = %sql show variables like 'is_shared_tier'\nif not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n %sql DROP DATABASE IF EXISTS vector_data;\n %sql CREATE DATABASE vector_data;" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1f2db020-7f76-44d0-9b32-cc81d35979ef", + "metadata": {}, + "source": "
\n \n
\n

Action Required

\n

Make sure to select the vector_data database from the drop-down menu at the top of this notebook. It updates the connection_url which is used by the %%sql magic command and SQLAlchemy to make connections to the selected database.

\n
\n
" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3e65bd3b-49b4-48ca-8409-e3da89ebcce4", + "metadata": {}, + "source": "## Generate numpy arrays containing vector data\n\nThe code belowe generates 1,000 rows of 10 random 32-bit float numbers in a numpy array.\nThis data will be used in the following examples." + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c9fd6e9f-5513-45b3-bc4f-395e115ccd9e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:10.258253Z", + "iopub.status.busy": "2024-07-08T10:18:10.257852Z", + "iopub.status.idle": "2024-07-08T10:18:10.262656Z", + "shell.execute_reply": "2024-07-08T10:18:10.262157Z", + "shell.execute_reply.started": "2024-07-08T10:18:10.258218Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "import numpy as np" + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "aab67ef8-8bd1-4f9e-957a-ac8248984f4f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:11.377606Z", + "iopub.status.busy": "2024-07-08T10:18:11.376977Z", + "iopub.status.idle": "2024-07-08T10:18:11.385730Z", + "shell.execute_reply": "2024-07-08T10:18:11.385210Z", + "shell.execute_reply.started": "2024-07-08T10:18:11.377575Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "[array([0.94529617, 0.5391597 , 0.3666218 , 0.27040002, 0.33434793,\n 0.81942284, 0.7387169 , 0.0188459 , 0.07248586, 0.5413058 ],\n dtype=float32),\n array([0.39620587, 0.56646174, 0.09738464, 0.6073699 , 0.86925113,\n 0.40876037, 0.17535466, 0.5120548 , 0.03570552, 0.842908 ],\n dtype=float32),\n array([0.10918448, 0.43081337, 0.03388631, 0.55986017, 0.80183506,\n 0.6763027 , 0.25283858, 0.41930103, 0.7678156 , 0.13405219],\n dtype=float32)]" + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "vec_f32 = [np.random.rand(10).astype(np.float32) for _ in range(1000)]\nvec_f32[:3]" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1539013e-6ad8-49cc-aafd-e0aa5c2dbf60", + "metadata": {}, + "source": "### Create a Python list of float values from the numpy array\n\nWe will show how to work with both numpy arrays and Python lists in the following examples.\nThis cell creates a list of Python lists of floats equivalent to the numpy arrays above." + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "c72202fa-3a15-42a0-83f2-2650a6d5faa6", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:16.815963Z", + "iopub.status.busy": "2024-07-08T10:18:16.815597Z", + "iopub.status.idle": "2024-07-08T10:18:16.833510Z", + "shell.execute_reply": "2024-07-08T10:18:16.832856Z", + "shell.execute_reply.started": "2024-07-08T10:18:16.815935Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "[[0.9452961683273315,\n 0.5391597151756287,\n 0.36662179231643677,\n 0.2704000174999237,\n 0.3343479335308075,\n 0.8194228410720825,\n 0.7387169003486633,\n 0.018845897167921066,\n 0.07248586416244507,\n 0.5413057804107666],\n [0.396205872297287,\n 0.5664617419242859,\n 0.09738463908433914,\n 0.6073698997497559,\n 0.8692511320114136,\n 0.4087603688240051,\n 0.17535465955734253,\n 0.5120548009872437,\n 0.03570551797747612,\n 0.8429080247879028],\n [0.10918448120355606,\n 0.43081337213516235,\n 0.03388631343841553,\n 0.5598601698875427,\n 0.8018350601196289,\n 0.6763026714324951,\n 0.2528385818004608,\n 0.41930103302001953,\n 0.7678155899047852,\n 0.13405218720436096]]" + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "vec_f32_list = [list([float(y) for y in x]) for x in vec_f32]\nvec_f32_list[:3]" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ebe71955-7358-4c7c-add8-162f5bca098a", + "metadata": {}, + "source": "## Upload and downloading data to SingleStoreDB\n\nIn the following sections, we'll describe how to use the SingleStoreDB Python client, SQLAlchemy, the `%%sql` magic,\nand pandas to upload and download vector data." + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2860a4f6-bfc6-4bc0-89d8-6c9d765f1240", + "metadata": {}, + "source": "### Using SingleStoreDB Python client" + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "35cd7e37-d05f-424f-98c7-ae61958c42d5", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:23.121668Z", + "iopub.status.busy": "2024-07-08T10:18:23.121294Z", + "iopub.status.idle": "2024-07-08T10:18:23.179874Z", + "shell.execute_reply": "2024-07-08T10:18:23.179303Z", + "shell.execute_reply.started": "2024-07-08T10:18:23.121631Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "import singlestoredb as s2\n\nconn = s2.connect()\ncursor = conn.cursor()" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "66e77736-4625-481b-9991-d7e7f28401cb", + "metadata": {}, + "source": "#### Working with numpy arrays" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2d1453cd-21d2-4843-a41a-6aa1a33ce0a1", + "metadata": {}, + "source": "The SingleStoreDB Python client supports numpy arrays natively. If a numpy array is passed as a parameter to a query,\nit will be converted to a byte string containing the contents of the array. The data type of the numpy array is\npreserved, so you need to ensure that it is the proper numpy dtype before uploading. You can change the data type\nof a numpy array by using the `astype` method." + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "5fa23885-106d-4b37-ade2-d7b6e6c8b593", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:31.557933Z", + "iopub.status.busy": "2024-07-08T10:18:31.557522Z", + "iopub.status.idle": "2024-07-08T10:18:31.786400Z", + "shell.execute_reply": "2024-07-08T10:18:31.785791Z", + "shell.execute_reply.started": "2024-07-08T10:18:31.557894Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "reset_table()" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a752e82f-bdf9-442e-94eb-9e29459da840", + "metadata": {}, + "source": "Recall that `vec_f32` contained numpy arrays of float32 values." + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "9fcdb1ce-254b-4420-815e-76cb2199ac05", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:34.245193Z", + "iopub.status.busy": "2024-07-08T10:18:34.244825Z", + "iopub.status.idle": "2024-07-08T10:18:34.250481Z", + "shell.execute_reply": "2024-07-08T10:18:34.249923Z", + "shell.execute_reply.started": "2024-07-08T10:18:34.245165Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "[array([0.94529617, 0.5391597 , 0.3666218 , 0.27040002, 0.33434793,\n 0.81942284, 0.7387169 , 0.0188459 , 0.07248586, 0.5413058 ],\n dtype=float32),\n array([0.39620587, 0.56646174, 0.09738464, 0.6073699 , 0.86925113,\n 0.40876037, 0.17535466, 0.5120548 , 0.03570552, 0.842908 ],\n dtype=float32),\n array([0.10918448, 0.43081337, 0.03388631, 0.55986017, 0.80183506,\n 0.6763027 , 0.25283858, 0.41930103, 0.7678156 , 0.13405219],\n dtype=float32)]" + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "vec_f32[:3]" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "df0f98b0-d916-4113-a34c-e0c13cffa242", + "metadata": {}, + "source": "The `executemany` method will insert multiple rows of data in a single SQL query." + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "b55d0954-9e8c-468b-b1da-019a3adf4fd2", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:39.098628Z", + "iopub.status.busy": "2024-07-08T10:18:39.098235Z", + "iopub.status.idle": "2024-07-08T10:18:39.189921Z", + "shell.execute_reply": "2024-07-08T10:18:39.189227Z", + "shell.execute_reply.started": "2024-07-08T10:18:39.098588Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "1000" + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "cursor.executemany('INSERT INTO vectors(vec_f32) VALUES (%s)', vec_f32)" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f929f1ed-2ee1-4209-a27d-121bec2a3a79", + "metadata": {}, + "source": "To download the vector data from SingleStoreDB, you simple execute a `SELECT` statement. The data is held in\nblob columns, so the result will simply contain byte strings." + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "218071ef-0742-460b-b0a4-b079970ae568", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:41.361391Z", + "iopub.status.busy": "2024-07-08T10:18:41.361072Z", + "iopub.status.idle": "2024-07-08T10:18:41.464475Z", + "shell.execute_reply": "2024-07-08T10:18:41.463938Z", + "shell.execute_reply.started": "2024-07-08T10:18:41.361365Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "5" + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "cursor.execute('SELECT vec_f32 FROM vectors LIMIT 5')" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "22892481-3d71-48aa-abe3-ffd63b309419", + "metadata": {}, + "source": "Since we want to use the data as numpy arrays, we can \"reconstitute\" the arrays as we read the data using the `np.frombuffer` function." + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "52bfac93-5503-4144-8700-95db21f13897", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:44.107280Z", + "iopub.status.busy": "2024-07-08T10:18:44.106910Z", + "iopub.status.idle": "2024-07-08T10:18:44.113105Z", + "shell.execute_reply": "2024-07-08T10:18:44.112599Z", + "shell.execute_reply.started": "2024-07-08T10:18:44.107248Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "[array([0.7970012 , 0.34688511, 0.14492278, 0.73332036, 0.37237272,\n 0.15225586, 0.26400378, 0.1706023 , 0.2066024 , 0.2986435 ],\n dtype=float32),\n array([0.15736586, 0.7116634 , 0.55218774, 0.8527479 , 0.8662606 ,\n 0.21808125, 0.19137949, 0.19946271, 0.31750116, 0.4048979 ],\n dtype=float32),\n array([0.329683 , 0.7214109 , 0.48456433, 0.6002015 , 0.4030805 ,\n 0.32164323, 0.02634622, 0.10913838, 0.13508031, 0.33974582],\n dtype=float32),\n array([0.35889304, 0.37261793, 0.22267127, 0.57628405, 0.10873934,\n 0.66360027, 0.67708856, 0.69097304, 0.8924684 , 0.07560002],\n dtype=float32),\n array([0.46105748, 0.24171682, 0.3059963 , 0.95824414, 0.33805165,\n 0.30686185, 0.89336896, 0.70329565, 0.45199844, 0.6623023 ],\n dtype=float32)]" + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "out_f32 = [np.frombuffer(x[0], dtype=np.float32) for x in cursor]\nout_f32" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "390b149f-8039-43ee-ae43-215ea7997a4f", + "metadata": {}, + "source": "#### Working with Python lists\n\nIt is also possible to upload Python lists without going through a numpy array using the [struct](https://docs.python.org/3/library/struct.html) package. In this method, we convert\nthe floats to a byte string and pass that byte string as the parameter to the `INSERT` statement. The possible format codes are as follows.\nThe little-endian indicator (`<`) should also be used.\n\n* f - float32\n* d - float64\n* b - int8\n* h - int16\n* l - int32\n* q - int64" + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "5707a569-4361-4d69-a078-5c71bb547dce", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:47.096886Z", + "iopub.status.busy": "2024-07-08T10:18:47.096493Z", + "iopub.status.idle": "2024-07-08T10:18:47.260905Z", + "shell.execute_reply": "2024-07-08T10:18:47.260302Z", + "shell.execute_reply.started": "2024-07-08T10:18:47.096847Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "reset_table()" + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "a0777da4-daba-4b06-8fb6-c7fcc30dcc25", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:48.953913Z", + "iopub.status.busy": "2024-07-08T10:18:48.953526Z", + "iopub.status.idle": "2024-07-08T10:18:48.960729Z", + "shell.execute_reply": "2024-07-08T10:18:48.960117Z", + "shell.execute_reply.started": "2024-07-08T10:18:48.953859Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "[b'\\xee\\xfeq?_\\x06\\n?\\xda\\xb5\\xbb>\\xdfq\\x8a>\\xa7/\\xab>\\xb2\\xc5Q?\\x8d\\x1c=?\\xb6b\\x9a\\xa3\\x03\\x11?\\x99q\\xc7=\\x98|\\x1b?>\\x87^?\\nI\\xd1>,\\x903>\\x06\\x16\\x03?\\xf3?\\x12=\\xd2\\xc8W?',\n b'\\x1d\\x9c\\xdf=\\x92\\x93\\xdc>`\\xcc\\n=\\xffR\\x0f?\\x10EM?,\"-?\\x0ft\\x81>\\xa0\\xae\\xd6>\\x90\\x8fD?\\xfaD\\t>']" + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "import struct\n\n# Construct the format for a vector of 10 32-bit floats, in this case it is '<10f'\nfmt = '<{}f'.format(len(vec_f32_list[0]))\n\nvec_f32_list_bytes = [struct.pack(fmt, *x) for x in vec_f32_list]\nvec_f32_list_bytes[:3]" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "77a3b930-33cd-4436-a021-9e99ed94cd9c", + "metadata": {}, + "source": "##### The `INSERT` and `SELECT` code is the same as for numy arrays" + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "0a1f4d5b-50f1-4987-b8f8-613b2b6f03bd", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:51.292495Z", + "iopub.status.busy": "2024-07-08T10:18:51.292173Z", + "iopub.status.idle": "2024-07-08T10:18:51.385711Z", + "shell.execute_reply": "2024-07-08T10:18:51.385230Z", + "shell.execute_reply.started": "2024-07-08T10:18:51.292469Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "1000" + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "cursor.executemany('INSERT INTO vectors(vec_f32) VALUES (%s)', vec_f32_list_bytes)" + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "171acbee-c663-4073-843b-a3f83fa0a99a", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:52.173344Z", + "iopub.status.busy": "2024-07-08T10:18:52.172822Z", + "iopub.status.idle": "2024-07-08T10:18:52.195163Z", + "shell.execute_reply": "2024-07-08T10:18:52.194551Z", + "shell.execute_reply.started": "2024-07-08T10:18:52.173317Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "5" + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "cursor.execute('SELECT vec_f32 FROM vectors LIMIT 5')" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b0b40daa-52a9-4bf8-aefd-4722974cb8f5", + "metadata": {}, + "source": "To unpack the rows as Python lists, we use the `struct` package again." + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "63490736-c68b-49d5-8db1-8ec203c7a583", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:54.463863Z", + "iopub.status.busy": "2024-07-08T10:18:54.463482Z", + "iopub.status.idle": "2024-07-08T10:18:54.468633Z", + "shell.execute_reply": "2024-07-08T10:18:54.468135Z", + "shell.execute_reply.started": "2024-07-08T10:18:54.463812Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "[[0.3937252461910248,\n 0.5025281310081482,\n 0.17226243019104004,\n 0.7201003432273865,\n 0.987917423248291,\n 0.36919161677360535,\n 0.03498654067516327,\n 0.7478368878364563,\n 0.34253644943237305,\n 0.33940786123275757],\n [0.761231005191803,\n 0.8932342529296875,\n 0.06776423007249832,\n 0.8769919276237488,\n 0.48779383301734924,\n 0.9544709920883179,\n 0.8270399570465088,\n 0.9150049686431885,\n 0.8350704908370972,\n 0.9739500880241394],\n [0.9656015634536743,\n 0.4987963140010834,\n 0.6006644368171692,\n 0.000701306969858706,\n 0.5339081287384033,\n 0.22828376293182373,\n 0.3365790545940399,\n 0.2838159203529358,\n 0.3415278196334839,\n 0.7082713842391968]]" + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "out_f32_list = [list(struct.unpack(fmt, x[0])) for x in cursor]\nout_f32_list[:3]" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "91927fbd-f19c-448a-926a-d4ee8dc3e607", + "metadata": {}, + "source": "### Using SQLAlchemy\n\nIn order to use SingleStoreDB with SQLAlchemy, you need to install the `sqlalchemy-singlestoredb` dialect as follows.\n\n```\npip install sqlalchemy-singlestoredb\n```" + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "7d715701-eba1-4868-b223-a17a6fa4b6ce", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:56.780648Z", + "iopub.status.busy": "2024-07-08T10:18:56.780341Z", + "iopub.status.idle": "2024-07-08T10:18:56.865380Z", + "shell.execute_reply": "2024-07-08T10:18:56.864512Z", + "shell.execute_reply.started": "2024-07-08T10:18:56.780622Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "import sqlalchemy as sa\n\neng = sa.create_engine(connection_url)\nconn = eng.connect()" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6ca1960a-a55e-465c-a4f2-3daeb56e2739", + "metadata": {}, + "source": "The SQLAlchemy method works much like the SingleStoreDB method. However, SQLAlchemy (v2+) requires parameters to be\nin a dictionary, and the substitution syntax is of the form `:var_name` where 'var_name' in the key in the dictionary." + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "473114ce-4b51-484d-90d9-eaafce4d4b58", + "metadata": {}, + "source": "#### Working with numpy arrays" + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "74707c74-2529-43e1-ba87-b693403b5e8d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:18:59.775878Z", + "iopub.status.busy": "2024-07-08T10:18:59.775248Z", + "iopub.status.idle": "2024-07-08T10:18:59.960585Z", + "shell.execute_reply": "2024-07-08T10:18:59.959930Z", + "shell.execute_reply.started": "2024-07-08T10:18:59.775840Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "reset_table()" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "86eff78c-4b8f-40d1-bc9f-978fd39dada6", + "metadata": {}, + "source": "SQLAlchemy requires you to construct the query as a `sa.text` object. Parameters for inserting multple\nrows are in a list of dictionaries." + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "03905527-9239-4fd7-9a9b-4c35da0b7447", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:01.260387Z", + "iopub.status.busy": "2024-07-08T10:19:01.259954Z", + "iopub.status.idle": "2024-07-08T10:19:01.391284Z", + "shell.execute_reply": "2024-07-08T10:19:01.390589Z", + "shell.execute_reply.started": "2024-07-08T10:19:01.260347Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "" + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "query = sa.text('INSERT INTO vectors(vec_f32) VALUES (:vec_f32)')\nconn.execute(query, [dict(vec_f32=x) for x in vec_f32])" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f95fb2be-e513-4555-b580-118f337e0f19", + "metadata": {}, + "source": "Selecting the data works much as before as well." + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "d7b22128-188c-475e-a1cb-5c52261d8403", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:03.961370Z", + "iopub.status.busy": "2024-07-08T10:19:03.961041Z", + "iopub.status.idle": "2024-07-08T10:19:04.043306Z", + "shell.execute_reply": "2024-07-08T10:19:04.042658Z", + "shell.execute_reply.started": "2024-07-08T10:19:03.961344Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "result = conn.execute(sa.text('SELECT vec_f32 FROM vectors LIMIT 5'))" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f7bc12cf-6ce6-4c20-8fa4-e83c2cb49e71", + "metadata": {}, + "source": "We can use the `np.frombuffer` function again to convert the byte strings to numpy arrays." + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "3391ee73-86c5-4913-b412-bf4d12fb9b68", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:05.737302Z", + "iopub.status.busy": "2024-07-08T10:19:05.736951Z", + "iopub.status.idle": "2024-07-08T10:19:05.743601Z", + "shell.execute_reply": "2024-07-08T10:19:05.743022Z", + "shell.execute_reply.started": "2024-07-08T10:19:05.737274Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "[array([0.06707381, 0.3336899 , 0.23638362, 0.54146034, 0.21330866,\n 0.57814604, 0.7436944 , 0.21778256, 0.32921487, 0.18143076],\n dtype=float32),\n array([0.17625922, 0.6122456 , 0.65093136, 0.680956 , 0.99456173,\n 0.785619 , 0.8397423 , 0.34446132, 0.9549833 , 0.53008443],\n dtype=float32),\n array([0.12105445, 0.27007556, 0.33191404, 0.35239697, 0.104354 ,\n 0.560923 , 0.95614606, 0.6793355 , 0.12789273, 0.01870769],\n dtype=float32),\n array([0.05535996, 0.13312466, 0.9434161 , 0.52270526, 0.24034844,\n 0.73964477, 0.8723515 , 0.02157358, 0.14537902, 0.8052284 ],\n dtype=float32),\n array([0.17335513, 0.87024 , 0.11818643, 0.40915504, 0.65390265,\n 0.519701 , 0.1028851 , 0.8442223 , 0.64491796, 0.31468135],\n dtype=float32)]" + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "out_f32 = [np.frombuffer(x[0], dtype=np.float32) for x in result]\nout_f32" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "893ff3b8-5f16-4736-b157-52eec72a6fea", + "metadata": {}, + "source": "#### Working with Python lists\n\nTo upload Python lists of values, you use the `struct` package to construct the byte strings as described in the\n\"Uploading Python Lists\" in the previous section. The rest of the code here stays the same with the exception of\nreplacing `vec_f32` with `vec_f32_list_bytes` as the query parameter for the `INSERT` query." + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "4a4124d1-588b-408a-9d85-dc2acd8d8f31", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:08.294658Z", + "iopub.status.busy": "2024-07-08T10:19:08.294295Z", + "iopub.status.idle": "2024-07-08T10:19:08.456480Z", + "shell.execute_reply": "2024-07-08T10:19:08.455864Z", + "shell.execute_reply.started": "2024-07-08T10:19:08.294624Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "reset_table()" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3a09006f-6c12-40fd-b3d7-cd3b4c33f040", + "metadata": {}, + "source": "Recall that we create a list of bytes (vector) objects in the previous example. This list of vectors\ncan be passed to the `INSERT` as well as numpy arrays." + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "465ffe10-cc32-400c-adec-f4e91f25fb98", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:09.948319Z", + "iopub.status.busy": "2024-07-08T10:19:09.947985Z", + "iopub.status.idle": "2024-07-08T10:19:09.952550Z", + "shell.execute_reply": "2024-07-08T10:19:09.952002Z", + "shell.execute_reply.started": "2024-07-08T10:19:09.948287Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "[b'\\xee\\xfeq?_\\x06\\n?\\xda\\xb5\\xbb>\\xdfq\\x8a>\\xa7/\\xab>\\xb2\\xc5Q?\\x8d\\x1c=?\\xb6b\\x9a\\xa3\\x03\\x11?\\x99q\\xc7=\\x98|\\x1b?>\\x87^?\\nI\\xd1>,\\x903>\\x06\\x16\\x03?\\xf3?\\x12=\\xd2\\xc8W?',\n b'\\x1d\\x9c\\xdf=\\x92\\x93\\xdc>`\\xcc\\n=\\xffR\\x0f?\\x10EM?,\"-?\\x0ft\\x81>\\xa0\\xae\\xd6>\\x90\\x8fD?\\xfaD\\t>']" + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "vec_f32_list_bytes[:3]" + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "97b2069f-2cd2-4af5-95cc-87637d1fc838", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:10.797188Z", + "iopub.status.busy": "2024-07-08T10:19:10.796924Z", + "iopub.status.idle": "2024-07-08T10:19:10.881795Z", + "shell.execute_reply": "2024-07-08T10:19:10.881215Z", + "shell.execute_reply.started": "2024-07-08T10:19:10.797165Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "" + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "query = sa.text('INSERT INTO vectors(vec_f32) VALUES (:vec_f32)')\nconn.execute(query, [dict(vec_f32=x) for x in vec_f32_list_bytes])" + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "ea364348-8b95-4835-9481-11a7bf67fce0", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:12.211197Z", + "iopub.status.busy": "2024-07-08T10:19:12.210873Z", + "iopub.status.idle": "2024-07-08T10:19:12.271980Z", + "shell.execute_reply": "2024-07-08T10:19:12.271568Z", + "shell.execute_reply.started": "2024-07-08T10:19:12.211165Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "result = conn.execute(sa.text('SELECT vec_f32 FROM vectors LIMIT 5'))" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8fa7bd8e-8842-438f-a336-e93ecc321820", + "metadata": {}, + "source": "Unpacking the Python lists works as before as well." + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "78b0619f-a057-4edb-a230-1e96c5b0b2e7", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:13.782840Z", + "iopub.status.busy": "2024-07-08T10:19:13.782465Z", + "iopub.status.idle": "2024-07-08T10:19:13.788115Z", + "shell.execute_reply": "2024-07-08T10:19:13.787433Z", + "shell.execute_reply.started": "2024-07-08T10:19:13.782807Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "[[0.6067176461219788,\n 0.2106485515832901,\n 0.6345869302749634,\n 0.6352039575576782,\n 0.829525887966156,\n 0.2823314368724823,\n 0.017225714400410652,\n 0.22034095227718353,\n 0.24362443387508392,\n 0.7712428569793701],\n [0.26649677753448486,\n 0.6021978259086609,\n 0.8979067206382751,\n 0.9429398775100708,\n 0.589701771736145,\n 0.24339258670806885,\n 0.3752290904521942,\n 0.34352484345436096,\n 0.647399365901947,\n 0.19694264233112335],\n [0.6997039914131165,\n 0.08066725730895996,\n 0.19695895910263062,\n 0.08963707834482193,\n 0.3289657235145569,\n 0.8245747089385986,\n 0.782729983329773,\n 0.43013912439346313,\n 0.765410304069519,\n 0.8552709817886353]]" + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "out_f32_list = [list(struct.unpack(fmt, x[0])) for x in result]\nout_f32_list[:3]" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2b2db64d-8e96-4f59-b91a-3731ee934287", + "metadata": {}, + "source": "### Using pandas\n\nThe pandas package has utilities for working with databases. The two primary methods / functions are\n`DataFrame.to_sql` which uploads `DataFrame` data to a table, and `pd.read_sql` which downloads\ndata from a table." + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "a53088c8-af5e-40f6-84b5-aa83cc81303f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:17.888364Z", + "iopub.status.busy": "2024-07-08T10:19:17.887727Z", + "iopub.status.idle": "2024-07-08T10:19:18.052419Z", + "shell.execute_reply": "2024-07-08T10:19:18.051726Z", + "shell.execute_reply.started": "2024-07-08T10:19:17.888328Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "reset_table()" + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "50511825-6506-45b4-9b36-607dcee37dea", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:18.663853Z", + "iopub.status.busy": "2024-07-08T10:19:18.663572Z", + "iopub.status.idle": "2024-07-08T10:19:18.666951Z", + "shell.execute_reply": "2024-07-08T10:19:18.666376Z", + "shell.execute_reply.started": "2024-07-08T10:19:18.663829Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "import pandas as pd" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "91b876a8-da7d-48b5-89a9-8149fab91566", + "metadata": {}, + "source": "First, we'll create a pandas `DataFrame` with our numpy arrays." + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "fdf50e43-68a2-4cfb-a6a0-215d442f27c8", + "metadata": {}, + "source": "#### Working with numpy arrays" + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "7f871623-9176-4865-97f4-5e89cf7c3a70", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:20.728780Z", + "iopub.status.busy": "2024-07-08T10:19:20.728373Z", + "iopub.status.idle": "2024-07-08T10:19:20.740570Z", + "shell.execute_reply": "2024-07-08T10:19:20.739971Z", + "shell.execute_reply.started": "2024-07-08T10:19:20.728752Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
vec_f32
0[0.94529617, 0.5391597, 0.3666218, 0.27040002,...
1[0.39620587, 0.56646174, 0.09738464, 0.6073699...
2[0.10918448, 0.43081337, 0.033886313, 0.559860...
3[0.027094776, 0.03226529, 0.49422556, 0.171387...
4[0.65606296, 0.022113293, 0.57438064, 0.867151...
\n
", + "text/plain": " vec_f32\n0 [0.94529617, 0.5391597, 0.3666218, 0.27040002,...\n1 [0.39620587, 0.56646174, 0.09738464, 0.6073699...\n2 [0.10918448, 0.43081337, 0.033886313, 0.559860...\n3 [0.027094776, 0.03226529, 0.49422556, 0.171387...\n4 [0.65606296, 0.022113293, 0.57438064, 0.867151..." + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "df = pd.DataFrame(dict(vec_f32=pd.Series(vec_f32)))\ndf.head()" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c37150fa-e5f1-49d5-b13b-e26e9e88ed92", + "metadata": {}, + "source": "We can use the `to_sql` method of the `DataFrame` to upload the data. Notice that we are using the SQLAlchemy\nconnection we created in the previous section as the `con` parameter." + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "1a853637-f29e-434a-9dd4-d2fb92bc4597", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:23.525455Z", + "iopub.status.busy": "2024-07-08T10:19:23.525138Z", + "iopub.status.idle": "2024-07-08T10:19:23.657476Z", + "shell.execute_reply": "2024-07-08T10:19:23.656748Z", + "shell.execute_reply.started": "2024-07-08T10:19:23.525430Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "1000" + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "df.to_sql('vectors', con=conn, if_exists='append', index=False)" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "67fdc9d4-9d48-4af9-a4f9-b643a43992b9", + "metadata": {}, + "source": "To read the data, we use the `read_sql` function. As before, we are getting byte strings back that will need to be\nconverted into numpy arrays." + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "a75c5726-0ee7-4876-aac7-e71dc9752eae", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:25.078098Z", + "iopub.status.busy": "2024-07-08T10:19:25.077716Z", + "iopub.status.idle": "2024-07-08T10:19:25.183282Z", + "shell.execute_reply": "2024-07-08T10:19:25.182630Z", + "shell.execute_reply.started": "2024-07-08T10:19:25.078061Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
vec_f32
0b\"\\xc4.\\x81<\\x8a\\x0fW?\\x11~\\xcd>_\\x82@?Vq\\x05?...
1b'rp\\xdc>U\\xd7\\x89>\\xe6BC?\\xe7\\xcd\\xfb>P\\xe4\\x...
2b\"\\xaf\\x10,?\\xc9\\x8c\\\\?\\xa3\\xccQ>c\\xd0'?\\xe2y\\...
\n
", + "text/plain": " vec_f32\n0 b\"\\xc4.\\x81<\\x8a\\x0fW?\\x11~\\xcd>_\\x82@?Vq\\x05?...\n1 b'rp\\xdc>U\\xd7\\x89>\\xe6BC?\\xe7\\xcd\\xfb>P\\xe4\\x...\n2 b\"\\xaf\\x10,?\\xc9\\x8c\\\\?\\xa3\\xccQ>c\\xd0'?\\xe2y\\..." + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "out_df = pd.read_sql('vectors', con=conn)\nout_df.head(3)" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9d774b5f-88f9-45b3-a54d-229020aa16af", + "metadata": {}, + "source": "We apply the `np.frombuffer` function to each element in the `vec_f32` column to reconstruct the numpy array." + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "48b56238-b251-479f-9d1f-271f46a7111e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:31.138151Z", + "iopub.status.busy": "2024-07-08T10:19:31.137759Z", + "iopub.status.idle": "2024-07-08T10:19:31.144020Z", + "shell.execute_reply": "2024-07-08T10:19:31.143344Z", + "shell.execute_reply.started": "2024-07-08T10:19:31.138119Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "out_df['vec_f32'] = out_df['vec_f32'].apply(lambda x: np.frombuffer(x, dtype=np.float32))" + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "c4e77d6b-a93c-47d2-89ce-b1c502950c71", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:31.801250Z", + "iopub.status.busy": "2024-07-08T10:19:31.800889Z", + "iopub.status.idle": "2024-07-08T10:19:31.808158Z", + "shell.execute_reply": "2024-07-08T10:19:31.807685Z", + "shell.execute_reply.started": "2024-07-08T10:19:31.801223Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
vec_f32
0[0.01576937, 0.84008086, 0.40135244, 0.7519893...
1[0.4305454, 0.26922098, 0.76273954, 0.4918053,...
2[0.6721296, 0.8615232, 0.20488219, 0.6555235, ...
\n
", + "text/plain": " vec_f32\n0 [0.01576937, 0.84008086, 0.40135244, 0.7519893...\n1 [0.4305454, 0.26922098, 0.76273954, 0.4918053,...\n2 [0.6721296, 0.8615232, 0.20488219, 0.6555235, ..." + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "out_df.head(3)" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "71b184dd-641c-4ef0-91cf-c581143d3945", + "metadata": {}, + "source": "#### Working with Python lists\n\nBecause Python lists are not typed arrays like numpy arrays, we have to convert them to bytes before\nuploading them." + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "43187411-efe0-465d-b6dd-a167534f6823", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:34.513673Z", + "iopub.status.busy": "2024-07-08T10:19:34.513278Z", + "iopub.status.idle": "2024-07-08T10:19:34.677447Z", + "shell.execute_reply": "2024-07-08T10:19:34.676902Z", + "shell.execute_reply.started": "2024-07-08T10:19:34.513645Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "reset_table()" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6b0fa295-99e9-4846-9996-a704df463a36", + "metadata": {}, + "source": "Construct a `DataFrame` using Python lists as the data." + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "1ed1b6c2-3c79-42b9-a671-41b2828c4c31", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:35.886020Z", + "iopub.status.busy": "2024-07-08T10:19:35.885662Z", + "iopub.status.idle": "2024-07-08T10:19:35.895096Z", + "shell.execute_reply": "2024-07-08T10:19:35.894403Z", + "shell.execute_reply.started": "2024-07-08T10:19:35.885991Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
vec_f32
0[0.9452961683273315, 0.5391597151756287, 0.366...
1[0.396205872297287, 0.5664617419242859, 0.0973...
2[0.10918448120355606, 0.43081337213516235, 0.0...
\n
", + "text/plain": " vec_f32\n0 [0.9452961683273315, 0.5391597151756287, 0.366...\n1 [0.396205872297287, 0.5664617419242859, 0.0973...\n2 [0.10918448120355606, 0.43081337213516235, 0.0..." + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "df = pd.DataFrame(dict(vec_f32=vec_f32_list))\ndf.head(3)" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "94263962-9ec2-4e34-a08e-1e2ad41247dd", + "metadata": {}, + "source": "Note that we are using our `fmt` value from a previous section to convert the Python lists\nto bytes using `struct.pack`." + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "3cb1b6e1-a732-4a2f-a751-095d6727e6ae", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:38.742645Z", + "iopub.status.busy": "2024-07-08T10:19:38.742282Z", + "iopub.status.idle": "2024-07-08T10:19:38.747538Z", + "shell.execute_reply": "2024-07-08T10:19:38.746778Z", + "shell.execute_reply.started": "2024-07-08T10:19:38.742621Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "'<10f'" + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "fmt" + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "6cdaafa5-7406-488b-a780-744f23b5c0e4", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:39.431018Z", + "iopub.status.busy": "2024-07-08T10:19:39.430520Z", + "iopub.status.idle": "2024-07-08T10:19:39.436348Z", + "shell.execute_reply": "2024-07-08T10:19:39.435661Z", + "shell.execute_reply.started": "2024-07-08T10:19:39.430982Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "df['vec_f32'] = df['vec_f32'].apply(lambda x: struct.pack(fmt, *x))" + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "af739340-e5fd-482a-96c8-5eedf8202f1c", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:40.064483Z", + "iopub.status.busy": "2024-07-08T10:19:40.064070Z", + "iopub.status.idle": "2024-07-08T10:19:40.070538Z", + "shell.execute_reply": "2024-07-08T10:19:40.069900Z", + "shell.execute_reply.started": "2024-07-08T10:19:40.064445Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "0 b'\\xee\\xfeq?_\\x06\\n?\\xda\\xb5\\xbb>\\xdfq\\x8a>\\xa...\n1 b'\\x7f\\xdb\\xca>\\xa3\\x03\\x11?\\x99q\\xc7=\\x98|\\x1...\n2 b'\\x1d\\x9c\\xdf=\\x92\\x93\\xdc>`\\xcc\\n=\\xffR\\x0f?...\nName: vec_f32, dtype: object" + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "df['vec_f32'].head(3)" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6f2d8675-c1ee-44d2-ac17-eef1c543d71c", + "metadata": {}, + "source": "Use the `to_sql` method to upload the `DataFrame`." + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "49dde7bd-9823-4c55-8f34-4e16643e6b8e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:42.029746Z", + "iopub.status.busy": "2024-07-08T10:19:42.029379Z", + "iopub.status.idle": "2024-07-08T10:19:42.132050Z", + "shell.execute_reply": "2024-07-08T10:19:42.131556Z", + "shell.execute_reply.started": "2024-07-08T10:19:42.029718Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "1000" + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "df.to_sql('vectors', con=conn, if_exists='append', index=False)" + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "137a7f8e-d713-4179-bcad-66f194d1f839", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:42.793734Z", + "iopub.status.busy": "2024-07-08T10:19:42.793430Z", + "iopub.status.idle": "2024-07-08T10:19:42.873760Z", + "shell.execute_reply": "2024-07-08T10:19:42.873122Z", + "shell.execute_reply.started": "2024-07-08T10:19:42.793703Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
vec_f32
0b'>kS?;\\xec\\x8b>0\\xe5\\x19?\\r\\xddF?\\xf2\\x044?u\\...
1b'\\xedJ[=\\xbfq\\xce=G#\\xa4<\\xcbV\\xe3=\\xeb:;?\\xa...
2b'\\x0e\\x08n>\\xe8\\xb2\\x98>\\x10\\x133>\\xd4\\xf7\\x1...
\n
", + "text/plain": " vec_f32\n0 b'>kS?;\\xec\\x8b>0\\xe5\\x19?\\r\\xddF?\\xf2\\x044?u\\...\n1 b'\\xedJ[=\\xbfq\\xce=G#\\xa4<\\xcbV\\xe3=\\xeb:;?\\xa...\n2 b'\\x0e\\x08n>\\xe8\\xb2\\x98>\\x10\\x133>\\xd4\\xf7\\x1..." + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "out_df = pd.read_sql('vectors', con=conn)\nout_df.head(3)" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "99233fdb-57b2-4290-9038-7c3e5eaf553e", + "metadata": {}, + "source": "We now have to convert the byte strings back to Python lists." + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "a60f967c-c8fe-4ad9-a11f-25f5fb35ce69", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:44.813581Z", + "iopub.status.busy": "2024-07-08T10:19:44.813205Z", + "iopub.status.idle": "2024-07-08T10:19:44.820245Z", + "shell.execute_reply": "2024-07-08T10:19:44.819575Z", + "shell.execute_reply.started": "2024-07-08T10:19:44.813545Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "out_df['vec_f32'] = out_df['vec_f32'].apply(lambda x: list(struct.unpack(fmt, x)))" + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "2924f8b8-f543-4a2f-90c8-8e6e5c15275d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:45.338554Z", + "iopub.status.busy": "2024-07-08T10:19:45.338175Z", + "iopub.status.idle": "2024-07-08T10:19:45.345208Z", + "shell.execute_reply": "2024-07-08T10:19:45.344648Z", + "shell.execute_reply.started": "2024-07-08T10:19:45.338521Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
vec_f32
0[0.8258551359176636, 0.2732866704463959, 0.601...
1[0.0535382516682148, 0.10080289095640182, 0.02...
2[0.2324526011943817, 0.29823994636535645, 0.17...
\n
", + "text/plain": " vec_f32\n0 [0.8258551359176636, 0.2732866704463959, 0.601...\n1 [0.0535382516682148, 0.10080289095640182, 0.02...\n2 [0.2324526011943817, 0.29823994636535645, 0.17..." + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "out_df.head(3)" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8f070295-78e3-4137-82d6-8be8c64b3898", + "metadata": {}, + "source": "### Using the `%%sql` / `%sql` magic commands\n\nWhile the SQL magic commands are convenient for invoking basic SQL commands, they aren't quite as good\nfor complex queries that insert data. The primary issue is that you must construct the query as a string\nand ensure that all of your data is properly escaped. We'll demonstrate some basics here, but the\nmethods described in the previous sections are likely to work better." + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "5f13939e-2254-4956-9537-315f1dde1b63", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:49.081113Z", + "iopub.status.busy": "2024-07-08T10:19:49.080780Z", + "iopub.status.idle": "2024-07-08T10:19:49.265261Z", + "shell.execute_reply": "2024-07-08T10:19:49.264490Z", + "shell.execute_reply.started": "2024-07-08T10:19:49.081088Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "reset_table()" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3ac2349f-d2bd-452d-9e4f-d869ef0e774f", + "metadata": {}, + "source": "#### Working with numpy arrays or Python lists\n\nThe SQL magic commands do not do any of the automatic conversions of data to query parameters, so this must be done\nmanually before creating the query. This is done the same way whether the source is numpy arrays or Python lists.\nIn either case, you must convert the objects to byte strings as we have in the previous sections, then convert that\nbyte string into a hex literal that can be used in the query." + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "f6781046-e636-4495-8a99-e035db8988aa", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:50.554191Z", + "iopub.status.busy": "2024-07-08T10:19:50.553850Z", + "iopub.status.idle": "2024-07-08T10:19:50.559649Z", + "shell.execute_reply": "2024-07-08T10:19:50.558965Z", + "shell.execute_reply.started": "2024-07-08T10:19:50.554162Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "'eefe713f5f060a3fdab5bb3edf718a3ea72fab3eb2c5513f8d1c3d3fb6629a3c7873943d04930a3f'" + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "# Convert an element of the numpy array to a hex string\nvec_f32[0].tobytes().hex()" + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "957f98e1-c3d5-4e7c-b43a-5583cdff045e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:51.292915Z", + "iopub.status.busy": "2024-07-08T10:19:51.292559Z", + "iopub.status.idle": "2024-07-08T10:19:51.297697Z", + "shell.execute_reply": "2024-07-08T10:19:51.296979Z", + "shell.execute_reply.started": "2024-07-08T10:19:51.292882Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "'eefe713f5f060a3fdab5bb3edf718a3ea72fab3eb2c5513f8d1c3d3fb6629a3c7873943d04930a3f'" + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "# Convert an element of the Python list to a hex string\nstruct.pack(fmt, *vec_f32_list[0]).hex()" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "5424355e-fffb-4cc7-b0c3-eba7012d1bd1", + "metadata": {}, + "source": "To construct the query string for the `%%sql` command, we need to build the entire list of values to insert\nin a separate step. We'll insert the `X` at the beginning of the string to indicate a hex literal to\nSingleStoreDB. We'll also add the parentheses around the value for inserting multiple rows of data using\nthe `INSERT` statement." + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "e269be32-5b56-4e19-baed-6420d6fd4bfb", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:53.236912Z", + "iopub.status.busy": "2024-07-08T10:19:53.236581Z", + "iopub.status.idle": "2024-07-08T10:19:53.242966Z", + "shell.execute_reply": "2024-07-08T10:19:53.242469Z", + "shell.execute_reply.started": "2024-07-08T10:19:53.236887Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "[\"(X'eefe713f5f060a3fdab5bb3edf718a3ea72fab3eb2c5513f8d1c3d3fb6629a3c7873943d04930a3f')\",\n \"(X'7fdbca3ea303113f9971c73d987c1b3f3e875e3f0a49d13e2c90333e0616033ff33f123dd2c8573f')\",\n \"(X'1d9cdf3d9293dc3e60cc0a3dff520f3f10454d3f2c222d3f0f74813ea0aed63e908f443ffa44093e')\"]" + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "params = [\"(X'{}')\".format(x.tobytes().hex()) for x in vec_f32]\nparams[:3]" + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "bd51d277-eec1-4787-b9b9-7a943f3eea0c", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:53.978476Z", + "iopub.status.busy": "2024-07-08T10:19:53.978189Z", + "iopub.status.idle": "2024-07-08T10:19:54.221876Z", + "shell.execute_reply": "2024-07-08T10:19:54.221385Z", + "shell.execute_reply.started": "2024-07-08T10:19:53.978452Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/html": "\n \n \n \n \n \n \n
", + "text/plain": "++\n||\n++\n++" + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "%%sql\nINSERT INTO vectors(vec_f32) VALUES {{ ','.join(params) }}" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "5b982cc3-5e8a-460c-beff-440dbae58144", + "metadata": {}, + "source": "We can now select the data." + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "8a9ce43b-2ebc-4c9a-8898-afacaff13df9", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:55.729083Z", + "iopub.status.busy": "2024-07-08T10:19:55.728712Z", + "iopub.status.idle": "2024-07-08T10:19:55.790793Z", + "shell.execute_reply": "2024-07-08T10:19:55.790284Z", + "shell.execute_reply.started": "2024-07-08T10:19:55.729055Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "%%sql out <<\nSELECT * FROM vectors LIMIT 5" + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "b10b2f71-02fd-4630-8ae0-7845a8385934", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:56.271121Z", + "iopub.status.busy": "2024-07-08T10:19:56.270799Z", + "iopub.status.idle": "2024-07-08T10:19:56.276388Z", + "shell.execute_reply": "2024-07-08T10:19:56.275960Z", + "shell.execute_reply.started": "2024-07-08T10:19:56.271091Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
vec_f32
b'\\xfe\\xd7@?\\xecr\\xec>\\xbdW1?\\xbb_\\xcb>\\x88\\xb9\\xf4>\\x04G\\xaa>#d\\xf3=\\x07\\xb5\\xcb>\\xcd\\xd7&?{Es?'
b'\\x0e\\x08n>\\xe8\\xb2\\x98>\\x10\\x133>\\xd4\\xf7\\x1b?Q$-?t\\x11\\xfa>,}S?\\xa8\\x14k;\\x1a\\xf8h>\\xf8\\xbc-?'
b'1~\\x01>\\xb7>/?\\xd3\\x10\\x1f?Z\\xcc\\x05=>X\\xa8>\\x01\\r\\x1b>Q\\xf1\\xc3>/-\\xab=\\xea\\x9c6?\\xbc\\xd6|?'
b'\\xe3\\x02\\xba=zz)?<\\xa33?\\x15\\x03\\x14>\\x99\\x97\\x19?(\\x13#?!\\xe9\\xe9>\\xba#i?\\xdc\\xe1y?\\xe1\\xc1)?'
b'\\x8c\\x86\\x12?\\xc3+h?\\xba?=<\\xa45\\xda>\\xb5\\xf2\\r?\\xdb\\xa8\\x08?]\\x84&?\\xf2\\xd1s?\\xec\\xce\\xab>\\x10\\x19k?'
", + "text/plain": "+----------------------------------------------------------------------------------------------------------+\n| vec_f32 |\n+----------------------------------------------------------------------------------------------------------+\n| b'\\xfe\\xd7@?\\xecr\\xec>\\xbdW1?\\xbb_\\xcb>\\x88\\xb9\\xf4>\\x04G\\xaa>#d\\xf3=\\x07\\xb5\\xcb>\\xcd\\xd7&?{Es?' |\n| b'\\x0e\\x08n>\\xe8\\xb2\\x98>\\x10\\x133>\\xd4\\xf7\\x1b?Q$-?t\\x11\\xfa>,}S?\\xa8\\x14k;\\x1a\\xf8h>\\xf8\\xbc-?' |\n| b'1~\\x01>\\xb7>/?\\xd3\\x10\\x1f?Z\\xcc\\x05=>X\\xa8>\\x01\\r\\x1b>Q\\xf1\\xc3>/-\\xab=\\xea\\x9c6?\\xbc\\xd6|?' |\n| b'\\xe3\\x02\\xba=zz)?<\\xa33?\\x15\\x03\\x14>\\x99\\x97\\x19?(\\x13#?!\\xe9\\xe9>\\xba#i?\\xdc\\xe1y?\\xe1\\xc1)?' |\n| b'\\x8c\\x86\\x12?\\xc3+h?\\xba?=<\\xa45\\xda>\\xb5\\xf2\\r?\\xdb\\xa8\\x08?]\\x84&?\\xf2\\xd1s?\\xec\\xce\\xab>\\x10\\x19k?' |\n+----------------------------------------------------------------------------------------------------------+" + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "out" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3178ea48-2bfd-44d2-8211-a291dd5bf5ba", + "metadata": {}, + "source": "At this point, there is nothing we can do with SQL magic commands to convert the data back into numpy arrays or Python\nlists. We need to drop to Python for that." + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "5b6d0c98-a016-423c-a460-aa617615bcdf", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:58.489208Z", + "iopub.status.busy": "2024-07-08T10:19:58.488903Z", + "iopub.status.idle": "2024-07-08T10:19:58.493581Z", + "shell.execute_reply": "2024-07-08T10:19:58.493004Z", + "shell.execute_reply.started": "2024-07-08T10:19:58.489183Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "out_df = pd.DataFrame(out)" + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "5a3e6552-b66c-460e-9394-04b6b1a25795", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:59.152051Z", + "iopub.status.busy": "2024-07-08T10:19:59.151741Z", + "iopub.status.idle": "2024-07-08T10:19:59.156177Z", + "shell.execute_reply": "2024-07-08T10:19:59.155684Z", + "shell.execute_reply.started": "2024-07-08T10:19:59.152026Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "out_df['vec_f32'] = out_df['vec_f32'].apply(lambda x: np.frombuffer(x, dtype=np.float32))" + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "821ac65c-b8d5-47f5-8b14-945ed8e8d1fa", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:19:59.763146Z", + "iopub.status.busy": "2024-07-08T10:19:59.762871Z", + "iopub.status.idle": "2024-07-08T10:19:59.770886Z", + "shell.execute_reply": "2024-07-08T10:19:59.770351Z", + "shell.execute_reply.started": "2024-07-08T10:19:59.763122Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
vec_f32
0[0.7532958, 0.46181428, 0.69274503, 0.39721474...
1[0.2324526, 0.29823995, 0.1748774, 0.6092503, ...
2[0.12645794, 0.6845507, 0.62135047, 0.03266558...
\n
", + "text/plain": " vec_f32\n0 [0.7532958, 0.46181428, 0.69274503, 0.39721474...\n1 [0.2324526, 0.29823995, 0.1748774, 0.6092503, ...\n2 [0.12645794, 0.6845507, 0.62135047, 0.03266558..." + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "out_df.head(3)" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ad5c2b96-0002-4948-87a4-949a68c3e0a2", + "metadata": {}, + "source": "### Using JSON\n\nIt is also possible to use JSON to create vectors, however, this method require serializing and deserializing JSON on either\nend which isn't quite a efficient as the techniques above. It also requires using the `JSON_ARRAY_PACK` and `JSON_ARRAY_UNPACK`\nfunctions in your queries to go back and forth between the vector bytes and JSON. Here is an example of inserting the\nPython list of floats." + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "df5af6d1-15e1-4867-a02c-31634a65393b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:20:02.617827Z", + "iopub.status.busy": "2024-07-08T10:20:02.617511Z", + "iopub.status.idle": "2024-07-08T10:20:02.621408Z", + "shell.execute_reply": "2024-07-08T10:20:02.620884Z", + "shell.execute_reply.started": "2024-07-08T10:20:02.617789Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "import json" + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "79f06760-9039-408a-a4c2-6331947dd3e4", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:20:03.508306Z", + "iopub.status.busy": "2024-07-08T10:20:03.507927Z", + "iopub.status.idle": "2024-07-08T10:20:03.521454Z", + "shell.execute_reply": "2024-07-08T10:20:03.520719Z", + "shell.execute_reply.started": "2024-07-08T10:20:03.508280Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "['(JSON_ARRAY_PACK(\"[0.9452961683273315, 0.5391597151756287, 0.36662179231643677, 0.2704000174999237, 0.3343479335308075, 0.8194228410720825, 0.7387169003486633, 0.018845897167921066, 0.07248586416244507, 0.5413057804107666]\"))',\n '(JSON_ARRAY_PACK(\"[0.396205872297287, 0.5664617419242859, 0.09738463908433914, 0.6073698997497559, 0.8692511320114136, 0.4087603688240051, 0.17535465955734253, 0.5120548009872437, 0.03570551797747612, 0.8429080247879028]\"))',\n '(JSON_ARRAY_PACK(\"[0.10918448120355606, 0.43081337213516235, 0.03388631343841553, 0.5598601698875427, 0.8018350601196289, 0.6763026714324951, 0.2528385818004608, 0.41930103302001953, 0.7678155899047852, 0.13405218720436096]\"))']" + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "params = ['(JSON_ARRAY_PACK(\"{}\"))'.format(json.dumps(x)) for x in vec_f32_list]\nparams[:3]" + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "92217c8d-f374-49a6-8fb8-f21666681f95", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:20:04.593439Z", + "iopub.status.busy": "2024-07-08T10:20:04.593134Z", + "iopub.status.idle": "2024-07-08T10:20:04.913522Z", + "shell.execute_reply": "2024-07-08T10:20:04.913009Z", + "shell.execute_reply.started": "2024-07-08T10:20:04.593413Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/html": "\n \n \n \n \n \n \n
", + "text/plain": "++\n||\n++\n++" + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "%%sql\nINSERT INTO vectors(vec_f32) VALUES {{ ','.join(params) }}" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "bd323c0a-7de4-4c56-9fc4-f2a22f4f661c", + "metadata": {}, + "source": "If you use the `JSON_ARRAY_UNPACK` function in your `SELECT` statement, you can download the data as JSON." + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "b9aac5ba-efea-466b-82c7-12fa02174630", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:20:06.436559Z", + "iopub.status.busy": "2024-07-08T10:20:06.436232Z", + "iopub.status.idle": "2024-07-08T10:20:06.547095Z", + "shell.execute_reply": "2024-07-08T10:20:06.546606Z", + "shell.execute_reply.started": "2024-07-08T10:20:06.436534Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "%%sql out <<\nSELECT JSON_ARRAY_UNPACK(vec_f32) AS 'vec_f32' FROM vectors LIMIT 5" + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "9f9e9b6e-5a25-483b-9d40-aea95a302b5f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:20:08.175495Z", + "iopub.status.busy": "2024-07-08T10:20:08.175127Z", + "iopub.status.idle": "2024-07-08T10:20:08.183521Z", + "shell.execute_reply": "2024-07-08T10:20:08.182908Z", + "shell.execute_reply.started": "2024-07-08T10:20:08.175458Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
vec_f32
0[0.177021876, 0.717939079, 0.825487614, 0.7764...
1[0.749338746, 0.589595854, 0.704872251, 0.9270...
2[0.971682549, 0.574555218, 0.174982488, 0.4692...
3[0.814588428, 0.773147047, 0.970053494, 0.9038...
4[0.247024894, 0.828292727, 0.599695325, 0.4499...
\n
", + "text/plain": " vec_f32\n0 [0.177021876, 0.717939079, 0.825487614, 0.7764...\n1 [0.749338746, 0.589595854, 0.704872251, 0.9270...\n2 [0.971682549, 0.574555218, 0.174982488, 0.4692...\n3 [0.814588428, 0.773147047, 0.970053494, 0.9038...\n4 [0.247024894, 0.828292727, 0.599695325, 0.4499..." + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "out = pd.DataFrame(out)\nout" + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "bbfe895a-0f09-4094-a835-793329ee388e", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:20:09.160995Z", + "iopub.status.busy": "2024-07-08T10:20:09.160596Z", + "iopub.status.idle": "2024-07-08T10:20:09.166041Z", + "shell.execute_reply": "2024-07-08T10:20:09.165388Z", + "shell.execute_reply.started": "2024-07-08T10:20:09.160959Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "[0.177021876,\n 0.717939079,\n 0.825487614,\n 0.77646929,\n 0.137723535,\n 0.358667195,\n 0.41495508,\n 0.027805429,\n 0.291372836,\n 0.413403481]" + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "out['vec_f32'][0]" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6bdb300a-a8f6-40cc-a5f9-de54508bb22b", + "metadata": {}, + "source": "Notice that since the data type of the column in the `SELECT` is JSON, it automatically gets converted to a Python list\nin the client." + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "b5234146-b058-4462-b5d0-516ae699efc6", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:20:10.909585Z", + "iopub.status.busy": "2024-07-08T10:20:10.909330Z", + "iopub.status.idle": "2024-07-08T10:20:10.914795Z", + "shell.execute_reply": "2024-07-08T10:20:10.914331Z", + "shell.execute_reply.started": "2024-07-08T10:20:10.909556Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [ + { + "data": { + "text/plain": "list" + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": "type(out['vec_f32'][0])" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8a8cfd32-c903-4e9b-b27b-253fcbca6ad4", + "metadata": {}, + "source": "## Conclusion\n\nAs you can see, there are various interfaces available for uploading and downloading vector data. Depending on\nwhich Python framework you are using and what format your data is in, you can pick and choose which\nmethods work for your use-case." + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "42060943", + "metadata": {}, + "source": "
\n \n
\n

Action Required

\n

If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

\n
\n
" + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "8f911f36-0153-4959-828a-41e637cc9887", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-08T10:20:14.177578Z", + "iopub.status.busy": "2024-07-08T10:20:14.177232Z", + "iopub.status.idle": "2024-07-08T10:20:14.190149Z", + "shell.execute_reply": "2024-07-08T10:20:14.189456Z", + "shell.execute_reply.started": "2024-07-08T10:20:14.177548Z" + }, + "language": "python", + "trusted": true + }, + "outputs": [], + "source": "shared_tier_check = %sql show variables like 'is_shared_tier'\nif not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n %sql DROP DATABASE IF EXISTS vector_data;" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "546a9cee-db0a-438b-9fcc-081223339a9f", + "metadata": {}, + "source": "
\n
" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + }, + "singlestore_cell_default_language": "python", + "singlestore_connection": { + "connectionID": "58e4dd02-7da3-425b-8bcb-3321cc7f1b41", + "defaultDatabase": "database_e9b85" + }, + "singlestore_row_limit": 300 }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 + } diff --git a/resources/nb-check.py b/resources/nb-check.py index dc971943..6a5744d2 100755 --- a/resources/nb-check.py +++ b/resources/nb-check.py @@ -149,9 +149,6 @@ def new_markdown_cell(cell_id: str, content: list[str]) -> dict[str, Any]: for i, cell in enumerate(cells): if 'metadata' in cell: cell['metadata'] = {} - # TODO: do not remove outputs once helios has migrated to published zips - if 'outputs' in cell: - cell['outputs'] = [] # Remove empty cells at the end of the notebook end = len(cells) - 1