Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add extra dimension support #865

Merged
merged 6 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions scripts/.env.sample
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# resource switch
FLAG_EMBEDDING_MODEL = "AOAI" # "AOAI" or "COHERE"
FLAG_COHERE = "ENGLISH" # "MULTILINGUAL" or "ENGLISH" options for Cohere embedding models
FLAG_AOAI = "V3" # "V2" or "V3" options for AOAI embedding models

# update vector dimension based on model chosen
VECTOR_DIMENSION = 1536 # change it to desired, e.g., 1536 for AOAI ada 002, 1024 for COHERE
Expand Down
26 changes: 13 additions & 13 deletions scripts/config.json
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[
{
"data_path": "<path to data>",
"location": "<azure region, e.g. 'westus2'>",
"subscription_id": "<subscription id>",
"resource_group": "<resource group name>",
"search_service_name": "<search service name to use or create>",
"index_name": "<index name to use or create>",
"chunk_size": 1024,
"token_overlap": 128,
"semantic_config_name": "default",
"language": "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers"
}
[
{
"data_path": "<path to data>",
"location": "<azure region, e.g. 'westus2'>",
"subscription_id": "<subscription id>",
SophieGarden marked this conversation as resolved.
Show resolved Hide resolved
"resource_group": "<resource group name>",
"search_service_name": "<search service name to use or create>",
"index_name": "<index name to use or create>",
"chunk_size": 1024,
"token_overlap": 128,
"semantic_config_name": "default",
"language": "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers"
}
]
2 changes: 1 addition & 1 deletion scripts/data_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def create_or_update_search_index(
"type": "Collection(Edm.Single)",
"searchable": True,
"retrievable": True,
"dimensions": os.getenv("VECTOR_DIMENSION", 1536),
"dimensions": int(os.getenv("VECTOR_DIMENSION", 1536)),
"vectorSearchConfiguration": vector_config_name
})

Expand Down
10 changes: 8 additions & 2 deletions scripts/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,7 @@ def get_embedding(text, embedding_model_endpoint=None, embedding_model_key=None,

FLAG_EMBEDDING_MODEL = os.getenv("FLAG_EMBEDDING_MODEL", "AOAI")
FLAG_COHERE = os.getenv("FLAG_COHERE", "ENGLISH")
FLAG_AOAI = os.getenv("FLAG_AOAI", "V3")

if azure_credential is None and (endpoint is None or key is None):
raise Exception("EMBEDDING_MODEL_ENDPOINT and EMBEDDING_MODEL_KEY are required for embedding")
Expand All @@ -666,8 +667,13 @@ def get_embedding(text, embedding_model_endpoint=None, embedding_model_key=None,
else:
api_key = embedding_model_key if embedding_model_key else os.getenv("AZURE_OPENAI_API_KEY")

client = AzureOpenAI(api_version=api_version, azure_endpoint=base_url, azure_ad_token=api_key)
embeddings = client.embeddings.create(model=deployment_id, input=text)
client = AzureOpenAI(api_version=api_version, azure_endpoint=base_url, api_key=api_key)
if FLAG_AOAI == "V2":
embeddings = client.embeddings.create(model=deployment_id, input=text)
elif FLAG_AOAI == "V3":
embeddings = client.embeddings.create(model=deployment_id,
input=text,
dimensions=int(os.getenv("VECTOR_DIMENSION", 1536)))

return embeddings.dict()['data'][0]['embedding']

Expand Down
3 changes: 3 additions & 0 deletions scripts/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ Disclaimer: Make sure there are no duplicate pages in your data. That could impa

`python data_preparation.py --config config.json --njobs=4`

### Batch creation of index
Refer to the script run_batch_create_index.py to create multiple indexes in batch using one script.

## Optional: Use URL prefix
Each document can be associated with a URL that is stored with each document chunk in the Azure Cognitive Search index in the `url` field. If your documents were downloaded from the web, you can specify a URL prefix to use to construct the document URLs when ingesting your data. Your config file should have an additional `url_prefix` parameter like so:

Expand Down
79 changes: 79 additions & 0 deletions scripts/run_batch_create_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import copy
import json
import os
import subprocess
import tqdm
from openai import AzureOpenAI
from dotenv import load_dotenv

load_dotenv()

FORM_RECOGNIZER_KEY = os.getenv("FORM_RECOGNIZER_KEY")

with open("./config.json", "r") as f:
config = json.loads(f.read())

# this is an example,
# it address how to handle subfolders
# it also provide option wether to use form recognizer
run_config_by_data_path_3_small_512_512 = {
"aks": "aks_embed_003_small_512_512_index",
"azure-docs": {
"index": "azure_embed_003_small_512_512_index",
"subfolder": "azure-docs",
},
"test_loranorm": {
"index": "test_loranorm_embed_003_small_512_512_index",
"form-rec-use-layout": False,
},

}

for key, cfg in tqdm.tqdm(run_config_by_data_path_3_small_512_512.items()):
# folder is where data is saved
folder = os.path.join("/index_data", key)

if isinstance(cfg, str):
index = cfg
form_rec_use_layout = True
else:
index = cfg["index"]
form_rec_use_layout = cfg.get("form-rec-use-layout", True)
if "subfolder" in cfg:
folder = os.path.join(folder, cfg["subfolder"])


config_key = copy.deepcopy(config[0])
config_key["data_path"] = os.path.abspath(folder)
config_key["index_name"] = index

print(config_key["data_path"])
with open(f"./config.{key}.json", "w") as f:
f.write(json.dumps([config_key]))

command = [
"python",
"data_preparation.py",
"--config",
f"config.{key}.json",
"--embedding-model-endpoint",
'"EMBEDDING_MODEL_ENDPOINT"',
"--form-rec-resource",
"test-tprompt",
"--form-rec-key",
FORM_RECOGNIZER_KEY,
] + (["--form-rec-use-layout"] if form_rec_use_layout else []) + [
"--njobs=8",
]
str_command = " ".join(command)
proc = subprocess.run(str_command, capture_output=True)
if proc.returncode != 0:
print("Error running", command)
print(proc.stderr)
print(proc.stdout)






Loading