Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add initial support for ingesting visual content #1026

Merged
merged 6 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-r requirements.txt
azure-ai-formrecognizer==3.2.1
azure-ai-documentintelligence==1.0.0b2
Markdown==3.4.4
requests==2.31.0
tqdm==4.66.1
Expand All @@ -9,6 +9,7 @@ bs4==0.0.1
urllib3==2.1.0
pytest==7.4.0
pytest-asyncio==0.23.2
PyMuPDF==1.24.5
azure-storage-blob
chardet
azure-keyvault-secrets
Expand Down
23 changes: 17 additions & 6 deletions scripts/data_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import time

import requests
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential
from azure.identity import AzureCliCredential
from azure.search.documents import SearchClient
Expand Down Expand Up @@ -209,6 +209,14 @@ def create_or_update_search_index(
"type": "Edm.String",
"searchable": True,
},
{
"name": "image_mapping",
"type": "Edm.String",
"searchable": False,
"sortable": False,
"facetable": False,
"filterable": False
}
],
"suggesters": [],
"scoringProfiles": [],
Expand Down Expand Up @@ -356,7 +364,7 @@ def validate_index(service_name, subscription_id, resource_group, index_name):
print(f"Request failed. Please investigate. Status code: {response.status_code}")
break

def create_index(config, credential, form_recognizer_client=None, embedding_model_endpoint=None, use_layout=False, njobs=4):
def create_index(config, credential, form_recognizer_client=None, embedding_model_endpoint=None, use_layout=False, njobs=4, captioning_model_endpoint=None, captioning_model_key=None):
service_name = config["search_service_name"]
subscription_id = config["subscription_id"]
resource_group = config["resource_group"]
Expand Down Expand Up @@ -410,7 +418,8 @@ def create_index(config, credential, form_recognizer_client=None, embedding_mode
elif os.path.exists(data_config["path"]):
result = chunk_directory(data_config["path"], num_tokens=config["chunk_size"], token_overlap=config.get("token_overlap",0),
azure_credential=credential, form_recognizer_client=form_recognizer_client, use_layout=use_layout, njobs=njobs,
add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint, url_prefix=data_config["url_prefix"])
add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint, url_prefix=data_config["url_prefix"],
captioning_model_endpoint=captioning_model_endpoint, captioning_model_key=captioning_model_key)
else:
raise Exception(f"Path {data_config['path']} does not exist and is not a blob URL. Please check the path and try again.")

Expand Down Expand Up @@ -443,11 +452,13 @@ def valid_range(n):
parser.add_argument("--config", type=str, help="Path to config file containing settings for data preparation")
parser.add_argument("--form-rec-resource", type=str, help="Name of your Form Recognizer resource to use for PDF cracking.")
parser.add_argument("--form-rec-key", type=str, help="Key for your Form Recognizer resource to use for PDF cracking.")
parser.add_argument("--form-rec-use-layout", default=False, action='store_true', help="Whether to use Layout model for PDF cracking, if False will use Read model.")
parser.add_argument("--form-rec-use-layout", default=True, action='store_true', help="Whether to use Layout model for PDF cracking, if False will use Read model.")
parser.add_argument("--njobs", type=valid_range, default=4, help="Number of jobs to run (between 1 and 32). Default=4")
parser.add_argument("--embedding-model-endpoint", type=str, help="Endpoint for the embedding model to use for vector search. Format: 'https://<AOAI resource name>.openai.azure.com/openai/deployments/<Ada deployment name>/embeddings?api-version=2024-03-01-Preview'")
parser.add_argument("--embedding-model-key", type=str, help="Key for the embedding model to use for vector search.")
parser.add_argument("--search-admin-key", type=str, help="Admin key for the search service. If not provided, will use Azure CLI to get the key.")
parser.add_argument("--azure-openai-endpoint", type=str, help="Endpoint for the (Azure) OpenAI API. Format: 'https://<AOAI resource name>.openai.azure.com/openai/deployments/<vision model name>/chat/completions?api-version=2024-04-01-preview'")
parser.add_argument("--azure-openai-key", type=str, help="Key for the (Azure) OpenAI API.")
args = parser.parse_args()

with open(args.config) as f:
Expand All @@ -464,15 +475,15 @@ def valid_range(n):
os.environ["FORM_RECOGNIZER_ENDPOINT"] = f"https://{args.form_rec_resource}.cognitiveservices.azure.com/"
os.environ["FORM_RECOGNIZER_KEY"] = args.form_rec_key
if args.njobs==1:
form_recognizer_client = DocumentAnalysisClient(endpoint=f"https://{args.form_rec_resource}.cognitiveservices.azure.com/", credential=AzureKeyCredential(args.form_rec_key))
form_recognizer_client = DocumentIntelligenceClient(endpoint=f"https://{args.form_rec_resource}.cognitiveservices.azure.com/", credential=AzureKeyCredential(args.form_rec_key))
print(f"Using Form Recognizer resource {args.form_rec_resource} for PDF cracking, with the {'Layout' if args.form_rec_use_layout else 'Read'} model.")

for index_config in config:
print("Preparing data for index:", index_config["index_name"])
if index_config.get("vector_config_name") and not args.embedding_model_endpoint:
raise Exception("ERROR: Vector search is enabled in the config, but no embedding model endpoint and key were provided. Please provide these values or disable vector search.")

create_index(index_config, credential, form_recognizer_client, embedding_model_endpoint=args.embedding_model_endpoint, use_layout=args.form_rec_use_layout, njobs=args.njobs)
create_index(index_config, credential, form_recognizer_client, embedding_model_endpoint=args.embedding_model_endpoint, use_layout=args.form_rec_use_layout, njobs=args.njobs, captioning_model_endpoint=args.azure_openai_endpoint, captioning_model_key=args.azure_openai_key)
print("Data preparation for index", index_config["index_name"], "completed")

print(f"Data preparation script completed. {len(config)} indexes updated.")
Loading
Loading