diff --git a/labelbase/annotate.py b/labelbase/annotate.py index 8627974..b624923 100644 --- a/labelbase/annotate.py +++ b/labelbase/annotate.py @@ -365,7 +365,6 @@ def flatten_label(client:labelboxClient, label_dict:dict, ontology_index:dict, d if column_name not in flat_label.keys(): flat_label[column_name] = [] if "bounding_box" in obj.keys(): - print(obj) annotation_value = [obj["bounding_box"]["top"], obj["bounding_box"]["left"], obj["bounding_box"]["height"], obj["bounding_box"]["width"]] if "page_number" in obj.keys(): annotation_value.append(obj["page_number"]) diff --git a/labelbase/connector.py b/labelbase/connector.py index bc4c1e7..918440d 100644 --- a/labelbase/connector.py +++ b/labelbase/connector.py @@ -187,6 +187,8 @@ def determine_actions( attachments_action = True if attachment_index and not create_action else False # Determine if we're batching data rows batch_action = False if (project_id == project_id_col == "") else True + print(project_id) + print(project_id_col) # Determine the upload_method if we're batching to projects annotate_action = upload_method if (upload_method in ["mal", "import", "ground-truth"]) and annotation_index and batch_action else "" # "ground-truth" defaults to "import" if no model informtion is given diff --git a/labelbase/converters/__pycache__/__init__.cpython-311.pyc b/labelbase/converters/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index edb3e85..0000000 Binary files a/labelbase/converters/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/labelbase/converters/__pycache__/coco.cpython-311.pyc b/labelbase/converters/__pycache__/coco.cpython-311.pyc deleted file mode 100644 index 45df18a..0000000 Binary files a/labelbase/converters/__pycache__/coco.cpython-311.pyc and /dev/null differ diff --git a/labelbase/downloader.py b/labelbase/downloader.py index 65c8712..eeb12ae 100644 --- a/labelbase/downloader.py +++ b/labelbase/downloader.py @@ -5,7 +5,7 @@ from labelbase.annotate import flatten_label def export_and_flatten_labels(client:labelboxClient, project, include_metadata:bool=True, include_performance:bool=True, - include_agreement:bool=False, verbose:bool=False, mask_method:str="png", divider="///", export_filters:dict=None): + include_agreement:bool=False, include_label_details:bool=False, verbose:bool=False, mask_method:str="png", divider="///", export_filters:dict=None): """ Exports and flattens labels from a Labelbox Project Args: client: : Required (labelbox.Client) - Labelbox Client object @@ -82,6 +82,7 @@ def export_and_flatten_labels(client:labelboxClient, project, include_metadata:b flat_label["seconds_to_create"] = nested_label['performance_details']['seconds_to_create'] flat_label["seconds_to_review"] = nested_label['performance_details']['seconds_to_review'] flat_label["seconds_to_label"] = nested_label['performance_details']['seconds_to_create'] - nested_label['performance_details']['seconds_to_review'] + if include_metadata: for metadata in label['metadata_fields']: try: if metadata['value'] in metadata_schema_to_name_key.keys(): @@ -115,6 +116,10 @@ def export_and_flatten_labels(client:labelboxClient, project, include_metadata:b metadata_value = metadata['value'] if field_name != "lb_integration_source": flat_label[f'metadata{divider}{metadata_type}{divider}{field_name}'] = metadata_value + if include_label_details: + flat_label["created_by"] = nested_label['label_details']["created_by"] + flat_label["updated_at"] = nested_label['label_details']["updated_at"] + flat_label["created_at"] = nested_label['label_details']["created_at"] flattened_labels.append(flat_label) if verbose: print(f"Labels flattened") diff --git a/labelbase/metadata.py b/labelbase/metadata.py index 030ce21..13ab5b0 100644 --- a/labelbase/metadata.py +++ b/labelbase/metadata.py @@ -3,6 +3,7 @@ from datetime import datetime from dateutil import parser import pytz +import pandas def get_metadata_schema_to_type(client:labelboxClient, lb_mdo=False, invert:bool=False): """ Creates a dictionary where {key=metadata_schema_id: value=metadata_type} @@ -121,6 +122,8 @@ def process_metadata_value(metadata_value, metadata_type:str, parent_name:str, m return_value = None if str(metadata_value) == "nan": # Catch NaN values return_value = None + if pandas.isna(metadata_value): #Catch pandas df NaN values + return_value = None # By metadata type if metadata_type == "enum": # For enums, it must be a schema ID - if we can't match it, we have to skip it name_key = f"{parent_name}{divider}{str(metadata_value)}" diff --git a/labelbase/uploader.py b/labelbase/uploader.py index a12ab38..57a193c 100644 --- a/labelbase/uploader.py +++ b/labelbase/uploader.py @@ -2,6 +2,7 @@ from labelbox import Dataset as labelboxDataset from labelbox import Project as labelboxProject import uuid +from concurrent.futures import ThreadPoolExecutor, as_completed def create_global_key_to_label_id_dict(client:labelboxClient, project_id:str, global_keys:list): """ Creates a dictionary where { key=global_key : value=label_id } by exporting labels from a project @@ -37,7 +38,7 @@ def create_global_key_to_data_row_id_dict(client:labelboxClient, global_keys:lis global_key_to_data_row_dict[gks[i]] = res['results'][i] return global_key_to_data_row_dict -def check_global_keys(client:labelboxClient, global_keys:list, batch_size=1000): +def check_global_keys(client:labelboxClient, global_keys:list): """ Checks if data rows exist for a set of global keys - if data rows exist, returns as dictionary { key=data_row_id : value=global_key } Args: client : Required (labelbox.client.Client) - Labelbox Client object @@ -52,20 +53,18 @@ def check_global_keys(client:labelboxClient, global_keys:list, batch_size=1000): # Enforce global keys as strings global_keys_list = [str(x) for x in global_keys] # Batch global key checks - for i in range(0, len(global_keys_list), batch_size): - batch_gks = global_keys_list[i:] if i + batch_size >= len(global_keys_list) else global_keys_list[i:i+batch_size] - # Get the datarow ids - res = client.get_data_row_ids_for_global_keys(batch_gks) - # Check query job results for fetched data rows - for i in range(0, len(res["results"])): - data_row_id = res["results"][i] - if data_row_id: - existing_drid_to_gk[data_row_id] = batch_gks[i] + # Get the datarow ids + res = client.get_data_row_ids_for_global_keys(global_keys_list) + # Check query job results for fetched data rows + for i in range(0, len(res["results"])): + data_row_id = res["results"][i] + if data_row_id: + existing_drid_to_gk[data_row_id] = global_keys_list[i] return existing_drid_to_gk def batch_create_data_rows( client:labelboxClient, upload_dict:dict, skip_duplicates:bool=True, - divider:str="___", batch_size:int=20000, verbose:bool=False): + divider:str="___", batch_size:int=100000, verbose:bool=False): """ Uploads data rows, skipping duplicate global keys or auto-generating new unique ones. upload_dict must be in the following format: @@ -94,7 +93,7 @@ def batch_create_data_rows( """ # Default error message - e = "Success" + e = {} # Vet all global keys global_keys = list(upload_dict.keys()) # Get all global keys if verbose: @@ -103,11 +102,14 @@ def batch_create_data_rows( gks = global_keys[i:] if i + batch_size >= len(global_keys) else global_keys[i:i+batch_size] # Batch of global keys to vet existing_data_row_to_global_key = check_global_keys(client, gks) # Returns empty list if there are no duplicates loop_counter = 0 + if skip_duplicates: + e['skipped_global_keys'] = [] while existing_data_row_to_global_key: if skip_duplicates: # Drop in-use global keys if we're skipping duplicates if verbose: print(f"Warning: Global keys in this upload are in use by active data rows, skipping the upload of data rows affected") for gk in existing_data_row_to_global_key.values(): + e['skipped_global_keys'].append(gk) del upload_dict[gk] break else: # Create new suffix for taken global keys if we're not skipping duplicates @@ -135,7 +137,9 @@ def batch_create_data_rows( dataset_id_to_upload_list[dataset_id] = [] dataset_id_to_upload_list[dataset_id].append(data_row) # Perform uploads grouped by dataset ID + e['errors'] = [] for dataset_id in dataset_id_to_upload_list: + task_list = [] dataset = client.get_dataset(dataset_id) upload_list = dataset_id_to_upload_list[dataset_id] if verbose: @@ -147,16 +151,24 @@ def batch_create_data_rows( if verbose: print(f'Batch #{batch_number}: {len(batch)} data rows') task = dataset.create_data_rows(batch) - task.wait_till_done() - errors = task.errors - if errors: - if verbose: - print(f'Error: Upload batch number {batch_number} unsuccessful') - e = errors - break - else: - if verbose: - print(f'Success: Upload batch number {batch_number} successful') + task_list.append(task) + # task.wait_till_done() + # errors = task.errors + # e['upload_results'].append(task.uid) + # if errors: + # if verbose: + # print(f'Error: Upload batch number {batch_number} unsuccessful') + # e['errors'] = errors + # break + # else: + # if verbose: + # print(f'Success: Upload batch number {batch_number} successful') + with ThreadPoolExecutor() as exc: + futures = [exc.submit(get_results_from_task, x) for x in task_list] + for future in as_completed(futures): + errors = future.result() + if errors: + e['errors'] += errors if verbose: print(f'Upload complete - all data rows uploaded') return e, upload_dict @@ -487,3 +499,7 @@ def batch_upload_predictions( except Exception as error: e = error return e + +def get_results_from_task(task): + task.wait_till_done() + return task.errors diff --git a/setup.py b/setup.py index a9e5a38..f77f44b 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='labelbase', - version='0.1.05', + version='0.1.06', author='Labelbox', author_email='raphael@labelbox.com', description='Labelbox Helper Library',