Skip to content
This repository was archived by the owner on Aug 1, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion labelbase/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,6 @@ def flatten_label(client:labelboxClient, label_dict:dict, ontology_index:dict, d
if column_name not in flat_label.keys():
flat_label[column_name] = []
if "bounding_box" in obj.keys():
print(obj)
annotation_value = [obj["bounding_box"]["top"], obj["bounding_box"]["left"], obj["bounding_box"]["height"], obj["bounding_box"]["width"]]
if "page_number" in obj.keys():
annotation_value.append(obj["page_number"])
Expand Down
2 changes: 2 additions & 0 deletions labelbase/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ def determine_actions(
attachments_action = True if attachment_index and not create_action else False
# Determine if we're batching data rows
batch_action = False if (project_id == project_id_col == "") else True
print(project_id)
print(project_id_col)
# Determine the upload_method if we're batching to projects
annotate_action = upload_method if (upload_method in ["mal", "import", "ground-truth"]) and annotation_index and batch_action else ""
# "ground-truth" defaults to "import" if no model informtion is given
Expand Down
Binary file not shown.
Binary file not shown.
7 changes: 6 additions & 1 deletion labelbase/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from labelbase.annotate import flatten_label

def export_and_flatten_labels(client:labelboxClient, project, include_metadata:bool=True, include_performance:bool=True,
include_agreement:bool=False, verbose:bool=False, mask_method:str="png", divider="///", export_filters:dict=None):
include_agreement:bool=False, include_label_details:bool=False, verbose:bool=False, mask_method:str="png", divider="///", export_filters:dict=None):
""" Exports and flattens labels from a Labelbox Project
Args:
client: : Required (labelbox.Client) - Labelbox Client object
Expand Down Expand Up @@ -82,6 +82,7 @@ def export_and_flatten_labels(client:labelboxClient, project, include_metadata:b
flat_label["seconds_to_create"] = nested_label['performance_details']['seconds_to_create']
flat_label["seconds_to_review"] = nested_label['performance_details']['seconds_to_review']
flat_label["seconds_to_label"] = nested_label['performance_details']['seconds_to_create'] - nested_label['performance_details']['seconds_to_review']
if include_metadata:
for metadata in label['metadata_fields']:
try:
if metadata['value'] in metadata_schema_to_name_key.keys():
Expand Down Expand Up @@ -115,6 +116,10 @@ def export_and_flatten_labels(client:labelboxClient, project, include_metadata:b
metadata_value = metadata['value']
if field_name != "lb_integration_source":
flat_label[f'metadata{divider}{metadata_type}{divider}{field_name}'] = metadata_value
if include_label_details:
flat_label["created_by"] = nested_label['label_details']["created_by"]
flat_label["updated_at"] = nested_label['label_details']["updated_at"]
flat_label["created_at"] = nested_label['label_details']["created_at"]
flattened_labels.append(flat_label)
if verbose:
print(f"Labels flattened")
Expand Down
3 changes: 3 additions & 0 deletions labelbase/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from datetime import datetime
from dateutil import parser
import pytz
import pandas

def get_metadata_schema_to_type(client:labelboxClient, lb_mdo=False, invert:bool=False):
""" Creates a dictionary where {key=metadata_schema_id: value=metadata_type}
Expand Down Expand Up @@ -121,6 +122,8 @@ def process_metadata_value(metadata_value, metadata_type:str, parent_name:str, m
return_value = None
if str(metadata_value) == "nan": # Catch NaN values
return_value = None
if pandas.isna(metadata_value): #Catch pandas df NaN values
return_value = None
# By metadata type
if metadata_type == "enum": # For enums, it must be a schema ID - if we can't match it, we have to skip it
name_key = f"{parent_name}{divider}{str(metadata_value)}"
Expand Down
60 changes: 38 additions & 22 deletions labelbase/uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from labelbox import Dataset as labelboxDataset
from labelbox import Project as labelboxProject
import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed

def create_global_key_to_label_id_dict(client:labelboxClient, project_id:str, global_keys:list):
""" Creates a dictionary where { key=global_key : value=label_id } by exporting labels from a project
Expand Down Expand Up @@ -37,7 +38,7 @@ def create_global_key_to_data_row_id_dict(client:labelboxClient, global_keys:lis
global_key_to_data_row_dict[gks[i]] = res['results'][i]
return global_key_to_data_row_dict

def check_global_keys(client:labelboxClient, global_keys:list, batch_size=1000):
def check_global_keys(client:labelboxClient, global_keys:list):
""" Checks if data rows exist for a set of global keys - if data rows exist, returns as dictionary { key=data_row_id : value=global_key }
Args:
client : Required (labelbox.client.Client) - Labelbox Client object
Expand All @@ -52,20 +53,18 @@ def check_global_keys(client:labelboxClient, global_keys:list, batch_size=1000):
# Enforce global keys as strings
global_keys_list = [str(x) for x in global_keys]
# Batch global key checks
for i in range(0, len(global_keys_list), batch_size):
batch_gks = global_keys_list[i:] if i + batch_size >= len(global_keys_list) else global_keys_list[i:i+batch_size]
# Get the datarow ids
res = client.get_data_row_ids_for_global_keys(batch_gks)
# Check query job results for fetched data rows
for i in range(0, len(res["results"])):
data_row_id = res["results"][i]
if data_row_id:
existing_drid_to_gk[data_row_id] = batch_gks[i]
# Get the datarow ids
res = client.get_data_row_ids_for_global_keys(global_keys_list)
# Check query job results for fetched data rows
for i in range(0, len(res["results"])):
data_row_id = res["results"][i]
if data_row_id:
existing_drid_to_gk[data_row_id] = global_keys_list[i]
return existing_drid_to_gk

def batch_create_data_rows(
client:labelboxClient, upload_dict:dict, skip_duplicates:bool=True,
divider:str="___", batch_size:int=20000, verbose:bool=False):
divider:str="___", batch_size:int=100000, verbose:bool=False):
""" Uploads data rows, skipping duplicate global keys or auto-generating new unique ones.

upload_dict must be in the following format:
Expand Down Expand Up @@ -94,7 +93,7 @@ def batch_create_data_rows(

"""
# Default error message
e = "Success"
e = {}
# Vet all global keys
global_keys = list(upload_dict.keys()) # Get all global keys
if verbose:
Expand All @@ -103,11 +102,14 @@ def batch_create_data_rows(
gks = global_keys[i:] if i + batch_size >= len(global_keys) else global_keys[i:i+batch_size] # Batch of global keys to vet
existing_data_row_to_global_key = check_global_keys(client, gks) # Returns empty list if there are no duplicates
loop_counter = 0
if skip_duplicates:
e['skipped_global_keys'] = []
while existing_data_row_to_global_key:
if skip_duplicates: # Drop in-use global keys if we're skipping duplicates
if verbose:
print(f"Warning: Global keys in this upload are in use by active data rows, skipping the upload of data rows affected")
for gk in existing_data_row_to_global_key.values():
e['skipped_global_keys'].append(gk)
del upload_dict[gk]
break
else: # Create new suffix for taken global keys if we're not skipping duplicates
Expand Down Expand Up @@ -135,7 +137,9 @@ def batch_create_data_rows(
dataset_id_to_upload_list[dataset_id] = []
dataset_id_to_upload_list[dataset_id].append(data_row)
# Perform uploads grouped by dataset ID
e['errors'] = []
for dataset_id in dataset_id_to_upload_list:
task_list = []
dataset = client.get_dataset(dataset_id)
upload_list = dataset_id_to_upload_list[dataset_id]
if verbose:
Expand All @@ -147,16 +151,24 @@ def batch_create_data_rows(
if verbose:
print(f'Batch #{batch_number}: {len(batch)} data rows')
task = dataset.create_data_rows(batch)
task.wait_till_done()
errors = task.errors
if errors:
if verbose:
print(f'Error: Upload batch number {batch_number} unsuccessful')
e = errors
break
else:
if verbose:
print(f'Success: Upload batch number {batch_number} successful')
task_list.append(task)
# task.wait_till_done()
# errors = task.errors
# e['upload_results'].append(task.uid)
# if errors:
# if verbose:
# print(f'Error: Upload batch number {batch_number} unsuccessful')
# e['errors'] = errors
# break
# else:
# if verbose:
# print(f'Success: Upload batch number {batch_number} successful')
with ThreadPoolExecutor() as exc:
futures = [exc.submit(get_results_from_task, x) for x in task_list]
for future in as_completed(futures):
errors = future.result()
if errors:
e['errors'] += errors
if verbose:
print(f'Upload complete - all data rows uploaded')
return e, upload_dict
Expand Down Expand Up @@ -487,3 +499,7 @@ def batch_upload_predictions(
except Exception as error:
e = error
return e

def get_results_from_task(task):
task.wait_till_done()
return task.errors
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name='labelbase',
version='0.1.05',
version='0.1.06',
author='Labelbox',
author_email='raphael@labelbox.com',
description='Labelbox Helper Library',
Expand Down