Skip to content

Commit 66bbe0d

Browse files
authored
Merge pull request #323 from roboflow/import-paligemma-format
Import paligemma format into text-image-pairs project
2 parents f0656cb + 009e322 commit 66bbe0d

19 files changed

+82
-15
lines changed

roboflow/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from roboflow.models import CLIPModel, GazeModel # noqa: F401
1616
from roboflow.util.general import write_line
1717

18-
__version__ = "1.1.44"
18+
__version__ = "1.1.45"
1919

2020

2121
def check_key(api_key, model, notebook, num_retries=0):

roboflow/roboflowpy.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ def download(args):
4747

4848

4949
def import_dataset(args):
50-
rf = roboflow.Roboflow()
50+
api_key = load_roboflow_api_key(args.workspace)
51+
rf = roboflow.Roboflow(api_key)
5152
workspace = rf.workspace(args.workspace)
5253
workspace.upload_dataset(
5354
dataset_path=args.folder,

roboflow/util/folderparser.py

+32-9
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from .image_utils import load_labelmap
99

1010
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp"}
11-
ANNOTATION_EXTENSIONS = {".txt", ".json", ".xml", ".csv"}
11+
ANNOTATION_EXTENSIONS = {".txt", ".json", ".xml", ".csv", ".jsonl"}
1212
LABELMAPS_EXTENSIONS = {".labels", ".yaml", ".yml"}
1313

1414

@@ -107,13 +107,14 @@ def _map_annotations_to_images_1tomany(images, annotationFiles):
107107
dirname = image["dirname"]
108108
annotationsInSameDir = annotationsByDirname.get(dirname, [])
109109
if annotationsInSameDir:
110-
if len(annotationsInSameDir) > 1:
111-
print(f"warning: found multiple annotation files on dir {dirname}")
112-
annotationFile = annotationsInSameDir[0]
113-
format = annotationFile["parsedType"]
114-
image["annotationfile"] = _filterIndividualAnnotations(
115-
image, annotationFile, format, imgRefMap, annotationMap
116-
)
110+
for annotationFile in annotationsInSameDir:
111+
format = annotationFile["parsedType"]
112+
filtered_annotations = _filterIndividualAnnotations(
113+
image, annotationFile, format, imgRefMap, annotationMap
114+
)
115+
if filtered_annotations:
116+
image["annotationfile"] = filtered_annotations
117+
break
117118

118119

119120
def _build_image_and_annotation_maps(annotationFiles):
@@ -182,11 +183,16 @@ def _filterIndividualAnnotations(image, annotation, format, imgRefMap, annotatio
182183
return _annotation
183184
else:
184185
return None
186+
elif format == "jsonl":
187+
jsonlLines = [json.dumps(line) for line in parsed if line["image"] == image["name"]]
188+
if jsonlLines:
189+
_annotation = {"name": "annotation.jsonl", "rawText": "\n".join(jsonlLines)}
190+
return _annotation
185191
return None
186192

187193

188194
def _loadAnnotations(folder, annotations):
189-
valid_extensions = {".json", ".csv"}
195+
valid_extensions = {".json", ".csv", ".jsonl"}
190196
annotations = [a for a in annotations if a["extension"] in valid_extensions]
191197
for ann in annotations:
192198
extension = ann["extension"]
@@ -197,12 +203,29 @@ def _loadAnnotations(folder, annotations):
197203
if parsedType:
198204
ann["parsed"] = parsed
199205
ann["parsedType"] = parsedType
206+
elif extension == ".jsonl":
207+
ann["parsed"] = _read_jsonl(f"{folder}{ann['file']}")
208+
ann["parsedType"] = "jsonl"
200209
elif extension == ".csv":
201210
ann["parsedType"] = "csv"
202211
ann["parsed"] = _parseAnnotationCSV(f"{folder}{ann['file']}")
203212
return annotations
204213

205214

215+
def _read_jsonl(path):
216+
data = []
217+
with open(path) as file:
218+
for linenum, line in enumerate(file, 1):
219+
if not line:
220+
continue
221+
try:
222+
json_object = json.loads(line.strip())
223+
data.append(json_object)
224+
except json.JSONDecodeError:
225+
print(f"Warning: Skipping invalid JSON line in {path}:{linenum}")
226+
return data
227+
228+
206229
def _parseAnnotationCSV(filename):
207230
# TODO: use a proper CSV library?
208231
with open(filename) as f:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# ChartQA > 2024-08-28 7:21pm
2+
https://universe.roboflow.com/roboflow-jvuqo/chartqa-c9zny
3+
4+
Provided by a Roboflow user
5+
License: CC BY 4.0
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{"image":"de960ddd58344041754d5f984f8f82c2_png.rf.011864613b53c6b6a0c0a7086b657a71.jpg","prefix":"What region in Italy had the highest number of mafia crimes in 2018?","suffix":"Calabria"}
2+
{"image":"de960ddd58344041754d5f984f8f82c2_png.rf.011864613b53c6b6a0c0a7086b657a71.jpg","prefix":"How many criminal reports were recorded in the region of Calabria in 2018?","suffix":"896"}
3+
{"image":"de960ddd58344041754d5f984f8f82c2_png.rf.011864613b53c6b6a0c0a7086b657a71.jpg","prefix":"What region in Italy had the highest number of mafia crimes in 2018?","suffix":"Calabria"}
4+
{"image":"de960ddd58344041754d5f984f8f82c2_png.rf.011864613b53c6b6a0c0a7086b657a71.jpg","prefix":"How many criminal reports were recorded in the region of Calabria in 2018?","suffix":"896"}
5+
{"image":"de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg","prefix":"Which sector had the highest ROI in 2013?","suffix":"Retail"}
6+
{"image":"de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg","prefix":"Which sector had the highest ROI in 2014?","suffix":"Electronics"}
7+
{"image":"e1893eee3f64bda1eac88da795ad3a00_png.rf.01248d761c27015da1fa5f3c4daea759.jpg","prefix":"How much did Hermes' national general cargo revenue add up to in 2009?","suffix":"100"}
8+
{"image":"e1893eee3f64bda1eac88da795ad3a00_png.rf.01248d761c27015da1fa5f3c4daea759.jpg","prefix":"How much did Hermes' national general cargo revenue add up to in 2009?","suffix":"100"}
9+
{"image":"eaab023f1ce380c4c9163415facc3c0d_png.rf.01c5a1f19653c056bbb3b0c8fc2d752d.jpg","prefix":"What's the percentage value of leftmost bar?","suffix":"24"}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{"image":"63a6c783083d5c7c7290bc81877a4ee9_png.rf.5c02d037f48bc3df56e6d0e3e6e053e4.jpg","prefix":"How many research and public policy oriented organizations were there among the registered environmental and conservation organizations in the United States in 2005?","suffix":"372"}
2+
{"image":"63a6c783083d5c7c7290bc81877a4ee9_png.rf.5c02d037f48bc3df56e6d0e3e6e053e4.jpg","prefix":"How many research and public policy oriented organizations were there among the registered environmental and conservation organizations in the United States in 2005?","suffix":"372"}
3+
{"image":"5964b4c268577652f171d52dc317d82d_png.rf.5bf49f8aa575f586001710b1d79968fd.jpg","prefix":"What was the crude birth rate in Costa Rica in 2019?","suffix":"13.69"}
4+
{"image":"5964b4c268577652f171d52dc317d82d_png.rf.5bf49f8aa575f586001710b1d79968fd.jpg","prefix":"What was the crude birth rate in Costa Rica in 2019?","suffix":"13.69"}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"image":"fa68474f5b30c3d647ec1f5cddf41570_png.rf.000949c9aafeb8c594a936a0ef92993f.jpg","prefix":"How many murders and manslaughters were recorded by the Belgian police in 2020?","suffix":"874"}
2+
{"image":"fa68474f5b30c3d647ec1f5cddf41570_png.rf.000949c9aafeb8c594a936a0ef92993f.jpg","prefix":"How many murders and manslaughters were recorded by the Belgian police in 2020?","suffix":"874"}
3+
{"image":"aca6fd05e9b2830518288ba082aa6f76_png.rf.001543e209328197472f6587dfa8a6d6.jpg","prefix":"What was the unemployment rate in Chile in 2020?","suffix":"11.51"}

tests/manual/debugme.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
# f"import {thisdir}/data/cultura-pepino-yolov8_voc -w wolfodorpythontests -p yellow-auto -c 100".split() # noqa: E501 // docs
4242
# f"import {thisdir}/data/cultura-pepino-yolov5pytorch -w wolfodorpythontests -p yellow-auto -c 100 -n papaiasso".split() # noqa: E501 // docs
4343
# f"import {thisdir}/../datasets/mosquitos -w wolfodorpythontests -p yellow-auto -n papaiasso".split() # noqa: E501 // docs
44-
f"deployment list".split() # noqa: E501 // docs
44+
# f"deployment list".split() # noqa: E501 // docs
45+
f"import -w tonyprivate -p meh-plvrv {thisdir}/../datasets/paligemma/".split() # noqa: E501 // docs
4546
)
4647
args.func(args)

tests/manual/uselocal

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
#!/bin/env bash
2-
cp data/.config-staging data/.config
2+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
3+
cp $SCRIPT_DIR/data/.config-staging $SCRIPT_DIR/data/.config
34
export API_URL=https://localhost.roboflow.one
45
export APP_URL=https://localhost.roboflow.one
6+
export DEDICATED_DEPLOYMENT_URL=https://staging.roboflow.cloud
7+
export ROBOFLOW_CONFIG_DIR=$SCRIPT_DIR/data/.config
58
# need to set it in /etc/hosts to the IP of host.docker.internal!

tests/manual/useprod

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#!/bin/env bash
22

3-
cp data/.config-prod data/.config
3+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
4+
cp $SCRIPT_DIR/data/.config-prod $SCRIPT_DIR/data/.config
45
export API_URL=https://api.roboflow.com
56
export APP_URL=https://app.roboflow.com
67
export OBJECT_DETECTION_URL=https://detect.roboflow.one
78
export DEDICATED_DEPLOYMENT_URL=https://roboflow.cloud
9+
export ROBOFLOW_CONFIG_DIR=$SCRIPT_DIR/data/.config

tests/manual/usestaging

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#!/bin/env bash
22

3-
cp data/.config-staging data/.config
3+
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
4+
cp $SCRIPT_DIR/data/.config-staging $SCRIPT_DIR/data/.config
45
export API_URL=https://api.roboflow.one
56
export APP_URL=https://app.roboflow.one
67
export OBJECT_DETECTION_URL=https://lambda-object-detection.staging.roboflow.com
78
export DEDICATED_DEPLOYMENT_URL=https://staging.roboflow.cloud
9+
export ROBOFLOW_CONFIG_DIR=$SCRIPT_DIR/data/.config

tests/util/test_folderparser.py

+14
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,20 @@ def test_parse_mosquitos_csv(self):
5252
expected += "train_10308.jpeg,1058,943,japonicus/koreicus,28,187,908,815\n"
5353
assert testImage["annotationfile"]["rawText"] == expected
5454

55+
def test_paligemma_format(self):
56+
folder = f"{thisdir}/../datasets/paligemma"
57+
parsed = folderparser.parsefolder(folder)
58+
testImagePath = "/dataset/de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg"
59+
testImage = [i for i in parsed["images"] if i["file"] == testImagePath][0]
60+
assert testImage["annotationfile"]["name"] == "annotation.jsonl"
61+
expected = (
62+
'{"image": "de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg",'
63+
' "prefix": "Which sector had the highest ROI in 2013?", "suffix": "Retail"}\n'
64+
'{"image": "de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg",'
65+
' "prefix": "Which sector had the highest ROI in 2014?", "suffix": "Electronics"}'
66+
)
67+
assert testImage["annotationfile"]["rawText"] == expected
68+
5569

5670
def _assertJsonMatchesFile(actual, filename):
5771
with open(filename) as file:

0 commit comments

Comments
 (0)