Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset splitter #16

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,16 @@ issac_to_yolo/sample_yolo_dataset
*.code-workspace

# data
/data/*
!data/readme.md
/data*/*/
YOLO_*/
transformer_output*/

# don't ignore yolo_to_yolo (it matches the above pattern and isn't just a dataset)
!yolo_to_yolo/

# IDE
.idea/
.vscode/
.vscode/
*.lnk
78 changes: 78 additions & 0 deletions yolo_to_yolo/scripts/isaac_subset_maker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import os
import shutil
import re
from tkinter import Tk, filedialog

def prompt_for_directory(prompt_text: str) -> str:
"""
Opens a folder dialog using Tkinter, allowing the user to select a directory.

Args:
prompt_text (str): The title of the dialog window.

Returns:
str: The path to the selected directory.
"""
root = Tk()
root.withdraw() # Hide the main window
directory = filedialog.askdirectory(title=prompt_text)
root.destroy()
return directory

def copy_files(input_dir: str, output_dir: str, end_number: int, even_only: bool = False) -> None:
"""
Copies files from the input directory to the output directory based on specified conditions.

Args:
input_dir (str): The directory to copy files from.
output_dir (str): The directory to copy files to.
end_number (int): Maximum file number to consider for copying.
even_only (bool): Whether to copy only files with even numbers (default is False).
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)

# Regex pattern to match the number at the end of filenames after the last underscore
pattern = re.compile(r'.*_([0-9]+)\.')

for filename in os.listdir(input_dir):
match = pattern.match(filename)
if match:
file_number = int(match.group(1))
if file_number <= end_number and (not even_only or file_number % 2 == 0):
src = os.path.join(input_dir, filename)
dst = os.path.join(output_dir, filename)
shutil.copy2(src, dst)

def main() -> None:
"""
Main function to handle user input and orchestrate the copying of files.
"""
input_dir = prompt_for_directory("Select Input Directory")
if not input_dir:
print("No input directory selected. Exiting.")
return

output_dir = prompt_for_directory("Select Output Directory")
if not output_dir:
print("No output directory selected. Exiting.")
return

try:
end_number = int(input("Enter the ending number: "))
except ValueError:
print("Invalid number entered. Exiting.")
return

even_only = False
even_only_raw = input("Copy even numbers only? (y/n): ")
if even_only_raw.lower().strip() == "y":
even_only = True
end_number = end_number if end_number % 2 == 0 else end_number - 1

print("Copying files...")
copy_files(input_dir, output_dir, end_number, even_only)
print("Done!")

if __name__ == "__main__":
main()
82 changes: 82 additions & 0 deletions yolo_to_yolo/scripts/yolo_subset_maker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from pathlib import Path
import tkinter as tk
from tkinter import filedialog

from yolo_io import YoloReader, YoloWriter
from yolo_io_types import PredictionTask, Task

def prompt_for_file(prompt_text: str) -> str:
"""
Opens a file dialog using Tkinter, allowing the user to select a YAML file.

Args:
prompt_text (str): The text to display in the dialog window
"""
root = tk.Tk()
root.withdraw() # Hide the main window
file_path = filedialog.askopenfilename(title=prompt_text, filetypes=[("YAML files", "*.yaml")])
root.destroy()
return file_path

def prompt_for_directory(prompt_text: str) -> str:
"""
Opens a folder dialog using Tkinter, allowing the user to select a directory.

Args:
prompt_text (str): The text to display in the dialog window
"""
root = tk.Tk()
root.withdraw() # Hide the main window
directory = filedialog.askdirectory(title=prompt_text)
root.destroy()
return directory

def filter_and_copy_files(reader: YoloReader, writer: YoloWriter, total_files_to_copy: int) -> None:
"""
Filters and copies files across train, test, and val directories based on their proportionate counts.

Args:
reader (YoloReader): The reader instance to read the YOLO dataset
writer (YoloWriter): The writer instance to write the filtered YOLO dataset
total_files_to_copy (int): The total number of files to copy across the subsets
"""
# Determine the number of images in each subset
tasks_to_process = (Task.TRAIN, Task.VAL, Task.TEST)
subset_sizes = {task: len(list(reader.parent_dir.joinpath(reader.descriptor.get_image_and_labels_dirs(task).images).glob("*.png"))) for task in tasks_to_process}
total_images = sum(subset_sizes.values())

# Calculate how many images to copy from each subset
files_to_copy = {task: round((subset_sizes[task] / total_images) * total_files_to_copy) for task in tasks_to_process}

for task in tasks_to_process:
image_data_gen = reader.read(tasks=(task,))
for _, yolo_image_data in zip(range(files_to_copy[task]), image_data_gen):
writer.write([yolo_image_data])

def main():
input_yaml = Path(prompt_for_file("Select the YOLO dataset YAML file"))
if not input_yaml:
print("No YAML file selected. Exiting.")
return

output_dir = Path(prompt_for_directory("Select Output Directory"))
if not output_dir:
print("No output directory selected. Exiting.")
return

try:
total_files_to_copy = int(input("Enter the total number of files to copy: "))
except ValueError:
print("Invalid number entered. Exiting.")
return

# Create reader and writer instances
reader = YoloReader(yaml_path=input_yaml, prediction_task=PredictionTask.DETECTION)
writer = YoloWriter(out_dir=output_dir, prediction_task=PredictionTask.DETECTION, classes=reader.classes)

print("Copying files...")
filter_and_copy_files(reader, writer, total_files_to_copy)
print("Done!")

if __name__ == "__main__":
main()
8 changes: 4 additions & 4 deletions yolo_to_yolo/yolo_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,17 +117,17 @@ def _parse_label_line(self, label_line: str) -> YoloLabel:
"""
split = label_line.strip().split()

if self.prediction_task == PredictionTask.DETECTION and len(split) != 5:
if self.prediction_task.value == PredictionTask.DETECTION.value and len(split) != 5:
raise ValueError(f"Label line for detection should have 5 fields, got '{label_line}'")

if self.prediction_task == PredictionTask.SEGMENTATION and (len(split) - 1) % 2:
if self.prediction_task.value == PredictionTask.SEGMENTATION.value and (len(split) - 1) % 2:
raise ValueError(f"Got odd number of points in label line: {label_line}")

classname = self.classes[int(split[0])]

if self.prediction_task == PredictionTask.DETECTION:
if self.prediction_task.value == PredictionTask.DETECTION.value:
location_data = YoloBbox(*map(float, split[1:]))
elif self.prediction_task == PredictionTask.SEGMENTATION:
elif self.prediction_task.value == PredictionTask.SEGMENTATION.value:
location_data = YoloOutline([Point(float(x), float(y)) for x, y in batched(split[1:], 2)])
else:
raise NotImplementedError(
Expand Down
17 changes: 11 additions & 6 deletions yolo_to_yolo/yolo_io_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,20 +49,25 @@ def check_dirs_exist(self):
raise NotADirectoryError(f"{sub_dir} is not a directory")

def create_dirs(self):
if self.parent_dir.is_dir() and not any(self.parent_dir.iterdir()):
raise IsADirectoryError(f"{self.parent_dir} exists and is not empty")
# Check if parent_dir exists and if it is empty
if self.parent_dir.is_dir():
if any(self.parent_dir.iterdir()): # This checks if the directory is not empty
raise IsADirectoryError(f"{self.parent_dir} exists and is not empty")
else:
self.parent_dir.mkdir(parents=True, exist_ok=True) # Create parent_dir if it doesn't exist

# Create subdirectories for train, val, and test if they do not exist
for task_dir in (self.train_dirs, self.val_dirs, self.test_dirs):
for sub_dir in task_dir:
sub_dir.mkdir(parents=True, exist_ok=True)

def get_image_and_labels_dirs(self, task: Task) -> YoloSubsetDirs:
match task:
case Task.TRAIN:
match task.value:
case Task.TRAIN.value:
return self.train_dirs
case Task.VAL:
case Task.VAL.value:
return self.val_dirs
case Task.TEST:
case Task.TEST.value:
return self.test_dirs
case _:
raise ValueError(f"Task {task} is invalid")
Expand Down