Skip to content

Commit

Permalink
Change dataset check on saturation ratio.
Browse files Browse the repository at this point in the history
  • Loading branch information
ebezzam committed Jun 6, 2024
1 parent 5ffe0f5 commit 0f3dd52
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 13 deletions.
3 changes: 2 additions & 1 deletion configs/analyze_dataset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ hydra:
chdir: True # change to output folder

dataset_path: null
desired_range: [150, 254]
desired_range: [150, 255]
saturation_percent: 0.05
delete_bad: False
n_files: null
start_idx: null
59 changes: 47 additions & 12 deletions scripts/measure/analyze_measured_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,19 @@
import matplotlib.pyplot as plt
import time
import tqdm
import re


def convert(text):
return int(text) if text.isdigit() else text.lower()


def alphanum_key(key):
return [convert(c) for c in re.split("([0-9]+)", key)]


def natural_sort(arr):
return sorted(arr, key=alphanum_key)


@hydra.main(version_base=None, config_path="../../configs", config_name="analyze_dataset")
Expand All @@ -24,13 +37,14 @@ def analyze_dataset(config):
desired_range = config.desired_range
delete_bad = config.delete_bad
start_idx = config.start_idx
saturation_percent = config.saturation_percent

assert (
folder is not None
), "Must specify folder to analyze in config or through command line (folder=PATH)."

# get all PNG files in folder
files = sorted(glob.glob(os.path.join(folder, "*.png")))
files = natural_sort(glob.glob(os.path.join(folder, "*.png")))
print("Found {} files".format(len(files)))
if start_idx is not None:
files = files[start_idx:]
Expand All @@ -48,10 +62,9 @@ def analyze_dataset(config):
im = np.array(Image.open(fn))
max_val = im.max()
max_vals.append(max_val)
saturation_ratio = np.sum(im >= desired_range[1]) / im.size

# if out of desired range, print filename
if max_val < desired_range[0] or max_val > desired_range[1]:
# print("File {} has max value {}".format(fn, max_val))
if max_val < desired_range[0]:
n_bad_files += 1
bad_files.append(fn)

Expand All @@ -61,6 +74,28 @@ def analyze_dataset(config):
else:
print("File {} has max value {}".format(fn, max_val))

elif saturation_ratio > saturation_percent:
n_bad_files += 1
bad_files.append(fn)

if delete_bad:
os.remove(fn)
print("REMOVED file {}".format(fn))
else:
print("File {} has saturation ratio {}".format(fn, saturation_ratio))

# # if out of desired range, print filename
# if max_val < desired_range[0] or saturation_ratio > saturation_percent:
# # print("File {} has max value {}".format(fn, max_val))
# n_bad_files += 1
# bad_files.append(fn)

# if delete_bad:
# os.remove(fn)
# print("REMOVED file {}".format(fn))
# else:
# print("File {} has max value {}".format(fn, max_val))

proc_time = time.time() - start_time
print("Went through {} files in {:.2f} seconds".format(len(files), proc_time))
print(
Expand All @@ -69,6 +104,14 @@ def analyze_dataset(config):
)
)

# plot histogram
output_folder = os.getcwd()
output_fp = os.path.join(output_folder, "max_vals.png")
plt.hist(max_vals, bins=100)
plt.savefig(output_fp)

print("Saved histogram to {}".format(output_fp))

# command line input on whether to delete bad files
if not delete_bad:
response = None
Expand All @@ -80,14 +123,6 @@ def analyze_dataset(config):
else:
print("Not deleting bad files")

# plot histogram
output_folder = os.getcwd()
output_fp = os.path.join(output_folder, "max_vals.png")
plt.hist(max_vals, bins=100)
plt.savefig(output_fp)

print("Saved histogram to {}".format(output_fp))


if __name__ == "__main__":
analyze_dataset()

0 comments on commit 0f3dd52

Please sign in to comment.