diff --git a/labs/06/svhn_competition.py b/labs/06/svhn_competition.py index 7e95de3..80136f4 100644 --- a/labs/06/svhn_competition.py +++ b/labs/06/svhn_competition.py @@ -3,10 +3,12 @@ import datetime import os import re + os.environ.setdefault("KERAS_BACKEND", "torch") # Use PyTorch backend unless specified otherwise import keras import numpy as np +from torchvision.transforms import Resize import torch import bboxes_utils @@ -66,17 +68,58 @@ def main(args: argparse.Namespace) -> None: # - "classes", a `[num_digits]` vector with classes of image digits, # - "bboxes", a `[num_digits, 4]` vector with bounding boxes of image digits. svhn = SVHN() - def get_inputs(dataset=svhn.train): - imgs, clsses, bboxes = [], [], [] - for example in dataset: - img, cls, bbox = example["image"], example["classes"], example["bboxes"] - imgs.append(img) - clsses.append(cls) - bboxes.append(bbox) - return np.array(imgs), np.array(clsses), np.array(bboxes) - train_imgs, train_clsses, train_bboxes = get_inputs() - dev_imgs, dev_clsses, dev_bboxes = get_inputs(svhn.dev) - test_imgs, _, _ = get_inputs(svhn.test) + + def transform_data(example): + image = example["image"] + classes = example["classes"] # Gold classes + bboxes = example["bboxes"] # Gold bboxes + + threshhold = 0.5 + + # Resize image + resize_transform = Resize((224, 224)) + resized_image = resize_transform(image) + + anchors = [] + + anchor_size = 2 ** (2 / 3) * 4 * (2**7) + + for i in range(7): + for j in range(7): + # top, left, bottom, right + anchors.append( + [ + i * anchor_size, + j * anchor_size, + (i + 1) * anchor_size, + (j + 1) * anchor_size, + ] + ) + + anchors = np.array(anchors) + anchor_classes, anchor_bboxes = bboxes_utils.bboxes_training( + anchors, bboxes, classes, threshhold + ) + onehot_encoded_anchor_classes = keras.ops.one_hot(anchor_classes, svhn.LABELS) + + # If class is zero, return 1, otherwise return 0 + is_background_weights = map(lambda x: x / svhn.LABELS, anchor_classes) + + return ( + resized_image, + (onehot_encoded_anchor_classes, anchor_bboxes), + is_background_weights, + ) + + transformed_train_dataset = torch.utils.data.DataLoader( + svhn.train.transform(transform_data), batch_size=args.batch_size, shuffle=True + ) + transformed_dev_dataset = torch.utils.data.DataLoader( + svhn.dev.transform(transform_data), batch_size=args.batch_size, shuffle=False + ) + transformed_test_dataset = torch.utils.data.DataLoader( + svhn.test.transform(transform_data), batch_size=args.batch_size, shuffle=False + ) # Load the EfficientNetV2-B0 model. It assumes the input images are # represented in the [0-255] range. @@ -92,15 +135,15 @@ def get_inputs(dataset=svhn.train): ) # TODO: Create the model and train it - backbone.trainable = False - inputs = keras.layers.Input(shape=train_imgs.shape) + backbone.trainable = False + inputs = keras.layers.Input(shape=(224,224,3)) # backbone outputs bottom to up: block1a, block2b, block3b, block5e, top # shapes: 7x7x1280, 14x14x112, 28x28x40, 56x56x24, 112x112x16 # top, block5e, block3b, block2b, block1a = backbone(inputs) # FPN: feature pyramid network # 1. First build feature pyramid to extract features, using 3 layers (layer 3-5) - def fpn(backbone=backbone, inputs=inputs, layers=(3,4,5)): + def fpn(backbone=backbone, inputs=inputs, layers=(3, 4, 5)): conv3_11 = keras.layers.Conv2D(256, 1, 1, "same") conv4_11 = keras.layers.Conv2D(256, 1, 1, "same") conv5_11 = keras.layers.Conv2D(256, 1, 1, "same") @@ -123,6 +166,7 @@ def fpn(backbone=backbone, inputs=inputs, layers=(3,4,5)): p6_output = conv6_33(c5_output) p7_output = conv7_33(keras.activations.relu(p6_output)) return p3_output, p4_output, p5_output, p6_output, p7_output + ### classification and bbox regression head ### 9 is the anchor number def heads(input_feature, type="classification"): @@ -139,7 +183,7 @@ def heads(input_feature, type="classification"): conv4 = keras.layers.ReLU()(keras.layers.Conv2D(256, 3, 1, "same")(conv3)) outputs = keras.layers.Conv2D(output_size, 3, 1, "same", activation=activ)(conv4) return outputs - + fpn_features = fpn() cls_outputs, reg_outputs = [], [] for feature in fpn_features: @@ -147,7 +191,7 @@ def heads(input_feature, type="classification"): reg_output = heads(feature, "regression") cls_outputs.append(keras.ops.reshape(cls_output, (args.batch_size, -1, svhn.LABELS))) reg_outputs.append(keras.ops.reshape(reg_output, (args.batch_size, -1, 4))) - + cls_outputs = keras.ops.concatenate(cls_outputs, axis=1) reg_outputs = keras.ops.concatenate(reg_outputs, axis=1) model_outputs = keras.ops.concatenate([reg_outputs, cls_outputs], axis=-1) @@ -156,12 +200,15 @@ def heads(input_feature, type="classification"): model.compile( optimizer=keras.optimizers.Adam(learning_rate=args.learning_rate), - loss=( - keras.losses.BinaryFocalCrossEntropy(), - keras.losses.Huber()), + loss=(keras.losses.BinaryFocalCrossentropy(), keras.losses.Huber()), ) - model.fit() + model.fit( + transformed_train_dataset, + epochs=args.epochs, + validation_data=transformed_dev_dataset, + callbacks=[TorchTensorBoardCallback(args.logdir)], + ) # Generate test set annotations, but in `args.logdir` to allow parallel execution. os.makedirs(args.logdir, exist_ok=True)