olgaliak · olgaliak · Feb 6, 2019 · Feb 6, 2019 · Feb 6, 2019 · Feb 6, 2019
diff --git a/.gitignore b/.gitignore
@@ -117,4 +117,8 @@ models/*.h5
 tag/*.csv
 
 # TF exported graph files
-.pb
+*.pb
+#train/ppf_inference_graphs/frozen_inference_graph.pb
+
+# ignore configs
+work_config.ini
diff --git a/tag/download_vott_json.py b/tag/download_vott_json.py
@@ -376,7 +376,7 @@ def create_vott_json(file_location, num_rows, user_folders, pick_max, image_loc,
                                             str(csv_file_loc / "totag.csv"))
         file_date = [(blob.name, blob.properties.last_modified) for blob in
                      block_blob_service.list_blobs(container_name) if re.match(r'tagging_(.*).csv', blob.name)]
-        ideal_class_balance = config_file["ideal_class_balance"].split(",")
+
         if file_date:
             block_blob_service.get_blob_to_path(container_name, max(file_date, key=lambda x: x[1])[0],
                                                 str(csv_file_loc / "tagging.csv"))

diff --git a/test/.gitignore b/test/.gitignore
@@ -0,0 +1,2 @@
+*.jpg
+*.gz
diff --git a/test/tagged.csv b/test/tagged.csv
@@ -0,0 +1,6 @@
+filename,class,xmin,xmax,ymin,ymax,height,width,folder,box_confidence,image_confidence
+st1026.png,knot,0.69150746,0.7407868,0.3375946,0.39474854,512,488,board_images_png,0.9990602,0.54169416
+st1026.png,knot,0.29255274,0.37531677,0.41773036,0.48604906,512,488,board_images_png,0.74185294,0.54169416
+st1026.png,knot,0.29603952,0.35703427,0.40142354,0.49790853,512,488,board_images_png,0.54169416,0.54169416
+st1578.png,knot,0.54391885,0.60184073,0.7846939,0.85633487,512,488,board_images_png,0.9994636,0.9942725
+st1578.png,knot,0.60079277,0.6762777,0.36906424,0.4369791,512,488,board_images_png,0.9942725,0.9942725
diff --git a/test/test_create_predictions.py b/test/test_create_predictions.py
@@ -29,7 +29,7 @@ def setUp(self):
            opener = urllib.request.URLopener()
            opener.retrieve(url, model_file)
 
-    def tearDown(self):
+    def tearDown(self):        
         if os.path.exists("untagged.csv"):
             os.remove("untagged.csv")
         if os.path.exists("tagged_preds.csv"):
@@ -66,6 +66,20 @@ def test_get_suggestions(self):
                         user_folders=True)
         self.assertEqual(filecmp.cmp('untagged.csv', 'untagged_source.csv'), True, "generated untagged.csv is correct")
 
+    def test_get_suggestions_pretagged(self):
+        classes = 'knot,defect'
+        cur_detector = TFDetector(classes.split(','), 'model_knots.pb')
+        image_dir = "test_workdir_train"
+        untagged_output = 'untagged.csv'
+        tagged_output = 'tagged_preds.csv'
+        cur_tagged = "tagged.csv"
+        cur_tagging = None
+        get_suggestions(cur_detector, image_dir, untagged_output, tagged_output, cur_tagged, cur_tagging,
+                        filetype="*.png", min_confidence=0.5,
+                        user_folders=True)
+        self.assertEqual(filecmp.cmp('untagged.csv', 'untagged_source_exclude_tagged.csv'), True,
+                         "generated untagged.csv is correct")
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/untagged_source_exclude_tagged.csv b/test/untagged_source_exclude_tagged.csv
@@ -0,0 +1,4 @@
+filename,class,xmin,xmax,ymin,ymax,height,width,folder,box_confidence,image_confidence
+st1194.png,knot,0.6518282,0.70353997,0.7374667,0.80387944,512,488,board_images_png,0.99921286,0.99921286
+st1611.png,knot,0.65116334,0.7139255,0.86043906,0.9666604,512,488,board_images_png,0.99822897,0.9488958
+st1611.png,knot,0.07768918,0.1141083,0.332125,0.36988598,512,488,board_images_png,0.9488958,0.9488958
diff --git a/train/create_predictions.py b/train/create_predictions.py
@@ -7,6 +7,8 @@
 from collections import defaultdict
 import numpy as np
 
+CV2_COLOR_LOAD_FLAG = 1
+
 NUM_CHANNELS=3
 FOLDER_LOCATION=8
 
@@ -17,6 +19,7 @@
 
 #name,prediction[CLASS_IDX],prediction[XMIN_IDX],prediction[XMAX_IDX],prediction[YMIN_IDX],prediction[YMAX_IDX],height,width,folder,prediction[BOX_CONFID_IDX], confidence
 BOX_CONFID_IDX = 0
+FILENAME_LOCATION = 0
 CLASS_IDX = 1
 XMIN_IDX = 3
 XMAX_IDX = 5
@@ -65,10 +68,48 @@ def make_csv_output(all_predictions: List[List[List[int]]], all_names: List[str]
                             prediction[CLASS_IDX], prediction[XMIN_IDX], prediction[XMAX_IDX],
                             prediction[YMIN_IDX], prediction[YMAX_IDX], height, width,
                             prediction[BOX_CONFID_IDX], confidence])
+    print(untagged_output, tagged_output)
+
+def get_images_for_prediction(subdir, filetype, already_tagged_this_folder, image_size):
+    '''
+    Function walks though the given directory of images and consucts ndarrays of image data that will be
+    used to get model's predictions (for subsequent review by human annotators).
+    Images that  already have been reviewed (tagged) are excluded from the list.
+    :param subdir: local directory where images are
+    :param filetype: extension of the images
+    :param already_tagged_this_folder: list of images (names), that already have been tagged and thus can be skiepped
+    :param image_size: target image size
+    :return: function retuns 3 arrays
+       - all_images_this_folder -- ndarray representation of image
+       - all_names_this_folder  -- names of the images that will get pre-tagged,
+       - all_sizes_this_folder  -- original size of images
+    '''
+    all_image_files = []
+    cur_image_names = list(subdir.rglob(filetype))
+    print("Total image names: ", len(cur_image_names))
+    all_image_files += [str(image_name) for image_name in cur_image_names]
+    foldername = subdir.stem
+    all_nonfilt_names = [(foldername, filename.name) for filename in cur_image_names]
+
+    # check if images have been tagged already
+    print("Already tagged {0} images in folder {1}.".format(len(already_tagged_this_folder), foldername))
+    all_names_this_folder = [t for t in all_nonfilt_names if t[1] not in already_tagged_this_folder]
+    print("{0} images are taken for prediction from folder {1}".format(len(all_names_this_folder), foldername))
+
+    all_filt_imagepaths = [filepath for filepath in cur_image_names if filepath.name not in already_tagged_this_folder]
+
+    # Reversed because numpy is row-major
+    all_sizes_this_folder = [cv2.imread(str(image_path), CV2_COLOR_LOAD_FLAG).shape[:2] for image_path in
+                             all_filt_imagepaths]
+    all_images_this_folder = np.zeros((len(all_names_this_folder), *reversed(image_size), NUM_CHANNELS), dtype=np.uint8)
+    for curindex, image_path in enumerate(all_filt_imagepaths):
+        all_images_this_folder[curindex] = cv2.resize(cv2.imread(str(image_path), CV2_COLOR_LOAD_FLAG), image_size)
+
+    return all_images_this_folder, all_names_this_folder, all_sizes_this_folder
 
 def get_suggestions(detector, basedir: str, untagged_output: str, 
     tagged_output: str, cur_tagged: str, cur_tagging: str, min_confidence: float =.2,
-    image_size: Tuple=(1000,750), filetype: str="*.jpg", minibatchsize: int=50,
+    image_size: Tuple=(1000,750), filetype: str="*.jpg", minibatchsize: int=16,
     user_folders: bool=True):
     '''Gets suggestions from a given detector and uses them to generate VOTT tags
 
@@ -81,9 +122,9 @@ def get_suggestions(detector, basedir: str, untagged_output: str,
     arguments to the Detector class
     '''
     basedir = Path(basedir)
-    CV2_COLOR_LOAD_FLAG = 1
-    all_predictions = []
+    all_images = None
     all_tagged = []
+    all_names = []
     if user_folders:
         # TODO: Cross reference with ToTag
         # download latest tagging and tagged
@@ -99,25 +140,26 @@ def get_suggestions(detector, basedir: str, untagged_output: str,
                 all_tagged.extend(list(reader))
         already_tagged = defaultdict(set)
         for row in all_tagged:
-            already_tagged[row[FOLDER_LOCATION]].add(row[0])
+            already_tagged[row[FOLDER_LOCATION]].add(row[FILENAME_LOCATION])
+        print("Already tagged {0} images.".format(sum(map(len, already_tagged.values()))))
         subdirs = [subfile for subfile in basedir.iterdir() if subfile.is_dir()]
         print("subdirs: ", subdirs)
-        all_names = []
-        all_image_files = [] 
         all_sizes = []
         for subdir in subdirs:
-            cur_image_names = list(subdir.rglob(filetype))
-            print("Total image names: ", len(cur_image_names))
-            all_image_files += [str(image_name) for image_name in cur_image_names]
             foldername = subdir.stem
-            all_names += [(foldername, filename.name) for filename in cur_image_names]
-        # Reversed because numpy is row-major
-        all_sizes = [cv2.imread(image, CV2_COLOR_LOAD_FLAG).shape[:2] for image in all_image_files]
-        all_images = np.zeros((len(all_image_files), *reversed(image_size), NUM_CHANNELS), dtype=np.uint8)
-        for curindex, image in enumerate(all_image_files):
-            all_images[curindex] = cv2.resize(cv2.imread(image, CV2_COLOR_LOAD_FLAG), image_size)
+            already_tagged_this_folder = already_tagged[foldername]
+            all_images_this_folder, all_names_this_folder, all_sizes_this_folder =\
+                get_images_for_prediction(subdir, filetype, already_tagged_this_folder, image_size)
+
+            all_names += all_names_this_folder
+            if (all_images == None):
+                all_images = all_images_this_folder
+            else:
+                all_images = np.concatenate((all_images,all_images_this_folder), axis=0)
+            all_sizes += all_sizes_this_folder
+
         print("Shape of all_images: ", all_images.shape)
-        all_predictions = detector.predict(all_images, min_confidence=min_confidence)
+        all_predictions = detector.predict(all_images, min_confidence=min_confidence, batch_size=minibatchsize)
     else:
         with open(cur_tagged, 'r') as file:
             reader = csv.reader(file)
@@ -127,13 +169,11 @@ def get_suggestions(detector, basedir: str, untagged_output: str,
             reader = csv.reader(file)
             next(reader, None)
             already_tagged |= {row[0] for row in reader}
-        all_image_files = list(basedir.rglob(filetype))
-        all_names = [filename.name for filename in all_image_files]
-        all_sizes = [cv2.imread(str(image), CV2_COLOR_LOAD_FLAG).shape[:2] for image in all_image_files]
-        all_images = np.zeros((len(all_image_files), *reversed(image_size), NUM_CHANNELS), dtype=np.uint8)
-        for curindex, image in enumerate(all_image_files):
-            all_images[curindex] = cv2.resize(cv2.imread(str(image), CV2_COLOR_LOAD_FLAG), image_size)
-        all_predictions = detector.predict(all_images, batch_size=2, min_confidence=min_confidence)
+
+        already_tagged_this_folder = already_tagged
+        all_images, all_names, all_sizes = \
+            get_images_for_prediction(basedir, filetype, already_tagged_this_folder, image_size)
+        all_predictions = detector.predict(all_images, batch_size=1, min_confidence=min_confidence)
     make_csv_output(all_predictions, all_names, all_sizes, untagged_output, tagged_output, already_tagged, user_folders)
 
 if __name__ == "__main__":
@@ -169,6 +209,7 @@ def get_suggestions(detector, basedir: str, untagged_output: str,
     else:
         classes = config_file["classes"].split(",")
         model = str(Path(config_file["inference_output_dir"])/"frozen_inference_graph.pb")
+        print("using model: ", model)
         if file_date:
             block_blob_service.get_blob_to_path(container_name, max(file_date, key=lambda x:x[1])[0], "tagged.csv")
             cur_tagged = "tagged.csv"