flowcam_data_processor.py

import cv2
import numpy as np
from os import listdir, path
import pandas as pd

from typing import List, Tuple


class FlowcamDataProcessor():
    """ Process flb or lst and image collage files in a directory

    Given a directory this class looks for a flb or lst file

    # Arguments
        flowcam_files_dir: A String
            Directory containing files generated by FlowCam, this
            should contain one or more image collage files (tif) and
            one or more flb or lst files. 
            (flb and lst files generated by FlowCam are identical formats)

    """

    def __init__(self, flowcam_files_dir: path) -> None:
        self.flowcam_files_dir = flowcam_files_dir

    def parse_lst_or_flb_file(self, file_content: List[str],
                              target: int) -> pd.DataFrame:
        """ Parses a flb or lst file created by FlowCam, and creates
        a panda dataframe for all the geometrics in the file.

        Arguments:
            file_content {List[str]} -- [description]
            target {int} -- [description]

        Returns:
            pd.DataFrame -- [description]
        """

        # thinking in terms of a csv file, there are column names
        col_names: List = []
        # and column values for each sample
        col_values: List = []

        # ignore the first two lines of the file
        for line in file_content[2:]:
            # if line contains only one '|' then it is a column name
            if line.count('|') == 1:
                col_names.append(line.split('|')[0])

            # else each line contains numerous features for each particle
            else:
                features = line.split('|')
                col_values.append(features)

        # put data into a panda dataframe for easier future analysis
        df: pd.DataFrame = pd.DataFrame(col_values)
        df.columns = col_names
        # target is the particle class id
        df['_target'] = target

        return df

    def process_lst_or_flb_files(self, target: int = 0) -> pd.DataFrame:
        """Given flowcam files in a directory, extract data from
        the flb or lst files and store the data in a pandas dataframe

        Keyword Arguments:
            target {int} -- [description] (default: {0})

        Returns:
            pd.DataFrame -- [description]
        """
        # in case there is more than one lst file,
        # create list to store processed data from each one
        dataframes = []

        # Iterate through the files generated by FlowCam
        for idx, filename in enumerate(listdir(self.flowcam_files_dir)):

            # search for the lst/flb file containing info for all the samples
            if filename.endswith(".lst") or filename.endswith(".flb"):

                # if found, instruct Python to open the file
                with open(path.join(self.flowcam_files_dir, filename)) as file:

                    # extract all content from the file
                    file_content = file.readlines()

                    # and then pass content to function for processing
                    df = self.parse_lst_or_flb_file(file_content, target)

                    dataframes.append(df)

        df = pd.concat(dataframes, sort=True)

        # dataframe contains information for all samples in this class (target)
        return df

    def snip_images(self,
                    df: pd.DataFrame,
                    desired_image_size: Tuple[int, int],
                    resize: bool = True) -> np.array:
        """For each row in the dataframe, this finds the particle within
        it's relevant image collage and "snips" it out. Optionally these
        images will be resized to a uniform size.

        Arguments:
            df {pd.DataFrame} -- [description]
            desired_image_size {Tuple[int, int]} -- [description]

        Keyword Arguments:
            resize {bool} -- [description] (default: {True})

        Returns:
            np.array -- 4d array containing all segmented images
        """

        images_data: List = []

        for index, row in df.iterrows():
            image_collage_path = path.join(
                self.flowcam_files_dir, row['collage_file'])
            im = cv2.imread(image_collage_path)

            # snip out the particle from the image collage
            im = im[
                int(row['image_y']):int(row['image_y'])+int(row['image_h']),
                int(row['image_x']):int(row['image_x'])+int(row['image_w'])]

            if resize:
                # make all images have portrait orientation
                # if width > height
                (h, w) = im.shape[:2]
                if w > h:
                    # calculate the center of the image
                    im = cv2.rotate(im, cv2.ROTATE_90_CLOCKWISE)

                im = cv2.resize(im, desired_image_size)

            im = im / 255

            images_data.append(im)
        images_data: np.array = np.array(images_data)
        return images_data