extraction_data_medpc_BR.py

# -*- coding: utf-8 -*-
"""
Created on Tue Dec 13

Functions to analyse social operant conditioning

@author: redon

 Variables:
\ A = fixed ratio
\ B = session duration max
\ C = nb np max in session
\ D = timer for session
\ E = timer for table
\ F = count noldus
\ G = array noldus
\ H = idx array noldus
\ I = count inactive
\ J = array inactive
\ K = idx array inactive
\ L = count inactive all
\ M = array inactive all
\ N = idx array inactive all
\ O = temporary count active all
\ P = count active
\ Q = array active
\ R = idx array active
\ S = count active all
\ T = array active all
\ U = idx array active all
"""

import pandas as pd
import numpy as np
import re
import datetime
import pathlib


def ethovision_reader(file):
    '''
    Read ethovision raw csv into separate dataframe for header and raw values

    Parameters:
    ----------------
    file: String
        Path to the ethovision raw csv

    Returns:
    ----------------
    df_header: Dataframe
        Contains all experiment information (2 columns: name of information, value)

    df_ethovision: Dataframe
        Contains the raw value of the variables' state at each 4ms

    '''
    # Determine the number of rows in the header
    nrows = pd.read_csv(file, header=None, usecols=[0, 1], nrows=1).at[0, 1]
    # Extract the header into a dataframe
    df_header = pd.read_csv(file, header=None, index_col=0, usecols=[0, 1], nrows=(nrows - 2))
    # Extract the session data into a dataframe
    df_ethovision = pd.read_csv(file, skiprows=(nrows - 2), header=0, na_values='-').drop(0).astype('float')

    return df_header, df_ethovision


def start_time_med(med_file):
    time_regex_med = r"Start Time: (\d{2}:\d{2}:\d{2})"
    date_regex_med = r"Start Date: (\d{2}/\d{2}/\d{2})"

    with open(med_file) as f:
        lines = f.readlines()
        for line in lines:
            match_time = re.search(time_regex_med, line)
            match_date = re.search(date_regex_med, line)
            if match_time:
                start_time = match_time.group(1)
                # start_time = datetime.datetime.strptime(match_time.group(1), '%H:%M:%S')
            if match_date:
                start_date = match_date.group(1)

    start_med_str = start_date + ' ' + start_time
    start_med_time = datetime.datetime.strptime(start_med_str, '%m/%d/%y %H:%M:%S')

    return start_med_time


def detect_floats(sequence):
    '''Function computing medassociate file data by detecting list of numbers in non-organized txt file

    Arg:

        sequence = chunk of non organized txt file containing several float to extract
            type Str

    Returns

        list_floats = List containing each single floats detected in the sequence
            type List of floats

    '''
    # Pattern to match any decimal number whatever the size
    pattern = re.compile(
        r'[0-9]+\.?[0-9]+')  # any several number ([0-9]+) followed by a period or not(\.?), and followed by several number

    # Find each single decimal number in the sequence
    list_floats_str = re.findall(pattern, sequence)

    # Convert the list of string into list of floats to allow future computation
    list_floats = [float(x) for x in list_floats_str]

    return list_floats


def import_txt(file):
    """Function allowing to (i) import medpc file into a dataframe
    and to (ii) convert columns of data into arrays of floats for computation
    WARNING: use detect_floats()

    Arg:

        file = path or name of the file to open, must be txt or csv-like
            type Str

    Return:

        array_results = array version of df_medpc_all
            type Array

        df_medpc_all = 2 columns dataframe containing experimental data
            type DataFrame

    """

    df_medpc_all = pd.read_csv(file,
                               sep=':',  # Allow to have 2 column but will raise error on dates
                               skiprows=[0],
                               on_bad_lines='warn',
                               # Allow to overcome error with dates by simply warning when one is encountered
                               header=None).fillna('empty')

    # Converting the dataframe into array
    array_results = df_medpc_all.values

    for i in range(len(array_results)):
        array_results[i, 1] = detect_floats(array_results[i, 1])

    return array_results, df_medpc_all


def extract_arrays(array_result):
    """Function iterating through arrays to extract and organize the all data
    into a dictionary with the variable's name (key) and its content (value)

    Arg:

        array_results = array containing data from the dataframe

    Return:

        dict_result = dictionnary gathering variable names (keys) and values in an array (value)
            type Dict

        list_variables = list of variable names a string
            type List of Str

        list_values = list of variable values in list
            type List of lists
    """

    # Initiate empty lists to host data
    list_variables = []
    list_values = []

    # Define a regex that would match any uppercase letter
    letter = re.compile(r'[A-Z]')

    # Iterate through the row of the array:
    for j in range(len(array_result)):

        # If column 0 element at index j and j+1 are capital letters
        if re.match(letter, array_result[j, 0]) != None and re.match(letter, array_result[j + 1, 0]) != None:
            # Add the variable name (element of index j in column 0) to list_variables
            list_variables.append(re.match(letter, array_result[j, 0]).group())
            # Add its value (element of index j in column 1) to list_values
            list_values.append(array_result[j, 1])

        # If column 0 element is a capital letter and the following is not
        elif re.match(letter, array_result[j, 0]) != None and re.match(letter, array_result[j + 1, 0]) == None:
            # Add the variable name (element of index j in column 0) to list_variables
            list_variables.append(re.match(letter, array_result[j, 0]).group())
            # Add its value (element of index j in column 1) to list_values to initiate the array
            list_values.append(array_result[j, 1])

        # If column 0 element is not a capital letter
        elif re.match(letter, array_result[j, 0]) == None:
            # Append its value (element of index j in column 1) to the last element of list_values
            list_values[-1] = list_values[-1] + array_result[j, 1]

    # Create the dictionnary with all extracted data: name as keys and values as values
    zip_iterator = zip(list_variables, list_values)
    dict_result = dict(zip_iterator)

    # Iterate through element within the dictionary:
    for key, value in dict_result.items():
        # If no value is found, set to nan
        if len(value) == 0:
            dict_result[key] = 0

        # if only one element is present in the value, extract it from the value list
        elif len(value) == 1:
            dict_result[key] = value[0]

    return dict_result, list_variables, list_values