-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextraction_data_medpc_BR.py
215 lines (159 loc) · 7.11 KB
/
extraction_data_medpc_BR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 13
Functions to analyse social operant conditioning
@author: redon
Variables:
\ A = fixed ratio
\ B = session duration max
\ C = nb np max in session
\ D = timer for session
\ E = timer for table
\ F = count noldus
\ G = array noldus
\ H = idx array noldus
\ I = count inactive
\ J = array inactive
\ K = idx array inactive
\ L = count inactive all
\ M = array inactive all
\ N = idx array inactive all
\ O = temporary count active all
\ P = count active
\ Q = array active
\ R = idx array active
\ S = count active all
\ T = array active all
\ U = idx array active all
"""
import pandas as pd
import numpy as np
import re
import datetime
import pathlib
def ethovision_reader(file):
'''
Read ethovision raw csv into separate dataframe for header and raw values
Parameters:
----------------
file: String
Path to the ethovision raw csv
Returns:
----------------
df_header: Dataframe
Contains all experiment information (2 columns: name of information, value)
df_ethovision: Dataframe
Contains the raw value of the variables' state at each 4ms
'''
# Determine the number of rows in the header
nrows = pd.read_csv(file, header=None, usecols=[0, 1], nrows=1).at[0, 1]
# Extract the header into a dataframe
df_header = pd.read_csv(file, header=None, index_col=0, usecols=[0, 1], nrows=(nrows - 2))
# Extract the session data into a dataframe
df_ethovision = pd.read_csv(file, skiprows=(nrows - 2), header=0, na_values='-').drop(0).astype('float')
return df_header, df_ethovision
def start_time_med(med_file):
time_regex_med = r"Start Time: (\d{2}:\d{2}:\d{2})"
date_regex_med = r"Start Date: (\d{2}/\d{2}/\d{2})"
with open(med_file) as f:
lines = f.readlines()
for line in lines:
match_time = re.search(time_regex_med, line)
match_date = re.search(date_regex_med, line)
if match_time:
start_time = match_time.group(1)
# start_time = datetime.datetime.strptime(match_time.group(1), '%H:%M:%S')
if match_date:
start_date = match_date.group(1)
start_med_str = start_date + ' ' + start_time
start_med_time = datetime.datetime.strptime(start_med_str, '%m/%d/%y %H:%M:%S')
return start_med_time
def detect_floats(sequence):
'''Function computing medassociate file data by detecting list of numbers in non-organized txt file
Arg:
sequence = chunk of non organized txt file containing several float to extract
type Str
Returns
list_floats = List containing each single floats detected in the sequence
type List of floats
'''
# Pattern to match any decimal number whatever the size
pattern = re.compile(
r'[0-9]+\.?[0-9]+') # any several number ([0-9]+) followed by a period or not(\.?), and followed by several number
# Find each single decimal number in the sequence
list_floats_str = re.findall(pattern, sequence)
# Convert the list of string into list of floats to allow future computation
list_floats = [float(x) for x in list_floats_str]
return list_floats
def import_txt(file):
"""Function allowing to (i) import medpc file into a dataframe
and to (ii) convert columns of data into arrays of floats for computation
WARNING: use detect_floats()
Arg:
file = path or name of the file to open, must be txt or csv-like
type Str
Return:
array_results = array version of df_medpc_all
type Array
df_medpc_all = 2 columns dataframe containing experimental data
type DataFrame
"""
df_medpc_all = pd.read_csv(file,
sep=':', # Allow to have 2 column but will raise error on dates
skiprows=[0],
on_bad_lines='warn',
# Allow to overcome error with dates by simply warning when one is encountered
header=None).fillna('empty')
# Converting the dataframe into array
array_results = df_medpc_all.values
for i in range(len(array_results)):
array_results[i, 1] = detect_floats(array_results[i, 1])
return array_results, df_medpc_all
def extract_arrays(array_result):
"""Function iterating through arrays to extract and organize the all data
into a dictionary with the variable's name (key) and its content (value)
Arg:
array_results = array containing data from the dataframe
Return:
dict_result = dictionnary gathering variable names (keys) and values in an array (value)
type Dict
list_variables = list of variable names a string
type List of Str
list_values = list of variable values in list
type List of lists
"""
# Initiate empty lists to host data
list_variables = []
list_values = []
# Define a regex that would match any uppercase letter
letter = re.compile(r'[A-Z]')
# Iterate through the row of the array:
for j in range(len(array_result)):
# If column 0 element at index j and j+1 are capital letters
if re.match(letter, array_result[j, 0]) != None and re.match(letter, array_result[j + 1, 0]) != None:
# Add the variable name (element of index j in column 0) to list_variables
list_variables.append(re.match(letter, array_result[j, 0]).group())
# Add its value (element of index j in column 1) to list_values
list_values.append(array_result[j, 1])
# If column 0 element is a capital letter and the following is not
elif re.match(letter, array_result[j, 0]) != None and re.match(letter, array_result[j + 1, 0]) == None:
# Add the variable name (element of index j in column 0) to list_variables
list_variables.append(re.match(letter, array_result[j, 0]).group())
# Add its value (element of index j in column 1) to list_values to initiate the array
list_values.append(array_result[j, 1])
# If column 0 element is not a capital letter
elif re.match(letter, array_result[j, 0]) == None:
# Append its value (element of index j in column 1) to the last element of list_values
list_values[-1] = list_values[-1] + array_result[j, 1]
# Create the dictionnary with all extracted data: name as keys and values as values
zip_iterator = zip(list_variables, list_values)
dict_result = dict(zip_iterator)
# Iterate through element within the dictionary:
for key, value in dict_result.items():
# If no value is found, set to nan
if len(value) == 0:
dict_result[key] = 0
# if only one element is present in the value, extract it from the value list
elif len(value) == 1:
dict_result[key] = value[0]
return dict_result, list_variables, list_values