Skip to content

Commit

Permalink
added error handling for data content errors relating to run order co…
Browse files Browse the repository at this point in the history
…rrection
  • Loading branch information
carolinesands committed Feb 9, 2024
1 parent 3f78659 commit 5f585c9
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 178 deletions.
16 changes: 15 additions & 1 deletion nPYc/batchAndROCorrection/_batchAndROCorrection.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from datetime import datetime, timedelta
from ..objects._msDataset import MSDataset
from ..enumerations import AssayRole, SampleType
from ..utilities._errorHandling import npycToolboxError


def correctMSdataset(data,
Expand Down Expand Up @@ -60,9 +61,18 @@ def correctMSdataset(data,
raise TypeError("parallelise must be a boolean")
if not isinstance(excludeFailures, bool):
raise TypeError("excludeFailures must be a boolean")
if not isinstance(correctionSampleType,SampleType):
if not isinstance(correctionSampleType, SampleType):
raise TypeError("correctionType must be a SampleType")

# Input data checks
if ('Acquired Time' not in data.sampleMetadata.columns) and ('Run Order' not in data.sampleMetadata.columns):
raise npycToolboxError(
'Unable to run batch and run order correction without `Run Order` or `Acquired Time` columns in `dataset.sampleMetadata`')

if 'Correction Batch' not in data.sampleMetadata.columns:
raise npycToolboxError(
'Unable to run batch and run order correction without `dataset.sampleMetadata[`Correction Batch`]`, add manually or run `inferBatches`')

# Define the samples to be corrected (only corrected if have value in 'Correction Batch' and not listed for
# exclusion in 'samplesNotCorrected'
samplesForCorrection = data.sampleMetadata['Correction Batch'].values.astype(float)
Expand All @@ -75,6 +85,10 @@ def correctMSdataset(data,
except KeyError:
raise KeyError('data.Attributes[\'samplesNotCorrected\'] must contain valid SampleType/AssayRole enumeration entries')

# Check Run Order available for all samples to be corrected and raise error if not
if numpy.any(numpy.isnan(data.sampleMetadata.loc[~numpy.isnan(samplesForCorrection), 'Run Order'])):
raise npycToolboxError("Unable to run batch and run order correction without `dataset.sampleMetadata[`Run Order`]` info for ALL samples")

with warnings.catch_warnings():
warnings.simplefilter('ignore', category=RuntimeWarning)

Expand Down
13 changes: 6 additions & 7 deletions nPYc/objects/_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
from ..utilities import removeDuplicateColumns
from ..utilities import normalisation
from ..utilities.normalisation._normaliserABC import Normaliser
from ..utilities._errorHandling import npycToolboxError
import warnings
from IPython.display import display


class Dataset:
Expand Down Expand Up @@ -1246,9 +1246,9 @@ def _matchBasicCSV(self, filePath):
# If Acquired Time column is in the CSV file, reformat data to allow operations on timestamps and timedeltas,
# which are used in some plotting functions
if 'Acquired Time' in csvData:
csv_datetime = pandas.to_datetime(csvData['Acquired Time'], errors='ignore')
csv_datetime = pandas.to_datetime(csvData['Acquired Time'], errors='coerce') # errors='ignore'
csv_datetime = csv_datetime.dt.strftime('%d-%b-%Y %H:%M:%S')
csvData['Acquired Time'] = csv_datetime.apply(lambda x: datetime.strptime(x, '%d-%b-%Y %H:%M:%S')).astype('O')
csvData['Acquired Time'] = csv_datetime.apply(lambda x: datetime.strptime(x, '%d-%b-%Y %H:%M:%S') if isinstance(x, str) else None).astype('O')

# Left join, without sort, so the intensityData matrix and the sample Masks are kept in order
# Preserve information about sample mask alongside merge even on the case of samples missing from CSV file.
Expand Down Expand Up @@ -1314,13 +1314,12 @@ def _matchBasicCSV(self, filePath):
# If not in the new CSV, but previously there, keep it and don't mask
if len(metadataNotAvailable) > 0:
joinedTable.loc[metadataNotAvailable, 'Metadata Available'] = False
# self.sampleMask[metadataNotAvailable] = False
# joinedTable.loc[metadataNotAvailable, 'Exclusion Details'] = 'No Metadata in CSV'

# Print warning that samples should be added to basic CSV or excluded from dataset
print('The following samples should be added to "Basic CSV" file, or excluded from dataset or nPYc-Toolbox functionality may be compromised:')
raise npycToolboxError('The following samples should be added to "Basic CSV" file, or excluded from dataset else nPYc-Toolbox functionality may be compromised: ',
table=acquired_butnotcsv)
#display(acquired_butnotcsv)
print(*acquired_butnotcsv['Sample File Name'].values, sep='\n')
#print(*acquired_butnotcsv['Sample File Name'].values, sep='\n')

# 1) ACQ and in "include Sample" - drop and set mask to false
# Samples Not ACQ and in "include Sample" set to False - drop and ignore from the dataframe
Expand Down
203 changes: 64 additions & 139 deletions nPYc/objects/_msDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from ..utilities._filters import blankFilter
from ..utilities.normalisation._normaliserABC import Normaliser
from ..utilities.normalisation._nullNormaliser import NullNormaliser
from ..utilities._errorHandling import npycToolboxError


class MSDataset(Dataset):
Expand Down Expand Up @@ -548,6 +549,7 @@ def addSampleInfo(self, descriptionFormat=None, filePath=None, filenameSpec=None
* **'ISATAB'** ISATAB study designs
* **'Filenames'** Parses sample information out of the filenames, based on the named capture groups in the regex passed in *filenamespec*
* **'Basic CSV'** Joins the :py:attr:`sampleMetadata` table with the data in the ``csv`` file at *filePath=*, matching on the 'Sample File Name' column in both.
* **'Infer batches'** Infers the sample batch based on Acquired Time/Run Order
:param str descriptionFormat: Format of metadata to be added
:param str filePath: Path to the additional data to be added
Expand All @@ -560,11 +562,12 @@ def addSampleInfo(self, descriptionFormat=None, filePath=None, filenameSpec=None
if filenameSpec is None: # Use spec from SOP
filenameSpec = self.Attributes['filenameSpec']
self._getSampleMetadataFromFilename(filenameSpec)
elif descriptionFormat == 'Infer Batches':
self._inferBatches()
else:
super().addSampleInfo(descriptionFormat=descriptionFormat, filePath=filePath,
filetype=filetype, filenameSpec=filenameSpec, **kwargs)
if descriptionFormat in ['Filenames', 'Basic CSV', 'Raw Data']:
self._inferBatches()


def _loadQIDataset(self, path):

Expand Down Expand Up @@ -1291,6 +1294,7 @@ def _getSampleMetadataFromRawData(self, rawDataPath, filetype="Waters .raw"):
# Print warning that samples are missing info, raw files should be located or samples should be excluded from dataset
missingSampleInfo = ~self.sampleMetadata['Sample File Name'].isin(instrumentParams['Sample File Name'])
if sum(missingSampleInfo) > 0:
# CAZ TODO raise error if samples missing from raw data folder
print('Raw data for the following samples should be added to the raw data folder, or samples should be excluded from dataset else nPYc-Toolbox functionality may be compromised:\n')
print(*self.sampleMetadata.loc[missingSampleInfo, 'Sample File Name'].values, sep='\n')

Expand Down Expand Up @@ -1436,158 +1440,79 @@ def _inferBatches(self, gapLength=24):
"""
sampleMetadata = self.sampleMetadata.copy()

# Try to infer batches from 'Acquired Time' (preference) or 'Run Order'
try:
# Initial checks and set up

# Check `Acquired Time` or `Run Order` available
if ('Acquired Time' not in sampleMetadata.columns) and ('Run Order' not in sampleMetadata.columns):
raise npycToolboxError('Unable to infer batches without `Run Order` or `Acquired Time` columns in dataset.sampleMetadata')

# Infer batches from `Acquired Time` (preference) or `Run Order`
if 'Acquired Time' in sampleMetadata.columns:
usefield = 'Acquired Time'

# Generate sampleMetadata sorted by run order
if ('Run Order' not in sampleMetadata.columns):
if 'Run Order' not in sampleMetadata.columns:
sampleMetadata['Order'] = sampleMetadata.sort_values(by='Acquired Time').index
sampleMetadata['Run Order'] = sampleMetadata.sort_values(by='Order').index
sampleMetadata.drop('Order', axis=1, inplace=True)

sortedSampleMetadata = sampleMetadata.sort_values(by='Run Order')

# Use 'Acquired Time'(preference) or 'Run Order'
if ('Acquired Time' in sampleMetadata.columns):
usefield = 'Acquired Time'
else:
usefield = 'Run Order'

else:
usefield = 'Run Order'

# Check `Acquired Time` or `Run Order` information available for all samples
if numpy.any(sampleMetadata[usefield].isnull()):
raise npycToolboxError("Unable to run batch and run order correction without `sampleMetadata[`" + usefield + "`]` info for ALL samples")

# Set first batch
sampleMetadata['Correction Batch'] = 1
sampleMetadata['Batch'] = 1
# Generate sampleMetadata sorted by run order
sortedSampleMetadata = sampleMetadata.sort_values(by='Run Order')

# Calculate the consecutive time differences
timeDelta = sortedSampleMetadata[usefield].diff()
# Set first batch
sampleMetadata['Correction Batch'] = 1
sampleMetadata['Batch'] = 1

# Calculate the consecutive time differences
timeDelta = sortedSampleMetadata[usefield].diff()
if usefield == 'Acquired Time':
batchTimeSplits = [sortedSampleMetadata.loc[idx, usefield] for idx, x in
sortedSampleMetadata.iterrows() if timeDelta.loc[idx] > timedelta(hours=gapLength)]
batchTimeSplits.extend([sortedSampleMetadata[usefield].max()])
batchNumber = 1

for idx, batchSplit in enumerate(batchTimeSplits):
currentBatchIndex = sampleMetadata[usefield] <= batchSplit
if idx > 0:
currentBatchIndex &= sampleMetadata[usefield] >= batchTimeSplits[idx - 1]
sampleMetadata.loc[currentBatchIndex, 'Correction Batch'] = batchNumber
sampleMetadata.loc[currentBatchIndex, 'Batch'] = batchNumber
batchNumber += 1

# Handle the 'Dilution Series' field
if sum(sampleMetadata['AssayRole'] == AssayRole.LinearityReference) > 0:
SRD_series = 1
previousDilutionRunOrder = sortedSampleMetadata.loc[
sortedSampleMetadata['AssayRole'] == AssayRole.LinearityReference, 'Run Order'].min()
previousBatch = 1
for idx, row in sortedSampleMetadata.loc[
sortedSampleMetadata['AssayRole'] == AssayRole.LinearityReference, :].iterrows():
if (row['Run Order'] - previousDilutionRunOrder > 1) or (row['Batch'] > previousBatch):
SRD_series += 1
sampleMetadata.loc[idx, 'Dilution Series'] = SRD_series
previousDilutionRunOrder = row['Run Order']
previousBatch = row['Batch']

# Method Reference, Dilution Series, and Blanks should have "Correction Batch" = nan
SamplesNoBatchCorrection = sampleMetadata['AssayRole'].isin([AssayRole.Blank, AssayRole.LinearityReference])
sampleMetadata.loc[SamplesNoBatchCorrection, 'Correction Batch'] = numpy.nan

# Handle cases where a first batch contains only blanks or pre-injection blanks.
if numpy.nanmin(sampleMetadata['Correction Batch']) > 1:
batchDiff = numpy.nanmin(sampleMetadata['Correction Batch']) - 1
sampleMetadata['Correction Batch'] -= batchDiff

self.sampleMetadata = sampleMetadata

except (AttributeError, KeyError, TypeError):
warnings.warn('Unable to infer batches without complete run order or acquired time info, skipping.')

"""
# If 'Acquired Time' data present
if ('Acquired Time' in sampleMetadata.columns) and (not sampleMetadata['Acquired Time'].isnull().all()):
else:
batchTimeSplits = [sortedSampleMetadata.loc[idx, usefield] for idx, x in
sortedSampleMetadata.iterrows() if timeDelta.loc[idx] > gapLength]
batchTimeSplits.extend([sortedSampleMetadata[usefield].max()])
batchNumber = 1

for idx, batchSplit in enumerate(batchTimeSplits):
currentBatchIndex = sampleMetadata[usefield] <= batchSplit
if idx > 0:
currentBatchIndex &= sampleMetadata[usefield] >= batchTimeSplits[idx - 1]
sampleMetadata.loc[currentBatchIndex, 'Correction Batch'] = batchNumber
sampleMetadata.loc[currentBatchIndex, 'Batch'] = batchNumber
batchNumber += 1

# Handle the 'Dilution Series' field
if sum(sampleMetadata['AssayRole'] == AssayRole.LinearityReference) > 0:
SRD_series = 1
previousDilutionRunOrder = sortedSampleMetadata.loc[
sortedSampleMetadata['AssayRole'] == AssayRole.LinearityReference, 'Run Order'].min()
previousBatch = 1
for idx, row in sortedSampleMetadata.loc[
sortedSampleMetadata['AssayRole'] == AssayRole.LinearityReference, :].iterrows():
if (row['Run Order'] - previousDilutionRunOrder > 1) or (row['Batch'] > previousBatch):
SRD_series += 1
sampleMetadata.loc[idx, 'Dilution Series'] = SRD_series
previousDilutionRunOrder = row['Run Order']
previousBatch = row['Batch']

if ('Run Order' not in sampleMetadata.columns):
sampleMetadata['Order'] = sampleMetadata.sort_values(by='Acquired Time').index
sampleMetadata['Run Order'] = sampleMetadata.sort_values(by='Order').index
sampleMetadata.drop('Order', axis=1, inplace=True)
# Method Reference, Dilution Series, and Blanks should have "Correction Batch" = nan
SamplesNoBatchCorrection = sampleMetadata['AssayRole'].isin([AssayRole.Blank, AssayRole.LinearityReference])
sampleMetadata.loc[SamplesNoBatchCorrection, 'Correction Batch'] = numpy.nan

sortedSampleMetadata = sampleMetadata.sort_values(by='Run Order')
sampleMetadata['Correction Batch'] = 1
sampleMetadata['Batch'] = 1
# Handle cases where a first batch contains only blanks or pre-injection blanks.
if numpy.nanmin(sampleMetadata['Correction Batch']) > 1:
batchDiff = numpy.nanmin(sampleMetadata['Correction Batch']) - 1
sampleMetadata['Correction Batch'] -= batchDiff

timeDelta = sortedSampleMetadata['Acquired Time'].diff()
self.sampleMetadata = sampleMetadata

batchTimeSplits = [sortedSampleMetadata.loc[idx, 'Acquired Time'] for idx, x in
sortedSampleMetadata.iterrows() if timeDelta.loc[idx] > timedelta(hours=gapLength)]
batchTimeSplits.extend([sortedSampleMetadata['Acquired Time'].max()])
batchNumber = 1
for idx, batchSplit in enumerate(batchTimeSplits):
currentBatchIndex = sampleMetadata['Acquired Time'] <= batchSplit
if idx > 0:
currentBatchIndex &= sampleMetadata['Acquired Time'] >= batchTimeSplits[idx - 1]
sampleMetadata.loc[currentBatchIndex, 'Correction Batch'] = batchNumber
sampleMetadata.loc[currentBatchIndex, 'Batch'] = batchNumber
batchNumber += 1
# Handle the 'Dilution Series' field
if sum(sampleMetadata['AssayRole'] == AssayRole.LinearityReference) > 0:
SRD_series = 1
previousDilutionRunOrder = sortedSampleMetadata.loc[
sortedSampleMetadata['AssayRole'] == AssayRole.LinearityReference, 'Run Order'].min()
previousBatch = 1
for idx, row in sortedSampleMetadata.loc[
sortedSampleMetadata['AssayRole'] == AssayRole.LinearityReference, :].iterrows():
if (row['Run Order'] - previousDilutionRunOrder > 1) or (row['Batch'] > previousBatch):
SRD_series += 1
sampleMetadata.loc[idx, 'Dilution Series'] = SRD_series
previousDilutionRunOrder = row['Run Order']
previousBatch = row['Batch']
# Method Reference, Dilution Series, and Blanks should have "Correction Batch" = nan
SamplesNoBatchCorrection = sampleMetadata['AssayRole'].isin([AssayRole.Blank, AssayRole.LinearityReference])
sampleMetadata.loc[SamplesNoBatchCorrection, 'Correction Batch'] = numpy.nan
# Handle cases where a first batch contains only blanks or pre-injection blanks.
if numpy.nanmin(sampleMetadata['Correction Batch']) > 1:
batchDiff = numpy.nanmin(sampleMetadata['Correction Batch']) - 1
sampleMetadata['Correction Batch'] -= batchDiff
self.sampleMetadata = sampleMetadata
elif ('Run Order' in sampleMetadata.columns) and (not sampleMetadata['Run Order'].isnull().all()):
sortedSampleMetadata = sampleMetadata.sort_values(by='Run Order')
sampleMetadata['Correction Batch'] = 1
sampleMetadata['Batch'] = 1
# Handle the 'Dilution Series' field
if sum(sampleMetadata['AssayRole'] == AssayRole.LinearityReference) > 0:
SRD_series = 1
previousDilutionRunOrder = sortedSampleMetadata.loc[
sortedSampleMetadata['AssayRole'] == AssayRole.LinearityReference, 'Run Order'].min()
previousBatch = 1
for idx, row in sortedSampleMetadata.loc[
sortedSampleMetadata['AssayRole'] == AssayRole.LinearityReference, :].iterrows():
if (row['Run Order'] - previousDilutionRunOrder > 1) or (row['Batch'] > previousBatch):
SRD_series += 1
sampleMetadata.loc[idx, 'Dilution Series'] = SRD_series
previousDilutionRunOrder = row['Run Order']
previousBatch = row['Batch']
# Method Reference, Dilution Series, and Blanks should have "Correction Batch" = nan
SamplesNoBatchCorrection = sampleMetadata['AssayRole'].isin(
[AssayRole.Blank, AssayRole.LinearityReference])
sampleMetadata.loc[SamplesNoBatchCorrection, 'Correction Batch'] = numpy.nan
# Handle cases where a first batch contains only blanks or pre-injection blanks.
if numpy.nanmin(sampleMetadata['Correction Batch']) > 1:
batchDiff = numpy.nanmin(sampleMetadata['Correction Batch']) - 1
sampleMetadata['Correction Batch'] -= batchDiff
self.sampleMetadata = sampleMetadata
"""

def amendBatches(self, sampleRunOrder):
"""
Expand Down
Loading

0 comments on commit 5f585c9

Please sign in to comment.