Skip to content

Commit

Permalink
Merge pull request #208 from julianu/main
Browse files Browse the repository at this point in the history
Various small fixes
  • Loading branch information
julianu authored Jul 11, 2024
2 parents a17b10c + 0be2a0d commit bcffe82
Show file tree
Hide file tree
Showing 3 changed files with 7,939 additions and 1,231 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

<groupId>de.mpc.pia</groupId>
<artifactId>pia</artifactId>
<version>1.5.2</version>
<version>1.5.3</version>
<name>PIA - Protein Inference Algorithms</name>
<url>https://github.com/mpc-bioinformatics/pia</url>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
import de.mpc.pia.tools.obo.OBOMapper;

import org.apache.commons.text.StringEscapeUtils;
import org.apache.log4j.Logger;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.biojava.nbio.ontology.Term;
import org.biojava.nbio.ontology.Triple;
import uk.ac.ebi.jmzidml.model.mzidml.*;
Expand All @@ -35,7 +36,7 @@
class MzIdentMLFileParser {

/** logger for this class */
private static final Logger LOGGER = Logger.getLogger(MzIdentMLFileParser.class);
private static final Logger LOGGER = LogManager.getLogger();


/** the used PIA compiler */
Expand Down Expand Up @@ -136,10 +137,10 @@ private boolean parseFile(String name, String fileName) {
// get the AnalysisCollection:SpectrumIdentification for the SpectrumIdentificationLists
AnalysisCollection analysisCollection = unmarshaller.unmarshal(AnalysisCollection.class);

LOGGER.debug("scanning analysisCollection: " + analysisCollection
+ "\n\tgetSpectrumIdentification " + analysisCollection.getSpectrumIdentification()
+ "\n\tgetProteinDetection " + analysisCollection.getProteinDetection()
);
LOGGER.debug("scanning analysisCollection: {} "
+ "\n\tgetSpectrumIdentification {}"
+ "\n\tgetProteinDetection {}" ,
analysisCollection, analysisCollection.getSpectrumIdentification(), analysisCollection.getProteinDetection());

for (SpectrumIdentification si : analysisCollection.getSpectrumIdentification()) {
if (specIdLists.keySet().contains(si.getSpectrumIdentificationListRef())) {
Expand All @@ -153,8 +154,7 @@ private boolean parseFile(String name, String fileName) {
neededSpectraData.addAll(si.getInputSpectra().stream().map(InputSpectra::getSpectraDataRef).toList());
neededSearchDatabases.addAll(si.getSearchDatabaseRef().stream().map(SearchDatabaseRef::getSearchDatabaseRef).toList());
} else {
LOGGER.warn("file contains SpectrumIdentification ("
+ si.getId() + ") without SpectrumIdentificationList!");
LOGGER.warn("file contains SpectrumIdentification ({}) without SpectrumIdentificationList!", si.getId());
}
}

Expand Down Expand Up @@ -199,34 +199,20 @@ private boolean parseFile(String name, String fileName) {
dbSequences = new HashMap<>();
for (DBSequence dbSeq : sc.getDBSequence()) {
dbSequences.put(dbSeq.getId(), dbSeq);

LOGGER.debug("added dbSequence: " + dbSeq.getId() + " -> " + dbSequences.get(dbSeq.getId()));
}

// get/hash the SequenceCollection:Peptides
peptides = new HashMap<>();
for (uk.ac.ebi.jmzidml.model.mzidml.Peptide peptide: sc.getPeptide()) {
peptides.put(peptide.getId(), peptide);

LOGGER.debug("added peptide: " + peptide.getId()
+ " -> " + peptides.get(peptide.getId())
+ "\n\tpeptideSequence " + peptide.getPeptideSequence()
);
}

// get/hash the SequenceCollection:PeptideEvidences
peptideEvidences = new HashMap<>();
for (PeptideEvidence pepEvidence : sc.getPeptideEvidence()) {
peptideEvidences.put(pepEvidence.getId(), pepEvidence);

LOGGER.debug("added pepEvidence: " + pepEvidence.getId()
+ " -> " + peptideEvidences.get(pepEvidence.getId())
+ "\n\tdbSequenceRef " + pepEvidence.getDBSequenceRef()
+ "\n\tdbSequence " + pepEvidence.getDBSequence()
);
}


boolean ok = true;

// go through the SpectrumIdentificationList:SpectrumIdentificationResult:SpectrumIdentificationItem and build the PeptideSpectrumMatches, Accessions and Peptides
Expand All @@ -238,10 +224,10 @@ private boolean parseFile(String name, String fileName) {
}
}

LOGGER.info("inserted new: \n\t" +
pepNr + " peptides\n\t" +
specNr + " peptide spectrum matches\n\t" +
accNr + " accessions");
LOGGER.info("inserted new: \n"
+ "\t{} peptides\n"
+ "\t{} peptide spectrum matches\n"
+ "\t{} accessions", pepNr, specNr, accNr);
return ok;
}

Expand All @@ -258,15 +244,15 @@ private boolean createUnmarshaller(String name, String fileName) {
File mzidFile = new File(fileName);

if (!mzidFile.canRead()) {
LOGGER.error("could not read '" + fileName + "'.");
LOGGER.error("could not read '{}'.", fileName);
return false;
}

file = compiler.insertNewFile(name, fileName,
InputFileParserFactory.InputFileTypes.MZIDENTML_INPUT.getFileSuffix());

unmarshaller = new MzIdentMLUnmarshaller(mzidFile);
LOGGER.debug("Version of mzIdentML file: " + unmarshaller.getMzIdentMLVersion());
LOGGER.debug("Version of mzIdentML file: {}", unmarshaller.getMzIdentMLVersion());
return true;
}

Expand Down Expand Up @@ -336,9 +322,12 @@ private void checkEnzymeRegEx(Enzyme enzyme) {
getAndSetEnzymeRegexFromOBO(oboID, enzyme);
} else {
// TODO: parse the enzyme regex from a userParam
LOGGER.error("unsupported enzyme: " + param.getName() + " / " + param.getValue());
LOGGER.error("unsupported enzyme: {} / {}", param.getName(), param.getValue());
}
}
} else if ((enzyme.getSiteRegexp() != null) && enzyme.getSiteRegexp().contains(" ")) {
// if there are blanks in the regexp, remove them
enzyme.setSiteRegexp(enzyme.getSiteRegexp().replaceAll("\\s", ""));
}
}

Expand Down Expand Up @@ -407,7 +396,7 @@ private boolean addSpectrumIdentificationList(SpectrumIdentificationList specIDL
}

// go through all the SpectrumIdentificationResults and build the PSMs
LOGGER.debug("Processing " + specIDList.getSpectrumIdentificationResult().size() + " specIdResults");
LOGGER.debug("Processing {} specIdResults", specIDList.getSpectrumIdentificationResult().size());
boolean ok = true;
for (SpectrumIdentificationResult specIdRes : specIDList.getSpectrumIdentificationResult()) {
ok = addSpectrumIdentificationResult(specIdRes, spectrumID, specIDListsDBRefs, specIDListsEnzymes,
Expand Down Expand Up @@ -634,8 +623,7 @@ private boolean processSpectrumIdentificationItem(SpectrumIdentificationItem spe
processModification(mod, sequence, psm);
}
} else {
LOGGER.warn("no peptide for the peptide_ref " + specIdItem.getPeptideRef() +
" in the SequenceCollection -> can't get Modifications for it.");
LOGGER.warn("no peptide for the peptide_ref {} in the SequenceCollection -> can't get Modifications for it.", specIdItem.getPeptideRef());
}

// the PSM is finished here
Expand Down Expand Up @@ -679,14 +667,13 @@ private Peptide parseSIIPeptideEvidences(List<PeptideEvidenceRef> peptideEvidenc
PeptideEvidence pepEvidence = peptideEvidences.get(pepEvRef.getPeptideEvidenceRef());

if (pepEvidence == null) {
LOGGER.error("PeptideEvidence " + pepEvRef.getPeptideEvidenceRef() + " not found!");
LOGGER.error("PeptideEvidence {} not found!", pepEvRef.getPeptideEvidenceRef());
return null;
}

DBSequence dbSeq = dbSequences.get(pepEvidence.getDBSequenceRef());
if (dbSeq == null) {
LOGGER.error("DBSequence " + pepEvidence.getDBSequenceRef()
+ " for pepEvidence " + pepEvidence.getId() + " not found!");
LOGGER.error("DBSequence {} for pepEvidence {} not found!", pepEvidence.getDBSequenceRef(), pepEvidence.getId());
return null;
}

Expand All @@ -700,7 +687,7 @@ private Peptide parseSIIPeptideEvidences(List<PeptideEvidenceRef> peptideEvidenc
sequence = pepEvSequence;
} else {
if (!sequence.equals(pepEvSequence)) {
LOGGER.error("Different sequences found for a PSM: " + sequence + " != " + pepEvSequence);
LOGGER.error("Different sequences found for a PSM: {} != {}", sequence, pepEvSequence);
return null;
}
}
Expand Down Expand Up @@ -736,16 +723,15 @@ private static String getPeptideEvidenceSequence(Integer start, Integer end,
LOGGER.error("No peptide sequence found for a peptide!");
}

if ((proteinSequence != null) && (peptide != null) && proteinSequence.trim().length() > 0) {
if ((start != null) && (end != null) && (proteinSequence != null) && (peptide != null) && proteinSequence.trim().length() > 0) {
// some exporters get the start and stop of sequences wrong
if (start-1 < 0) {
start++;
}
String dbEvSeq = proteinSequence.substring(start-1, end);

if ((dbEvSeq != null) && !dbEvSeq.equals(pepEvSequence)) {
LOGGER.warn("PSM sequence fromSearchDB differs to sequence from Peptide: " +
dbEvSeq + " != " + pepEvSequence + ". Only sequence from Peptide is used.");
LOGGER.warn("PSM sequence fromSearchDB differs to sequence from Peptide: {} != {}. Only sequence from Peptide is used.", dbEvSeq, pepEvSequence);
}
}

Expand Down Expand Up @@ -782,10 +768,10 @@ private Accession addAccessionInformationFromPeptideEvidence(DBSequence dbSeq, S
if (proteinSequence != null) {
if ((acc.getDbSequence() != null) &&
!proteinSequence.equals(acc.getDbSequence())) {
LOGGER.warn("Different DBSequences found for same Accession, this is not suported!\n" +
"\t Accession: " + acc.getAccession() +
'\t' + dbSeq.getSeq() + '\n' +
'\t' + acc.getDbSequence());
LOGGER.warn("Different DBSequences found for same Accession, this is not suported!\n"
+ "\tAccession: {}"
+ "\t{}\n"
+ "\t!= {}", acc.getAccession(), dbSeq.getSeq(), acc.getDbSequence());
} else if (acc.getDbSequence() == null) {
// found a sequence now
acc.setDbSequence(proteinSequence);
Expand Down Expand Up @@ -1084,7 +1070,7 @@ private static boolean addScoreFromParam(PeptideSpectrumMatch psm, UserParam use
*/
public static boolean checkFileType(String fileName) {
boolean isMzIdentMLFile = false;
LOGGER.debug("checking whether this is an mzIdentML file: " + fileName);
LOGGER.debug("checking whether this is an mzIdentML file: {}", fileName);

try (Stream<String> stream = Files.lines(Paths.get(fileName))) {
// read in the first 10, not empty lines
Expand All @@ -1096,24 +1082,27 @@ public static boolean checkFileType(String fileName) {
int idx = 0;

// optional declaration
if (lines.get(idx).trim().matches("<\\?xml version=\"[0-9.]+\"( encoding=\"[^\"]+\"){0,1}( standalone=\\\"[^\\\"]+\\\"){0,1}\\?>")) {
LOGGER.debug("file has the XML declaration line:" + lines.get(idx));
String line = lines.get(idx);
if (line.trim().matches("<\\?xml version=\"[0-9.]+\"( encoding=\"[^\"]+\")?( standalone=\\\"[^\\\"]+\\\")?\\?>")) {
LOGGER.debug("file has the XML declaration line: {}", line);
idx++;
}

// optional stylesheet declaration
if (lines.get(idx).trim().matches("<\\?xml-stylesheet.+\\?>")) {
LOGGER.debug("file has the XML stylesheet line:" + lines.get(idx));
line = lines.get(idx);
if (line.trim().matches("<\\?xml-stylesheet.+\\?>")) {
LOGGER.debug("file has the XML stylesheet line: {}", line);
idx++;
}

// now the MzIdentML element must be next
if (lines.get(idx).trim().matches("<MzIdentML .+")) {
line = lines.get(idx);
if (line.trim().matches("<MzIdentML .+")) {
isMzIdentMLFile = true;
LOGGER.debug("file has the MzIdentML element: " + lines.get(idx));
LOGGER.debug("file has the MzIdentML element: {}", line);
}
} catch (Exception e) {
LOGGER.error("Could not check file " + fileName, e);
LOGGER.error("Could not check file {}", fileName, e);
}

return isMzIdentMLFile;
Expand Down
Loading

0 comments on commit bcffe82

Please sign in to comment.