Skip to content

Commit

Permalink
fetch and insert all possible seqCol objects from one assembly read
Browse files Browse the repository at this point in the history
  • Loading branch information
waterflow80 committed Aug 2, 2023
1 parent 6097a92 commit cc12985
Show file tree
Hide file tree
Showing 9 changed files with 127 additions and 129 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import uk.ac.ebi.eva.evaseqcol.service.SeqColService;

import java.io.IOException;
import java.util.List;
import java.util.Optional;

@RequestMapping("/collection/admin")
Expand All @@ -27,16 +28,15 @@ public AdminController(SeqColService seqColService) {
}

/**
* Naming convention should be either ENA, GENBANK or UCSC */
@PutMapping(value = "/seqcols/{asmAccession}/{namingConvention}")
* Fetch and insert all possible seqCol objects given the assembly accession
* NOTE: All possible means with all naming conventions that exist in the fetched assembly report*/
@PutMapping(value = "/seqcols/{asmAccession}")
public ResponseEntity<?> fetchAndInsertSeqColByAssemblyAccessionAndNamingConvention(
@PathVariable String asmAccession, @PathVariable String namingConvention) {
// TODO: REMOVE THE NAMING CONVENTION PATH VARIABLE AND MAKE IT GENERIC
@PathVariable String asmAccession) {
try {
Optional<String> level0Digest = seqColService.fetchAndInsertSeqColByAssemblyAccession(
asmAccession, SeqColEntity.NamingConvention.valueOf(namingConvention));
List<String> level0Digests = seqColService.fetchAndInsertAllSeqColByAssemblyAccession(asmAccession);
return new ResponseEntity<>(
"Successfully inserted seqCol for assemblyAccession " + asmAccession + "\nDigest=" + level0Digest.get()
"Successfully inserted seqCol object(s) for assembly accession " + asmAccession + "\nSeqCol digests=" + level0Digests
, HttpStatus.OK);
} catch (IllegalArgumentException e) {
return new ResponseEntity<>(e.getMessage(), HttpStatus.BAD_REQUEST);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,123 +9,59 @@
import uk.ac.ebi.eva.evaseqcol.entities.AssemblySequenceEntity;
import uk.ac.ebi.eva.evaseqcol.entities.SeqColEntity;
import uk.ac.ebi.eva.evaseqcol.entities.SeqColExtendedDataEntity;
import uk.ac.ebi.eva.evaseqcol.entities.SeqColLevelOneEntity;
import uk.ac.ebi.eva.evaseqcol.digests.DigestCalculator;
import uk.ac.ebi.eva.evaseqcol.refget.ChecksumCalculator;
import uk.ac.ebi.eva.evaseqcol.refget.MD5Calculator;
import uk.ac.ebi.eva.evaseqcol.refget.SHA512Calculator;
import uk.ac.ebi.eva.evaseqcol.utils.JSONLevelOne;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;

@Repository("NCBISeqColDataSource")
public class NCBISeqColDataSource implements SeqColDataSource{

private final Logger logger = LoggerFactory.getLogger(NCBISeqColDataSource.class);

private final NCBIAssemblyDataSource assemblyDataSource;
private final NCBIAssemblySequenceDataSource assemblySequenceDataSource;
private DigestCalculator digestCalculator = new DigestCalculator();
private ChecksumCalculator sha512Calculator = new SHA512Calculator();
private ChecksumCalculator md5Caclculator = new MD5Calculator();

@Autowired
public NCBISeqColDataSource(NCBIAssemblyDataSource assemblyDataSource,
NCBIAssemblySequenceDataSource assemblySequenceDataSource
) {
this.assemblyDataSource = assemblyDataSource;
this.assemblySequenceDataSource = assemblySequenceDataSource;
}
this.assemblySequenceDataSource = assemblySequenceDataSource;}

@Override
/**
* Download both the Assembly Report and the Sequences FASTA file for the given accession
* and return the seqCol extended data list: names, lengths and sequences */
public Optional<List<SeqColExtendedDataEntity>> getSeqColExtendedDataListByAccession(
String accession, SeqColEntity.NamingConvention namingConvention) throws IOException {
* and return a Map with the following content:
* {
* "sameValueAttributes" : [extendedLengths, extendedSequences, extendedMd5Sequences],
* "namesAttributes" : [extendedNames1, extendedNames2, ...]
* }
* The "sameValueAttributes" are the attributes that have the same value across multiple seqCol for the same assembly
* accession.
* The "namesAttributes" has the list of the list of sequences' names with all possible naming conventions.*/
public Optional<Map<String, List<SeqColExtendedDataEntity>>> getAllPossibleSeqColExtendedData(String accession) throws IOException {
Map<String, List<SeqColExtendedDataEntity>> seqColResultData = new HashMap<>();
Optional<AssemblyEntity> assemblyEntity = assemblyDataSource.getAssemblyByAccession(accession);
if (!assemblyEntity.isPresent()) {
logger.error("Could not fetch Assembly Report from NCBI for accession: " + accession);
logger.error("Could not fetch Assembly Report from NCBI for assembly accession: " + accession);
return Optional.empty();
} else if (!(assemblyEntity.get().getChromosomes() != null && assemblyEntity.get().getChromosomes().size() > 0)) {
} else if (!(assemblyEntity.get().getChromosomes() != null && !assemblyEntity.get().getChromosomes().isEmpty())) {
logger.error("No chromosome in assembly " + accession + ". Aborting");
return Optional.empty();
}
Optional<AssemblySequenceEntity> sequenceEntity = assemblySequenceDataSource.getAssemblySequencesByAccession(accession);
if (!sequenceEntity.isPresent()) {
logger.error("Could not fetch Sequences FASTA file from NCBI for accession: " + accession);
return Optional.empty();
}
List<SeqColExtendedDataEntity> extendedDataEntities = constructExtendedSeqColDataList(
assemblyEntity.get(), sequenceEntity.get(), namingConvention, accession);
return Optional.of(extendedDataEntities);
}

@Override
/**
* Download both the Assembly Report and the Sequences FASTA file for the given accession
* and return the seqCol Level one entity for the given naming convention*/
public Optional<SeqColLevelOneEntity> getSeqColL1ByAssemblyAccession(
String accession, SeqColEntity.NamingConvention namingConvention) throws IOException {
Optional<AssemblyEntity> assemblyEntity = assemblyDataSource.getAssemblyByAccession(accession);
if (!assemblyEntity.isPresent()) {
logger.error("Could not fetch Assembly Report from NCBI for accession: " + accession);
return Optional.empty();
}
Optional<AssemblySequenceEntity> sequenceEntity = assemblySequenceDataSource.getAssemblySequencesByAccession(accession);
if (!sequenceEntity.isPresent()) {
logger.error("Could not fetch Sequences FASTA file from NCBI for accession: " + accession);
logger.error("Could not fetch Sequences FASTA file from NCBI for assembly accession: " + accession);
return Optional.empty();
}
List<SeqColExtendedDataEntity> extendedDataEntities = constructExtendedSeqColDataList(
assemblyEntity.get(), sequenceEntity.get(), namingConvention, accession);
SeqColLevelOneEntity levelOneEntity = constructSeqColLevelOne(extendedDataEntities, namingConvention);
return Optional.of(levelOneEntity);
}

/**
* Construct a seqCol level 1 entity out of three seqCol level 2 entities that
* hold names, lengths and sequences objects*/
public SeqColLevelOneEntity constructSeqColLevelOne(List<SeqColExtendedDataEntity> extendedDataEntities,
SeqColEntity.NamingConvention convention) throws IOException {
SeqColLevelOneEntity levelOneEntity = new SeqColLevelOneEntity();
JSONLevelOne jsonLevelOne = new JSONLevelOne();
for (SeqColExtendedDataEntity dataEntity: extendedDataEntities) {
switch (dataEntity.getAttributeType()) {
case lengths:
jsonLevelOne.setLengths(dataEntity.getDigest());
break;
case names:
jsonLevelOne.setNames(dataEntity.getDigest());
break;
case sequences:
jsonLevelOne.setSequences(dataEntity.getDigest());
break;
case md5DigestsOfSequences:
jsonLevelOne.setMd5DigestsOfSequences(dataEntity.getDigest());
break;
}
}
levelOneEntity.setSeqColLevel1Object(jsonLevelOne);
String digest0 = digestCalculator.getSha512Digest(levelOneEntity.toString());
levelOneEntity.setDigest(digest0);
levelOneEntity.setNamingConvention(convention);
return levelOneEntity;
}

/**
* Return the 3 extended data objects (names, lengths and sequences) of the given naming convention*/
public List<SeqColExtendedDataEntity> constructExtendedSeqColDataList(AssemblyEntity assemblyEntity, AssemblySequenceEntity assemblySequenceEntity,
SeqColEntity.NamingConvention convention, String assemblyAccession) throws IOException {
// Sorting the chromosomes' list (assemblyEntity) and the sequences' list (sequencesEntity) in the same order
return Arrays.asList(
SeqColExtendedDataEntity.constructSeqColSequencesObject(assemblySequenceEntity),
SeqColExtendedDataEntity.constructSeqColSequencesMd5Object(assemblySequenceEntity),
SeqColExtendedDataEntity.constructSeqColNamesObject(assemblyEntity, convention),
SeqColExtendedDataEntity.constructSeqColLengthsObject(assemblyEntity)
);
seqColResultData.put(
"sameValueAttributes",
SeqColExtendedDataEntity.constructSameValueExtendedSeqColData(assemblyEntity.get(), sequenceEntity.get()));
seqColResultData.put(
"namesAttributes",
SeqColExtendedDataEntity.constructAllPossibleExtendedNamesSeqColData(assemblyEntity.get()));
return Optional.of(seqColResultData);
}

}
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
package uk.ac.ebi.eva.evaseqcol.datasource;

import uk.ac.ebi.eva.evaseqcol.entities.SeqColEntity;
import uk.ac.ebi.eva.evaseqcol.entities.SeqColExtendedDataEntity;
import uk.ac.ebi.eva.evaseqcol.entities.SeqColLevelOneEntity;
import uk.ac.ebi.eva.evaseqcol.entities.SeqColLevelTwoEntity;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Optional;


public interface SeqColDataSource {
Optional<SeqColLevelOneEntity> getSeqColL1ByAssemblyAccession(
String accesison, SeqColEntity.NamingConvention namingConvention) throws IOException;
Optional<Map<String, List<SeqColExtendedDataEntity>>> getAllPossibleSeqColExtendedData(String accession) throws IOException;
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
import javax.persistence.Id;
import javax.persistence.Table;
import javax.persistence.Transient;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;

Expand All @@ -40,6 +41,11 @@ public class SeqColExtendedDataEntity {
@Transient
private AttributeType attributeType;

@Transient
// This is needed when constructing multiple seqCol objects from the datasource to
// identify the naming convention used for the sequences.
private SeqColEntity.NamingConvention namingConvention;

public enum AttributeType {
names, sequences, md5DigestsOfSequences, lengths
}
Expand All @@ -56,9 +62,11 @@ public SeqColExtendedDataEntity setExtendedSeqColData(JSONExtData object) {

/**
* Return the seqCol names array object*/
public static SeqColExtendedDataEntity constructSeqColNamesObject(AssemblyEntity assemblyEntity, SeqColEntity.NamingConvention convention) throws IOException {
public static SeqColExtendedDataEntity constructSeqColNamesObjectByNamingConvention(
AssemblyEntity assemblyEntity, SeqColEntity.NamingConvention convention) {
SeqColExtendedDataEntity seqColNamesObject = new SeqColExtendedDataEntity().setAttributeType(
SeqColExtendedDataEntity.AttributeType.names);
seqColNamesObject.setNamingConvention(convention);
JSONExtData seqColNamesArray = new JSONExtData();
List<String> namesList = new LinkedList<>();

Expand All @@ -84,7 +92,7 @@ public static SeqColExtendedDataEntity constructSeqColNamesObject(AssemblyEntity

/**
* Return the seqCol lengths array object*/
public static SeqColExtendedDataEntity constructSeqColLengthsObject(AssemblyEntity assemblyEntity) throws IOException {
public static SeqColExtendedDataEntity constructSeqColLengthsObject(AssemblyEntity assemblyEntity) {
SeqColExtendedDataEntity seqColLengthsObject = new SeqColExtendedDataEntity().setAttributeType(
SeqColExtendedDataEntity.AttributeType.lengths);
JSONExtData seqColLengthsArray = new JSONExtData();
Expand All @@ -102,7 +110,7 @@ public static SeqColExtendedDataEntity constructSeqColLengthsObject(AssemblyEnti

/**
* Return the seqCol sequences array object*/
public static SeqColExtendedDataEntity constructSeqColSequencesObject(AssemblySequenceEntity assemblySequenceEntity) throws IOException {
public static SeqColExtendedDataEntity constructSeqColSequencesObject(AssemblySequenceEntity assemblySequenceEntity) {
SeqColExtendedDataEntity seqColSequencesObject = new SeqColExtendedDataEntity().setAttributeType(
SeqColExtendedDataEntity.AttributeType.sequences);
JSONExtData seqColSequencesArray = new JSONExtData();
Expand All @@ -120,7 +128,7 @@ public static SeqColExtendedDataEntity constructSeqColSequencesObject(AssemblySe

/**
* Return the seqCol sequences array object*/
public static SeqColExtendedDataEntity constructSeqColSequencesMd5Object(AssemblySequenceEntity assemblySequenceEntity) throws IOException {
public static SeqColExtendedDataEntity constructSeqColSequencesMd5Object(AssemblySequenceEntity assemblySequenceEntity) {
SeqColExtendedDataEntity seqColSequencesObject = new SeqColExtendedDataEntity().setAttributeType(
AttributeType.md5DigestsOfSequences);
JSONExtData seqColSequencesArray = new JSONExtData();
Expand All @@ -135,4 +143,40 @@ public static SeqColExtendedDataEntity constructSeqColSequencesMd5Object(Assembl
seqColSequencesObject.setDigest(sha512Calculator.calculateChecksum(seqColSequencesArray.toString()));
return seqColSequencesObject;
}

/**
* Return the list of extended data entities that are the same across multiple seqCol objects under
* the same assembly accession. These entities are "sequences", "md5Sequences" and "lengths". */
public static List<SeqColExtendedDataEntity> constructSameValueExtendedSeqColData(
AssemblyEntity assemblyEntity, AssemblySequenceEntity assemblySequenceEntity) {
return Arrays.asList(
SeqColExtendedDataEntity.constructSeqColSequencesObject(assemblySequenceEntity),
SeqColExtendedDataEntity.constructSeqColSequencesMd5Object(assemblySequenceEntity),
SeqColExtendedDataEntity.constructSeqColLengthsObject(assemblyEntity)
);
}

/**
* Return a list of seqCol sequences' names with all possible naming convention that can be extracted
* from the given assemblyEntity*/
public static List<SeqColExtendedDataEntity> constructAllPossibleExtendedNamesSeqColData(AssemblyEntity assemblyEntity) {
List<SeqColEntity.NamingConvention> existingNamingConventions = new ArrayList<>();
if (assemblyEntity.getChromosomes().get(0).getEnaSequenceName() != null) {
existingNamingConventions.add(SeqColEntity.NamingConvention.ENA);
}
if (assemblyEntity.getChromosomes().get(0).getInsdcAccession() != null) {
existingNamingConventions.add(SeqColEntity.NamingConvention.GENBANK);
}
if (assemblyEntity.getChromosomes().get(0).getUcscName() != null) {
existingNamingConventions.add(SeqColEntity.NamingConvention.UCSC);
}

List<SeqColExtendedDataEntity> allPossibleExtendedNamesData = new ArrayList<>();
for (SeqColEntity.NamingConvention convention: existingNamingConventions) {
SeqColExtendedDataEntity extendedNamesEntity = constructSeqColNamesObjectByNamingConvention(assemblyEntity, convention);
extendedNamesEntity.setNamingConvention(convention);
allPossibleExtendedNamesData.add(extendedNamesEntity);
}
return allPossibleExtendedNamesData;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@
import uk.ac.ebi.eva.evaseqcol.entities.SeqColExtendedDataEntity;
import uk.ac.ebi.eva.evaseqcol.exception.ExtendedDataNotFoundException;
import uk.ac.ebi.eva.evaseqcol.exception.SeqColNotFoundException;
import uk.ac.ebi.eva.evaseqcol.refget.ChecksumCalculator;
import uk.ac.ebi.eva.evaseqcol.refget.MD5Calculator;
import uk.ac.ebi.eva.evaseqcol.refget.SHA512Calculator;
import uk.ac.ebi.eva.evaseqcol.repo.SeqColExtendedDataRepository;

import java.io.IOException;
Expand All @@ -26,9 +23,6 @@ public class SeqColExtendedDataService {
@Autowired
private SeqColExtendedDataRepository repository;

private ChecksumCalculator sha512Calculator = new SHA512Calculator();
private ChecksumCalculator md5Calculator = new MD5Calculator();

/**
* Add a seqCol's attribute; names, lengths or sequences, to the database*/
public Optional<SeqColExtendedDataEntity> addSeqColExtendedData(SeqColExtendedDataEntity seqColExtendedData){
Expand Down Expand Up @@ -83,7 +77,7 @@ public List<SeqColExtendedDataEntity> constructExtendedSeqColDataList(AssemblyEn
return Arrays.asList(
SeqColExtendedDataEntity.constructSeqColSequencesObject(assemblySequenceEntity),
SeqColExtendedDataEntity.constructSeqColSequencesMd5Object(assemblySequenceEntity),
SeqColExtendedDataEntity.constructSeqColNamesObject(assemblyEntity, convention),
SeqColExtendedDataEntity.constructSeqColNamesObjectByNamingConvention(assemblyEntity, convention),
SeqColExtendedDataEntity.constructSeqColLengthsObject(assemblyEntity)
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public List<SeqColLevelOneEntity> getAllSeqColLevelOneObjects(){
* Construct a seqCol level 1 entity out of three seqCol level 2 entities that
* hold names, lengths and sequences objects*/
public SeqColLevelOneEntity constructSeqColLevelOne(List<SeqColExtendedDataEntity> extendedDataEntities,
SeqColEntity.NamingConvention convention) throws IOException {
SeqColEntity.NamingConvention convention) throws IOException {
SeqColLevelOneEntity levelOneEntity = new SeqColLevelOneEntity();
JSONLevelOne jsonLevelOne = new JSONLevelOne();
for (SeqColExtendedDataEntity dataEntity: extendedDataEntities) {
Expand Down
Loading

0 comments on commit cc12985

Please sign in to comment.