Skip to content

Commit

Permalink
EVA-3501 - adding new endpoint for parsing fasta file data (#80)
Browse files Browse the repository at this point in the history
* adding new endpoint for parsing fasta file data
  • Loading branch information
nitin-ebi authored Feb 23, 2024
1 parent a77f137 commit 07a38d8
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PutMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

Expand Down Expand Up @@ -76,4 +77,32 @@ public ResponseEntity<?> fetchAndInsertSeqColByAssemblyAccession(
return new ResponseEntity<>(e.getMessage(), HttpStatus.CONFLICT);
}
}

@Operation(summary = "Add new sequence collection objects",
description = "Given FASTA file content, this endpoint will parse the content and use it to construct " +
"seqCol objects with naming convention TEST and eventually save these seqCol objects into the database. " +
"This is an authenticated endpoint, so it requires admin privileges to run it.")
@ApiResponses(value = {
@ApiResponse(responseCode = "201", description = "seqCol object(s) successfully inserted"),
@ApiResponse(responseCode = "409", description = "seqCol object(s) already exist(s)"),
@ApiResponse(responseCode = "404", description = "Assembly not found"),
@ApiResponse(responseCode = "400", description = "Bad request. (It can be a bad accession value)"),
@ApiResponse(responseCode = "500", description = "Server Error")
})
@PutMapping(value = "/seqcols/fasta/{accession}")
public ResponseEntity<?> fetchAndInsertSeqColByParsingFastaFile(@PathVariable(value = "accession") String accession, @RequestBody String fastaFileContent) {
try {
IngestionResultEntity ingestionResult = seqColService.fetchAndInsertAllSeqColInFastaFile(accession, fastaFileContent);
return new ResponseEntity<>(ingestionResult, HttpStatus.CREATED);
} catch (IOException e) {
e.printStackTrace();
return new ResponseEntity<>(e.getMessage(), HttpStatus.INTERNAL_SERVER_ERROR);
} catch (DuplicateSeqColException e) {
return new ResponseEntity<>(e.getMessage(), HttpStatus.CONFLICT);
} catch (AssemblyNotFoundException e) {
return new ResponseEntity<>(e.getMessage(), HttpStatus.NOT_FOUND);
} catch (AssemblyAlreadyIngestedException e) {
return new ResponseEntity<>(e.getMessage(), HttpStatus.CONFLICT);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import uk.ac.ebi.eva.evaseqcol.entities.AssemblySequenceEntity;
import uk.ac.ebi.eva.evaseqcol.utils.GzipCompress;

import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
Expand Down Expand Up @@ -43,6 +44,17 @@ public NCBIAssemblySequenceDataSource(NCBIBrowserFactory factory,
this.readerFactory = readerFactory;
}

public Optional<AssemblySequenceEntity> getAssemblySequencesByAccession(String accession, String fastaFileContent) throws IOException {
AssemblySequenceEntity assemblySequenceEntity;
try (InputStream stream = new ByteArrayInputStream(fastaFileContent.getBytes())) {
NCBIAssemblySequenceReader reader = readerFactory.build(stream, accession);
assemblySequenceEntity = reader.getAssemblySequencesEntity();
logger.info("FASTA file content with accession " + accession + " has been parsed successfully");
}

return Optional.of(assemblySequenceEntity);
}

@Override
public Optional<AssemblySequenceEntity> getAssemblySequencesByAccession(String accession) throws IOException, IllegalArgumentException {
NCBIBrowser ncbiBrowser = factory.build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import uk.ac.ebi.eva.evaseqcol.utils.JSONLevelOne;

import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -86,4 +87,27 @@ public Optional<Map<String, Object>> getAllPossibleSeqColExtendedData(String acc

return Optional.of(seqColResultData);
}

public Optional<Map<String, Object>> getAllPossibleSeqColExtendedData(String accession, String fastaFileContent) throws IOException {
Map<String, Object> seqColResultData = new HashMap<>();

// Fetching Sequence Entity (FASTA File)
Optional<AssemblySequenceEntity> sequenceEntity = assemblySequenceDataSource.getAssemblySequencesByAccession(accession, fastaFileContent);
if (!sequenceEntity.isPresent()) {
logger.error("Could not parse FASTA file content: ");
return Optional.empty();
}
logger.info("FASTA file have been parsed successfully");

Map<String, Object> sameValueAttributesMap = new HashMap<>();
sameValueAttributesMap.put("extendedLengths", SeqColExtendedDataEntity.constructSeqColLengthsObject(sequenceEntity.get()));
sameValueAttributesMap.put("extendedSequences", SeqColExtendedDataEntity.constructSeqColSequencesObject(sequenceEntity.get()));
sameValueAttributesMap.put("extendedMd5Sequences", SeqColExtendedDataEntity.constructSeqColSequencesMd5Object(sequenceEntity.get()));

// Seqcol Result Data Map
seqColResultData.put("sameValueAttributes", sameValueAttributesMap);
seqColResultData.put("namesAttributes", Collections.singletonList(SeqColExtendedDataEntity
.constructSeqColNamesObjectWithRefSeqAndTESTNamingConvention(sequenceEntity.get())));
return Optional.of(seqColResultData);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ protected void parseFile() throws IOException, NullPointerException {
while (line != null){
if (line.startsWith(">")){
SeqColSequenceEntity sequence = new SeqColSequenceEntity();
String refSeq = line.substring(1, line.indexOf(' '));
String refSeq = line.substring(1).split(" ")[0];
sequence.setRefseq(refSeq);
line = reader.readLine();
StringBuilder sequenceValue = new StringBuilder();
Expand All @@ -45,6 +45,7 @@ protected void parseFile() throws IOException, NullPointerException {
String sha512Checksum = sha512ChecksumCalculator.calculateRefgetChecksum(sequenceValue.toString());
sequence.setSequenceMD5(md5checksum);
sequence.setSequence(sha512Checksum);
sequence.setLength(sequenceValue.length());
sequences.add(sequence);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,6 @@ public abstract class SeqColEntity {


public enum NamingConvention {
ENA, GENBANK, UCSC
ENA, GENBANK, UCSC, TEST
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Collectors;

@Entity
@TypeDefs({
Expand Down Expand Up @@ -117,6 +118,20 @@ public static SeqColExtendedDataEntity<List<String>> constructSeqColNamesObjectB
return seqColNamesObject;
}

public static SeqColExtendedDataEntity<List<String>> constructSeqColNamesObjectWithRefSeqAndTESTNamingConvention(
AssemblySequenceEntity sequenceEntity) throws IOException {
SeqColExtendedDataEntity<List<String>> seqColNamesObject = new SeqColExtendedDataEntity<List<String>>().setAttributeType(
SeqColExtendedDataEntity.AttributeType.names);
seqColNamesObject.setNamingConvention(SeqColEntity.NamingConvention.TEST);
JSONExtData<List<String>> seqColNamesArray = new JSONStringListExtData();
List<String> namesList = sequenceEntity.getSequences().stream().map(s -> s.getRefseq()).collect(Collectors.toList());
DigestCalculator digestCalculator = new DigestCalculator();
seqColNamesArray.setObject(namesList);
seqColNamesObject.setExtendedSeqColData(seqColNamesArray);
seqColNamesObject.setDigest(digestCalculator.getSha512Digest(seqColNamesArray.toString()));
return seqColNamesObject;
}

/**
* Return the seqCol lengths array object*/
public static SeqColExtendedDataEntity<List<Integer>> constructSeqColLengthsObject(AssemblyEntity assemblyEntity) throws IOException {
Expand All @@ -136,6 +151,21 @@ public static SeqColExtendedDataEntity<List<Integer>> constructSeqColLengthsObje
return seqColLengthsObject;
}


public static SeqColExtendedDataEntity<List<Integer>> constructSeqColLengthsObject(AssemblySequenceEntity sequenceEntity) throws IOException {
SeqColExtendedDataEntity<List<Integer>> seqColLengthsObject = new SeqColExtendedDataEntity<List<Integer>>().setAttributeType(
SeqColExtendedDataEntity.AttributeType.lengths);
JSONExtData<List<Integer>> seqColLengthsArray = new JSONIntegerListExtData();
List<Integer> lengthsList = sequenceEntity.getSequences().stream().map(s -> s.getLength()).collect(Collectors.toList());

DigestCalculator digestCalculator = new DigestCalculator();
seqColLengthsArray.setObject(lengthsList);
seqColLengthsObject.setExtendedSeqColData(seqColLengthsArray);
seqColLengthsObject.setDigest(digestCalculator.getSha512Digest(seqColLengthsArray.toString()));

return seqColLengthsObject;
}

/**
* Return the seqCol sequences array object*/
public static SeqColExtendedDataEntity<List<String>> constructSeqColSequencesObject(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ public class SeqColSequenceEntity {
private String sequenceMD5;
@ApiModelProperty(value = "Sequence's defalut (ga4gh) checksum value")
private String sequence;
@ApiModelProperty(value = "Sequence's length")
private Integer length;

public SeqColSequenceEntity setRefseq(String refseq) {
this.refseq = refseq;
Expand All @@ -26,4 +28,9 @@ public SeqColSequenceEntity setSequence(String sequence) {
this.sequence = sequence;
return this;
}

public SeqColSequenceEntity setLength(Integer length) {
this.length = length;
return this;
}
}
18 changes: 13 additions & 5 deletions src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColService.java
Original file line number Diff line number Diff line change
Expand Up @@ -154,16 +154,24 @@ public void removeAllSeqCol() {
extendedDataService.removeAllSeqColExtendedEntities();
}

public IngestionResultEntity fetchAndInsertAllSeqColInFastaFile(String accession, String fastaFileContent) throws IOException {
Optional<Map<String, Object>> seqColDataMap = ncbiSeqColDataSource.getAllPossibleSeqColExtendedData(accession, fastaFileContent);
return createSeqColObjectsAndInsert(seqColDataMap, accession);
}

/**
* Fetch and insert all possible seqCol objects for the given assembly accession.
* NOTE: All possible seqCol objects means with all possible/provided naming conventions that could be found in the
* assembly report.
* Return the list of level 0 digests of the inserted seqcol objects*/
public IngestionResultEntity fetchAndInsertAllSeqColByAssemblyAccession(
String assemblyAccession) throws IOException, DuplicateSeqColException, AssemblyNotFoundException,
AssemblyAlreadyIngestedException{
Optional<Map<String, Object>> seqColDataMap = ncbiSeqColDataSource
.getAllPossibleSeqColExtendedData(assemblyAccession);
public IngestionResultEntity fetchAndInsertAllSeqColByAssemblyAccession(String assemblyAccession) throws IOException {
Optional<Map<String, Object>> seqColDataMap = ncbiSeqColDataSource.getAllPossibleSeqColExtendedData(assemblyAccession);
return createSeqColObjectsAndInsert(seqColDataMap, assemblyAccession);
}


public IngestionResultEntity createSeqColObjectsAndInsert(Optional<Map<String, Object>> seqColDataMap,
String assemblyAccession) throws IOException {
if (!seqColDataMap.isPresent()) {
logger.warn("No seqCol data corresponding to assemblyAccession " + assemblyAccession + " could be found on NCBI datasource");
throw new AssemblyNotFoundException(assemblyAccession);
Expand Down

0 comments on commit 07a38d8

Please sign in to comment.