Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.IOUtil;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
Expand Down Expand Up @@ -146,6 +147,41 @@ protected static void sanityCheckDictionaryAgainstIndex(final String fastaFile,
}
}

/** Do some basic checking to make sure the fasta and the index match.
* <p>
* checks that the length of the fasta file is at least as long as the index proclaims
* and that beyond the last position references in the index there is only one line followed by whitespaces
*
* @param fastaFile Used for error reporting only.
* @param index index file to check against the dictionary.
*/
public static void sanityCheckFastaAgainstIndex(final String fastaFile,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why String and not Path or File?

final FastaSequenceIndex FastaSequenceIndex) {


final Iterator<FastaSequenceIndexEntry> iterator = FastaSequenceIndex.iterator();
FastaSequenceIndexEntry fastaSequenceIndex = null;
while (iterator.hasNext()) {
fastaSequenceIndex = iterator.next();
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It kinda sucks to have to do this iteration. I wonder if it would make sense to expand the PR a little bit, and modify FastaSequenceIndex to either store index entries in a pair of (Array/ArrayList, HashMap (instead of LinkedHashMap))? Or just to hold an extra pointer to the final entry in the index?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are several operations that explicitly take advantage of the linked-list aspect...come to think of it, I my sanity-check only works on a "freshly make" object, since operations such as "rename" remove and add entries, which moved the renamed entry to the "end" of the linked-list.... I only need it to work upon initialization, but I need to add some protections. upshot is that a list would be overkill, I'll go with the reference to the last element.

assert fastaSequenceIndex != null;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Asserts can be disabled at runtime - are you ok with that, or do you want to always check this?

final long lastSequenceLength = fastaSequenceIndex.getSize();
final long lastSequenceStart = fastaSequenceIndex.getLocation();
final long lastSequenceEnd = lastSequenceStart + fastaSequenceIndex.getOffset(lastSequenceLength);

final long fastaLength = new File(fastaFile).length();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this safe to do? Does AbstractIndexedFastaSequenceFile support or have sub-classes that support URL based access (like http:// or ftp://)? If so ... is there a method to call to get the file/object length?


//Question: should we worry about files with lots of whitespace in their end?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't worry about "lots of whitespace" at the end. I think that is still technically valid fasta.

if (lastSequenceEnd > fastaLength) {
throw new IllegalArgumentException("The fasta file is shorter (%d) than its index claims (%d). Please reindex the fasta.".formatted(fastaLength, lastSequenceEnd));
}
// not sure why need to add 1 here.
if (lastSequenceEnd + fastaSequenceIndex.getTerminatorLength() + 1 < fastaLength) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Honestly I wonder if we should allow for more than 1? What if there are a handful of blank lines at the end? Maybe we could try a nested solution, that if the file is longer than you have here ... then we read the content after the last base and ensure it's all whitespace?

throw new IllegalArgumentException("The fasta file is too long (%d) given the claims of its index (%d). Please reindex the fasta.".formatted(fastaLength, lastSequenceEnd + fastaSequenceIndex.getTerminatorLength()));
}
}


public FastaSequenceIndex getIndex() {
return index;
}
Expand Down Expand Up @@ -210,7 +246,8 @@ public ReferenceSequence getSubsequenceAt( String contig, long start, long stop
final int bytesPerLine = indexEntry.getBytesPerLine();
final int terminatorLength = bytesPerLine - basesPerLine;

long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine;
long startOffset = indexEntry.getOffset(start);

// Cast to long so the second argument cannot overflow a signed integer.
final long minBufferSize = Math.min((long) Defaults.NON_ZERO_BUFFER_SIZE, (long)(length / basesPerLine + 2) * (long)bytesPerLine);
if (minBufferSize > Integer.MAX_VALUE) throw new SAMException("Buffer is too large: " + minBufferSize);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,26 @@ public int getSequenceIndex() {
return sequenceIndex;
}

/** Return the offset to pos as determined by the number of bases and bytes per line
*
* @param pos the (1-based) position in the contig that is requested
* @return the offset (0-based) from 'location' where pos is located in the file.
*/
public long getOffset(long pos) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

final int basesPerLine = this.getBasesPerLine();
final int bytesPerLine = this.getBytesPerLine();

return ((pos - 1) / basesPerLine) * bytesPerLine + (pos - 1) % basesPerLine;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return ((pos - 1) / basesPerLine) * bytesPerLine + (pos - 1) % basesPerLine;
return ((pos - 1) / basesPerLine) * bytesPerLine + ((pos - 1) % basesPerLine);

Just for clarity.

}

/** get the terminator length from the bytes per line and the bases per line
*
* @return the length of the terminator of each line (1 or 2)
*/
public int getTerminatorLength() {
return bytesPerLine - basesPerLine;
}

/**
* For debugging. Emit the contents of each contig line.
* @return A string representation of the contig line.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ public IndexedFastaSequenceFile(final Path path, final FastaSequenceIndex index)
if (IOUtil.isBlockCompressed(path, true)) {
throw new SAMException("Indexed block-compressed FASTA file cannot be handled: " + path);
}
sanityCheckFastaAgainstIndex(path.toAbsolutePath().toString(),index);
this.channel = Files.newByteChannel(path);
} catch (IOException e) {
throw new SAMException("FASTA file should be readable but is not: " + path, e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ public class AbstractIndexedFastaSequenceFileTest extends HtsjdkTest {
private static final File SEQUENCE_FILE_BGZ = new File(TEST_DATA_DIR,"Homo_sapiens_assembly18.trimmed.fasta.gz");
private static final File SEQUENCE_FILE_GZI = new File(TEST_DATA_DIR,"Homo_sapiens_assembly18.trimmed.fasta.gz.gzi");
private static final File SEQUENCE_FILE_NODICT = new File(TEST_DATA_DIR,"Homo_sapiens_assembly18.trimmed.nodict.fasta");
private static final Path HEADER_WITH_WHITESPACE = new File(TEST_DATA_DIR,"header_with_white_space.fasta").toPath();
private static final Path CRLF = new File(TEST_DATA_DIR,"crlf.fasta").toPath();
private static final Path HEADER_WITH_WHITESPACE_index = AbstractIndexedFastaSequenceFile.findFastaIndex(HEADER_WITH_WHITESPACE);
private static final Path CRLF_index = AbstractIndexedFastaSequenceFile.findFastaIndex(CRLF);

private final String firstBasesOfChrM = "GATCACAGGTCTATCACCCT";
private final String extendedBasesOfChrM = "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT" +
Expand All @@ -68,6 +72,15 @@ public class AbstractIndexedFastaSequenceFileTest extends HtsjdkTest {
private final String lastBasesOfChr20 = "ttgtctgatgctcatattgt";
private final int CHR20_LENGTH = 1000000;

@DataProvider(name="mismatched_indexes")
public Object[][] provideMismatchedIndexes() {
return new Object[][]{
new Object[]{HEADER_WITH_WHITESPACE,CRLF_index},
new Object[]{CRLF,HEADER_WITH_WHITESPACE_index}
};
}


@DataProvider(name="homosapiens")
public Object[][] provideSequenceFile() throws FileNotFoundException {
return new Object[][] { new Object[]
Expand Down Expand Up @@ -470,4 +483,13 @@ public void testBlockCompressedIndexedFastaSequenceFileFromNio() throws IOExcept
}
}

@Test(expectedExceptions = IllegalArgumentException.class,
dataProvider = "mismatched_indexes")
public void testWrongIndex(Path fasta, Path index){
try (IndexedFastaSequenceFile indexedFastaSequenceFile = new IndexedFastaSequenceFile(fasta, new FastaSequenceIndex(index))) {

} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
Loading