-
Notifications
You must be signed in to change notification settings - Fork 243
feat: Add sanity check to indexed FASTA file #1745
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
dcd8b41
c2a6a6b
e75f493
f12890e
b6a1842
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ | |
| import htsjdk.samtools.SAMSequenceRecord; | ||
| import htsjdk.samtools.util.IOUtil; | ||
|
|
||
| import java.io.File; | ||
| import java.io.FileNotFoundException; | ||
| import java.io.IOException; | ||
| import java.nio.ByteBuffer; | ||
|
|
@@ -146,6 +147,41 @@ protected static void sanityCheckDictionaryAgainstIndex(final String fastaFile, | |
| } | ||
| } | ||
|
|
||
| /** Do some basic checking to make sure the fasta and the index match. | ||
| * <p> | ||
| * checks that the length of the fasta file is at least as long as the index proclaims | ||
| * and that beyond the last position references in the index there is only one line followed by whitespaces | ||
| * | ||
| * @param fastaFile Used for error reporting only. | ||
| * @param index index file to check against the dictionary. | ||
| */ | ||
| public static void sanityCheckFastaAgainstIndex(final String fastaFile, | ||
| final FastaSequenceIndex FastaSequenceIndex) { | ||
|
|
||
|
|
||
| final Iterator<FastaSequenceIndexEntry> iterator = FastaSequenceIndex.iterator(); | ||
| FastaSequenceIndexEntry fastaSequenceIndex = null; | ||
| while (iterator.hasNext()) { | ||
| fastaSequenceIndex = iterator.next(); | ||
| } | ||
|
||
| assert fastaSequenceIndex != null; | ||
|
||
| final long lastSequenceLength = fastaSequenceIndex.getSize(); | ||
| final long lastSequenceStart = fastaSequenceIndex.getLocation(); | ||
| final long lastSequenceEnd = lastSequenceStart + fastaSequenceIndex.getOffset(lastSequenceLength); | ||
|
|
||
| final long fastaLength = new File(fastaFile).length(); | ||
|
||
|
|
||
| //Question: should we worry about files with lots of whitespace in their end? | ||
|
||
| if (lastSequenceEnd > fastaLength) { | ||
| throw new IllegalArgumentException("The fasta file is shorter (%d) than its index claims (%d). Please reindex the fasta.".formatted(fastaLength, lastSequenceEnd)); | ||
| } | ||
| // not sure why need to add 1 here. | ||
| if (lastSequenceEnd + fastaSequenceIndex.getTerminatorLength() + 1 < fastaLength) { | ||
|
||
| throw new IllegalArgumentException("The fasta file is too long (%d) given the claims of its index (%d). Please reindex the fasta.".formatted(fastaLength, lastSequenceEnd + fastaSequenceIndex.getTerminatorLength())); | ||
| } | ||
| } | ||
|
|
||
|
|
||
| public FastaSequenceIndex getIndex() { | ||
| return index; | ||
| } | ||
|
|
@@ -210,7 +246,8 @@ public ReferenceSequence getSubsequenceAt( String contig, long start, long stop | |
| final int bytesPerLine = indexEntry.getBytesPerLine(); | ||
| final int terminatorLength = bytesPerLine - basesPerLine; | ||
|
|
||
| long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine; | ||
| long startOffset = indexEntry.getOffset(start); | ||
|
|
||
| // Cast to long so the second argument cannot overflow a signed integer. | ||
| final long minBufferSize = Math.min((long) Defaults.NON_ZERO_BUFFER_SIZE, (long)(length / basesPerLine + 2) * (long)bytesPerLine); | ||
| if (minBufferSize > Integer.MAX_VALUE) throw new SAMException("Buffer is too large: " + minBufferSize); | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -112,6 +112,26 @@ public int getSequenceIndex() { | |||||
| return sequenceIndex; | ||||||
| } | ||||||
|
|
||||||
| /** Return the offset to pos as determined by the number of bases and bytes per line | ||||||
| * | ||||||
| * @param pos the (1-based) position in the contig that is requested | ||||||
| * @return the offset (0-based) from 'location' where pos is located in the file. | ||||||
| */ | ||||||
| public long getOffset(long pos) { | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||||||
| final int basesPerLine = this.getBasesPerLine(); | ||||||
| final int bytesPerLine = this.getBytesPerLine(); | ||||||
|
|
||||||
| return ((pos - 1) / basesPerLine) * bytesPerLine + (pos - 1) % basesPerLine; | ||||||
|
||||||
| return ((pos - 1) / basesPerLine) * bytesPerLine + (pos - 1) % basesPerLine; | |
| return ((pos - 1) / basesPerLine) * bytesPerLine + ((pos - 1) % basesPerLine); |
Just for clarity.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why String and not Path or File?