Refactor BB reader to conform more closely to spec -- no assumptions …

…on the layout of the file, particularly the relative location of the chromosome B+ tree.
igvteam · Sep 8, 2024 · 1bb1bd1 · 1bb1bd1
1 parent 8b364e7
commit 1bb1bd1
Show file tree

Hide file tree

Showing 6 changed files with 155 additions and 71 deletions.
diff --git a/src/main/java/org/broad/igv/ucsc/bb/BBDataSource.java b/src/main/java/org/broad/igv/ucsc/bb/BBDataSource.java
@@ -69,7 +69,7 @@ public BBDataSource(BBFile reader, Genome genome) throws IOException {
      */
     private void initMinMax() {
 
-        BBTotalSummary totalSummary = reader.totalSummary;
+        BBTotalSummary totalSummary = reader.getTotalSummary();
         if (totalSummary == null) {
             dataMin = 0;
             dataMax = 100;
@@ -144,7 +144,7 @@ public double getDataMin() {
     protected DataTile getRawData(String chr, int start, int end) {
 
         try {
-            long rTreeOffset = reader.header.fullIndexOffset;
+            long rTreeOffset = reader.getHeader().fullIndexOffset;
             List<byte[]> chunks = this.reader.getLeafChunks(chr, start, chr, end, rTreeOffset);
 
             Integer chrIdx = reader.getIdForChr(chr);

diff --git a/src/main/java/org/broad/igv/ucsc/bb/BBFeatureSource.java b/src/main/java/org/broad/igv/ucsc/bb/BBFeatureSource.java
@@ -65,7 +65,7 @@ public BBFeatureSource(BBFile reader, Genome genome) throws IOException {
         this.reader = reader;
 
         // viz window to load average of 10,000 features per screen
-        featureVisiblityWindow = reader.featureDensity > 0 ? (int) (10000 / reader.featureDensity) : -1;
+        featureVisiblityWindow = reader.getFeatureDensity() > 0 ? (int) (10000 / reader.getFeatureDensity()) : -1;
 
     }
 
@@ -93,7 +93,7 @@ public void close() {
      */
     public Iterator<BasicFeature> getFeatures(String chr, int start, int end) throws IOException {
 
-        long rTreeOffset = reader.header.fullIndexOffset;
+        long rTreeOffset = reader.getHeader().fullIndexOffset;
         Integer chrIdx = reader.getIdForChr(chr);
         if(chrIdx == null) {
             return Collections.emptyIterator();

diff --git a/src/main/java/org/broad/igv/ucsc/bb/BBFile.java b/src/main/java/org/broad/igv/ucsc/bb/BBFile.java
@@ -12,7 +12,6 @@
 import org.broad.igv.ucsc.twobit.UnsignedByteBuffer;
 import org.broad.igv.ucsc.bb.codecs.BBCodec;
 import org.broad.igv.ucsc.bb.codecs.BBCodecFactory;
-import org.broad.igv.ucsc.twobit.UnsignedByteBufferDynamic;
 import org.broad.igv.ucsc.twobit.UnsignedByteBufferImpl;
 import org.broad.igv.util.CompressionUtils;
 import org.broad.igv.util.stream.IGVSeekableStreamFactory;
@@ -83,71 +82,83 @@
  */
 public class BBFile {
 
+    enum Type {BIGWIG, BIGBED}
+
     static public final int BBFILE_HEADER_SIZE = 64;
     static public final long BIGWIG_MAGIC = 2291137574l; // BigWig Magic
     static public final long BIGBED_MAGIC = 2273964779l; // BigBed Magic
     static public final int BBFILE_EXTENDED_HEADER_HEADER_SIZE = 64;
     private Trix trix;
-    String autosql;
+    private String autosql;
     private ChromTree chromTree;
     private String[] chrNames;
-    double featureDensity;
-
-    Map<String, String> chrAliasTable;
-    public BBTotalSummary totalSummary;
+    private double featureDensity;
+    private Map<String, String> chrAliasTable;
+    private BBTotalSummary totalSummary;
     private BPTree[] _searchTrees;
     private Map<Long, RPTree> rTreeCache;
-    BBCodec bedCodec;
+    private BBCodec bedCodec;
+    private String path;
+    private Type type;
+    private BBHeader header = null;
+    private BBZoomHeader[] zoomHeaders;
+    private ByteOrder byteOrder;
+    private Genome genome;
 
-    public boolean isBigWigFile() {
-        return type == Type.BIGWIG;
+    public BBFile(String path, Genome genome) throws IOException {
+        this.path = path;
+        this.genome = genome;
+        this.chrAliasTable = new HashMap<>();
+        this.rTreeCache = new HashMap<>();
+        init();
     }
 
-    public boolean isBigBedFile() {
-        return type == Type.BIGBED;
+    public BBFile(String path, Genome genome, String trixPath) throws IOException {
+        this(path, genome);
+        this.trix = new Trix(trixPath + "x", trixPath);
     }
 
-    public String getAutoSql() {
-        return autosql;
+    void init() throws IOException {
+        this.header = readHeader();
     }
 
-    public String[] getChromosomeNames() {
-        return chrNames;
+    public BBTotalSummary getTotalSummary() {
+        return totalSummary;
     }
 
-    public void setTrix(Trix trix) {
+    public Type getType() {
+        return type;
     }
 
+    public BBHeader getHeader() {
+        return header;
+    }
 
-    enum Type {BIGWIG, BIGBED}
-
-    String path;
-    Type type;
-    BBHeader header = null;
-
-    BBZoomHeader[] zoomHeaders;
-    ByteOrder byteOrder;
-    Genome genome;
+    public Genome getGenome() {
+        return genome;
+    }
+    public boolean isBigWigFile() {
+        return type == Type.BIGWIG;
+    }
 
-    public BBFile(String path, Genome genome) throws IOException {
-        this.path = path;
-        this.genome = genome;
-        this.chrAliasTable = new HashMap<>();
-        this.rTreeCache = new HashMap<>();
-        init();
+    public boolean isBigBedFile() {
+        return type == Type.BIGBED;
     }
 
-    public BBFile(String path, Genome genome, String trixPath) throws IOException {
-        this(path, genome);
-        this.trix = new Trix(trixPath + "x", trixPath);
+    public double getFeatureDensity() {
+        return featureDensity;
+    }
+    public String getAutoSQL() {
+        return autosql;
     }
 
-    void init() throws IOException {
-        this.header = readHeader();
+    public String[] getChromosomeNames() {
+        return chrNames;
     }
 
     BBHeader readHeader() throws IOException {
 
+        // The common header
         ByteOrder order = ByteOrder.LITTLE_ENDIAN;
         UnsignedByteBuffer buffer = UnsignedByteBufferImpl.loadBinaryBuffer(this.path, order, 0, BBFILE_HEADER_SIZE);
         long magic = buffer.getUInt();
@@ -183,8 +194,12 @@ BBHeader readHeader() throws IOException {
         header.uncompressBuffSize = buffer.getInt();
         header.extensionOffset = buffer.getLong();
 
-        // Read rest of fields up to full data offset
-        buffer = UnsignedByteBufferImpl.loadBinaryBuffer(this.path, order, BBFILE_HEADER_SIZE, (int) (header.fullDataOffset - BBFILE_HEADER_SIZE + 4));
+        // Zoom headers, autosql, and total summary if present
+        int size = (int) (header.totalSummaryOffset > 0 ?
+                header.totalSummaryOffset - BBFILE_HEADER_SIZE + 40 :
+                Math.min(header.fullDataOffset, header.chromTreeOffset) - BBFILE_HEADER_SIZE);
+
+        buffer = UnsignedByteBufferImpl.loadBinaryBuffer(this.path, order, BBFILE_HEADER_SIZE, size);
 
         // Zoom headers -- immediately follows the common header
         this.zoomHeaders = new BBZoomHeader[header.nZoomLevels];
@@ -212,38 +227,28 @@ BBHeader readHeader() throws IOException {
             this.totalSummary = BBTotalSummary.parseSummary(buffer);
         }
 
-        //Total data count -- for bigbed this is the number of features, for bigwig it is number of sections
-        buffer.position((int) (header.fullDataOffset - BBFILE_HEADER_SIZE));
-        header.dataCount = buffer.getInt();
-
         // Chromosome tree -- this normally preceeds fullDataOffset so will be within the buffer.  However, this
         // isn't guaranteed, we have to check
-        int chromtreeBufferPosition = (int) (header.chromTreeOffset - startOffset);
-        if(chromtreeBufferPosition > 0 && chromtreeBufferPosition < buffer.position() + buffer.remaining()) {
-            buffer.position((int) (header.chromTreeOffset - startOffset));
-       } else {
-            buffer = UnsignedByteBufferDynamic.loadBinaryBuffer(this.path, order, header.chromTreeOffset, 1000);
-        }
+        int chromtreeBufferSize = (int) Math.min(1000000, Math.max(10000, header.fullDataOffset - header.chromTreeOffset));
+        buffer = UnsignedByteBufferDynamic.loadBinaryBuffer(this.path, order, header.chromTreeOffset, chromtreeBufferSize);
         this.chromTree = ChromTree.parseTree(buffer, startOffset, this.genome);
         this.chrNames = this.chromTree.names();
 
-        this.header = header;
 
-        //extension
-        if (header.extensionOffset > 0) {
-            this.loadExtendedHeader(header.extensionOffset);
-        }
+        if (type == Type.BIGBED) {
+            //Total data count -- for bigbed this is the number of features, for bigwig it is number of sections
+            buffer = UnsignedByteBufferImpl.loadBinaryBuffer(this.path, order, header.fullDataOffset, 4);
+            header.dataCount = buffer.getInt();
+            this.featureDensity = ((double) header.dataCount) / chromTree.sumLengths;
 
-        // total summary stats
-        if (header.version > 1) {
-            buffer = UnsignedByteBufferImpl.loadBinaryBuffer(this.path, order, header.totalSummaryOffset, 40);
-            this.totalSummary = BBTotalSummary.parseSummary(buffer);
+            bedCodec = BBCodecFactory.getCodec(autosql, header.definedFieldCount);
         }
 
-        this.featureDensity = ((double) header.dataCount) / chromTree.sumLengths;
+        this.header = header;
 
-        if (type == Type.BIGBED) {
-            bedCodec = BBCodecFactory.getCodec(autosql, header.definedFieldCount);
+        //extension
+        if (header.extensionOffset > 0) {
+            this.loadExtendedHeader(header.extensionOffset);
         }
 
 

diff --git a/...csc/twobit/UnsignedByteBufferDynamic.java → ...gv/ucsc/bb/UnsignedByteBufferDynamic.java b/...csc/twobit/UnsignedByteBufferDynamic.java → ...gv/ucsc/bb/UnsignedByteBufferDynamic.java
@@ -1,6 +1,7 @@
-package org.broad.igv.ucsc.twobit;
+package org.broad.igv.ucsc.bb;
 
 import htsjdk.samtools.seekablestream.SeekableStream;
+import org.broad.igv.ucsc.twobit.UnsignedByteBuffer;
 import org.broad.igv.util.stream.IGVSeekableStreamFactory;
 
 import java.io.ByteArrayOutputStream;
@@ -12,7 +13,7 @@
 /**
  * A UnsignedByteBuffer that refills its backing buffer as needed from the underlying file resource. This class was
  * created specifically to load the chromTree of a BB file, where the start position is known but total size is
- * not.
+ * not.  It might have general utility but has not been tested with any other use case.
  */
 public class UnsignedByteBufferDynamic implements UnsignedByteBuffer {
 

diff --git a/src/test/java/org/broad/igv/ucsc/bb/BBFeatureSourceTest.java b/src/test/java/org/broad/igv/ucsc/bb/BBFeatureSourceTest.java
@@ -3,7 +3,6 @@
 import htsjdk.tribble.Feature;
 import org.broad.igv.feature.BasicFeature;
 import org.broad.igv.feature.genome.Genome;
-import org.broad.igv.ucsc.Trix;
 import org.broad.igv.util.TestUtils;
 import org.junit.Ignore;
 import org.junit.Test;
@@ -76,7 +75,7 @@ public void testBigBed() throws IOException {
 
         String path = TestUtils.DATA_DIR + "bb/chr21.refseq.bb";
         BBFile bbReader = new BBFile(path, null);
-        assertTrue(bbReader.type == BBFile.Type.BIGBED);
+        assertTrue(bbReader.getType() == BBFile.Type.BIGBED);
 
         BBFeatureSource bbSource = new BBFeatureSource(bbReader, null);
 
@@ -97,6 +96,34 @@ public void testBigBed() throws IOException {
 
     }
 
+    /**
+     * Test the 'dataCount' value, which should equal the total count of feature records in the file.
+     *
+     * @throws IOException
+     */
+    @Test
+    public void testDataCount() throws IOException {
+
+        String path = TestUtils.DATA_DIR + "bb/chr21.refseq.bb";
+        BBFile bbReader = new BBFile(path, null);
+
+        assertTrue(bbReader.getType() == BBFile.Type.BIGBED);
+
+        BBFeatureSource bbSource = new BBFeatureSource(bbReader, null);
+        String [] chrNames = bbReader.getChromosomeNames();
+
+        int count = 0;
+        for(String chr : chrNames) {
+            Iterator<BasicFeature> iter = bbSource.getFeatures(chr, 0, Integer.MAX_VALUE);
+            while (iter.hasNext()) {
+                iter.next();
+                count++;
+            }
+        }
+        assertEquals("Feature count", bbReader.getHeader().dataCount, count);
+
+    }
+
 
     @Test
     @Ignore

diff --git a/src/test/java/org/broad/igv/ucsc/bb/BBFileTest.java b/src/test/java/org/broad/igv/ucsc/bb/BBFileTest.java
@@ -1,25 +1,76 @@
 package org.broad.igv.ucsc.bb;
 
-import htsjdk.tribble.Feature;
 import org.broad.igv.feature.BasicFeature;
 import org.broad.igv.util.TestUtils;
 import org.junit.Test;
 
 import java.io.IOException;
-import java.util.Iterator;
 
 import static org.junit.Assert.*;
 import static org.junit.Assert.assertTrue;
 
 public class BBFileTest {
 
+    /**
+     * Test a BW file with an unusual layout (chromTree after full data).
+     */
     @Test
-    public void testGene() throws IOException {
+    public void testChromTree() throws IOException {
+        String bbFile = "https://data.broadinstitute.org/igvdata/test/data/bb/chromTreeTest.bigwig";
+        BBFile reader = new BBFile(bbFile, null);
+        reader.readHeader();
+        String [] chrNames = reader.getChromosomeNames();
+        assertEquals(6, chrNames.length);
+    }
+
+    /**
+     * Test a BW file with an typical layout (chromTree before full data).
+     */
+    @Test
+    public void testChromTree2() throws IOException {
 
         String bbFile = TestUtils.DATA_DIR + "bb/GCF_000009045.1_ASM904v1.ncbiGene.bb";
         BBFile reader = new BBFile(bbFile, null);
         BBHeader header = reader.readHeader();
         assertNotNull(header);
+        String [] chrNames = reader.getChromosomeNames();
+        assertEquals(1, chrNames.length);
+    }
+
+    @Test
+    public void testAutoSQL() throws IOException {
+
+        String bbFile = TestUtils.DATA_DIR + "bb/GCF_000009045.1_ASM904v1.ncbiGene.bb";
+        BBFile reader = new BBFile(bbFile, null);
+        reader.readHeader();
+
+        String autoSql = "table bigGenePred\n" +
+                "\"bigGenePred gene models\"\n" +
+                "   (\n" +
+                "   string chrom;       \"Reference sequence chromosome or scaffold\"\n" +
+                "   uint   chromStart;  \"Start position in chromosome\"\n" +
+                "   uint   chromEnd;    \"End position in chromosome\"\n" +
+                "   string name;        \"Name or ID of item, ideally both human readable and unique\"\n" +
+                "   uint score;         \"Score (0-1000)\"\n" +
+                "   char[1] strand;     \"+ or - for strand\"\n" +
+                "   uint thickStart;    \"Start of where display should be thick (start codon)\"\n" +
+                "   uint thickEnd;      \"End of where display should be thick (stop codon)\"\n" +
+                "   uint reserved;       \"RGB value (use R,G,B string in input file)\"\n" +
+                "   int blockCount;     \"Number of blocks\"\n" +
+                "   int[blockCount] blockSizes; \"Comma separated list of block sizes\"\n" +
+                "   int[blockCount] chromStarts; \"Start positions relative to chromStart\"\n" +
+                "   string name2;       \"Alternative/human readable name\"\n" +
+                "   string cdsStartStat; \"Status of CDS start annotation (none, unknown, incomplete, or complete)\"\n" +
+                "   string cdsEndStat;   \"Status of CDS end annotation (none, unknown, incomplete, or complete)\"\n" +
+                "   int[blockCount] exonFrames; \"Exon frame {0,1,2}, or -1 if no frame for exon\"\n" +
+                "   string type;        \"Transcript type\"\n" +
+                "   string geneName;    \"Primary identifier for gene\"\n" +
+                "   string geneName2;   \"Alternative/human readable gene name\"\n" +
+                "   string geneType;    \"Gene type\"\n" +
+                "   )\n" +
+                "\n";
+
+        assertEquals(autoSql.trim(), reader.getAutoSQL().trim());
     }