Skip to content

Commit

Permalink
Refactor BB reader to conform more closely to spec -- no assumptions …
Browse files Browse the repository at this point in the history
…on the layout of the file, particularly the relative location of the chromosome B+ tree.
  • Loading branch information
jrobinso committed Sep 8, 2024
1 parent 8b364e7 commit 1bb1bd1
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 71 deletions.
4 changes: 2 additions & 2 deletions src/main/java/org/broad/igv/ucsc/bb/BBDataSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public BBDataSource(BBFile reader, Genome genome) throws IOException {
*/
private void initMinMax() {

BBTotalSummary totalSummary = reader.totalSummary;
BBTotalSummary totalSummary = reader.getTotalSummary();
if (totalSummary == null) {
dataMin = 0;
dataMax = 100;
Expand Down Expand Up @@ -144,7 +144,7 @@ public double getDataMin() {
protected DataTile getRawData(String chr, int start, int end) {

try {
long rTreeOffset = reader.header.fullIndexOffset;
long rTreeOffset = reader.getHeader().fullIndexOffset;
List<byte[]> chunks = this.reader.getLeafChunks(chr, start, chr, end, rTreeOffset);

Integer chrIdx = reader.getIdForChr(chr);
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/broad/igv/ucsc/bb/BBFeatureSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public BBFeatureSource(BBFile reader, Genome genome) throws IOException {
this.reader = reader;

// viz window to load average of 10,000 features per screen
featureVisiblityWindow = reader.featureDensity > 0 ? (int) (10000 / reader.featureDensity) : -1;
featureVisiblityWindow = reader.getFeatureDensity() > 0 ? (int) (10000 / reader.getFeatureDensity()) : -1;

}

Expand Down Expand Up @@ -93,7 +93,7 @@ public void close() {
*/
public Iterator<BasicFeature> getFeatures(String chr, int start, int end) throws IOException {

long rTreeOffset = reader.header.fullIndexOffset;
long rTreeOffset = reader.getHeader().fullIndexOffset;
Integer chrIdx = reader.getIdForChr(chr);
if(chrIdx == null) {
return Collections.emptyIterator();
Expand Down
125 changes: 65 additions & 60 deletions src/main/java/org/broad/igv/ucsc/bb/BBFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import org.broad.igv.ucsc.twobit.UnsignedByteBuffer;
import org.broad.igv.ucsc.bb.codecs.BBCodec;
import org.broad.igv.ucsc.bb.codecs.BBCodecFactory;
import org.broad.igv.ucsc.twobit.UnsignedByteBufferDynamic;
import org.broad.igv.ucsc.twobit.UnsignedByteBufferImpl;
import org.broad.igv.util.CompressionUtils;
import org.broad.igv.util.stream.IGVSeekableStreamFactory;
Expand Down Expand Up @@ -83,71 +82,83 @@
*/
public class BBFile {

enum Type {BIGWIG, BIGBED}

static public final int BBFILE_HEADER_SIZE = 64;
static public final long BIGWIG_MAGIC = 2291137574l; // BigWig Magic
static public final long BIGBED_MAGIC = 2273964779l; // BigBed Magic
static public final int BBFILE_EXTENDED_HEADER_HEADER_SIZE = 64;
private Trix trix;
String autosql;
private String autosql;
private ChromTree chromTree;
private String[] chrNames;
double featureDensity;

Map<String, String> chrAliasTable;
public BBTotalSummary totalSummary;
private double featureDensity;
private Map<String, String> chrAliasTable;
private BBTotalSummary totalSummary;
private BPTree[] _searchTrees;
private Map<Long, RPTree> rTreeCache;
BBCodec bedCodec;
private BBCodec bedCodec;
private String path;
private Type type;
private BBHeader header = null;
private BBZoomHeader[] zoomHeaders;
private ByteOrder byteOrder;
private Genome genome;

public boolean isBigWigFile() {
return type == Type.BIGWIG;
public BBFile(String path, Genome genome) throws IOException {
this.path = path;
this.genome = genome;
this.chrAliasTable = new HashMap<>();
this.rTreeCache = new HashMap<>();
init();
}

public boolean isBigBedFile() {
return type == Type.BIGBED;
public BBFile(String path, Genome genome, String trixPath) throws IOException {
this(path, genome);
this.trix = new Trix(trixPath + "x", trixPath);
}

public String getAutoSql() {
return autosql;
void init() throws IOException {
this.header = readHeader();
}

public String[] getChromosomeNames() {
return chrNames;
public BBTotalSummary getTotalSummary() {
return totalSummary;
}

public void setTrix(Trix trix) {
public Type getType() {
return type;
}

public BBHeader getHeader() {
return header;
}

enum Type {BIGWIG, BIGBED}

String path;
Type type;
BBHeader header = null;

BBZoomHeader[] zoomHeaders;
ByteOrder byteOrder;
Genome genome;
public Genome getGenome() {
return genome;
}
public boolean isBigWigFile() {
return type == Type.BIGWIG;
}

public BBFile(String path, Genome genome) throws IOException {
this.path = path;
this.genome = genome;
this.chrAliasTable = new HashMap<>();
this.rTreeCache = new HashMap<>();
init();
public boolean isBigBedFile() {
return type == Type.BIGBED;
}

public BBFile(String path, Genome genome, String trixPath) throws IOException {
this(path, genome);
this.trix = new Trix(trixPath + "x", trixPath);
public double getFeatureDensity() {
return featureDensity;
}
public String getAutoSQL() {
return autosql;
}

void init() throws IOException {
this.header = readHeader();
public String[] getChromosomeNames() {
return chrNames;
}

BBHeader readHeader() throws IOException {

// The common header
ByteOrder order = ByteOrder.LITTLE_ENDIAN;
UnsignedByteBuffer buffer = UnsignedByteBufferImpl.loadBinaryBuffer(this.path, order, 0, BBFILE_HEADER_SIZE);
long magic = buffer.getUInt();
Expand Down Expand Up @@ -183,8 +194,12 @@ BBHeader readHeader() throws IOException {
header.uncompressBuffSize = buffer.getInt();
header.extensionOffset = buffer.getLong();

// Read rest of fields up to full data offset
buffer = UnsignedByteBufferImpl.loadBinaryBuffer(this.path, order, BBFILE_HEADER_SIZE, (int) (header.fullDataOffset - BBFILE_HEADER_SIZE + 4));
// Zoom headers, autosql, and total summary if present
int size = (int) (header.totalSummaryOffset > 0 ?
header.totalSummaryOffset - BBFILE_HEADER_SIZE + 40 :
Math.min(header.fullDataOffset, header.chromTreeOffset) - BBFILE_HEADER_SIZE);

buffer = UnsignedByteBufferImpl.loadBinaryBuffer(this.path, order, BBFILE_HEADER_SIZE, size);

// Zoom headers -- immediately follows the common header
this.zoomHeaders = new BBZoomHeader[header.nZoomLevels];
Expand Down Expand Up @@ -212,38 +227,28 @@ BBHeader readHeader() throws IOException {
this.totalSummary = BBTotalSummary.parseSummary(buffer);
}

//Total data count -- for bigbed this is the number of features, for bigwig it is number of sections
buffer.position((int) (header.fullDataOffset - BBFILE_HEADER_SIZE));
header.dataCount = buffer.getInt();

// Chromosome tree -- this normally preceeds fullDataOffset so will be within the buffer. However, this
// isn't guaranteed, we have to check
int chromtreeBufferPosition = (int) (header.chromTreeOffset - startOffset);
if(chromtreeBufferPosition > 0 && chromtreeBufferPosition < buffer.position() + buffer.remaining()) {
buffer.position((int) (header.chromTreeOffset - startOffset));
} else {
buffer = UnsignedByteBufferDynamic.loadBinaryBuffer(this.path, order, header.chromTreeOffset, 1000);
}
int chromtreeBufferSize = (int) Math.min(1000000, Math.max(10000, header.fullDataOffset - header.chromTreeOffset));
buffer = UnsignedByteBufferDynamic.loadBinaryBuffer(this.path, order, header.chromTreeOffset, chromtreeBufferSize);
this.chromTree = ChromTree.parseTree(buffer, startOffset, this.genome);
this.chrNames = this.chromTree.names();

this.header = header;

//extension
if (header.extensionOffset > 0) {
this.loadExtendedHeader(header.extensionOffset);
}
if (type == Type.BIGBED) {
//Total data count -- for bigbed this is the number of features, for bigwig it is number of sections
buffer = UnsignedByteBufferImpl.loadBinaryBuffer(this.path, order, header.fullDataOffset, 4);
header.dataCount = buffer.getInt();
this.featureDensity = ((double) header.dataCount) / chromTree.sumLengths;

// total summary stats
if (header.version > 1) {
buffer = UnsignedByteBufferImpl.loadBinaryBuffer(this.path, order, header.totalSummaryOffset, 40);
this.totalSummary = BBTotalSummary.parseSummary(buffer);
bedCodec = BBCodecFactory.getCodec(autosql, header.definedFieldCount);
}

this.featureDensity = ((double) header.dataCount) / chromTree.sumLengths;
this.header = header;

if (type == Type.BIGBED) {
bedCodec = BBCodecFactory.getCodec(autosql, header.definedFieldCount);
//extension
if (header.extensionOffset > 0) {
this.loadExtendedHeader(header.extensionOffset);
}


Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.broad.igv.ucsc.twobit;
package org.broad.igv.ucsc.bb;

import htsjdk.samtools.seekablestream.SeekableStream;
import org.broad.igv.ucsc.twobit.UnsignedByteBuffer;
import org.broad.igv.util.stream.IGVSeekableStreamFactory;

import java.io.ByteArrayOutputStream;
Expand All @@ -12,7 +13,7 @@
/**
* A UnsignedByteBuffer that refills its backing buffer as needed from the underlying file resource. This class was
* created specifically to load the chromTree of a BB file, where the start position is known but total size is
* not.
* not. It might have general utility but has not been tested with any other use case.
*/
public class UnsignedByteBufferDynamic implements UnsignedByteBuffer {

Expand Down
31 changes: 29 additions & 2 deletions src/test/java/org/broad/igv/ucsc/bb/BBFeatureSourceTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import htsjdk.tribble.Feature;
import org.broad.igv.feature.BasicFeature;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.ucsc.Trix;
import org.broad.igv.util.TestUtils;
import org.junit.Ignore;
import org.junit.Test;
Expand Down Expand Up @@ -76,7 +75,7 @@ public void testBigBed() throws IOException {

String path = TestUtils.DATA_DIR + "bb/chr21.refseq.bb";
BBFile bbReader = new BBFile(path, null);
assertTrue(bbReader.type == BBFile.Type.BIGBED);
assertTrue(bbReader.getType() == BBFile.Type.BIGBED);

BBFeatureSource bbSource = new BBFeatureSource(bbReader, null);

Expand All @@ -97,6 +96,34 @@ public void testBigBed() throws IOException {

}

/**
* Test the 'dataCount' value, which should equal the total count of feature records in the file.
*
* @throws IOException
*/
@Test
public void testDataCount() throws IOException {

String path = TestUtils.DATA_DIR + "bb/chr21.refseq.bb";
BBFile bbReader = new BBFile(path, null);

assertTrue(bbReader.getType() == BBFile.Type.BIGBED);

BBFeatureSource bbSource = new BBFeatureSource(bbReader, null);
String [] chrNames = bbReader.getChromosomeNames();

int count = 0;
for(String chr : chrNames) {
Iterator<BasicFeature> iter = bbSource.getFeatures(chr, 0, Integer.MAX_VALUE);
while (iter.hasNext()) {
iter.next();
count++;
}
}
assertEquals("Feature count", bbReader.getHeader().dataCount, count);

}


@Test
@Ignore
Expand Down
57 changes: 54 additions & 3 deletions src/test/java/org/broad/igv/ucsc/bb/BBFileTest.java
Original file line number Diff line number Diff line change
@@ -1,25 +1,76 @@
package org.broad.igv.ucsc.bb;

import htsjdk.tribble.Feature;
import org.broad.igv.feature.BasicFeature;
import org.broad.igv.util.TestUtils;
import org.junit.Test;

import java.io.IOException;
import java.util.Iterator;

import static org.junit.Assert.*;
import static org.junit.Assert.assertTrue;

public class BBFileTest {

/**
* Test a BW file with an unusual layout (chromTree after full data).
*/
@Test
public void testGene() throws IOException {
public void testChromTree() throws IOException {
String bbFile = "https://data.broadinstitute.org/igvdata/test/data/bb/chromTreeTest.bigwig";
BBFile reader = new BBFile(bbFile, null);
reader.readHeader();
String [] chrNames = reader.getChromosomeNames();
assertEquals(6, chrNames.length);
}

/**
* Test a BW file with an typical layout (chromTree before full data).
*/
@Test
public void testChromTree2() throws IOException {

String bbFile = TestUtils.DATA_DIR + "bb/GCF_000009045.1_ASM904v1.ncbiGene.bb";
BBFile reader = new BBFile(bbFile, null);
BBHeader header = reader.readHeader();
assertNotNull(header);
String [] chrNames = reader.getChromosomeNames();
assertEquals(1, chrNames.length);
}

@Test
public void testAutoSQL() throws IOException {

String bbFile = TestUtils.DATA_DIR + "bb/GCF_000009045.1_ASM904v1.ncbiGene.bb";
BBFile reader = new BBFile(bbFile, null);
reader.readHeader();

String autoSql = "table bigGenePred\n" +
"\"bigGenePred gene models\"\n" +
" (\n" +
" string chrom; \"Reference sequence chromosome or scaffold\"\n" +
" uint chromStart; \"Start position in chromosome\"\n" +
" uint chromEnd; \"End position in chromosome\"\n" +
" string name; \"Name or ID of item, ideally both human readable and unique\"\n" +
" uint score; \"Score (0-1000)\"\n" +
" char[1] strand; \"+ or - for strand\"\n" +
" uint thickStart; \"Start of where display should be thick (start codon)\"\n" +
" uint thickEnd; \"End of where display should be thick (stop codon)\"\n" +
" uint reserved; \"RGB value (use R,G,B string in input file)\"\n" +
" int blockCount; \"Number of blocks\"\n" +
" int[blockCount] blockSizes; \"Comma separated list of block sizes\"\n" +
" int[blockCount] chromStarts; \"Start positions relative to chromStart\"\n" +
" string name2; \"Alternative/human readable name\"\n" +
" string cdsStartStat; \"Status of CDS start annotation (none, unknown, incomplete, or complete)\"\n" +
" string cdsEndStat; \"Status of CDS end annotation (none, unknown, incomplete, or complete)\"\n" +
" int[blockCount] exonFrames; \"Exon frame {0,1,2}, or -1 if no frame for exon\"\n" +
" string type; \"Transcript type\"\n" +
" string geneName; \"Primary identifier for gene\"\n" +
" string geneName2; \"Alternative/human readable gene name\"\n" +
" string geneType; \"Gene type\"\n" +
" )\n" +
"\n";

assertEquals(autoSql.trim(), reader.getAutoSQL().trim());
}


Expand Down

0 comments on commit 1bb1bd1

Please sign in to comment.