Merge pull request #22 from yhoogstrate/ensembl_bed_conversion

Added utility for GTF to BED conversion
yhoogstrate · Mar 11, 2016 · 89217af · 89217af
2 parents 3fb94ac + a05234b
commit 89217af
Show file tree

Hide file tree

Showing 10 changed files with 227 additions and 2 deletions.
diff --git a/Changelog b/Changelog
@@ -1,3 +1,7 @@
+2016-03-11  Youri Hoogstrate
+
+	* Version 2.11.5: Added utility to create appropriate BED files from GTF files
+
 2016-03-11  Youri Hoogstrate
 
 	* Version 2.11.4: Reduces memory footprint for high number of samples

diff --git a/README.md b/README.md
@@ -266,6 +266,17 @@ In this case the alias of the BED-file, hg19, will later be used to link it to d
 
 	-a "hg18:somefile_hg18.bed" "hg19:somefile_hg19.bed"
 
+#### Obtain BED file -> fuma-gencode-gtf-to-bed ####
+
+Because obtaining such files turns out to more difficult than expected, we have provided an extra utility named `fuma-gencode-gtf-to-bed`.
+The user should start with download a GTF file from (at least tested with) GenCode. Then user should proceed with running the following command:
+
+	fuma-gencode-gtf-to-bed -o converted.bed input.gtf
+
+The utility will use all annotations in the GTF file and will aggregate all exons per `transcript_id`, while it will use the gene_id as unique identifier in the BED file. The reason for this is that if transcripts that belong to the same gene while they are quite distant to each other (or homologues using the same name, which happens), they will be annotated per transcript such that the long distance between the transcripts will not unneccesairily be marked as part of that gene. In case multiple transcripts from the same gene are annotated upon each other, FuMa will treat them as the same gene as long as their identifier is the same, which is the case since the `gene_id` is being used for this.
+
+This tool should work for all GTF files for which all entries have a proper and uniquely wise correct definition of the `gene_id` and `transcript_id`.
+
 #### -s ADD_SAMPLE  ####
 To provide FuMa a fusion gene detection experiment, it should be provided with the "-s" argument which should follow the following syntax:
 

diff --git a/bin/fuma-gencode-gtf-to-bed b/bin/fuma-gencode-gtf-to-bed
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+
+import fuma,sys
+
+from fuma.CLI import CLI_ensmble_gtf_to_bed_converter
+
+args = CLI_ensmble_gtf_to_bed_converter()
+
+## GTF
+##  - start: 1-based
+##  - end:   1-based
+##
+## BED
+##  - start: 0-based
+##  - end:   1-based
+
+#print args
+#print args.genecode_gtf_file
+
+idx = {}
+
+# Parse the GTF file
+with open(args.genecode_gtf_file[0],"r") as fh:
+	for line in fh:
+		line = line.strip()
+		if len(line) > 0:
+			if line[0] != "#":
+				params = line.split("\t")
+
+				gene_id = params[8].split("gene_id",1)[1].split(";",1)[0].strip(" ").strip('"')
+				transcript_id = params[8].split("transcript_id",1)[1].split(";",1)[0].strip(" ").strip('"')
+				#exon_number = params[8].split("exon_number",1)[1].split(";",1)[0].strip(" ").strip('"')
+
+				start = int(params[3])
+				end = int(params[4])
+				inversed = (end < start)
+
+				min_pos = min(start,end)
+				max_pos = max(start,end)
+
+				if not idx.has_key(transcript_id):
+					idx[transcript_id] = {}
+
+				if not idx[transcript_id].has_key(params[0]):
+					idx[transcript_id][params[0]] = [min_pos,max_pos,inversed,params[6],gene_id,params[0]]
+
+				if min_pos < idx[transcript_id][params[0]][0]:
+					# if inversion and strand is identical, overwrite
+					if inversed == idx[transcript_id][params[0]][2] and params[6] == idx[transcript_id][params[0]][3]:
+						idx[transcript_id][params[0]][0] = min_pos
+					else:
+						raise Exception("Error: transcript annotated in different directions:\n"+line)
+
+				if max_pos > idx[transcript_id][params[0]][1]:
+					# if inversion and strand is identical, overwrite
+					if inversed == idx[transcript_id][params[0]][2] and params[6] == idx[transcript_id][params[0]][3]:
+						idx[transcript_id][params[0]][1] = max_pos
+					else:
+						raise Exception("Error: transcript annotated in different directions:\n"+line)
+
+# Export to BED
+lines = set()
+for t in sorted(idx.keys()):# Sorted is essential to get the same output and for functional testing
+	if len(idx[t].keys()) != 1:
+		raise Exception("Error: " + t + " has either no annotated chromosomes or multiple")
+	else:
+		data = idx[t][idx[t].keys()[0]]
+
+		out =  data[5]# chr
+		out += "\t"+str(data[0]-1)# start
+		out += "\t"+str(data[1])# end
+		out += "\t"+data[4]#.split(".",1)[0] << trick to get rid of suffixes of ensembl ID's
+
+		lines.update([out])
+
+# Lines are unique by using sets
+for line in sorted(lines):
+	print line
diff --git a/fuma/CLI.py b/fuma/CLI.py
@@ -71,6 +71,8 @@ def show_formats():
 
 def CLI(argv=None):
 	"""Command Line Interface
+	
+	base command line interface of FuMa
 	"""
 	parser = argparse.ArgumentParser()
 
@@ -106,3 +108,23 @@ def CLI(argv=None):
 	else:
 		# Argumented parameters are used in the unit tests.
 		return parser.parse_args(argv)
+
+
+def CLI_ensmble_gtf_to_bed_converter(argv=None):
+	"""
+		CLI for gtf to bed converter
+	"""
+
+	parser = argparse.ArgumentParser()
+
+	parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,epilog="For more info please visit:\n<https://github.com/yhoogstrate/fuma>")
+	parser.add_argument('-V','--version', action='version', version=textwrap.dedent("%(prog)s "+fuma.__version__+"\n\nCopyright (C) 2013-"+str(datetime.datetime.now().year)+" Youri Hoogstrate.\n\nLicense GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\nThis is free software: you are free to change and redistribute it.\nThere is NO WARRANTY, to the extent permitted by law.\n"))
+
+	parser.add_argument("-o","--output",help="output filename; '-' for stdout",default="-")
+	parser.add_argument("genecode_gtf_file",nargs=1,help="Input GTF file, e.g. 'gencode_gtf_file.gtf' - not as .gz")
+
+	if(argv == None):
+		return parser.parse_args()
+	else:
+		# Argumented parameters are used in the unit tests.
+		return parser.parse_args(argv)
diff --git a/fuma/__init__.py b/fuma/__init__.py
@@ -21,7 +21,7 @@
  <http://epydoc.sourceforge.net/manual-fields.html#fields-synonyms>
 """
 
-__version_info__ = ('2', '11', '4')
+__version_info__ = ('2', '11', '5')
 __version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3])+"-"+__version_info__[3]
 __author__ = 'Youri Hoogstrate'
 __homepage__ = 'https://github.com/yhoogstrate/fuma'

diff --git a/setup.py b/setup.py
@@ -33,7 +33,7 @@
 		maintainer=fuma.__author__,
 		license=fuma.__license__,
 		url=fuma.__homepage__,
-		scripts=["bin/fuma","bin/defuse-clusters-to-CG",'bin/chimerascan-exclude-transcriptome-events',"bin/fusioncatcher-to-CG","bin/chimerascan-relative-bedpe-to-CG","bin/fuma-list-to-boolean-list"],
+		scripts=["bin/fuma","bin/defuse-clusters-to-CG",'bin/chimerascan-exclude-transcriptome-events',"bin/fusioncatcher-to-CG","bin/chimerascan-relative-bedpe-to-CG","bin/fuma-list-to-boolean-list","bin/fuma-gencode-gtf-to-bed"],
 		packages=['fuma'],
 		test_suite="tests",
 		install_requires=['HTSeq >= 0.6.1','numpy'],

diff --git a/tests/data/gencode_hg19.subset.bed b/tests/data/gencode_hg19.subset.bed
@@ -0,0 +1,10 @@
+chr1	110952	129173	ENSG00000238009.2
+chr1	129080	133566	ENSG00000238009.2
+chr1	29553	31097	ENSG00000243485.2
+chr1	30266	31109	ENSG00000243485.2
+chr1	30365	30503	ENSG00000243485.2
+chr1	34553	36081	ENSG00000237613.2
+chr1	35244	36073	ENSG00000237613.2
+chr1	69090	70008	ENSG00000186092.4
+chr1	89294	120932	ENSG00000238009.2
+chr1	92229	129217	ENSG00000238009.2
diff --git a/tests/data/gencode_hg19.subset.gtf b/tests/data/gencode_hg19.subset.gtf
@@ -0,0 +1,27 @@
+##description: test file
+##provider: STAR Fusion wiki
+chr1	HAVANA	exon	29554	30039	.	+	.	gene_id "ENSG00000243485.2"; transcript_id "ENST00000473358.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "MIR1302-11"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-001"; exon_number 1;  exon_id "ENSE00001947070.1";  level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1";
+chr1	HAVANA	exon	30564	30667	.	+	.	gene_id "ENSG00000243485.2"; transcript_id "ENST00000473358.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "MIR1302-11"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-001"; exon_number 2;  exon_id "ENSE00001922571.1";  level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1";
+chr1	HAVANA	exon	30976	31097	.	+	.	gene_id "ENSG00000243485.2"; transcript_id "ENST00000473358.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "MIR1302-11"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-001"; exon_number 3;  exon_id "ENSE00001827679.1";  level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002840.1";
+chr1	HAVANA	exon	30267	30667	.	+	.	gene_id "ENSG00000243485.2"; transcript_id "ENST00000469289.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "MIR1302-11"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-002"; exon_number 1;  exon_id "ENSE00001841699.1";  level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2";
+chr1	HAVANA	exon	30976	31109	.	+	.	gene_id "ENSG00000243485.2"; transcript_id "ENST00000469289.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "MIR1302-11"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-002"; exon_number 2;  exon_id "ENSE00001890064.1";  level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000000959.2"; havana_transcript "OTTHUMT00000002841.2";
+chr1	ENSEMBL	exon	30366	30503	.	+	.	gene_id "ENSG00000243485.2"; transcript_id "ENST00000607096.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "MIR1302-11"; transcript_type "miRNA"; transcript_status "KNOWN"; transcript_name "MIR1302-11-201"; exon_number 1;  exon_id "ENSE00003695741.1";  level 3; tag "basic"; havana_gene "OTTHUMG00000000959.2";
+chr1	HAVANA	exon	35721	36081	.	-	.	gene_id "ENSG00000237613.2"; transcript_id "ENST00000417324.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138A"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138A-001"; exon_number 1;  exon_id "ENSE00001656588.1";  level 2; tag "basic"; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002842.1";
+chr1	HAVANA	exon	35277	35481	.	-	.	gene_id "ENSG00000237613.2"; transcript_id "ENST00000417324.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138A"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138A-001"; exon_number 2;  exon_id "ENSE00001669267.1";  level 2; tag "basic"; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002842.1";
+chr1	HAVANA	exon	34554	35174	.	-	.	gene_id "ENSG00000237613.2"; transcript_id "ENST00000417324.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138A"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138A-001"; exon_number 3;  exon_id "ENSE00001727627.1";  level 2; tag "basic"; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002842.1";
+chr1	HAVANA	exon	35721	36073	.	-	.	gene_id "ENSG00000237613.2"; transcript_id "ENST00000461467.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138A"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138A-002"; exon_number 1;  exon_id "ENSE00001618781.2";  level 2; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002843.1";
+chr1	HAVANA	exon	35245	35481	.	-	.	gene_id "ENSG00000237613.2"; transcript_id "ENST00000461467.1"; gene_type "lincRNA"; gene_status "KNOWN"; gene_name "FAM138A"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "FAM138A-002"; exon_number 2;  exon_id "ENSE00001874421.1";  level 2; havana_gene "OTTHUMG00000000960.1"; havana_transcript "OTTHUMT00000002843.1";
+chr1	HAVANA	exon	69091	70008	.	+	.	gene_id "ENSG00000186092.4"; transcript_id "ENST00000335137.3"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "OR4F5"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "OR4F5-001"; exon_number 1;  exon_id "ENSE00002319515.1";  level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; ccdsid "CCDS30547.1"; havana_gene "OTTHUMG00000001094.1"; havana_transcript "OTTHUMT00000003223.1";
+chr1	HAVANA	exon	120775	120932	.	-	.	gene_id "ENSG00000238009.2"; transcript_id "ENST00000466430.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-001"; exon_number 1;  exon_id "ENSE00001606755.2";  level 2; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003225.1";
+chr1	HAVANA	exon	112700	112804	.	-	.	gene_id "ENSG00000238009.2"; transcript_id "ENST00000466430.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-001"; exon_number 2;  exon_id "ENSE00001957285.1";  level 2; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003225.1";
+chr1	HAVANA	exon	92091	92240	.	-	.	gene_id "ENSG00000238009.2"; transcript_id "ENST00000466430.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-001"; exon_number 3;  exon_id "ENSE00001944529.1";  level 2; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003225.1";
+chr1	HAVANA	exon	89295	91629	.	-	.	gene_id "ENSG00000238009.2"; transcript_id "ENST00000466430.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-001"; exon_number 4;  exon_id "ENSE00001846804.1";  level 2; tag "not_best_in_genome_evidence"; tag "basic"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003225.1";
+chr1	HAVANA	exon	129055	129217	.	-	.	gene_id "ENSG00000238009.2"; transcript_id "ENST00000477740.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-003"; exon_number 1;  exon_id "ENSE00001919246.1";  level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003688.1";
+chr1	HAVANA	exon	120721	120932	.	-	.	gene_id "ENSG00000238009.2"; transcript_id "ENST00000477740.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-003"; exon_number 2;  exon_id "ENSE00001171005.3";  level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003688.1";
+chr1	HAVANA	exon	112700	112804	.	-	.	gene_id "ENSG00000238009.2"; transcript_id "ENST00000477740.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-003"; exon_number 3;  exon_id "ENSE00001957285.1";  level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003688.1";
+chr1	HAVANA	exon	92230	92240	.	-	.	gene_id "ENSG00000238009.2"; transcript_id "ENST00000477740.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-003"; exon_number 4;  exon_id "ENSE00001896976.1";  level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003688.1";
+chr1	HAVANA	exon	129055	129173	.	-	.	gene_id "ENSG00000238009.2"; transcript_id "ENST00000471248.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-002"; exon_number 1;  exon_id "ENSE00001934975.1";  level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003687.1";
+chr1	HAVANA	exon	112700	112804	.	-	.	gene_id "ENSG00000238009.2"; transcript_id "ENST00000471248.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-002"; exon_number 2;  exon_id "ENSE00001957285.1";  level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003687.1";
+chr1	HAVANA	exon	110953	111357	.	-	.	gene_id "ENSG00000238009.2"; transcript_id "ENST00000471248.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-002"; exon_number 3;  exon_id "ENSE00001879696.1";  level 2; tag "not_best_in_genome_evidence"; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003687.1";
+chr1	HAVANA	exon	133374	133566	.	-	.	gene_id "ENSG00000238009.2"; transcript_id "ENST00000453576.2"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-004"; exon_number 1;  exon_id "ENSE00001737600.2";  level 2; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003689.1";
+chr1	HAVANA	exon	129081	129223	.	-	.	gene_id "ENSG00000238009.2"; transcript_id "ENST00000453576.2"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-004"; exon_number 2;  exon_id "ENSE00001827073.1";  level 2; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003689.1";
diff --git a/tests/data/gencode_hg19.subset.sorted.bed b/tests/data/gencode_hg19.subset.sorted.bed
@@ -0,0 +1,10 @@
+chr1	29553	31097	ENSG00000243485.2
+chr1	30266	31109	ENSG00000243485.2
+chr1	30365	30503	ENSG00000243485.2
+chr1	34553	36081	ENSG00000237613.2
+chr1	35244	36073	ENSG00000237613.2
+chr1	69090	70008	ENSG00000186092.4
+chr1	89294	120932	ENSG00000238009.2
+chr1	92229	129217	ENSG00000238009.2
+chr1	110952	129173	ENSG00000238009.2
+chr1	129080	133566	ENSG00000238009.2
diff --git a/tests/test_GencodeGTF.py b/tests/test_GencodeGTF.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+
+"""[License: GNU General Public License v3 (GPLv3)]
+ 
+ This file is part of FuMa.
+ 
+ FuMa is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+ 
+ FuMa is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ Documentation as defined by:
+ <http://epydoc.sourceforge.net/manual-fields.html#fields-synonyms>
+"""
+
+import unittest,logging,sys,os
+logging.basicConfig(level=logging.INFO,format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",stream=sys.stdout)
+
+from fuma.ParseBED import ParseBED
+
+class TestParseBED(unittest.TestCase):
+	def test_01(self):
+		inputfile = "tests/data/gencode_hg19.subset.gtf"
+		outputfile = "tests/data/gencode_hg19.subset.bed"
+
+		command = "export PYTHONPATH=$PYTHONPATH\":fuma:../fuma\" ;\n\n"	# ensure the fuma lib is accessible for testing (also without installation)
+		command += ("bin/fuma-gencode-gtf-to-bed\\\n"
+					"   "+inputfile
+					)
+
+		result = os.popen(command).read()
+		validation = open(outputfile,"r").read()
+
+		self.assertEqual(result, validation)
+
+	def test_02(self):
+		inputfile = "tests/data/gencode_hg19.subset.gtf"
+		outputfile = "tests/data/gencode_hg19.subset.sorted.bed"
+
+		command = "export PYTHONPATH=$PYTHONPATH\":fuma:../fuma\" ;\n\n"	# ensure the fuma lib is accessible for testing (also without installation)
+		command += ("bin/fuma-gencode-gtf-to-bed \\\n"
+					"   "+inputfile+" | sort -k1,1V -k2,2g -k3,3g "
+					)
+
+
+		result = os.popen(command).read()
+		validation = open(outputfile,"r").read()
+
+		self.assertEqual(result, validation)
+
+def main():
+	unittest.main()
+
+if __name__ == '__main__':
+	main()