7_1_filterDb.py

# -*- coding:utf-8 -*-

# -----------------------------------------
# Updated Date: 2014/03/24
# Input: The annotated data (.gtf) was downloaded from GENCODE project.
# Output: A little small database that referred to gene name within the file generated by getTranscript.r.
# Environemt: Linux
# Description: Due to the size of annotated data, it is necessary to decrease the size of annotated data by
#              extracting potential entries (data). But this code is optional, is needed only when memory size
#              is not enough for the total annotated database.
# -----------------------------------------

import sys
import os

if len(sys.argv) < 3:
    print "Usage: python filteringDatabase.py <annotated.gtf> <bothGeTr.csv> <small_database.csv>\n"
    exit(0)

try:
    fin = open(sys.argv[2],"r")
except:
    print "Error: Make sure",sys.argv[2],"exist."
    exit(0)

# global variables
annotatedFile = sys.argv[1]     # gtf file
outputFile = sys.argv[3]        # small database
firstFlag = 0                   # header or not
tmpStr = ""
tmpCmd = ""
tmpList = []

os.system("rm -f " + outputFile)
while True:
    tmpStr = fin.readline().strip()
    if len(tmpStr) == 0:
        break
    if firstFlag == 0:
        firstFlag = 1
        os.system("touch " + outputFile)
        continue
    tmpList = tmpStr.split(',')
    tmpStr = tmpList[1]
    if (tmpList[1])[0] == '"':
        tmpStr = (tmpList[1])[1:len(tmpList[1])-1]
    tmpCmd = "sed -n '/" + tmpStr + "/p' " + annotatedFile + " >> " + outputFile
    #print tmpCmd
    os.system(tmpCmd)

fin.close()