This repository has been archived by the owner on Jun 14, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_orthologs.py
54 lines (41 loc) · 1.98 KB
/
parse_orthologs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python
#### Description: Parse gene set of each sample into separate genes
#### ... and group orthologous genes of all samples in a fasta per gene.
#### ... Takes GffRead gene sets as input and generates orthologs for downstream local realignment with PRANK.
#### List file: The absolute path for the gene set of each sample in order, one per line.
#### Use: parse_orthologs.py DATASETXXX.list
#### Written by: Henrique V. Figueiró - [email protected] as prankAlignment.py
#### Adapted by Jonas Lescroart on 24 May 2020.
#### Following changes were made:
#### 1/ number of input gene sets is determined from list file rather than hardcoded in script;
#### 2/ If genomic coordinates are available in header from preceding gffread analysis,
#### ... these are now propagated. Switch comment flag on lines 52-53 to disable.
from Bio import SeqIO
import sys
import os
#Open dataset.list file
dataset = open(sys.argv[1], 'r')
lines = dataset.readlines()
dataset_name = sys.argv[1].split('.')[0]
#Create dataset and windows folders
if not os.path.exists(os.getcwd() + '/fasta/'):
os.makedirs(os.getcwd() + '/fasta')
chr_dir = os.getcwd() + '/fasta/'
#Read gene set filenames and get species/sample names
names = list()
for i in range(len(lines)):
line = str(lines[i].rstrip())
name = str(line.split('/')[-1].split('.')[0])
names.append(name)
print(names[i])
#Create each orthologous gene fasta file and append ortholog from subsequent samples
for i in range(len(lines)):
with open(str(lines[i].rstrip()), 'r') as handle:
for record in SeqIO.parse(handle, 'fasta'):
with open(chr_dir + record.id + ".fa", 'a') as out_file:
coordinates = ' '.join(record.description.split(' ')[1:])
seq_name = str('>' + names[i])
seq = str(record.seq)
#out_file.write(seq_name + '\n' + seq + '\n')
out_file.write(seq_name + ' ' + coordinates + '\n' + seq + '\n')
out_file.close()