-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExp3.py
executable file
·107 lines (106 loc) · 4.84 KB
/
Exp3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 13 13:10:08 2020
@author: pc
"""
# =============================================================================
# a,b=eval(input("key in 2 numbers as a,b:"))
# print(a,'+',b,'=',a+b)
# print(a,'-',b,'=',a-b)
# print(a,'*',b,'=',a*b)
# print(a,'/',b,'=',round(a/b,2))
# =============================================================================
# =============================================================================
# a=eval(input('input a int less than 255'))
# print('binary:{08b}'.format(a))
# =============================================================================
# =============================================================================
# insulin_seq="MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN"
# for amino_acid in "ACDEFGHIKLMNPQRSTVWTY":
# number=insulin_seq.count(amino_acid)
# print(amino_acid,number)
# =============================================================================
# =============================================================================
# a=eval(input("Input a int between 0,255:"))
# print(bin(a)+'\n'+hex(a)+'\n'+oct(a))
# =============================================================================
# =============================================================================
# a="a b c d e"
# print(a)
# print(a.replace(' ','\n'))
# =============================================================================
# =============================================================================
# a='123456789'
# n=len(a)
# m=2
# while m<=n:
# print(' '*(21-m),end='')
# print('{}{}'.format(a[:m], a[m-2::-1]))
# m+=1
# =============================================================================
# =============================================================================
# n=5;m=6
# while n>=1:
# print('{}{}'.format(''*m,'*'*(2*n-1)))
# n-=1
# m+=1
# =============================================================================
# =============================================================================
# temp='rfh4289ogiqwhg21r9w8hg[0843hB4Gm20mrdiewruwuf2inut24n89n34803'
# macchar=max(list(temp))
# print(macchar)
# templist=[]
# for i in range(len(temp)):
# if temp[i]==macchar:
# templist.append(str(i))
# print(templist)
# =============================================================================
# =============================================================================
# a=input('Input a word:')
# print('{:*^30}'.format(a))
# =============================================================================
# =============================================================================
# ATP=3.5;ADP=1.8;Pi=5.0;R=0.00831;T=298;ΔG0=-30.5;
# import math
# print('ΔG=%.2f kcal/mol'%float((ΔG0+R*T*math.log(ADP*Pi/ATP))/4.184))
# =============================================================================
import requests
import re
from bs4 import BeautifulSoup
sch_gene_name=input('Input a homo gene name you wanna search:')
sch_url='https://www.ncbi.nlm.nih.gov/gene?term=(homo%5BOrganism%5D)%20AND%20'+sch_gene_name+'%5BGene%20Name%5D#reference-sequences'
print('Searching...\nThis may take few centuries...')
res = requests.get(sch_url)
res.encoding='gbk'
soup = BeautifulSoup(res.text,"html.parser")
match_pts_url = re.findall(r'/protein/NP_[0-9]+.[0-9]?',res.text)
print('matched protein number: ')
print(len(match_pts_url))
pt_name=[]
for i in range(len(match_pts_url)):
pt_name.append(re.search(r'NP_\S+', match_pts_url[i]).group())
print(pt_name)
print('Start downloading protein sequences')
for t in range(len(match_pts_url)):
print('Searching protein sequences...protein name= '+pt_name[i])
pt_sch_url='https://www.ncbi.nlm.nih.gov'+match_pts_url[t]
pt_res = requests.get(pt_sch_url)
pt_res.encoding='gbk'
match_pts = re.findall(r'<span class="ff_line" id="'+pt_name[i]+'_[0-9]+">\S+</span>',pt_res.text)
#print(match_pts)
match_pts=(re.search(r'(?<=<meta name="ncbi_uidlist" content=")[0-9]+(?=" />)',pt_res.text)).group()
pt_sch_url_re='https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id='+match_pts+'&db=protein&report=genpept&conwithfeat=on&show-cdd=on&retmode=html&withmarkup=on&tool=portal&log$=seqview&maxdownloadsize=1000000'
pt_res_re = requests.get(pt_sch_url_re)
pt_res_re.encoding='gbk'
match_ptseqs=re.findall(r'<span class="ff_line" id="\S+_\S+">[a-z\s]+(?=</span>)', pt_res_re.text)
match_ptseqs_new=[]
for i in range(len(match_ptseqs)):
latarrow=match_ptseqs[i].find('>')
match_ptseqs_new.append(match_ptseqs[i][latarrow+1:])
seq_content="".join(match_ptseqs_new)
seq_content=seq_content.replace(' ','')
seq_content=seq_content.upper()
print(seq_content)
for amino_acid in "ACDEFGHIKLMNPQRSTVWTY":
number=seq_content.count(amino_acid)
print(amino_acid,number)