-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFastQInfoSoupScraper.py
156 lines (125 loc) · 4.67 KB
/
FastQInfoSoupScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#! /usr/bin/python
"""FastQInfoSoupScraper
A script for scraping the SRA information from the search engine of ngdc.cncb.ac.cn.
To run this script, the python packages requests and bs4 are required.
"""
import sys
import csv
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
# Define the url to the first page of the searching results
start_url = "https://ngdc.cncb.ac.cn/search/?dbId=gsa&q=cancer%20cell%20free%20methylation&page=1"
# A dictionary stores the SRA information with the keys are the BioProject numbers, the values are
# lists which contains the accession numbers of different runs. Each BioProject may refer to several
# runs, one run could only refer to one BioProject
sra_info = defaultdict(list)
# A list contains urls to all pages
all_urls = []
# A list contains urls listed in one page
urls_in_one_page = []
def find_page_numbers(url):
"""Find out how many pages the search results occupy
Parameters
----------
url : str
An url to any page of the results
Returns:
integer
An integer indicates the number of pages
"""
# Get the page object
page = requests.get(url)
# Build a soup object using the page.content
soup = BeautifulSoup(page.content, "html.parser")
# Find the element which contains the number information using
# class name
element = soup.find(class_ = "disabled")
return int(element.get_text().split("/")[1])
def extract_all_urls(page_numbers):
"""Generates urls for all the pages
Parameters
----------
page_numbers : int
An integer indicates how many pages we get
"""
for i in range(1, page_numbers + 1):
tmp_url = f"https://ngdc.cncb.ac.cn/search/?dbId=gsa&q=cancer%20cell%20free%20methylation&page={i}"
if not tmp_url in all_urls:
all_urls.append(tmp_url)
def urls_in_page(curr_url):
"""One page lists different urls to different experiments. This funciont helps extracting those
URLs out and storing them into urls_in_one_page
Parameters
----------
curr_url : str
An url to one page
"""
# Get the current page
curr_page = requests.get(curr_url)
# Build a soup object using curr_page.content
curr_soup = BeautifulSoup(curr_page.content, "html.parser")
# Since all the urls wrapped by <h4> tags, uses find_all
elements = curr_soup.find_all("h4")
# Iterates the h4 elements
for ele in elements:
# Takes out the <a> wrapped in the <h4>
child = ele.find("a")
# Extracts the hyperlink in the <a> tag
href = child.get("href")
if not href in urls_in_one_page:
print(href)
urls_in_one_page.append(href)
def extract_sra_info(curr_url):
"""Extracts the SRA information from each page and stores them in the dictionary sra_info
Parameters
----------
curr_url : str
An url to a page
"""
print(curr_url)
# Get the page
sra_page = requests.get(curr_url)
# Build a BeautifulSoup object
sra_soup = BeautifulSoup(sra_page.content, "html.parser")
# Find the table which contains the useful information
table = sra_soup.find("table", class_ = "table table-left table2-border")
# Iterates the table
for row in table.find_all("tr"):
# Since there are nested tables, we have to use find_all to get all
# the <th> tags
ths = row.find_all("th")
for th in ths:
text = th.get_text()
if text == "BioProject":
project = row.find("td").get_text()
if text == "Organism":
# A flag indicates if this sample is homo sapiens
is_hs = (row.find("td").get_text() == "Homo sapiens")
if text == "Platform":
# A flag indicates if this sample generated by illumina hiseq
hiseq = ("Illumina" in row.find("td").get_text())
if text == "Run":
sra_number = row.find("a").get_text()
if is_hs and hiseq:
print(project)
print(sra_number)
try:
sra_info[project].append(sra_number)
print(sra_info)
except AttributeError:
#print(sra_info)
sys.exit(1)
# Extracts out urls of all the pages
extract_all_urls(find_page_numbers(start_url))
# Extracts the urls listed in a page
for url in all_urls:
urls_in_page(url)
# Extracts the sra information
for url in urls_in_one_page:
extract_sra_info(url)
# Writes the values, the lists, into files, which named after the keys
for key, value in sra_info.items():
with open(f"{key}.csv", "w") as file:
wr = csv.writer(file, delimiter = ",")
wr.writerow(value)