FastQInfoSoupScraper.py

#! /usr/bin/python
"""FastQInfoSoupScraper

A script for scraping the SRA information from the search engine of ngdc.cncb.ac.cn.
To run this script, the python packages requests and bs4 are required.
"""
import sys
import csv
import requests
from bs4 import BeautifulSoup
from collections import defaultdict

# Define the url to the first page of the searching results
start_url = "https://ngdc.cncb.ac.cn/search/?dbId=gsa&q=cancer%20cell%20free%20methylation&page=1"

# A dictionary stores the SRA information with the keys are the BioProject numbers, the values are
# lists which contains the accession numbers of different runs. Each BioProject may refer to several
# runs, one run could only refer to one BioProject
sra_info = defaultdict(list)

# A list contains urls to all pages
all_urls = []

# A list contains urls listed in one page
urls_in_one_page = []

def find_page_numbers(url):
    """Find out how many pages the search results occupy

    Parameters
    ----------
    url : str
        An url to any page of the results

    Returns:
    integer
        An integer indicates the number of pages
    """
    # Get the page object
    page = requests.get(url)

    # Build a soup object using the page.content
    soup = BeautifulSoup(page.content, "html.parser")

    # Find the element which contains the number information using
    # class name
    element = soup.find(class_ = "disabled")
    return int(element.get_text().split("/")[1])

def extract_all_urls(page_numbers):
    """Generates urls for all the pages

    Parameters
    ----------
        page_numbers : int
            An integer indicates how many pages we get
    """
    for i in range(1, page_numbers + 1):
        tmp_url = f"https://ngdc.cncb.ac.cn/search/?dbId=gsa&q=cancer%20cell%20free%20methylation&page={i}"
        if not tmp_url in all_urls:
            all_urls.append(tmp_url)

def urls_in_page(curr_url):
    """One page lists different urls to different experiments. This funciont helps extracting those
    URLs out and storing them into urls_in_one_page

    Parameters
    ----------
    curr_url : str
        An url to one page
    """
    # Get the current page
    curr_page = requests.get(curr_url)

    # Build a soup object using curr_page.content
    curr_soup = BeautifulSoup(curr_page.content, "html.parser")

    # Since all the urls wrapped by <h4> tags, uses find_all
    elements = curr_soup.find_all("h4")

    # Iterates the h4 elements
    for ele in elements:
        # Takes out the <a> wrapped in the <h4>
        child = ele.find("a")

        # Extracts the hyperlink in the <a> tag
        href = child.get("href")

        if not href in urls_in_one_page:
            print(href)
            urls_in_one_page.append(href)


def extract_sra_info(curr_url):
    """Extracts the SRA information from each page and stores them in the dictionary sra_info

    Parameters
    ----------
    curr_url : str
        An url to a page
    """
    print(curr_url)
    # Get the page
    sra_page = requests.get(curr_url)

    # Build a BeautifulSoup object
    sra_soup = BeautifulSoup(sra_page.content, "html.parser")

    # Find the table which contains the useful information
    table = sra_soup.find("table", class_ = "table table-left table2-border")

    # Iterates the table
    for row in table.find_all("tr"):
        # Since there are nested tables, we have to use find_all to get all
        # the <th> tags
        ths = row.find_all("th")

        for th in ths:
            text = th.get_text()
            if text == "BioProject":
                project = row.find("td").get_text()
            if text == "Organism":
                # A flag indicates if this sample is homo sapiens
                is_hs = (row.find("td").get_text() == "Homo sapiens")
            if text == "Platform":
                # A flag indicates if this sample generated by illumina hiseq
                hiseq = ("Illumina" in row.find("td").get_text())
            if text == "Run":
                sra_number = row.find("a").get_text()

    if is_hs and hiseq:
        print(project)
        print(sra_number)
        try:
            sra_info[project].append(sra_number)
            print(sra_info)
        except AttributeError:
            #print(sra_info)
            sys.exit(1)

# Extracts out urls of all the pages
extract_all_urls(find_page_numbers(start_url))

# Extracts the urls listed in a page
for url in all_urls:
    urls_in_page(url)

# Extracts the sra information
for url in urls_in_one_page:
    extract_sra_info(url)

# Writes the values, the lists, into files, which named after the keys
for key, value in sra_info.items():
    with open(f"{key}.csv", "w") as file:
        wr = csv.writer(file, delimiter = ",")
        wr.writerow(value)