-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathFastQInfoScraper.py
More file actions
86 lines (69 loc) · 2.91 KB
/
FastQInfoScraper.py
File metadata and controls
86 lines (69 loc) · 2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import selenium.webdriver as webdriver
from selenium.webdriver.common.by import By
def get_links(search_string):
"""Gets the links to the databases based on the search_string
Parameters
----------
search_string : str
The searching keywords. Multiple keywords should be separated
by whitespaces.
Returns
-------
dictionary
a dictionary with the key values are the names of the database,
the values are links to the corresponding databases
"""
url = "https://ngdc.cncb.ac.cn/"
# We use the Google Chrome as our driver
browser = webdriver.Chrome()
browser.get(url)
# Locates the search box using id
search_box = browser.find_element(By.ID, "q")
# Fills in our search_string and submit the string to the search
# engine
search_box.send_keys(search_string)
search_box.submit()
# Fetches the rows of the table with id 'DataTables_Table_0'
entries = browser.find_elements(By.XPATH, "//*[@id='DataTables_Table_0']/tbody/tr")
# Define a dictionary to store the links
results = {}
# A for loop iterates the rows. Each row is a WebElement
for entry in entries:
# An entry in fact is a <tr> in a table, and since a hyperlink locates in an
# <a> tag, we have to fetch the <a> WebElement using find_element by tag name
cell = entry.find_element(By.TAG_NAME, "a")
# A cell is also a WebElement. It has an attribute "href". So, we can extract
# out the link using get_attribute
href = cell.get_attribute("href")
results[cell.text] = href
browser.close()
return results
def gsa_iteration(gsa_link):
"""Iterates the pages of GSA (Gene Sequence Archive) database and extract the experiment
information as well as the SRA accession numbers
Parameters
----------
gsa_link : str
The link to the page for GSA searching results
Returns
-------
dictionary
a dictionary stores the experimental accession numbers and the SRA accession numbers.
with the keys are the experimental accession numbers, the values are lists contain
SRA accession numbers. one experimental accession numbers could refer to multiple SRA
accession numbers, but one SRA accession number could only refer to one experimental
accession number.
"""
# We use the Google Chrome as our driver
browser = webdriver.Chrome()
browser.get(gsa_link)
# Extract all the <h4> elements. Because the links to the experiments are put in <h4> elements
# as <h4><a href="link">text</a></h4>
elements = browser.find_elements(By.XPATH, "//*[@class='ui segment']/h4")
for ele in elements:
info = ele.find_element(By.TAG_NAME, "a")
print(info.text)
href = info.get_attribute("href")
print(href)
results = get_links("cancer cell free methylation")
gsa_iteration(results["GSA"])