我希望你們都做得很好!我正在嘗試抓取此血統串列(https://cov-lineages.org/lineage_list.html),并且血統與父子相關。我必須做的:
- 遍歷串列(這個https://cov-lineages.org/lineage_list.html)并單擊每個元素刮取其資料
- 然后轉到包含每個譜系的突變表的鏈接(在同一頁面中)并將其廢棄,
- 向下滾動到具有該譜系子代的表,回圈遍歷它們,單擊其中的每一個并廢棄其資料,如果每個子代有子代,我們也應該執行相同的程序并廢棄它們。我在這里通過 pdf 檔案中的螢屏截圖進行了解釋,請看一下,看看你是否能想出一個關于如何實作樹或嵌套字典的想法。
uj5u.com熱心網友回復:
你不需要 Selenium 來執行這個任務,requests就可以完成這項作業。
此代碼將獲取串列中的所有行:
import requests
from bs4 import BeautifulSoup
res = requests.get('https://cov-lineages.org/lineage_list.html')
soup = BeautifulSoup(res.text, 'html.parser')
rows = soup.find_all('tr')
for row in rows:
print(row)
從這里您可以使用row.find_all('td'). 使用檢查CTRL SHIFT I器來識別所需的 html 元素。
uj5u.com熱心網友回復:
資料都在 json 源中,供站點呈現。直接獲取資料就行了,效率更高。這將在很短的時間內獲得您使用 Selenium 抓取的所有資料。這將需要幾秒鐘,而不是幾個小時,通過讓 Selenium 點擊每個單獨的 1907 父鏈接,然后(我什至不知道有多少......但看起來你會讓 Selenium 點擊總共 2181 個左右的鏈接)下的子鏈接。
就將其轉換為該輸出而言,計算邏輯并確定哪些血統是哪些父母的后代,然后從葉節點向上構造它有點棘手。而且我確信有更好的方法來編碼它,但我認為這可以做到:
import requests
import pandas as pd
import re
# Source data
# This will get each individual lineage data into the desired form
_url = 'https://raw.githubusercontent.com/cov-lineages/lineages-website/master/_data/lineage_data.json'
jsonData = requests.get(_url).json()
jsonData = [v for k,v in jsonData.items()]
sourceData = {}
for _each in jsonData:
_lineage = _each['Lineage']
_description = _each['Description']
_most_common_countries = _each['Countries']
_earliest_date = _each['Earliest date']
_number_designated = _each['Number designated']
_number_assigned = _each['Number assigned']
_children = []
sourceData[_lineage] = {
'id':_lineage,
'description':_description,
'most_common_countries':_most_common_countries,
'earliest_date':_earliest_date,
'number_designated':_number_designated,
'number_assigned':_number_assigned,
'children':[]}
# This parses the yml file to work out which child belongs to which parent
_url = 'https://cov-lineages.org/data/lineages.yml'
_response = requests.get(_url).text
_lineages = re.findall('(name: |parent: )(.*)', _response)
parent_children = {}
# Create dictionary of all parent lineages
for _idx, _lineage in enumerate(_lineages):
if _lineage[0] == 'parent: ' and _lineage[1] != '' and _lineage[1] not in parent_children.keys():
parent_children[_lineage[-1]] = {'children':[]}
if _lineage[1] == '' and _lineages[_idx-1][1] not in parent_children.keys():
parent_children[_lineages[_idx-1][1]] = {'children':[]}
# Match parent with appropriate children
for _idx, _lineage in enumerate(_lineages):
if (_idx 1 == len(_lineages) or (_lineages[_idx][0] == 'name: ' and _lineages[_idx 1][0] == 'name: ')) or (_lineages[_idx 1][-1] == ''):
continue
if _lineages[_idx 1][0] == 'parent: ':
parent_children[_lineages[_idx 1][-1]]['children'].append(_lineages[_idx][-1])
# Creates a list and dictionary so that I can call out the parent
# given a child by it's key/lineage id
parent_child_relations = []
child_parent_relations = {}
for parent, children in parent_children.items():
child_list = children['children']
for child in child_list:
parent_child_relations.append([parent, child])
child_parent_relations.update({child:parent})
# Creates the "family tree" of each child to then iterate through
nested_child_parent = {}
for each in child_parent_relations:
familyOrder = []
current = each
belong_to = child_parent_relations[current]
familyOrder.append(belong_to)
continueLoop = True
while continueLoop == True:
current = belong_to
try:
belong_to = child_parent_relations[current]
familyOrder.append(belong_to)
except:
continueLoop = False
#familyOrder.reverse()
nested_child_parent[each] = familyOrder
# Sorts that list from the "deepest" branches so that I can
# reconstruct from bottom leaf
sorted_nested_child_parent = {}
for each in nested_child_parent.items():
length_of_branches = len(each[-1])
if length_of_branches not in sorted_nested_child_parent.keys():
sorted_nested_child_parent[length_of_branches] = []
sorted_nested_child_parent[length_of_branches].append(each)
lengthKeys = list(sorted_nested_child_parent.keys())
lengthKeys.sort()
lengthKeys.reverse()
# Starts to add the children lineage data into appropriate parent's children list
# in the source data
for x in lengthKeys:
listToAggregate = sorted_nested_child_parent[x]
for each in listToAggregate:
current = each[0]
for parent in each[1]:
lineageData = sourceData[current]
if parent not in sourceData.keys():
sourceData[parent] = {
'id':parent,
'description':'NA',
'most_common_countries':'NA',
'earliest_date':'NA',
'number_designated':'NA',
'number_assigned':'NA',
'children':[]}
# if lineageData not already in children, add it
if not lineageData in sourceData[parent]['children']:
sourceData[parent]['children'].append(lineageData)
current = parent
# Gets the list of the main/top lineages
mainNodes = []
parent_list = list(pd.read_html('https://cov-lineages.org/lineage_list.html')[0]['Lineage'])
for each in parent_list:
try:
parent = child_parent_relations[each]
child = each
except:
print(f'{each} is not a child.')
mainNodes.append(each)
# Gets the main/top lineages from the source data
# and puts into the output list
output = []
for each in mainNodes:
output.append(sourceData[each])
樣本輸出:
[
{
"id": "A",
"description": "Root of the pandemic lies within lineage A. Many sequences originating from China and many global exports; including to South East Asia Japan South Korea Australia the USA and Europe represented in this lineage",
"most_common_countries": "United States of America 27.0%, United_Arab_Emirates 12.0%, China 9.0%, Germany 8.0%, Canada 5.0%",
"earliest_date": "2019-12-30",
"number_designated": 1698,
"number_assigned": 2317,
"children": [
{
"id": "B",
"description": "Second major haplotype (and first to be discovered)",
"most_common_countries": "United States of America 37.0%, United Kingdom 20.0%, China 7.0%, Mexico 6.0%, Germany 3.0%",
"earliest_date": "2019-12-24",
"number_designated": 4009,
"number_assigned": 9162,
"children": [
{
"id": "B.1",
"description": "A large European lineage the origin of which roughly corresponds to the Northern Italian outbreak early in 2020.",
"most_common_countries": "United States of America 46.0%, United Kingdom 8.0%, Turkey 8.0%, Canada 4.0%, France 4.0%",
"earliest_date": "2020-01-03",
"number_designated": 46252,
"number_assigned": 95711,
"children": [
{
"id": "B.1.1",
"description": "European lineage with 3 clear SNPs `28881GA`,`28882GA`,`28883GC`",
"most_common_countries": "United Kingdom 27.0%, United States of America 14.0%, Japan 7.0%, Russia 5.0%, Turkey 4.0%",
"earliest_date": "2020-01-08",
"number_designated": 22834,
"number_assigned": 49224,
"children": [
{
"id": "B.1.1.1",
"description": "England",
"most_common_countries": "United Kingdom 53.0%, Peru 10.0%, Belgium 4.0%, United States of America 3.0%, Italy 2.0%",
"earliest_date": "2020-03-02",
"number_designated": 1745,
"number_assigned": 2913,
"children": [
{
"id": "C.36",
"description": "Alias of B.1.1.1.36, Egypt mainly and other countries",
"most_common_countries": "Egypt 33.0%, Germany 11.0%, United Kingdom 10.0%, United States of America 7.0%, Denmark 6.0%",
"earliest_date": "2020-03-13",
"number_designated": 220,
"number_assigned": 1042,
"children": [
{
"id": "C.36.3",
"description": "Alias of B.1.1.1.36.3, Europe and USA lineage, from pango-designation issue #80",
"most_common_countries": "Germany 18.0%, United States of America 18.0%, Switzerland 9.0%, Italy 8.0%, United Kingdom 7.0%",
"earliest_date": "2021-01-04",
"number_designated": 493,
"number_assigned": 1681,
"children": [
{
"id": "C.36.3.1",
"description": "Alias of B.1.1.1.36.3.1, Europe and USA lineage, from pango-designation issue #80",
"most_common_countries": "Germany 64.0%, United States of America 18.0%, Belgium 9.0%, Bulgaria 3.0%, Netherlands 3.0%",
"earliest_date": "2021-03-29",
"number_designated": 54,
"number_assigned": 324,
"children": []
}
]
},
{
"id": "C.36.1",
"description": "Alias of B.1.1.1.36.1, Canada",
"most_common_countries": "Canada 97.0%, United States of America 2.0%, Burkina_Faso 1.0%, Egypt 1.0%",
"earliest_date": "2020-06-24",
"number_designated": 21,
"number_assigned": 199,
"children": []
},
{
"id": "C.36.2",
"description": "Alias of B.1.1.1.36.2, Switzerland",
"most_common_countries": "Switzerland 80.0%, Norway 7.0%, Germany 3.0%, United States of America 3.0%, Sweden 3.0%",
"earliest_date": "2020-10-16",
"number_designated": 18,
"number_assigned": 30,
"children": []
}
]
},
{
"id": "C.1",
"description": "Alias of B.1.1.1.1, South Africa",
"most_common_countries": "South_Africa 91.0%, Zambia 4.0%, United States of America 3.0%, Mozambique 1.0%, Zimbabwe 0.0%",
"earliest_date": "2020-01-03",
"number_designated": 242,
"number_assigned": 351,
"children": [
{
"id": "C.1.1",
"description": "Alias of B.1.1.1.1.1, Mozambique",
"most_common_countries": "Mozambique 100.0%",
"earliest_date": "2020-11-25",
"number_designated": 12,
"number_assigned": 13,
"children": []
},
{
"id": "C.1.2",
"description": "Alias of B.1.1.1.1.2, mostly South Africa, from pango-designation issue #139",
"most_common_countries": "South_Africa 88.0%, Eswatini 4.0%, Russia 2.0%, United Kingdom 1.0%, Botswana 1.0%",
"earliest_date": "2021-04-07",
"number_designated": 15,
"number_assigned": 281,
"children": []
}
]
},
{
"id": "C.2",
"description": "Alias of B.1.1.1.2, South Africa and some European",
"most_common_countries": "South_Africa 44.0%, Zimbabwe 32.0%, Denmark 8.0%, United Kingdom 8.0%, Australia 6.0%",
"earliest_date": "2020-06-09",
"number_designated": 25,
"number_assigned": 50,
"children": [
{
"id": "C.2.1",
"description": "Alias of B.1.1.1.2.1, Aruba and Curacao",
"most_common_countries": "Aruba 60.0%, United States of America 28.0%, Cura\u00e7ao 9.0%, Netherlands 3.0%, Finland 1.0%",
"earliest_date": "2020-12-18",
"number_designated": 58,
"number_assigned": 150,
"children": []
}
]
}
uj5u.com熱心網友回復:
這是代碼,我正常刮表然后轉到每一頁,我只取血統的名稱,然后轉到爆發網站,刮掉突變。之后,我嘗試用這些構建樹,但我不確定缺少什么。
import json
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
driverPath = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(driverPath)
driver.get("https://cov-lineages.org/lineage_list.html")
# Locate data from the table.
lineages = driver.find_elements(By.XPATH, '//tbody/tr/td[1]/a')
mostCommonCountries = driver.find_elements(By.XPATH, '//tbody/tr/td[2]')
earliestDate = driver.find_elements(By.XPATH, '//tbody/tr/td[3]')
designatedNumbers = driver.find_elements(By.XPATH, '//tbody/tr/td[4]')
assignedNumbers = driver.find_elements(By.XPATH, '//tbody/tr/td[5]')
descriptions = driver.find_elements(By.XPATH, '//tbody/tr/td[6]')
result = []
for i in range(len(lineages)):
data = {
'Lineages': lineages[i].text,
'mostCommonCountries': mostCommonCountries[i].text,
'earliestDate': earliestDate[i].text,
'designatedNumbers': designatedNumbers[i].text,
'assignedNumbers': assignedNumbers[i].text,
'descriptions': descriptions[i].text
}
result.append(data)
parent_xpath = "//*[@id='pageTitle']"
outbreak_id = "outbreakLink"
show_button_xpath = "//*[@id='definition']/div/div[3]/button"
outbreak_data_xpath = "//*[@id='mutation-table']/div/div/div/table"
links = [item.get_attribute('href') for item in driver.find_elements(By.XPATH, '//tbody/tr/td[1]/a')]
for item in links:
driver.get(item)
try:
parent = driver.find_element(By.XPATH, parent_xpath).text
lineage_string = 'Lineage'
name = parent.replace(lineage_string, '')
print(name)
except Exception as e:
parent = None
print(e)
# Locate View more information at Outbreak.info- href
outbreak_link = driver.find_element(By.ID, outbreak_id)
# click on the href
outbreak_link.click()
# now we're on the outbreak website
# View mutation table
wait = WebDriverWait(driver, 100)
button_to_show = wait.until(EC.element_to_be_clickable((By.XPATH, show_button_xpath)))
button_to_show.click()
outbreak_data = driver.find_element(By.XPATH, outbreak_data_xpath)
item['parent'] = name
item['outbreak_data'] = outbreak_data
# # ------- Building the tree
def generate_tree(root_lineage):
parent_dict = [item for item in result if item.get('Lineages') == root_lineage][
0]
parent_dict['children'] = []
children_names = [item.get('Lineages') for item in result if item.get(
'parent') == root_lineage]
for child_name in children_names:
child_dict = generate_tree(child_name)
parent_dict['children'].append(child_dict)
return parent_dict
my_tree = []
root_lineages_names = [item.get('Lineages') for item in result if
not item.get('parent')]
for root_lineage_name in root_lineages_names:
sub_tree = generate_tree(root_lineage_name)
my_tree.append(sub_tree)
# save my_tree as a .json file
json_format = json.dumps(my_tree)
print(json_format)
轉載請註明出處,本文鏈接:https://www.uj5u.com/qiye/444123.html
