用selenium，beautifulsoup抓取父子關系資料-有解無憂

我希望你們都做得很好！我正在嘗試抓取此血統串列（https://cov-lineages.org/lineage_list.html），并且血統與父子相關。我必須做的：

遍歷串列（這個https://cov-lineages.org/lineage_list.html）并單擊每個元素刮取其資料
然后轉到包含每個譜系的突變表的鏈接（在同一頁面中）并將其廢棄，
向下滾動到具有該譜系子代的表，回圈遍歷它們，單擊其中的每一個并廢棄其資料，如果每個子代有子代，我們也應該執行相同的程序并廢棄它們。我在這里通過 pdf 檔案中的螢屏截圖進行了解釋，請看一下，看看你是否能想出一個關于如何實作樹或嵌套字典的想法。

uj5u.com熱心網友回復：

你不需要 Selenium 來執行這個任務，requests就可以完成這項作業。

此代碼將獲取串列中的所有行：

import requests
from bs4 import BeautifulSoup

res = requests.get('https://cov-lineages.org/lineage_list.html')
soup = BeautifulSoup(res.text, 'html.parser')

rows = soup.find_all('tr')

for row in rows:
    print(row)

從這里您可以使用row.find_all('td'). 使用檢查CTRL SHIFT I器來識別所需的 html 元素。

uj5u.com熱心網友回復：

資料都在 json 源中，供站點呈現。直接獲取資料就行了，效率更高。這將在很短的時間內獲得您使用 Selenium 抓取的所有資料。這將需要幾秒鐘，而不是幾個小時，通過讓 Selenium 點擊每個單獨的 1907 父鏈接，然后（我什至不知道有多少......但看起來你會讓 Selenium 點擊總共 2181 個左右的鏈接）下的子鏈接。

就將其轉換為該輸出而言，計算邏輯并確定哪些血統是哪些父母的后代，然后從葉節點向上構造它有點棘手。而且我確信有更好的方法來編碼它，但我認為這可以做到：

import requests
import pandas as pd
import re


# Source data
# This will get each individual lineage data into the desired form
_url = 'https://raw.githubusercontent.com/cov-lineages/lineages-website/master/_data/lineage_data.json'
jsonData = requests.get(_url).json()
jsonData = [v for k,v in jsonData.items()]
sourceData = {}

for _each in jsonData:
    _lineage = _each['Lineage']
    _description = _each['Description']
    _most_common_countries = _each['Countries']
    _earliest_date = _each['Earliest date']
    _number_designated = _each['Number designated']
    _number_assigned = _each['Number assigned']
    _children = []

    sourceData[_lineage] = {
            'id':_lineage,
            'description':_description,
            'most_common_countries':_most_common_countries,
            'earliest_date':_earliest_date,
            'number_designated':_number_designated,
            'number_assigned':_number_assigned,
            'children':[]}



# This parses the yml file to work out which child belongs to which parent
_url = 'https://cov-lineages.org/data/lineages.yml'
_response = requests.get(_url).text
_lineages = re.findall('(name: |parent: )(.*)', _response)

parent_children = {}
# Create dictionary of all parent lineages
for _idx, _lineage in enumerate(_lineages):
    if _lineage[0] == 'parent: ' and _lineage[1] != '' and _lineage[1] not in parent_children.keys():
        parent_children[_lineage[-1]] = {'children':[]}
        
    if _lineage[1] == '' and _lineages[_idx-1][1] not in parent_children.keys():
        parent_children[_lineages[_idx-1][1]] = {'children':[]}


# Match parent with appropriate children
for _idx, _lineage in enumerate(_lineages):
    if (_idx 1 == len(_lineages) or (_lineages[_idx][0] == 'name: ' and _lineages[_idx 1][0] == 'name: ')) or (_lineages[_idx 1][-1] == ''):
        continue 

    if _lineages[_idx 1][0] == 'parent: ':
        parent_children[_lineages[_idx 1][-1]]['children'].append(_lineages[_idx][-1])



# Creates a list and dictionary so that I can call out the parent
# given a child by it's key/lineage id
parent_child_relations = []
child_parent_relations = {}
for parent, children in parent_children.items():
    child_list = children['children']
    for child in child_list:
        parent_child_relations.append([parent, child])
        child_parent_relations.update({child:parent})


# Creates the "family tree" of each child to then iterate through
nested_child_parent = {}
for each in child_parent_relations:
    familyOrder = []
    current = each
    belong_to = child_parent_relations[current]
    
    familyOrder.append(belong_to)
    continueLoop = True
    while continueLoop == True:
        current = belong_to
        try:
            belong_to = child_parent_relations[current]
            familyOrder.append(belong_to)
        except:
            continueLoop = False
            #familyOrder.reverse()
            nested_child_parent[each] = familyOrder

# Sorts that list from the "deepest" branches so that I can
# reconstruct from bottom leaf             
sorted_nested_child_parent = {}
for each in nested_child_parent.items():
    length_of_branches = len(each[-1])
    
    if length_of_branches not in sorted_nested_child_parent.keys():
        sorted_nested_child_parent[length_of_branches] = []
    sorted_nested_child_parent[length_of_branches].append(each)

lengthKeys = list(sorted_nested_child_parent.keys())   
lengthKeys.sort() 
lengthKeys.reverse()


# Starts to add the children lineage data into appropriate parent's children list
# in the source data
for x in lengthKeys:
    listToAggregate = sorted_nested_child_parent[x]
    for each in listToAggregate:
        current = each[0]
        
        for parent in each[1]:
            lineageData = sourceData[current]
            if parent not in sourceData.keys():
                sourceData[parent] = {            
                    'id':parent,
                    'description':'NA',
                    'most_common_countries':'NA',
                    'earliest_date':'NA',
                    'number_designated':'NA',
                    'number_assigned':'NA',
                    'children':[]}
                
            
            # if lineageData not already in children, add it
            if not lineageData in sourceData[parent]['children']:
                sourceData[parent]['children'].append(lineageData)
            current = parent
            

# Gets the list of the main/top lineages    
mainNodes = []
parent_list = list(pd.read_html('https://cov-lineages.org/lineage_list.html')[0]['Lineage'])
for each in parent_list:
    try:
        parent = child_parent_relations[each]
        child = each
    except:
        print(f'{each} is not a child.')
        mainNodes.append(each)

# Gets the main/top lineages from the source data
# and puts into the output list
output = []
for each in mainNodes:
    output.append(sourceData[each])

樣本輸出：

[
  {
    "id": "A",
    "description": "Root of the pandemic lies within lineage A. Many sequences originating from China and many global exports; including to South East Asia Japan South Korea Australia the USA and Europe represented in this lineage",
    "most_common_countries": "United States of America 27.0%, United_Arab_Emirates 12.0%, China 9.0%, Germany 8.0%, Canada 5.0%",
    "earliest_date": "2019-12-30",
    "number_designated": 1698,
    "number_assigned": 2317,
    "children": [
      {
        "id": "B",
        "description": "Second major haplotype (and first to be discovered)",
        "most_common_countries": "United States of America 37.0%, United Kingdom 20.0%, China 7.0%, Mexico 6.0%, Germany 3.0%",
        "earliest_date": "2019-12-24",
        "number_designated": 4009,
        "number_assigned": 9162,
        "children": [
          {
            "id": "B.1",
            "description": "A large European lineage the origin of which roughly corresponds to the Northern Italian outbreak early in 2020.",
            "most_common_countries": "United States of America 46.0%, United Kingdom 8.0%, Turkey 8.0%, Canada 4.0%, France 4.0%",
            "earliest_date": "2020-01-03",
            "number_designated": 46252,
            "number_assigned": 95711,
            "children": [
              {
                "id": "B.1.1",
                "description": "European lineage with 3 clear SNPs `28881GA`,`28882GA`,`28883GC`",
                "most_common_countries": "United Kingdom 27.0%, United States of America 14.0%, Japan 7.0%, Russia 5.0%, Turkey 4.0%",
                "earliest_date": "2020-01-08",
                "number_designated": 22834,
                "number_assigned": 49224,
                "children": [
                  {
                    "id": "B.1.1.1",
                    "description": "England",
                    "most_common_countries": "United Kingdom 53.0%, Peru 10.0%, Belgium 4.0%, United States of America 3.0%, Italy 2.0%",
                    "earliest_date": "2020-03-02",
                    "number_designated": 1745,
                    "number_assigned": 2913,
                    "children": [
                      {
                        "id": "C.36",
                        "description": "Alias of B.1.1.1.36, Egypt mainly and other countries",
                        "most_common_countries": "Egypt 33.0%, Germany 11.0%, United Kingdom 10.0%, United States of America 7.0%, Denmark 6.0%",
                        "earliest_date": "2020-03-13",
                        "number_designated": 220,
                        "number_assigned": 1042,
                        "children": [
                          {
                            "id": "C.36.3",
                            "description": "Alias of B.1.1.1.36.3, Europe and USA lineage, from pango-designation issue #80",
                            "most_common_countries": "Germany 18.0%, United States of America 18.0%, Switzerland 9.0%, Italy 8.0%, United Kingdom 7.0%",
                            "earliest_date": "2021-01-04",
                            "number_designated": 493,
                            "number_assigned": 1681,
                            "children": [
                              {
                                "id": "C.36.3.1",
                                "description": "Alias of B.1.1.1.36.3.1, Europe and USA lineage, from pango-designation issue #80",
                                "most_common_countries": "Germany 64.0%, United States of America 18.0%, Belgium 9.0%, Bulgaria 3.0%, Netherlands 3.0%",
                                "earliest_date": "2021-03-29",
                                "number_designated": 54,
                                "number_assigned": 324,
                                "children": []
                              }
                            ]
                          },
                          {
                            "id": "C.36.1",
                            "description": "Alias of B.1.1.1.36.1, Canada",
                            "most_common_countries": "Canada 97.0%, United States of America 2.0%, Burkina_Faso 1.0%, Egypt 1.0%",
                            "earliest_date": "2020-06-24",
                            "number_designated": 21,
                            "number_assigned": 199,
                            "children": []
                          },
                          {
                            "id": "C.36.2",
                            "description": "Alias of B.1.1.1.36.2, Switzerland",
                            "most_common_countries": "Switzerland 80.0%, Norway 7.0%, Germany 3.0%, United States of America 3.0%, Sweden 3.0%",
                            "earliest_date": "2020-10-16",
                            "number_designated": 18,
                            "number_assigned": 30,
                            "children": []
                          }
                        ]
                      },
                      {
                        "id": "C.1",
                        "description": "Alias of B.1.1.1.1, South Africa",
                        "most_common_countries": "South_Africa 91.0%, Zambia 4.0%, United States of America 3.0%, Mozambique 1.0%, Zimbabwe 0.0%",
                        "earliest_date": "2020-01-03",
                        "number_designated": 242,
                        "number_assigned": 351,
                        "children": [
                          {
                            "id": "C.1.1",
                            "description": "Alias of B.1.1.1.1.1, Mozambique",
                            "most_common_countries": "Mozambique 100.0%",
                            "earliest_date": "2020-11-25",
                            "number_designated": 12,
                            "number_assigned": 13,
                            "children": []
                          },
                          {
                            "id": "C.1.2",
                            "description": "Alias of B.1.1.1.1.2, mostly South Africa, from pango-designation issue #139",
                            "most_common_countries": "South_Africa 88.0%, Eswatini 4.0%, Russia 2.0%, United Kingdom 1.0%, Botswana 1.0%",
                            "earliest_date": "2021-04-07",
                            "number_designated": 15,
                            "number_assigned": 281,
                            "children": []
                          }
                        ]
                      },
                      {
                        "id": "C.2",
                        "description": "Alias of B.1.1.1.2, South Africa and some European",
                        "most_common_countries": "South_Africa 44.0%, Zimbabwe 32.0%, Denmark 8.0%, United Kingdom 8.0%, Australia 6.0%",
                        "earliest_date": "2020-06-09",
                        "number_designated": 25,
                        "number_assigned": 50,
                        "children": [
                          {
                            "id": "C.2.1",
                            "description": "Alias of B.1.1.1.2.1, Aruba and Curacao",
                            "most_common_countries": "Aruba 60.0%, United States of America 28.0%, Cura\u00e7ao 9.0%, Netherlands 3.0%, Finland 1.0%",
                            "earliest_date": "2020-12-18",
                            "number_designated": 58,
                            "number_assigned": 150,
                            "children": []
                          }
                        ]
                      }

uj5u.com熱心網友回復：

這是代碼，我正常刮表然后轉到每一頁，我只取血統的名稱，然后轉到爆發網站，刮掉突變。之后，我嘗試用這些構建樹，但我不確定缺少什么。

import json
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait

driverPath = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(driverPath)
driver.get("https://cov-lineages.org/lineage_list.html")
# Locate data from the table.
lineages = driver.find_elements(By.XPATH, '//tbody/tr/td[1]/a')
mostCommonCountries = driver.find_elements(By.XPATH, '//tbody/tr/td[2]')
earliestDate = driver.find_elements(By.XPATH, '//tbody/tr/td[3]')
designatedNumbers = driver.find_elements(By.XPATH, '//tbody/tr/td[4]')
assignedNumbers = driver.find_elements(By.XPATH, '//tbody/tr/td[5]')
descriptions = driver.find_elements(By.XPATH, '//tbody/tr/td[6]')

result = []
for i in range(len(lineages)):
    data = {
        'Lineages': lineages[i].text,
        'mostCommonCountries': mostCommonCountries[i].text,
        'earliestDate': earliestDate[i].text,
        'designatedNumbers': designatedNumbers[i].text,
        'assignedNumbers': assignedNumbers[i].text,
        'descriptions': descriptions[i].text
    }
    result.append(data)

    parent_xpath = "//*[@id='pageTitle']"
    outbreak_id = "outbreakLink"
    show_button_xpath = "//*[@id='definition']/div/div[3]/button"
    outbreak_data_xpath = "//*[@id='mutation-table']/div/div/div/table"
    links = [item.get_attribute('href') for item in driver.find_elements(By.XPATH, '//tbody/tr/td[1]/a')]
    for item in links:
        driver.get(item)
        try:
            parent = driver.find_element(By.XPATH, parent_xpath).text
            lineage_string = 'Lineage'
            name = parent.replace(lineage_string, '')
            print(name)

        except Exception as e:
            parent = None
            print(e)
        # Locate View more information at Outbreak.info- href
        outbreak_link = driver.find_element(By.ID, outbreak_id)
        # click on the href
        outbreak_link.click()
        # now we're on the outbreak website
        # View mutation table
        wait = WebDriverWait(driver, 100)
        button_to_show = wait.until(EC.element_to_be_clickable((By.XPATH, show_button_xpath)))
        button_to_show.click()
        outbreak_data = driver.find_element(By.XPATH, outbreak_data_xpath)

    item['parent'] = name
    item['outbreak_data'] = outbreak_data


# # ------- Building the tree

def generate_tree(root_lineage):
    parent_dict = [item for item in result if item.get('Lineages') == root_lineage][
        0]
    parent_dict['children'] = []
    children_names = [item.get('Lineages') for item in result if item.get(
        'parent') == root_lineage]
    for child_name in children_names:
        child_dict = generate_tree(child_name)
        parent_dict['children'].append(child_dict)

    return parent_dict


my_tree = []
root_lineages_names = [item.get('Lineages') for item in result if
                       not item.get('parent')]
for root_lineage_name in root_lineages_names:
    sub_tree = generate_tree(root_lineage_name)
    my_tree.append(sub_tree)

# save my_tree as a .json file
json_format = json.dumps(my_tree)
print(json_format)

轉載請註明出處，本文鏈接：https://www.uj5u.com/qiye/444123.html

標籤：Python 硒网页抓取美丽的汤硒铬驱动程序

上一篇：如何使用SeleniumPython按標題單擊元素

下一篇：Symfony6將物件持久化到資料庫