使用BeautifulSoup動態抓取分頁表并將結果存盤在csv中？-有解無憂

代碼運行但資料框為空。在下面的 URL 中，YEAR和PAGE都是動態的。我想遍歷兩者并獲取表 td和（如果可能的話）acc下的依賴資料。日期并在每年的 year.csv中提取結果。

import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request

url = 'https://aviation-safety.net/wikibase/dblist.php?Year=1916&sorteer=datekey&page=1'
req = Request(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})


with open('1916_aviation-safety.csv', "w", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["acc. date", "Type", "Registration","operator", "fat", "Location", " ", "dmg", " ", " "])

    while True:
        print(url)
        html = requests.get(url)
        soup = BeautifulSoup(html.text, 'html.parser')

        # Go throught table = tbody and extract the data under the 'td' tag
        for row in soup.select('table > tbody > tr'):
            writer.writerow([c.text if c.text else '' for c in row.select('td')])
            print(row)

        # If more than one page then iterate through all of them        
        if soup.select_one('div.pagenumbers > span.current   div.a'):
            url = soup.select_one('div.pagenumbers > span.current   div.a')['href']
        else:
            break

uj5u.com熱心網友回復：

我對您的腳本進行了一些更改，這應該使其更易于除錯和維護。它使用 pandas 使寫入 CSV 變得更容易，并使用 concurrent.futures 來加快速度。如果您有問題請告訴我，基本上每年都是同時抓取的，我抓取第一頁以獲取要抓取的頁數，然后回圈遍歷每個頁面并決議 HTML。關鍵資訊被放入字典，然后添加到串列中（通過 pandas 更容易寫入 csv，因為它基本上已經是一個資料框 - 字典串列）

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures

def scrape_year(year):

    headers =   {
        'accept':'*/*',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        }

    url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
    req = requests.get(url, headers=headers)

    soup = BeautifulSoup(req.text,'html.parser')

    page_container = soup.find('div',{'class':'pagenumbers'})
    pages = max([int(page['href'].split('=')[-1])

    for page in  page_container.find_all('a')])

    info = []
    for page in range(1,pages 1):

        new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
        print(new_url)

        data = requests.get(new_url,headers=headers)
        soup = BeautifulSoup(data.text,'html.parser')


        table = soup.find('table',{'class':'hp'})


        regex = re.compile('list.*')
        for index,row in enumerate(table.find_all('tr',{'class':regex})):
            if index == 0:
                continue

            acc_link = 'https://aviation-safety.net/' row.find('a')['href']
            try:
                acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
            except ValueError:
                try:
                    acc_date = datetime.strptime("01" row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                except ValueError:
                    try:
                        acc_date = datetime.strptime("01-01" row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                    except ValueError:
                        continue

            acc_type = row.find_all('td')[1].text
            acc_reg = row.find_all('td')[2].text
            acc_operator = row.find_all('td')[3].text
            acc_fat = row.find_all('td')[4].text
            acc_location = row.find_all('td')[5].text
            acc_dmg = row.find_all('td')[7].text

            item = {
                'acc_link' : acc_link,
                'acc_date': acc_date,
                'acc_type': acc_type,
                'acc_reg': acc_reg,
                'acc_operator' :acc_operator,
                'acc_fat':acc_fat,
                'acc_location':acc_location,
                'acc_dmg':acc_dmg
                }

            info.append(item)

    df= pd.DataFrame(info)
    df.to_csv(f'{year}_aviation-safety.csv',index=False)


if __name__ == "__main__":

    START = 1916
    STOP = 2022

    years = [year for year in range(START,STOP 1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
            final_list = executor.map(scrape_year,years)

uj5u.com熱心網友回復：

怎么了？

首先，總是看湯——這就是真相。

您在 while 回圈的請求中缺少標頭，這會導致403錯誤并且表選擇不正確。

如何實作？

在 while 回圈中正確設定您的請求的標頭：

html = requests.get(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})

選擇更具體的行 - 請注意tbodyhtml 中沒有：

        # Go throught table = tbody and extract the data under the 'td' tag
        for row in soup.select('table tr.list'):

還要檢查分頁的選擇器：

# If more than one page then iterate through all of them        
if soup.select_one('div.pagenumbers span.current   a'):
    url = 'https://aviation-safety.net/wikibase/dblist.php' soup.select_one('div.pagenumbers span.current   a')['href']
else:
    break

例子

import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request

url = 'https://aviation-safety.net/wikibase/dblist.php?Year=1916&sorteer=datekey&page=1'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}

with open('1916_aviation-safety.csv', "w", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["acc. date", "Type", "Registration","operator", "fat", "Location", " ", "dmg", " ", " "])

    while True:
        print(url)
        html = requests.get(url , headers = headers)
        soup = BeautifulSoup(html.text, 'html.parser')

        # Go throught table = tbody and extract the data under the 'td' tag
        for row in soup.select('table tr.list'):
            writer.writerow([c.text if c.text else '' for c in row.select('td')])
            print(row)

        # If more than one page then iterate through all of them        
        if soup.select_one('div.pagenumbers span.current   a'):
            url = 'https://aviation-safety.net/wikibase/dblist.php' soup.select_one('div.pagenumbers span.current   a')['href']
        else:
            break

以防萬一

pandas.read_html()多年來迭代的替代解決方案：

import requests,time,random
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
url = 'https://aviation-safety.net/wikibase/'
req = requests.get(url , headers = headers)
soup = BeautifulSoup(req.text, 'html.parser')


data = []

for url in ['https://aviation-safety.net/' a['href'] for a in soup.select('a[href*="/wikibase/dblist.php"]')]:
    while True:

        html = requests.get(url, headers = headers)
        soup = BeautifulSoup(html.text, 'html.parser')

        data.append(pd.read_html(soup.prettify())[0])

        # If more than one page then iterate through all of them        
        if soup.select_one('div.pagenumbers span.current   a'):
            url = 'https://aviation-safety.net/wikibase/dblist.php' soup.select_one('div.pagenumbers span.current   a')['href']
        else:
            break
        time.sleep(random.random())

df = pd.concat(data)
df.loc[:, ~df.columns.str.contains('^Unnamed')].to_csv('aviation-safety.csv', index=False)

轉載請註明出處，本文鏈接：https://www.uj5u.com/caozuo/407882.html

標籤：

上一篇：Request.GetClientCertificate有時會在WebAPI中回傳不正確的資訊

下一篇：無法加載<tbody>，python表