代碼運行但資料框為空。在下面的 URL 中,YEAR和PAGE都是動態的。我想遍歷兩者并獲取表 td和(如果可能的話)acc下的依賴資料。日期并在每年的 year.csv中提取結果。
import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://aviation-safety.net/wikibase/dblist.php?Year=1916&sorteer=datekey&page=1'
req = Request(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
with open('1916_aviation-safety.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["acc. date", "Type", "Registration","operator", "fat", "Location", " ", "dmg", " ", " "])
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('table > tbody > tr'):
writer.writerow([c.text if c.text else '' for c in row.select('td')])
print(row)
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers > span.current div.a'):
url = soup.select_one('div.pagenumbers > span.current div.a')['href']
else:
break
uj5u.com熱心網友回復:
我對您的腳本進行了一些更改,這應該使其更易于除錯和維護。它使用 pandas 使寫入 CSV 變得更容易,并使用 concurrent.futures 來加快速度。如果您有問題請告訴我,基本上每年都是同時抓取的,我抓取第一頁以獲取要抓取的頁數,然后回圈遍歷每個頁面并決議 HTML。關鍵資訊被放入字典,然后添加到串列中(通過 pandas 更容易寫入 csv,因為它基本上已經是一個資料框 - 字典串列)
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
def scrape_year(year):
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
pages = max([int(page['href'].split('=')[-1])
for page in page_container.find_all('a')])
info = []
for page in range(1,pages 1):
new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
print(new_url)
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table',{'class':'hp'})
regex = re.compile('list.*')
for index,row in enumerate(table.find_all('tr',{'class':regex})):
if index == 0:
continue
acc_link = 'https://aviation-safety.net/' row.find('a')['href']
try:
acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01" row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01-01" row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
continue
acc_type = row.find_all('td')[1].text
acc_reg = row.find_all('td')[2].text
acc_operator = row.find_all('td')[3].text
acc_fat = row.find_all('td')[4].text
acc_location = row.find_all('td')[5].text
acc_dmg = row.find_all('td')[7].text
item = {
'acc_link' : acc_link,
'acc_date': acc_date,
'acc_type': acc_type,
'acc_reg': acc_reg,
'acc_operator' :acc_operator,
'acc_fat':acc_fat,
'acc_location':acc_location,
'acc_dmg':acc_dmg
}
info.append(item)
df= pd.DataFrame(info)
df.to_csv(f'{year}_aviation-safety.csv',index=False)
if __name__ == "__main__":
START = 1916
STOP = 2022
years = [year for year in range(START,STOP 1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scrape_year,years)
uj5u.com熱心網友回復:
怎么了?
首先,總是看湯——這就是真相。
您在 while 回圈的請求中缺少標頭,這會導致403錯誤并且表選擇不正確。
如何實作?
在 while 回圈中正確設定您的請求的標頭:
html = requests.get(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
選擇更具體的行 - 請注意tbodyhtml 中沒有:
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('table tr.list'):
還要檢查分頁的選擇器:
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers span.current a'):
url = 'https://aviation-safety.net/wikibase/dblist.php' soup.select_one('div.pagenumbers span.current a')['href']
else:
break
例子
import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://aviation-safety.net/wikibase/dblist.php?Year=1916&sorteer=datekey&page=1'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
with open('1916_aviation-safety.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["acc. date", "Type", "Registration","operator", "fat", "Location", " ", "dmg", " ", " "])
while True:
print(url)
html = requests.get(url , headers = headers)
soup = BeautifulSoup(html.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('table tr.list'):
writer.writerow([c.text if c.text else '' for c in row.select('td')])
print(row)
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers span.current a'):
url = 'https://aviation-safety.net/wikibase/dblist.php' soup.select_one('div.pagenumbers span.current a')['href']
else:
break
以防萬一
pandas.read_html()多年來迭代的替代解決方案:
import requests,time,random
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
url = 'https://aviation-safety.net/wikibase/'
req = requests.get(url , headers = headers)
soup = BeautifulSoup(req.text, 'html.parser')
data = []
for url in ['https://aviation-safety.net/' a['href'] for a in soup.select('a[href*="/wikibase/dblist.php"]')]:
while True:
html = requests.get(url, headers = headers)
soup = BeautifulSoup(html.text, 'html.parser')
data.append(pd.read_html(soup.prettify())[0])
# If more than one page then iterate through all of them
if soup.select_one('div.pagenumbers span.current a'):
url = 'https://aviation-safety.net/wikibase/dblist.php' soup.select_one('div.pagenumbers span.current a')['href']
else:
break
time.sleep(random.random())
df = pd.concat(data)
df.loc[:, ~df.columns.str.contains('^Unnamed')].to_csv('aviation-safety.csv', index=False)
轉載請註明出處,本文鏈接:https://www.uj5u.com/caozuo/407882.html
標籤:
