在Python中使用selenium從WeatherUnderground抓取風資料-有解無憂

我正在嘗試從 Weather Underground 中獲取第二天的時間、風速和風向預報。我修改了本教程中的代碼，我的 MWE 是

import numpy as np
import pandas as pd
from datetime import datetime
from datetime import date, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService

# define future date
start_date = date.today()   pd.Timedelta(days=1)

# get data for Sydney
page = 'https://www.wunderground.com/hourly/au/sydney/date/{}-{}-{}'

df = pd.DataFrame()

options = webdriver.ChromeOptions()
options.add_argument('headless') 
options.set_capability("loggingPrefs", {'performance': 'ALL'})
service = ChromeService(executable_path='chromedriver.exe')
driver = webdriver.Chrome(service=service, options=options)

classlist = ["mat-cell cdk-cell cdk-column-timeHour mat-column-timeHour ng-star-inserted",
             "mat-cell cdk-cell cdk-column-wind mat-column-wind ng-star-inserted",
             "mat-cell cdk-cell cdk-column-wind mat-column-wind ng-star-inserted",]
name = ['time', 'windspeed_mph', 'winddirection']

print('gathering data from: ', start_date)
    
formatted_lookup_URL = page.format(start_date.year,
                                   start_date.month,
                                   start_date.day)

driver.get(formatted_lookup_URL)

rows = WebDriverWait(driver, 20).until( \
    EC.visibility_of_all_elements_located((By.XPATH, \
    '//td[@hljs-number">0]   '"]')))

for row in rows:

    time = row.find_element(By.XPATH,'.//span[@]').text
    
    # append new row to table
    df = df.append(pd.DataFrame({"Day":[str(start_date)],"time":[time],}),
                   ignore_index = True)

del classlist[0]

for ii in range(len(classlist)):
    
    rows = WebDriverWait(driver, 20).until( \
        EC.visibility_of_all_elements_located((By.XPATH, \
        '//td[@hljs-string">'"]')))
    
    for row in rows:

        if name[ii]=='winddirection':
            data = row.find_element(By.XPATH,
                './/span[@]').text
            print(data)

        else:
            data = row.find_element(By.XPATH,
                './/span[@]').text
        
        # append new row to table
        df = df.append(pd.DataFrame({name[ii]:[data]}), ignore_index=True)
    
driver.quit()

# remove NaN
df = df.apply(lambda x: pd.Series(x.dropna().values))
print(df.head())

最終的資料幀df包含時間和風速，但不包含風向。我懷疑這是因為線路data = row.find_element(By.XPATH, './/span[@]').text，但我不知道如何解決它。

uj5u.com熱心網友回復：

似乎classlistandname在 line 之后有不同的長度del classlist[0]。通過在洗掉類串列的第一個元素后添加此行來修復它：

del name[0]

轉載請註明出處，本文鏈接：https://www.uj5u.com/qita/457781.html

標籤：Python 硒网页抓取

上一篇：使用BeautifulSoup統一標簽后的#text

下一篇：決議（抓取）網頁時如何將“display：flex”更改為“display：none”？|Python（電報機器人）|硒