我放置了我擁有的完整且功能正常的抓取代碼。成功抓取頁面上的所有元素。
但是,我只想使用與抓取相同的元素抓取頁面的一小部分。這個有限的部分已經與頁面的所有元素一起被正確地抓取,但我只想抓取它而不是“全部 它”。鏈接在
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get("url")
driver.implicitly_wait(12)
#driver.minimize_window()
wait = WebDriverWait(driver, 10)
all_rows = driver.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")
current_round = '?'
for bundesliga in all_rows:
classes = bundesliga.get_attribute('class')
#print(classes)
if 'event__round' in classes:
#round = row.find_elements(By.CSS_SELECTOR, "[class^='event__round event__round--static']")
#current_round = row.text # full text `Round 20`
current_round = bundesliga.text.split(" ")[-1] # only `20` without `Round`
else:
datetime = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__time']")
#Divide la data e l'ora
date, time = datetime.text.split(" ")
date = date.rstrip('.') # right-strip to remove `.` at the end of date
team_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--home']")
team_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--away']")
score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")
bundesliga = [current_round, date, time, team_home.text, team_away.text, score_home.text, score_away.text]
bundesliga.append(bundesliga)
print(bundesliga)
uj5u.com熱心網友回復:
我認為您需要做的就是限制all_rows變數。一種方法是使用文本找到您要查找的選項卡,然后獲取父元素。
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
driver = webdriver.Firefox()
driver.get("https://www.someurl/some/other/page")
driver.implicitly_wait(12)
#driver.minimize_window()
wait = WebDriverWait(driver, 10)
# all_rows = driver.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")
############### UPDATE ####################
def parent_element(element):
return element.find_element(By.XPATH, './..')
programma_element = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.XPATH, "//div[text()='Programma']")))
programma_element_p1 = parent_element(programma_element)
programma_element_p2 = parent_element(programma_element_p1)
programma_element_p3 = parent_element(programma_element_p2)
all_rows = programma_element_p3.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")
filter_rows = []
for row in all_rows:
if "event__match--last" in row.get_attribute('class'):
filter_rows.append(row)
break
else:
filter_rows.append(row)
############### UPDATE ####################
current_round = '?'
for bundesliga in filter_rows:
classes = bundesliga.get_attribute('class')
#print(classes)
if 'event__round' in classes:
#round = row.find_elements(By.CSS_SELECTOR, "[class^='event__round event__round--static']")
#current_round = row.text # full text `Round 20`
current_round = bundesliga.text.split(" ")[-1] # only `20` without `Round`
else:
datetime = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__time']")
#Divide la data e l'ora
date, time = datetime.text.split(" ")
date = date.rstrip('.') # right-strip to remove `.` at the end of date
team_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--home']")
team_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--away']")
# score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
# score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")
try:
score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
except (TimeoutException, NoSuchElementException):
MyObject = type('MyObject', (object,), {})
score_home = MyObject()
score_home.text = "-"
try:
score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")
except (TimeoutException, NoSuchElementException):
MyObject = type('MyObject', (object,), {})
score_away = MyObject()
score_away.text = "-"
bundesliga = [current_round, date, time, team_home.text, team_away.text, score_home.text, score_away.text]
bundesliga.append(bundesliga)
print(bundesliga)
轉載請註明出處,本文鏈接:https://www.uj5u.com/shujuku/372566.html
標籤:Python 蟒蛇-3.x 硒 硒网络驱动程序 网页抓取
