我正在嘗試刮udemy,似乎我獲得的課程標題比課程價格多,但我不知道為什么會發生這種情況,我知道一些避免pandas錯誤的方法,但這不是解決方案,解決方案必須是刮完全相同數量的名稱和相同數量的價格:
from selenium import webdriver
import pandas as pd
import time
import selenium
#I put all this options to avoid udemy detect selnium as a bot
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')
website = "https://www.udemy.com/courses/search/?src=ukw&q=python"
s = Service('C:\\Users\\Albin Rodriguez\\Documents\\Aprendiendo\\web_scraping\\chromedriver.exe')
driver = webdriver.Chrome(service=s, options=options)
driver.get(website)
time.sleep(5)
titles = driver.find_elements_by_xpath('//h3[@]')
prices = driver.find_elements_by_xpath('//div[@data-purpose="price-text-container"]//span/span')
courses = []
prices_courses= []
for title in titles:
courses.append(title.text)
for price in prices:
prices_courses.append(price.text)
input() #I used this to check if all the prices are shown.
#pandas
df = pd.DataFrame({'cursos': courses, 'precios': prices_courses})
df.to_excel("precio_cursos2.xlsx", index=False)
uj5u.com熱心網友回復:
現在運行代碼
from selenium import webdriver
import pandas as pd
import time
import selenium
#I put all this options to avoid udemy detect selnium as a bot
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')
website = "https://www.udemy.com/courses/search/?src=ukw&q=python"
s = Service('C:\\Users\\Albin Rodriguez\\Documents\\Aprendiendo\\web_scraping\\chromedriver.exe')
driver = webdriver.Chrome(service=s, options=options)
driver.get(website)
time.sleep(5)
data=[]
titles = [x.text for x in driver.find_elements_by_xpath('//h3[@]/a')]
prices = [x.text for x in driver.find_elements_by_xpath('//div[@data-purpose="price-text-container"]//span/span')[0:31]]
data.extend([titles,prices])
#input() #I used this to check if all the prices are shown.
#pandas
df = pd.DataFrame(data=list(zip(titles,prices)),columns=['cursos','precios'])
df.to_excel("precio_cursos2.xlsx", index=False)
轉載請註明出處,本文鏈接:https://www.uj5u.com/shujuku/449241.html
