如何將抓取的資訊放入 csv 檔案,然后關閉選項卡并將其寫入新的選項卡并回圈遍歷,直到論壇中的所有頁面都被抓取我仍在學習有關網路抓取的更多資訊我完全堅持這個需要被刮掉的div名稱是“post-content”但是當我測驗它時它沒有顯示正確的資訊
import driver as driver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common import window
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import csv
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)
options.add_argument("start-maximized")
wait = WebDriverWait(driver, 100)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get("https://navalcommand.enjin.com/forum/viewforum/2989694/m/11178354/page/1")
elems = driver.find_elements(By.XPATH, "//table[@class='structure small-cells']//a[@href]")
links = []
# create csv file
f = open(r"C:\Users\jammi\OneDrive\Desktop\Navcom\test.csv", 'w', encoding='UTF8')
csvWriter = csv.writer(f)
# to open every thread link
for ele in elems:
if "viewthread" in ele.get_attribute("href"):
links.append(ele.get_attribute("href"))
links = list(dict.fromkeys(links))
print(elems)
# to open every link into a new tab
for link in links:
driver.switch_to.new_window(window.WindowTypes.TAB)
driver.get(link)
# write the scraped information to a csv file
content = driver.find_elements(By.CLASS_NAME, "post-content")
print(content)
csvWriter.writerow([content])
uj5u.com熱心網友回復:
這是一種可能的解決方案:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
# disable chromedriver log message in cmd
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 10)
driver.get("https://navalcommand.enjin.com/forum/viewforum/2989694/m/11178354/page/1")
# get number of pages
num_pages = driver.find_element(By.CSS_SELECTOR, "span.text.rightmost").text.split(' ')[1]
for page in range(2, int(num_pages)):
# find all threads on the current page
threads = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.thread-view.thread-subject")))
# get links to threads
thread_links = [x.get_attribute('href') for x in threads]
# open each link and get all the posts in thread
for link in thread_links:
driver.get(link)
thread_content = driver.find_elements(By.CSS_SELECTOR, "div.post-content")
# get thread id
thread_id = driver.current_url.split('d/')[1].split('-')[0]
# save received data in csv
for post in thread_content:
post_content = post.text or post.find_element(By.TAG_NAME, 'img').get_attribute('src')
with open(file=f'{thread_id}_navalcommand.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([post_content])
driver.get(f"https://navalcommand.enjin.com/forum/viewforum/2989694/m/11178354/page/{page}")
driver.quit()
輸出是 csv 檔案串列:
32694465_navalcommand.csv
33053469_navalcommand.csv
33079839_navalcommand.csv
每個檔案都是一個單獨的執行緒
uj5u.com熱心網友回復:
您可以使用xpath再次進行post-content搜索。
content = driver.find_elements(By.XPATH, "//div[@class='post-content']")
uj5u.com熱心網友回復:
您的 URL 在通過 Selenium 訪問時被阻止。因此,我使用了另一個 URL 并修改了您的代碼,只需從以下代碼中獲取邏輯,并根據您的需要修改定位器:
此代碼將從主頁獲取所有 URL,遍歷并在新選項卡中打開每個鏈接,列印內容并將內容保存到“.csv”檔案,關閉選項卡然后移動到下一個鏈接。
driver.get("https://ubuntuforums.org/forumdisplay.php?f=326") # Change this URL to your URL
elems = driver.find_elements(By.XPATH, "//*[@class='threadtitle']//a[@href]") # Change this XPath as per your website
print("Length: ", len(elems))
links = []
# create csv file
f = open(r"C:\Users\<user name>\Downloads\test.csv", 'w', encoding='UTF8') # modify this path
csvWriter = csv.writer(f)
# to open every thread link
for i in range(len(elems)):
if "showthread" in elems[i].get_attribute("href"): # Change 'showthread' to the original - 'viewthread'
links.append(elems[i].get_attribute("href"))
links = list(dict.fromkeys(links)) # I am not sure why you are using this line here, anyway that's your decision
# print("Elements: ", elems)
print("Links: ", links)
print("")
# to open every link into a new tab
for link in links:
driver.switch_to.new_window(window.WindowTypes.TAB)
driver.get(link)
time.sleep(3)
print("Contents of '", driver.title, "' page")
print("----")
# write the scraped information to a csv file
no_of_content = driver.find_elements(By.CSS_SELECTOR, ".postcontent.restore") # Change '.postcontent.restore' to the original - '.post-content', but use CSS_SELECTOR
for i in range(len(no_of_content)):
print("Content: ", no_of_content[i].text)
csvWriter.writerow([no_of_content[i].text])
time.sleep(1)
print("=============End of the page=================")
print("")
time.sleep(1)
driver.close()
driver.switch_to.window(driver.window_handles[0])
轉載請註明出處,本文鏈接:https://www.uj5u.com/shujuku/515483.html
標籤:Python硒网页抓取
