這是一個類似的問題為什么python多處理腳本會在一段時間后變慢?
使用 Pool 的代碼示例:
from multiprocessing import Pool
Pool(processes=6).map(some_func, array)
經過幾次迭代后,程式變慢了,最后它變得比沒有多處理時更慢。也許問題是與Selenium相關的功能?這是完整的代碼:
# libraries
import os
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from multiprocessing import Pool
# Необходимые переменные
url = "https://eldorado.ua/"
directory = os.path.dirname(os.path.realpath(__file__))
env_path = directory "\chromedriver"
chromedriver_path = env_path "\chromedriver.exe"
dict1 = {"Смартфоны и телефоны": "https://eldorado.ua/node/c1038944/",
"Телевизоры и аудиотехника": "https://eldorado.ua/node/c1038957/",
"Ноутбуки, ПК и Планшеты": "https://eldorado.ua/node/c1038958/",
"Техника для кухни": "https://eldorado.ua/node/c1088594/",
"Техника для дома": "https://eldorado.ua/node/c1088603/",
"Игровая зона": "https://eldorado.ua/node/c1285101/",
"Гаджеты и аксесуары": "https://eldorado.ua/node/c1215257/",
"Посуда": "https://eldorado.ua/node/c1039055/",
"Фото и видео": "https://eldorado.ua/node/c1038960/",
"Красота и здоровье": "https://eldorado.ua/node/c1178596/",
"Авто и инструменты": "https://eldorado.ua/node/c1284654/",
"Спорт и туризм": "https://eldorado.ua/node/c1218544/",
"Товары для дома и сада": "https://eldorado.ua/node/c1285161/",
"Товары для детей": "https://eldorado.ua/node/c1085100/"}
def openChrome_headless(url1, name):
options = webdriver.ChromeOptions()
options.headless = True
options.add_experimental_option("excludeSwitches", ['enable-automation'])
options.add_argument(
'--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36"')
driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)
driver.get(url=url1)
sleep(1)
try:
with open(name ".html", "w", encoding="utf-8") as file:
file.write(driver.page_source)
except Exception as ex:
print(ex)
finally:
driver.close()
driver.quit()
def processing_goods_pages(name):
for n in os.listdir(f"brand_pages\\{name}"):
with open(f"{directory}\\brand_pages\\{name}\\{n}", encoding="utf-8") as file:
soup = BeautifulSoup(file.read(), "lxml")
if not os.path.exists(f"{directory}\\goods_pages\\{name}\\{n[:-5]}"):
if not os.path.exists(f"{directory}\\goods_pages\\{name}"):
os.mkdir(f"{directory}\\goods_pages\\{name}")
os.mkdir(f"{directory}\\goods_pages\\{name}\\{n[:-5]}")
links = soup.find_all("header", class_="good-description")
for li in links:
ref = url li.find('a').get('href')
print(li.text)
openChrome_headless(ref, f"{directory}\\goods_pages\\{name}\\{n[:-5]}\\{li.text}")
if __name__ == "__main__":
ar2 = []
for k, v in dict1.items():
ar2.append(k)
Pool(processes=6).map(processing_goods_pages, ar2)
uj5u.com熱心網友回復:
您正在創建 6 個行程來處理 14 個 URL——到目前為止一切順利。但是,池中的每個行程為了處理 URL 都會為它從該 URL 的檔案中讀取的每個鏈接啟動一次無頭 Chrome 瀏覽器。我不知道它平均為每個 URL 處理多少個鏈接,我不能說打開和關閉 Chrome 的次數是最終變慢的原因。但在我看來,如果您想要 6 級的多處理級別,那么您永遠不必啟動超過 6 個 Chrome 會話。然而,要實作這一點,需要進行一些代碼重構。
我要注意的第一件事是,這項作業可能也可以使用多執行緒而不是多處理。還有就是通過做一些CPU密集型的作業BeautifulSoup和lxml決議器,但我比較懷疑這相形見絀,以啟動Chrome 6次,取的URL的網頁,特別是因為你有1硬編碼等待第二以下網址提取(更多這個稍后)。
這個想法是將當前打開的 Chrome 驅動程式存盤在多執行緒池中的每個執行緒的執行緒本地存盤中,并且quit在程式結束之前從不驅動程式。函式中的邏輯openChrome_headless現在需要移到一個新的特殊函式中create_driver,該函式可以被呼叫processing_goods_pages以獲取當前執行緒的當前 Chrome 驅動程式(如果當前沒有,則創建一個)。但這意味著openChrome_headlesss現在需要將特定于 URL 的代碼移動到processing_goods_pages.
最后,洗掉執行緒本地存盤并運行垃圾收集器,以確保Driver運行類的所有實體的解構式,以確保所有 Chrome 驅動程式實體都“退出”。
由于我無權訪問您的檔案,這顯然無法進行徹底測驗,因此可能存在拼寫錯誤或 10。祝您好運。
一條進一步指出:而不是做一個呼叫sleep(1)后的driver.get(ref)通話,你應該看看上來,而不是一個呼叫driver.implicitly_wait(1),隨后由駕駛員呼叫定位元素,它的存在可以確保您需要的網頁上寫出一切都已經加載,如果這樣的事情是可能的。這樣,您只需等待鏈接出現所需的最短時間。當然,如果在通過 AJAX 呼叫初始頁面加載之后不修改 DOM,則根本不需要休眠。
import os
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
# Use multithreading instead of multiprocessing
from multiprocessing.pool import ThreadPool
import threading
# Необходимые переменные
url = "https://eldorado.ua/"
directory = os.path.dirname(os.path.realpath(__file__))
env_path = directory "\chromedriver"
chromedriver_path = env_path "\chromedriver.exe"
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.headless = True
options.add_experimental_option("excludeSwitches", ['enable-automation'])
options.add_argument(
'--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36"')
self.driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
#print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
dict1 = {"Смартфоны и телефоны": "https://eldorado.ua/node/c1038944/",
"Телевизоры и аудиотехника": "https://eldorado.ua/node/c1038957/",
"Ноутбуки, ПК и Планшеты": "https://eldorado.ua/node/c1038958/",
"Техника для кухни": "https://eldorado.ua/node/c1088594/",
"Техника для дома": "https://eldorado.ua/node/c1088603/",
"Игровая зона": "https://eldorado.ua/node/c1285101/",
"Гаджеты и аксесуары": "https://eldorado.ua/node/c1215257/",
"Посуда": "https://eldorado.ua/node/c1039055/",
"Фото и видео": "https://eldorado.ua/node/c1038960/",
"Красота и здоровье": "https://eldorado.ua/node/c1178596/",
"Авто и инструменты": "https://eldorado.ua/node/c1284654/",
"Спорт и туризм": "https://eldorado.ua/node/c1218544/",
"Товары для дома и сада": "https://eldorado.ua/node/c1285161/",
"Товары для детей": "https://eldorado.ua/node/c1085100/"}
def processing_goods_pages(name):
for n in os.listdir(f"brand_pages\\{name}"):
with open(f"{directory}\\brand_pages\\{name}\\{n}", encoding="utf-8") as file:
soup = BeautifulSoup(file.read(), "lxml")
if not os.path.exists(f"{directory}\\goods_pages\\{name}\\{n[:-5]}"):
if not os.path.exists(f"{directory}\\goods_pages\\{name}"):
os.mkdir(f"{directory}\\goods_pages\\{name}")
os.mkdir(f"{directory}\\goods_pages\\{name}\\{n[:-5]}")
links = soup.find_all("header", class_="good-description")
driver = create_driver()
for li in links:
ref = url li.find('a').get('href')
print(li.text)
driver.get(ref)
sleep(1)
name = f"{directory}\\goods_pages\\{name}\\{n[:-5]}\\{li.text}"
try:
with open(name ".html", "w", encoding="utf-8") as file:
file.write(driver.page_source)
except Exception as ex:
print(ex)
if __name__ == "__main__":
ThreadPool(processes=6).map(processing_goods_pages, dict1.keys())
# Quit all the Selenium drivers:
del threadLocal
import gc
gc.collect() # a little extra insurance
轉載請註明出處,本文鏈接:https://www.uj5u.com/yidong/364311.html
