我撰寫了從站點提取文本并分析它們的可讀性的代碼。但是,我有時會卡在網站上。如果我的程式花費的時間超過 x 秒,有什么方法可以讓我的程式跳到 for 回圈的下一次迭代?如果有任何問題或澄清,請在評論中告訴我
import time
import numpy as np
import pandas as pd
import openpyxl
import reqto as rq
from bs4 import BeautifulSoup
# from SpacySylGetter import *
# import readability
import selenium
from selenium import webdriver
TextIn = pd.read_excel('C:\\Users\\Max von Klemperer\\Desktop\\KeywordLinks\\Aus2.xlsx')
# print(TextIn)
WebURLs = list(TextIn["URL"].values)
Region = list(TextIn["Region"].values)
Keywords = list(TextIn["Keyword"].values)
Rankings = list(TextIn["Ranking"].values)
spaces = 0
syls = 0
counter = 0
characters = 0
sentences = 0
CLIs = []
FL = []
FLAuto = []
WebTexts = []
goodurl = []
goodKW = []
goodRegion = []
goodRanking = []
driver = webdriver.Chrome('C:\\Users\\Max von Klemperer\\Downloads\\chromedriver.exe')
for i in WebURLs:
try:
time.sleep(1)
url = i
driver.get(url)
el = driver.find_element_by_tag_name('body')
initText = el.text
TextPros = ''.join(
filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPGQRSTUVWXYZ-.?! \n', initText))
cleanedStr = ' '.join(TextPros.split())
print(i)
textToProc = cleanedStr[600:len(cleanedStr) - 600]
textToProc.replace("...", ".")
textToProc.replace("-", " ")
textToProc.replace(".com", " ")
if 1000 < len(textToProc) < 100000:
print(textToProc)
WebTexts.append(textToProc)
goodurl.append(i)
goodRegion.append(Region[counter])
goodKW.append(Keywords[counter])
goodRanking.append(Rankings[counter])
counter = counter 1
except Exception:
print("Bounced")
for i in WebTexts:
words = len(i.split())
commas = i.count(",")
spaces = i.count(" ")
Hyphens = i.count("-")
# syls = sylsGet(i)
# print(syls)
characters = len(i) - spaces - sentences
sentences = i.count(".") i.count("?") i.count("!")
characters = len(i) - spaces - sentences - commas - Hyphens
CLI = ((5.89 * (characters / words)) - (0.296 * sentences / (words / 100))) - 15.8
CLIs.append(CLI)
print(CLI)
# FLK = 206.835 - (1.015 * words / sentences) - (84.6 * syls / words)
# print(FLK)
# FL.append(FLK)
driver.close()
CLIExcel = pd.DataFrame()
toAdd1 = np.array(goodurl)
toAdd2 = np.array(CLIs)
toAdd3 = np.array(goodRegion)
toAdd4 = np.array(goodKW)
toAdd5 = np.array(goodRanking)
# toAdd6 = np.array(FL)
CLIExcel["URL"] = toAdd1
CLIExcel["CLI's"] = toAdd2
CLIExcel["Region"] = toAdd3
CLIExcel["Keyword"] = toAdd4
CLIExcel["Ranking"] = toAdd5
# CLIExcel["Flesch Kinkaid"] = toAdd6
print(CLIExcel)
CLIExcel.to_excel('C:\\Users\\Max von Klemperer\\Desktop\\WorkedCLI.xlsx')
uj5u.com熱心網友回復:
將您的行替換el = driver.find_element_by_tag_name('body')為
timeout = 10
try:
el = WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
except TimeoutException:
continue
這將等待元素出現 10 秒,如果未找到,continue將從 for 回圈中的下一個 url 繼續。
您還需要以下匯入:
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
轉載請註明出處,本文鏈接:https://www.uj5u.com/gongcheng/457814.html
