我想撰寫一個網路爬蟲,我需要在其中添加從頁面到池內陣列的鏈接,但池僅適用于給定的 url,不適用于我在 def 函式中提供的附加鏈接。
from concurrent import futures
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from urllib.request import urlopen
def linksSearchAndAppend(url):
req = Request(url)
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
links = []
for link in soup.findAll('a'):
links.append(link.get('href'))
if link[0]=="/":
link[0]==""
link=url link
global urls
urls.append(links)
print (urls)
urlListend=open("urlList.txt", "r")
urls=[]
for line in urlListend:
urls.append(line.rstrip())
urlListend.close()
#main multithreading is working
e = futures.ThreadPoolExecutor(max_workers=8)
for url in urls:
e.submit(linksSearchAndAppend, url)
e.shutdown()
uj5u.com熱心網友回復:
from concurrent import futures
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from urllib.request import urlopen
def linksSearchAndAppend(url):
req = Request(url)
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
#print (soup)
links = []
for link in soup.findAll('a'):
links.append(link.get('href'))
#if link[0]=="/":
# link[0]==""
# link=url link
global urls
urls.append(links)
print (links)
urlListend=open("urlList.txt", "r")
urls=[]
for line in urlListend:
urls.append(line.rstrip())
urlListend.close()
#main multithreading is working
e = futures.ThreadPoolExecutor(max_workers=8)
for url in urls:
e.submit(linksSearchAndAppend, url)
e.shutdown()
uj5u.com熱心網友回復:
這有效,但它仍然需要一個“alreadysearchedUrls”陣列,這樣它就不會重復搜索已經搜索過的“urls”
from concurrent import futures
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from urllib.request import urlopen
def linksSearchAndAppend(url):
req = Request(url)
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
#print (soup)
links = []
for link in soup.findAll('a'):
links.append(link.get('href'))
#if link[0]=="/":
# link[0]==""
# link=url link
global urls
urls.append(links)
print (urls)
urlListend=open("urlList.txt", "r")
urls=[]
for line in urlListend:
urls.append(line.rstrip())
urlListend.close()
#main multithreading is working
for i in urls:
e = futures.ThreadPoolExecutor(max_workers=8)
for url in urls:
e.submit(linksSearchAndAppend, url)
e.shutdown()
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/371827.html
