我正在嘗試撰寫一個程式來獲取網頁上的所有鏈接,甚至來自子目錄。我有這個與請求包一起作業,但是當你必須從很多子目錄中獲取鏈接時它很慢。這是我的作業代碼,從https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/收集所有鏈接大約需要 4 分鐘。
import requests
import re
from bs4 import BeautifulSoup
def get_html(base_url):
req = requests.get(base_url)
return req.text if (req.status_code == 200) else ''
def get_links(html_page):
soup = BeautifulSoup(html_page, "lxml") # removed "html.parser"
regex = r'(.nc$)|(/$)'
links = [f"{base_url}{link.get('href')}" for link in soup.findAll('a', attrs={'href': re.compile(regex)})]
return links
def get_sub_dirs(links):
sub_dirs = [link for link in links if re.search(r'/$', link)]
return sub_dirs
def get_files(links):
file_links = [link for link in links if re.search(r'.nc$', link)]
return file_links
def main(base_url):
files = []
html_page = get_html(base_url)
links = get_links(html_page)
sub_dirs = get_sub_dirs(links)
base_files = get_files(links)
files.append(base_files)
for sub in sub_dirs:
sub_files = main(sub)
files.append(sub_files)
return files
# Run programe
base_url = 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/'
files = main(base_url)
我認為代碼中的瓶頸是get_html()函式,需要幾秒鐘才能取回html。我認為可以使用異步函式優化此代碼,但我正在努力使其作業。這是我對代碼的異步版本的嘗試:
import aiohttp
import asyncio
import re
from bs4 import BeautifulSoup
async def get_html_async(base_url):
async with aiohttp.ClientSession() as client:
async with client.get(base_url) as resp:
return await resp.text() if (resp.status == 200) else ''
def get_links(html_page):
soup = BeautifulSoup(html_page, "lxml") # removed "html.parser"
regex = r'(.nc$)|(/$)'
links = [f"{base_url}{link.get('href')}" for link in soup.findAll('a', attrs={'href': re.compile(regex)})]
return links
def get_sub_dirs(links):
sub_dirs = [link for link in links if re.search(r'/$', link)]
return sub_dirs
def get_files(links):
file_links = [link for link in links if re.search(r'.nc$', link)]
return file_links
async def get_tasks(session):
async with aiohttp.ClientSession() as client:
async with client.get(url) as resp:
return await resp.text() if (resp.status == 200) else ''
async def main(base_url):
files = []
html_page = await asyncio.gather(get_html_async(base_url))
links = get_links(html_page[0])
sub_dirs = get_sub_dirs(links)
base_files = get_files(links)
files.append(base_files)
for sub in sub_dirs:
sub_files = await asyncio.gather(main(sub))
files.append(sub_files)
return files
# Run program
base_url = 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/'
files = asyncio.gather(main(base_url))
任何幫助將不勝感激。謝謝!
uj5u.com熱心網友回復:
通過呼叫asyncio.gather()您的方式,您可以像以前一樣按順序運行您的請求。asyncio.gather()將多個可迭代物件作為引數來同時運行它們。asyncio.gather()只用一個 awaitable呼叫是沒有意義的,因為那樣你就可以簡單地等待它了。通過在main()不等待它們的情況下創建所有 coros,然后將它們全部傳遞給asyncio.gather()您,您將獲得顯著的加速:
# some minor fixes added
import asyncio
import re
from itertools import chain
import aiohttp
from bs4 import BeautifulSoup
async def get_html_async(base_url):
async with aiohttp.ClientSession(
connector=aiohttp.TCPConnector(ssl=False) # I got ssl errors on my machine
) as client:
async with client.get(base_url) as resp:
return await resp.text() if (resp.status == 200) else ""
def get_links(html_page):
soup = BeautifulSoup(html_page, "lxml") # removed "html.parser"
regex = r"(.nc$)|(/$)"
links = [
f"{base_url}{link.get('href')}"
for link in soup.findAll("a", attrs={"href": re.compile(regex)})
]
return links
def get_sub_dirs(links):
sub_dirs = [link for link in links if re.search(r"/$", link)]
return sub_dirs
def get_files(links):
file_links = [link for link in links if re.search(r".nc$", link)]
return file_links
async def main(base_url):
files = []
html_page = await get_html_async(base_url)
links = get_links(html_page) # removed indexing 'html_page[0]'
sub_dirs = get_sub_dirs(links)
base_files = get_files(links)
files.extend(base_files) # extend list to get "cleaner" output, keep using 'append' if your downstream code requires it
coros = [main(sub) for sub in sub_dirs] # create all requests
new_files = await asyncio.gather(*coros) # run all requests concurrently
files.extend(chain(*new_files)) # again, add to list as needed
return files
# Run program
base_url = "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/"
files = asyncio.run(main(base_url)) # or simply 'await main(base_url)' in IPython
print(files)
轉載請註明出處,本文鏈接:https://www.uj5u.com/qiye/403059.html
標籤:
