我想獲取http://www.delixi-electric.com/cpzx/index.htm這個網站的電氣說明書PDF
import requests
import urllib.request
from bs4 import BeautifulSoup
import re
import os
def collect_category(url):
category_urls = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
r = requests.get(url, headers = headers)
#print (r.status_code)
soup = BeautifulSoup(r.text, "html.parser")
urls = soup.select("div.pro_menu > dl > dd > a")
for i in urls:
category_urls.append("http://www.delixi-electric.com/"+i.get("href"))
print (category_urls)
return category_urls
def collect_items(url):
items_urls = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.text, "html.parser")
urls = soup.select("#pro_list > li > a")
for i in urls:
items_urls.append("http://www.delixi-electric.com/"+i.get("href"))
return items_urls
def download_pdf(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
urls = soup.select("body > div:nth-child(10) > div > div:nth-child(3) > ul > li > a")
for i in urls:
name = i.get_text()
name = re.sub("/", "", name)
print (name)
if os.path.exists('D:/delixi/%s.pdf' % name):
print("檔案已存在")
continue
pdf_url = "http://www.delixi-electric.com/"+i.get("href")
print (pdf_url)
u = urllib.request.urlopen(pdf_url)
print ("進入成功,正在下載......")
block_sz = 8192
with open('D:/delixi/%s.pdf' % name, 'wb') as f:
while True:
buffer = u.read(block_sz)
if buffer:
f.write(buffer)
else:
print('第%d個檔案已下載' % n)
break
print ("=====================")
url = "http://www.delixi-electric.com/dcyb/index.htm"
category_urls = collect_category(url)
print ("目錄鏈接收集完畢")
n = 0
for i in category_urls:
items_urls = collect_items(i)
print ("準備開始下載PDF")
for a in items_urls:
n+=1
download_pdf(a)
print ("全部檔案下載完畢")
求大佬幫我看看應該怎么改
uj5u.com熱心網友回復:
我也是剛開始看 request, 所以也不是很懂, 所以以下瞎說 0.0首先,試了一下樓主的代碼,這一行完全沒有獲取到任何資料
urls = soup.select("div.pro_menu > dl > dd > a")
在瀏覽器中F12看到的 pdf 下載部分原始碼
<!--http://www.delixi-electric.com/dcyb/index.htm-->
<li class="down_right_bd">
<h2>DZ47M 系列斷路器樣本.pdf</h2>
<span>2.90 Mb</span>
<a href="https://bbs.csdn.net/u/cms/delixi/201905/30101510v12d.pdf" class="down_bd">查 看</a>
<a href="https://bbs.csdn.net/u/cms/delixi/201905/30101510v12d.pdf" class="down_hd"><img src="https://img.uj5u.com/2020/10/02/146682020626351.png"></a>
<a href="javascript:void(0)" onclick="downLoadWinShow('/u/cms/delixi/201905/30101510v12d.pdf','DZ47M 系列斷路器樣本.pdf')" class="down_hd">
<img src="https://img.uj5u.com/2020/10/02/146682020626352.jpg">
</a>
</li>
之后嘗試匹配這段也沒有成功, 原來這一段不在這個頁面的原始碼中,
通過瀏覽器工具,得到了頁面加載程序中抓到了這個請求:(對應的結果是pdf串列)

然后。。。我嘗試使用 python 獲取這個回傳了500錯誤。。。。
最終也沒成功不過希望這些東西可能會對樓主有用。。。
uj5u.com熱心網友回復:
好的,謝謝,我再看看吧轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/147510.html
