python爬取守望先鋒英雄圖片以及描述-有解無憂

先爬取英雄圖片

匯入庫

import requests
import re
from bs4 import BeautifulSoup 
import urllib

爬取網頁html

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36 Edg/94.0.992.47'
    } 
def get_html(url): 
 # 模擬瀏覽器訪問
    response = requests.get(url, headers=headers)  # 訪問網站
    response.encoding = response.apparent_encoding #字符編碼格式
    html = response.text  # 獲取網頁原始碼
    return html

再找出html里面所有以png結尾的圖片鏈接，保存到本地

r = get_html('https://ow.blizzard.cn/heroes/')
bs=BeautifulSoup(r,'html.parser')
#正則運算式爬取img以png結尾的鏈接
img_urls=bs.find_all('img', {'src': re.compile('.+(.png)$')})
for i in range(len(img_urls)):
    url = img_urls[i]['src']
    # 下載圖片
    urllib.request.urlretrieve(url, 'C:/Users/15584/Desktop/overwatch/picture/' +"hero"+str(i) + '.jpg')

再獲取每個英雄的具體描述，

首先找出所有的內鏈接并且以/heroes開頭

#正則運算式爬取以/heroes開頭的內鏈接
for link in bs.find_all('a',href=re.compile('^((\/heroes.))')):
    a.append(link.text)
    if 'href' in link.attrs:
        b.append(link['href'])

再訪問內鏈接，找出所有的p標簽

url="https://ow.blizzard.cn"
for link1 in b:
    c.append(url+link1)
for link2 in c:
    r1 = get_html(link2)
    bs1=BeautifulSoup(r1,'html.parser')
    #print(bs1.p.text)
    d.append(bs1.find_all('p'))

再對找出來的p標簽進行去除多余的html

for content in d:
    content = str(content)
    # 使用正則運算式匹配找到的文本內容中的所有<.*?>的標簽
    pattern = re.compile(r'<.*?>|\n|\s', re.S)
    # 將匹配到的內容用空來替換
    content_list = pattern.sub('', content)
    e.append(content_list)
#洗掉串列中第一個元素

最后保存為檔案

#洗掉串列中第一個元素
del(e[0])
#依次保存檔案
for j in range(len(e)):    
    with open("C:\\Users\\15584\\Desktop\\overwatch\\hero describe\\"+"hero"+str(j)+'.txt', mode='w', encoding='utf-8') as f:
        f.write(e[j])

結果展示如下：

會在overwatch下產生兩個檔案夾

產看picture為英雄圖片，hero describe為英雄描述

隨機查看一個英雄的描述

完整代碼如下：

（代碼比較粗糙，還需后期修改）

import requests
import re
from bs4 import BeautifulSoup 
import urllib
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36 Edg/94.0.992.47'
    } 
def get_html(url): 
 # 模擬瀏覽器訪問
    response = requests.get(url, headers=headers)  # 訪問網站
    response.encoding = response.apparent_encoding #字符編碼格式
    html = response.text  # 獲取網頁原始碼
    return html  
a=[]
b=[]
c=[]
d=[]
e=[]
r = get_html('https://ow.blizzard.cn/heroes/')
bs=BeautifulSoup(r,'html.parser')
#正則運算式爬取img以png結尾的鏈接
img_urls=bs.find_all('img', {'src': re.compile('.+(.png)$')})
for i in range(len(img_urls)):
    url = img_urls[i]['src']
    # 下載圖片
    urllib.request.urlretrieve(url, 'C:/Users/15584/Desktop/overwatch/picture/' +"hero"+str(i) + '.jpg')
#正則運算式爬取以/heroes開頭的內鏈接
for link in bs.find_all('a',href=re.compile('^((\/heroes.))')):
    a.append(link.text)
    if 'href' in link.attrs:
        b.append(link['href'])
url="https://ow.blizzard.cn"
for link1 in b:
    c.append(url+link1)
for link2 in c:
    r1 = get_html(link2)
    bs1=BeautifulSoup(r1,'html.parser')
    #print(bs1.p.text)
    d.append(bs1.find_all('p'))
for content in d:
    content = str(content)
    # 使用正則運算式匹配找到的文本內容中的所有<.*?>的標簽
    pattern = re.compile(r'<.*?>|\n|\s', re.S)
    # 將匹配到的內容用空來替換
    content_list = pattern.sub('', content)
    e.append(content_list)
#洗掉串列中第一個元素
del(e[0])
#依次保存檔案
for j in range(len(e)):    
    with open("C:\\Users\\15584\\Desktop\\overwatch\\hero describe\\"+"hero"+str(j)+'.txt', mode='w', encoding='utf-8') as f:
        f.write(e[j])

轉載請註明出處，本文鏈接：https://www.uj5u.com/ruanti/319871.html

標籤：其他

上一篇：2021-10-16我的第一篇博客——從跌倒處爬起

下一篇：【思維導圖】廠長爆肝計算機網路(第七版)(?建議收藏)