多執行緒與爬蟲
- 目標url
- json中查找url
- 訪問url
- 讀取json
- 查看json的list陣列
- 全部圖片
- 粗暴的單執行緒獲取
- 多執行緒執行
目標url
查看http://pvp.qq.com/web201605/wallpaper.shtml
有一個需要注意的就是圖片url在html原始碼中找不到,直接被js渲染了,可以仔細查找訪問服務器的url

查看html原始碼找不到該鏈接

json中查找url
在一個woKList中

使用csdn的json插件打開,查看鏈接

這些鏈接明顯不正常需要處理
參考urllib庫的parse
按照標準, URL 只允許一部分 ASCII 字符(數字字母和部分符號),其他的字符(如漢字)是不符合 URL 標準的
unqutote進行編碼

可以訪問,明顯看出這解析度不是高清壁紙

對比高清壁紙

對比高清壁紙:
python獲取的url
http://shp.qpic.cn/ishow/2735040716/1617785834_84828260_32746_sProdImgNo_8.jpg/200
高清圖的url
http://shp.qpic.cn/ishow/2735040716/1617785834_84828260_32746_sProdImgNo_8.jpg/0
url結尾的數字是0,處理一下url就可以啦!
訪問url
查看RequestURL

添加ua

import requests
from urllib import parse
from lxml import etree
import xlwt
def requests_get(url):#訪問url
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"Referer":"http://pvp.qq.com/web201605/wallpaper.shtml",
"Host": "apps.game.qq.com",
# "Cookie": 'RK=C4xd1jHDZe; ptcz=16bafcdd93d2321420a7ed219e1ae1b81004b9f8a4a2dec356252b5985f07669; pgv_pvi=7105682432; eas_sid=j1n5X908R97578Y8a9x878Y5B7; pac_uid=0_ebe0c22c30fbc; tvfe_boss_uuid=64bde02ade11a46c; LW_uid=81v6V0o6A8A3l119P7b844s4p4; PTTuserFirstTime=1606780800000; PTTosSysFirstTime=1606780800000; PTTosFirstTime=1606780800000; ied_qq=o1432448610; weekloop=0-0-0-15; PVP_PERSONAL_DATA_o1432448610=areaid%3D1%26areaname%3D%25E6%2589%258BQ52%25E5%258C%25BA-%25E6%259C%2588%25E5%2585%2589%25E5%2589%2591%25E8%2588%259E%26roleid%3DCE8E35648A764573EBD154D4C7601F38%26rolename%3D%25E6%25AD%25A4%25E5%258F%25B7%25E8%25BF%259EQQ%25E4%25BA%2594%25E7%2599%25BE%26rolesex%3D%26rolejob%3D%26checkparam%3Dyxzj%257Cyes%257C1432448610%257C1%257CCE8E35648A764573EBD154D4C7601F38*%257CCE8E35648A764573EBD154D4C7601F38%257C%257C1062%257C%2525E6%2525AD%2525A4%2525E5%25258F%2525B7%2525E8%2525BF%25259EQQ%2525E4%2525BA%252594%2525E7%252599%2525BE*%257C%257C%257C1617513052%257C1062%25255FCE8E35648A764573EBD154D4C7601F38*%26md5str%3D851A694620DA4E03243505B15AA6103A%26roleareaid%3D1%26sPartition%3D1062; isHostDate=18727; isOsSysDate=18727; isOsDate=18727; gpmtips_cfg=%7B%22iSendApi%22%3A0%2C%22iShowCount%22%3A0%2C%22iOnlineCount%22%3A0%2C%22iSendOneCount%22%3A0%2C%22iShowAllCount%22%3A0%2C%22iHomeCount%22%3A0%7D; pvpqqcomrouteLine=wallpaper_wallpaper_wallpaper_wallpaper_wallpaper_wallpaper_personal; _qpsvr_localtk=0.34624942012672344; eas_entry=https%3A%2F%2Fxui.ptlogin2.qq.com%2F; uin=o1432448610; skey=@ZewbKgfVZ; LW_sid=A1n6w1W8l0w4a3j0O3W2C3i2D7; pt2gguin=o1432448610; IED_LOG_INFO2=userUin%3D1432448610%26nickName%3D%25252F%26nickname%3D%252F%26userLoginTime%3D1618043033%26logtype%3Dqq%26loginType%3Dqq%26uin%3D1432448610; PTTDate=1618043050161; 25ccfec4f8bd9940e1abeafb17ed5209=1'
#
}
htmlText=''
try:
resp = requests.get(url, headers=headers)
htmlText = resp.text # 回傳的不是html
except Exception as e:
print(e)
return htmlText
def deal_url(text):#url進行編碼
return parse.unquote(text)
if __name__=='__main__':
url='http://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=4&totalpage=0&page=0&iOrder=0&iSortNumClose=1&jsoncallback=jQuery17107221495925873804_1618046009669&iAMSActivityId=51991&_everyRead=true&iTypeId=1&iFlowId=267733&iActId=2735&iModuleId=2735&_=1618046009859'
print(requests_get(url))
查看列印的內容,ok呼應上了但是,但是資料在jquery中 
去掉url中jsoncallback=jQuery
- 原始
url=‘http://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=4&totalpage=0&page=0&iOrder=0&iSortNumClose=1&jsoncallback=jQuery17107221495925873804_1618046009669&iAMSActivityId=51991&everyRead=true&iTypeId=1&iFlowId=267733&iActId=2735&iModuleId=2735&=1618046009859’- 去掉之后
url=‘http://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=4&totalpage=0&page=0&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&everyRead=true&iTypeId=1&iFlowId=267733&iActId=2735&iModuleId=2735&=1618046009859’
回傳json的格式
讀取json
查看json的list陣列
List陣列
- sProdImgNo_[1-8] 對應8張不同解析度的圖片
- sProdName 圖片名字


初步獲取圖片的名字和8張解析度不同的圖片
import requests
from urllib import parse
from lxml import etree
import xlwt
def requests_get(url):#訪問url
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"Referer":"http://pvp.qq.com/web201605/wallpaper.shtml",
"Host": "apps.game.qq.com",
"Cookie": 'RK=C4xd1jHDZe; ptcz=16bafcdd93d2321420a7ed219e1ae1b81004b9f8a4a2dec356252b5985f07669; pgv_pvi=7105682432; eas_sid=j1n5X908R97578Y8a9x878Y5B7; pac_uid=0_ebe0c22c30fbc; tvfe_boss_uuid=64bde02ade11a46c; LW_uid=81v6V0o6A8A3l119P7b844s4p4; PTTuserFirstTime=1606780800000; PTTosSysFirstTime=1606780800000; PTTosFirstTime=1606780800000; ied_qq=o1432448610; weekloop=0-0-0-15; PVP_PERSONAL_DATA_o1432448610=areaid%3D1%26areaname%3D%25E6%2589%258BQ52%25E5%258C%25BA-%25E6%259C%2588%25E5%2585%2589%25E5%2589%2591%25E8%2588%259E%26roleid%3DCE8E35648A764573EBD154D4C7601F38%26rolename%3D%25E6%25AD%25A4%25E5%258F%25B7%25E8%25BF%259EQQ%25E4%25BA%2594%25E7%2599%25BE%26rolesex%3D%26rolejob%3D%26checkparam%3Dyxzj%257Cyes%257C1432448610%257C1%257CCE8E35648A764573EBD154D4C7601F38*%257CCE8E35648A764573EBD154D4C7601F38%257C%257C1062%257C%2525E6%2525AD%2525A4%2525E5%25258F%2525B7%2525E8%2525BF%25259EQQ%2525E4%2525BA%252594%2525E7%252599%2525BE*%257C%257C%257C1617513052%257C1062%25255FCE8E35648A764573EBD154D4C7601F38*%26md5str%3D851A694620DA4E03243505B15AA6103A%26roleareaid%3D1%26sPartition%3D1062; isHostDate=18727; isOsSysDate=18727; isOsDate=18727; gpmtips_cfg=%7B%22iSendApi%22%3A0%2C%22iShowCount%22%3A0%2C%22iOnlineCount%22%3A0%2C%22iSendOneCount%22%3A0%2C%22iShowAllCount%22%3A0%2C%22iHomeCount%22%3A0%7D; pvpqqcomrouteLine=wallpaper_wallpaper_wallpaper_wallpaper_wallpaper_wallpaper_personal; _qpsvr_localtk=0.34624942012672344; eas_entry=https%3A%2F%2Fxui.ptlogin2.qq.com%2F; uin=o1432448610; skey=@ZewbKgfVZ; LW_sid=A1n6w1W8l0w4a3j0O3W2C3i2D7; pt2gguin=o1432448610; IED_LOG_INFO2=userUin%3D1432448610%26nickName%3D%25252F%26nickname%3D%252F%26userLoginTime%3D1618043033%26logtype%3Dqq%26loginType%3Dqq%26uin%3D1432448610; PTTDate=1618043050161; 25ccfec4f8bd9940e1abeafb17ed5209=1'
}
htmlText=''
try:
resp = requests.get(url, headers=headers)
json_url=resp.json() #回傳json
datas=json_url['List']
for d in datas:
img_data=img_url(d)
img_name=parse.unquote(d['sProdName'])
print('-'*30)
print(img_name)
print(img_data)
print('-' * 30)
except Exception as e:
print(e)
return htmlText
def img_url(data):#url進行編碼
img_data=[]
for x in range(1,9):#獲取8張解析度不同的圖片
img_url=parse.unquote(data['sProdImgNo_%d'%x].replace('200','0'))#替換為高清的url
img_data.append(img_url)
return img_data
if __name__=='__main__':
url = 'http://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=4&totalpage=0&page=0&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=1&iFlowId=267733&iActId=2735&iModuleId=2735&_=1618046009859'
print(requests_get(url))

全部圖片
只是獲取4個英雄的圖片

修改iListNum=20獲取20個英雄的
修改page實作翻頁
url='''http://apps.game.qq.com/cgi-
bin/ams/module/ishow/V1.0/query/workList_inc.cgi?
activityId=2735&sVerifyCode=ABCD&sDataType=JSON
&iListNum=4
&totalpage=0
&page=0
&iOrder=0
&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=1&iFlowId=267733&iActId=2735&iModuleId=2735&_=1618046009859'''
粗暴的單執行緒獲取
import requests
from urllib import parse,request
import os
def requests_get(url):#訪問url
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"Referer":"http://pvp.qq.com/web201605/wallpaper.shtml",
"Host": "apps.game.qq.com",
"Cookie": 'RK=C4xd1jHDZe; ptcz=16bafcdd93d2321420a7ed219e1ae1b81004b9f8a4a2dec356252b5985f07669; pgv_pvi=7105682432; eas_sid=j1n5X908R97578Y8a9x878Y5B7; pac_uid=0_ebe0c22c30fbc; tvfe_boss_uuid=64bde02ade11a46c; LW_uid=81v6V0o6A8A3l119P7b844s4p4; PTTuserFirstTime=1606780800000; PTTosSysFirstTime=1606780800000; PTTosFirstTime=1606780800000; ied_qq=o1432448610; weekloop=0-0-0-15; PVP_PERSONAL_DATA_o1432448610=areaid%3D1%26areaname%3D%25E6%2589%258BQ52%25E5%258C%25BA-%25E6%259C%2588%25E5%2585%2589%25E5%2589%2591%25E8%2588%259E%26roleid%3DCE8E35648A764573EBD154D4C7601F38%26rolename%3D%25E6%25AD%25A4%25E5%258F%25B7%25E8%25BF%259EQQ%25E4%25BA%2594%25E7%2599%25BE%26rolesex%3D%26rolejob%3D%26checkparam%3Dyxzj%257Cyes%257C1432448610%257C1%257CCE8E35648A764573EBD154D4C7601F38*%257CCE8E35648A764573EBD154D4C7601F38%257C%257C1062%257C%2525E6%2525AD%2525A4%2525E5%25258F%2525B7%2525E8%2525BF%25259EQQ%2525E4%2525BA%252594%2525E7%252599%2525BE*%257C%257C%257C1617513052%257C1062%25255FCE8E35648A764573EBD154D4C7601F38*%26md5str%3D851A694620DA4E03243505B15AA6103A%26roleareaid%3D1%26sPartition%3D1062; isHostDate=18727; isOsSysDate=18727; isOsDate=18727; gpmtips_cfg=%7B%22iSendApi%22%3A0%2C%22iShowCount%22%3A0%2C%22iOnlineCount%22%3A0%2C%22iSendOneCount%22%3A0%2C%22iShowAllCount%22%3A0%2C%22iHomeCount%22%3A0%7D; pvpqqcomrouteLine=wallpaper_wallpaper_wallpaper_wallpaper_wallpaper_wallpaper_personal; _qpsvr_localtk=0.34624942012672344; eas_entry=https%3A%2F%2Fxui.ptlogin2.qq.com%2F; uin=o1432448610; skey=@ZewbKgfVZ; LW_sid=A1n6w1W8l0w4a3j0O3W2C3i2D7; pt2gguin=o1432448610; IED_LOG_INFO2=userUin%3D1432448610%26nickName%3D%25252F%26nickname%3D%252F%26userLoginTime%3D1618043033%26logtype%3Dqq%26loginType%3Dqq%26uin%3D1432448610; PTTDate=1618043050161; 25ccfec4f8bd9940e1abeafb17ed5209=1'
}
try:
resp = requests.get(url, headers=headers)
json_url=resp.json() #回傳json
datas=json_url['List']
for d in datas:
img_data=img_url(d)
img_name=parse.unquote(d['sProdName']).replace("1:1",'').strip()
dirpath = os.path.join('images',img_name)
if not os.path.exists(dirpath):
os.mkdir(dirpath) # 創建目錄
for index,image_url in enumerate(img_data):#列舉
request.urlretrieve(image_url,os.path.join(dirpath,'%d.jpg'%(index+1)))
print('%s下載完成'%(image_url))
except Exception as e:
print(e)
return True
def img_url(data):#url進行編碼
img_data=[]
for x in range(1,9):#獲取8張解析度不同的圖片
img_url=parse.unquote(data['sProdImgNo_%d'%x].replace('200','0'))#替換為高清的url
img_data.append(img_url)
return img_data
if __name__=='__main__':
url = 'http://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=40&totalpage=0&page={page}&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=1&iFlowId=267733&iActId=2735&iModuleId=2735&_=1618046009859'
for i in range(1,18):
url=url.format(page=i)#替換頁碼
requests_get(url)
多執行緒執行
模擬生產者消費者
import requests
from urllib import parse,request
import os
import threading
from queue import Queue
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"Referer": "http://pvp.qq.com/web201605/wallpaper.shtml",
"Host": "apps.game.qq.com",
"Cookie": 'RK=C4xd1jHDZe; ptcz=16bafcdd93d2321420a7ed219e1ae1b81004b9f8a4a2dec356252b5985f07669; pgv_pvi=7105682432; eas_sid=j1n5X908R97578Y8a9x878Y5B7; pac_uid=0_ebe0c22c30fbc; tvfe_boss_uuid=64bde02ade11a46c; LW_uid=81v6V0o6A8A3l119P7b844s4p4; PTTuserFirstTime=1606780800000; PTTosSysFirstTime=1606780800000; PTTosFirstTime=1606780800000; ied_qq=o1432448610; weekloop=0-0-0-15; PVP_PERSONAL_DATA_o1432448610=areaid%3D1%26areaname%3D%25E6%2589%258BQ52%25E5%258C%25BA-%25E6%259C%2588%25E5%2585%2589%25E5%2589%2591%25E8%2588%259E%26roleid%3DCE8E35648A764573EBD154D4C7601F38%26rolename%3D%25E6%25AD%25A4%25E5%258F%25B7%25E8%25BF%259EQQ%25E4%25BA%2594%25E7%2599%25BE%26rolesex%3D%26rolejob%3D%26checkparam%3Dyxzj%257Cyes%257C1432448610%257C1%257CCE8E35648A764573EBD154D4C7601F38*%257CCE8E35648A764573EBD154D4C7601F38%257C%257C1062%257C%2525E6%2525AD%2525A4%2525E5%25258F%2525B7%2525E8%2525BF%25259EQQ%2525E4%2525BA%252594%2525E7%252599%2525BE*%257C%257C%257C1617513052%257C1062%25255FCE8E35648A764573EBD154D4C7601F38*%26md5str%3D851A694620DA4E03243505B15AA6103A%26roleareaid%3D1%26sPartition%3D1062; isHostDate=18727; isOsSysDate=18727; isOsDate=18727; gpmtips_cfg=%7B%22iSendApi%22%3A0%2C%22iShowCount%22%3A0%2C%22iOnlineCount%22%3A0%2C%22iSendOneCount%22%3A0%2C%22iShowAllCount%22%3A0%2C%22iHomeCount%22%3A0%7D; pvpqqcomrouteLine=wallpaper_wallpaper_wallpaper_wallpaper_wallpaper_wallpaper_personal; _qpsvr_localtk=0.34624942012672344; eas_entry=https%3A%2F%2Fxui.ptlogin2.qq.com%2F; uin=o1432448610; skey=@ZewbKgfVZ; LW_sid=A1n6w1W8l0w4a3j0O3W2C3i2D7; pt2gguin=o1432448610; IED_LOG_INFO2=userUin%3D1432448610%26nickName%3D%25252F%26nickname%3D%252F%26userLoginTime%3D1618043033%26logtype%3Dqq%26loginType%3Dqq%26uin%3D1432448610; PTTDate=1618043050161; 25ccfec4f8bd9940e1abeafb17ed5209=1'
}
class Producer(threading.Thread):
def __init__(self,page_que,img_que,*arges,**kwargs):
threading.Thread.__init__(self)#重寫
self.page_que=page_que
self.img_que=img_que
def run(self)->None:
while not self.page_que.empty():
page_url=self.page_que.get()
try:
resp = requests.get(page_url, headers=headers)
json_url = resp.json() # 回傳json
datas = json_url['List']
for d in datas:
img_data = img_url(d)
img_name = parse.unquote(d['sProdName']).replace("1:1", '').strip()
dirpath = os.path.join('images', img_name)
if not os.path.exists(dirpath):
os.mkdir(dirpath) # 創建目錄
for index, image_url in enumerate(img_data): # 列舉
self.img_que.put({'image_url':image_url,'img_path':os.path.join(dirpath, '%d.jpg' % (index + 1))})#圖片路徑加入佇列
except Exception as e:
print(e)
#執行某一頁的獲取url
#下載
class Consumer(threading.Thread):#下載圖片
def __init__(self,img_que,*arges,**kwargs):
threading.Thread.__init__(self)
self.img_que=img_que
def run(self)->None:
while True:
try:
img_obj = self.img_que.get(timeout=8)
img_url = img_obj.get('image_url')
img_path = img_obj.get('img_path')
try:
request.urlretrieve(img_url, img_path)
print('%s下載完成' % (img_path))
except:
print('%s下載失敗' % (img_path))
except:
break
def img_url(data):#url進行編碼
img_data=[]
for x in range(1,9):#獲取8張解析度不同的圖片
img_url=parse.unquote(data['sProdImgNo_%d'%x].replace('200','0'))#替換為高清的url
img_data.append(img_url)
return img_data
if __name__=='__main__':
url = 'http://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=40&totalpage=0&page={page}&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=1&iFlowId=267733&iActId=2735&iModuleId=2735&_=1618046009859'
page_queue=Queue(18)# 18頁
img_queue=Queue(1000)#
for i in range(1,18):
url=url.format(page=i)#替換頁碼
page_queue.put(url)#頁面url提取
for i in range(3):
th1=Producer(page_queue,img_queue,name='生產者%s號'%i)
th1.start()
for i in range(5):
th1=Consumer(img_queue,name='消費者%s號'%i)
th1.start()

抓取成功!

轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/275156.html
標籤:python

