文章目錄
- 爬蟲前的準備
- python爬蟲的三元素
- 使用到的python第三方庫
- request安裝的方法
- 爬蟲步驟
- 網頁分析
- json格式
- 代碼示例
- 代碼決議
- 反反爬
爬蟲前的準備
python爬蟲的三元素
資料抓取、資料決議、資料存盤
使用到的python第三方庫
json(不需要安裝)、request(需要安裝)
request安裝的方法




爬蟲步驟
網頁分析
爬蟲的第一步就是進行網頁分析,找到找到要爬取的值

通過訪問該鏈接,得到json格式如下:

json格式
外面用{}包起來,里面是鍵值對的形式
{"key":"value"}
存在嵌套現象
{"key":"{"key":"value"}"}
重點是在嵌套中爬取搭到value值
代碼示例
爬好看視頻網的代碼示例:
import json
import requests
# 第一步:資料的抓取
# 定義一個變數,用于存取爬取到的資料
base_url ='https://haokan.baidu.com/web/video/feed?tab=yingshi_new&act=pcFeed&pd=pc&num=5&shuaxin_id=1623571294796'
# 反反爬
headers = {
# User-Agent
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36',
# Referer 用于判斷你是從哪個頁面跳過來的
'referer':'https://haokan.baidu.com/?fr=pc_pz',
# Cookie 用于判斷是否你登錄(如果沒有登錄,是沒有cookie的)
'cookie':'BIDUPSID=05FEFAC34AEB974EB63623DBB54F3765; PSTM=1590673047; __yjs_duid=1_223b526dd2e6b4ea4aa28b6a28e3398a1618362899184; BAIDUID=05FEFAC34AEB974EF3880088E7AC8536:SL=0:NR=10:FG=1; BAIDUID_BFESS=05FEFAC34AEB974EF3880088E7AC8536:SL=0:NR=10:FG=1; BDRCVFR[w-kNo__JL0t]=1jmUUpB1KcCmh7GmLNEmi4WUvY; delPer=0; PSINO=1; H_PS_PSSID=34099_31253_33848_33607_34094_26350; BA_HECTOR=a1aha4052hakah804b1gcbeil0q; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDRCVFR[fb3VbsUruOn]=rJZwba6_rOCfAF9pywd; BCLID=11157415508952475120; BDSFRCVID=YiuOJexroG38bzJevE5BMBPcTOqMFyTTDYLEJs2qYShnrsPVJeC6EG0PtoWQkz--EHtdogKKBmOTHgKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tR3aQ5rtKRTffjrnhPF3KhDfXP6-hnjy3bAO3tFa54QpHRcELl3j3M4LXtj8Lp3RymJJ2-39LPO2hpRjyxv4y4Ldj4oxJpOJaavIB-FEHl51fbbvbURvL4ug3-7MBM5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIEoCvt-5rDHJTg5DTjhPrM3RjdWMT-MTryKK8yKtFhetQHjf5YLl8S3-nDWx58QNnRhlR2B-3iV-OxDUvnyxAZyxomtfQxtNRJWM3l2-FVKq5S5-OobUPULxc9LUvMW2cdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj2CKLtCvsHJ7c-tI_-4_tbh_X5-RLfa50Ll7F54nKDp0Re-50y4LBQHoGWxjTKan9Ql6IMxbxsMTsQf65DR_40U7XbJJTQeQ-5KQN3KJmfb750tR0qDukyhOb2-biW2rL2Mbd5hvP_IoG2Mn8M4bb3qOpBtQmJeTxoUJ25DnJhhCGe4bK-TrXDauHtx5; BCLID_BFESS=11157415508952475120; BDSFRCVID_BFESS=YiuOJexroG38bzJevE5BMBPcTOqMFyTTDYLEJs2qYShnrsPVJeC6EG0PtoWQkz--EHtdogKKBmOTHgKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR3aQ5rtKRTffjrnhPF3KhDfXP6-hnjy3bAO3tFa54QpHRcELl3j3M4LXtj8Lp3RymJJ2-39LPO2hpRjyxv4y4Ldj4oxJpOJaavIB-FEHl51fbbvbURvL4ug3-7MBM5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIEoCvt-5rDHJTg5DTjhPrM3RjdWMT-MTryKK8yKtFhetQHjf5YLl8S3-nDWx58QNnRhlR2B-3iV-OxDUvnyxAZyxomtfQxtNRJWM3l2-FVKq5S5-OobUPULxc9LUvMW2cdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj2CKLtCvsHJ7c-tI_-4_tbh_X5-RLfa50Ll7F54nKDp0Re-50y4LBQHoGWxjTKan9Ql6IMxbxsMTsQf65DR_40U7XbJJTQeQ-5KQN3KJmfb750tR0qDukyhOb2-biW2rL2Mbd5hvP_IoG2Mn8M4bb3qOpBtQmJeTxoUJ25DnJhhCGe4bK-TrXDauHtx5; BDUSS=EgtYkpkdHRPRmNHSE9hTUlXdzZSTEo2N1FvUzFFd1JrTXVzYXV6N0VYQU9UZTFnRUFBQUFBJCQAAAAAAQAAAAEAAAAP3ewBc25vd2xpZmVzcwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA7AxWAOwMVga; BDUSS_BFESS=EgtYkpkdHRPRmNHSE9hTUlXdzZSTEo2N1FvUzFFd1JrTXVzYXV6N0VYQU9UZTFnRUFBQUFBJCQAAAAAAQAAAAEAAAAP3ewBc25vd2xpZmVzcwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA7AxWAOwMVga; Hm_lvt_4aadd610dfd2f5972f1efee2653a2bc5=1623571282,1623571295,1623572498; Hm_lpvt_4aadd610dfd2f5972f1efee2653a2bc5=1623572498; ab_sr=1.0.1_N2JmMjdkMDQ4YjhmOTZjNzY0MWFhZDU4ODU0ZDQ2NzgzZDAxODZmMWVkNGNmYWNjYTIxMjY0NGYzMWU5ZDA0MDZlNmVmODBlOWJkZGE3M2M1YTllYzA1MGVkMTQ2OTRkMDYwMTI2OThmYTJmMDEyZjk0ZjY0MjY4YjhlNmE3ZTUxNDU2MDMzNGIyNzQ2ZmFkZDM4ODVmYmQ5ZTcxNTFkMg==; reptileData=%7B%22data%22%3A%22b63c3c44ce7bc20eda633b56a7a8001dd67cef3dba2333c93cda863c2fdf2fb5d9e24be71639bdeb4dc0343566aef2c9e23a0318e7b5031322b35e380f87464fbd2c17a9a1ced8623fca3edcebaffd6a6b0b671f5faf3dfe9a56f8c9f7e0c6550289a890ebc252822d02af6411c79d729c53e4334fa15b6100077783e37fade1%22%2C%22key_id%22%3A%2230%22%2C%22sign%22%3A%22f18c8fe9%22%7D'
}
# 第二步:取出資料
# 使用request的get 請求,把資料拿出來
response = requests.get(base_url,headers=headers) # 第一次請求
print(response.status_code) # 列印出來的結果是200,否則爬取資料失敗
# 把爬到的資料用json格式顯示
data = response.text # 文本格式(json)
print(data) # 列印出來的結果與Request URL中鏈接結果的json資料相同
# 第三步:json格式的資料決議
# json.loads -- 將json編碼的字串轉換為一個python資料結構
# json.dump -- 將python資料結構轉換為json
json_data = json.loads(data)
print(json_data)
# 有順序的放入嵌套結構中的key值
json_list = json_data['data']['response']['videos']
print(json_list)
# 使用for回圈,將每一個值分別列印出來
for data in json_list:
# 預處理
# 視頻名稱
video_title = data['title'] + '.mp4'
# 視頻鏈接
video_url= data['play_url']
print(video_title,video_url)
print("正在下載:",video_title)
# 第二次請求
video_data = requests.get(video_url,headers=headers).content # content回傳的是二進制的值
# 保存視頻到本地
with open(r'./視頻/' + video_title,'wb') as f:
f.write(video_data)
print("下載完成!\n")
運行后的結果


代碼決議
通過把json資料轉為python資料結構,從而找到值,根據嵌套結構把值取出來

由圖可知,有三層嵌套結構
所以有代碼:
# 第三步:json格式的資料決議
# json.loads -- 將json編碼的字串轉換為一個python資料結構
# json.dump -- 將python資料結構轉換為json
json_data = json.loads(data)
print(json_data)
# 有順序的放入嵌套結構中的key值
json_list = json_data['data']['response']['videos']
使用for回圈去遍歷每一個視頻的值(視頻名稱、視頻鏈接)
# 使用for回圈,將每一個值分別列印出來
for data in json_list:
# 預處理
video_title = data['title'] + '.mp4'
video_url= data['play_url']


列印結果

點擊鏈接可以在瀏覽器中播放視頻
反反爬
有一些網站有反爬機制,這時候我們就會用到反爬機制
- 找到cookie、User-Agent、Referer的值



2. 反反爬的代碼
# 反反爬
headers = {
# User-Agent的值 -- 是一種向訪問網站提供你所使用的瀏覽器型別、作業系統及版本、CPU 型別、瀏覽器渲染引擎、瀏覽器語言、瀏覽器插件等資訊的標識,UA字串在每次瀏覽器 HTTP 請求時發送到服務器!
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36',
# Referer的值 -- 用于判斷你是從哪個頁面跳過來的
'referer':'https://haokan.baidu.com/?fr=pc_pz',
# Cookie的值 -- 用于判斷是否你登錄(如果沒有登錄,是沒有cookie的)
'cookie':'BIDUPSID=05FEFAC34AEB974EB63623DBB54F3765; PSTM=1590673047; __yjs_duid=1_223b526dd2e6b4ea4aa28b6a28e3398a1618362899184; BAIDUID=05FEFAC34AEB974EF3880088E7AC8536:SL=0:NR=10:FG=1; BAIDUID_BFESS=05FEFAC34AEB974EF3880088E7AC8536:SL=0:NR=10:FG=1; BDRCVFR[w-kNo__JL0t]=1jmUUpB1KcCmh7GmLNEmi4WUvY; delPer=0; PSINO=1; H_PS_PSSID=34099_31253_33848_33607_34094_26350; BA_HECTOR=a1aha4052hakah804b1gcbeil0q; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDRCVFR[fb3VbsUruOn]=rJZwba6_rOCfAF9pywd; BCLID=11157415508952475120; BDSFRCVID=YiuOJexroG38bzJevE5BMBPcTOqMFyTTDYLEJs2qYShnrsPVJeC6EG0PtoWQkz--EHtdogKKBmOTHgKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tR3aQ5rtKRTffjrnhPF3KhDfXP6-hnjy3bAO3tFa54QpHRcELl3j3M4LXtj8Lp3RymJJ2-39LPO2hpRjyxv4y4Ldj4oxJpOJaavIB-FEHl51fbbvbURvL4ug3-7MBM5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIEoCvt-5rDHJTg5DTjhPrM3RjdWMT-MTryKK8yKtFhetQHjf5YLl8S3-nDWx58QNnRhlR2B-3iV-OxDUvnyxAZyxomtfQxtNRJWM3l2-FVKq5S5-OobUPULxc9LUvMW2cdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj2CKLtCvsHJ7c-tI_-4_tbh_X5-RLfa50Ll7F54nKDp0Re-50y4LBQHoGWxjTKan9Ql6IMxbxsMTsQf65DR_40U7XbJJTQeQ-5KQN3KJmfb750tR0qDukyhOb2-biW2rL2Mbd5hvP_IoG2Mn8M4bb3qOpBtQmJeTxoUJ25DnJhhCGe4bK-TrXDauHtx5; BCLID_BFESS=11157415508952475120; BDSFRCVID_BFESS=YiuOJexroG38bzJevE5BMBPcTOqMFyTTDYLEJs2qYShnrsPVJeC6EG0PtoWQkz--EHtdogKKBmOTHgKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR3aQ5rtKRTffjrnhPF3KhDfXP6-hnjy3bAO3tFa54QpHRcELl3j3M4LXtj8Lp3RymJJ2-39LPO2hpRjyxv4y4Ldj4oxJpOJaavIB-FEHl51fbbvbURvL4ug3-7MBM5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIEoCvt-5rDHJTg5DTjhPrM3RjdWMT-MTryKK8yKtFhetQHjf5YLl8S3-nDWx58QNnRhlR2B-3iV-OxDUvnyxAZyxomtfQxtNRJWM3l2-FVKq5S5-OobUPULxc9LUvMW2cdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj2CKLtCvsHJ7c-tI_-4_tbh_X5-RLfa50Ll7F54nKDp0Re-50y4LBQHoGWxjTKan9Ql6IMxbxsMTsQf65DR_40U7XbJJTQeQ-5KQN3KJmfb750tR0qDukyhOb2-biW2rL2Mbd5hvP_IoG2Mn8M4bb3qOpBtQmJeTxoUJ25DnJhhCGe4bK-TrXDauHtx5; BDUSS=EgtYkpkdHRPRmNHSE9hTUlXdzZSTEo2N1FvUzFFd1JrTXVzYXV6N0VYQU9UZTFnRUFBQUFBJCQAAAAAAQAAAAEAAAAP3ewBc25vd2xpZmVzcwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA7AxWAOwMVga; BDUSS_BFESS=EgtYkpkdHRPRmNHSE9hTUlXdzZSTEo2N1FvUzFFd1JrTXVzYXV6N0VYQU9UZTFnRUFBQUFBJCQAAAAAAQAAAAEAAAAP3ewBc25vd2xpZmVzcwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA7AxWAOwMVga; Hm_lvt_4aadd610dfd2f5972f1efee2653a2bc5=1623571282,1623571295,1623572498; Hm_lpvt_4aadd610dfd2f5972f1efee2653a2bc5=1623572498; ab_sr=1.0.1_N2JmMjdkMDQ4YjhmOTZjNzY0MWFhZDU4ODU0ZDQ2NzgzZDAxODZmMWVkNGNmYWNjYTIxMjY0NGYzMWU5ZDA0MDZlNmVmODBlOWJkZGE3M2M1YTllYzA1MGVkMTQ2OTRkMDYwMTI2OThmYTJmMDEyZjk0ZjY0MjY4YjhlNmE3ZTUxNDU2MDMzNGIyNzQ2ZmFkZDM4ODVmYmQ5ZTcxNTFkMg==; reptileData=%7B%22data%22%3A%22b63c3c44ce7bc20eda633b56a7a8001dd67cef3dba2333c93cda863c2fdf2fb5d9e24be71639bdeb4dc0343566aef2c9e23a0318e7b5031322b35e380f87464fbd2c17a9a1ced8623fca3edcebaffd6a6b0b671f5faf3dfe9a56f8c9f7e0c6550289a890ebc252822d02af6411c79d729c53e4334fa15b6100077783e37fade1%22%2C%22key_id%22%3A%2230%22%2C%22sign%22%3A%22f18c8fe9%22%7D'
}
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/287508.html
標籤:python
上一篇:Java 給圖片加 文字水印
下一篇:Mybatis-學習筆記
