問題描述：無法爬取淘寶商品頁面

案例如下：

import requests
import re

def getHTMLText(url):
    try:  
        r = requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
        
def parsePage(ilt,html):
    try:
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price,title])
    except:
        print("")
    
def printGoodsList(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序號","價格","商品名稱"))
    count = 0
    for g in ilt:
        count = count + 1
        print(tplt.format(count,g[0],g[1]))

def main():
    goods = input("請輸入要搜索的商品：")
    depth = input("請輸入想要搜索的商品的頁數：")
    depth = int(depth)
    start_url = 'https://s.taobao.com/search?q=' + goods
    infoList = []
    for i in range(depth):
        try:
            url = start_url + '&s=' + str(44*i)
            html = getHTMLText(url)
            parsePage(infoList, html)
        except:
            continue
    printGoodsList(infoList)
main()

代碼編譯沒有報錯，以下是運行結果：
在這里插入圖片描述
內容并未爬取到，

原因分析：因為淘寶的反爬蟲機制導致爬取不了資料

查看淘寶的robots協議
https://www.taobao.com/robots.txt
在這里插入圖片描述
分析：第一行淘寶將爬蟲定義為百度蜘蛛，第二行它不允許爬蟲爬取以’/'開頭的路徑

解決方案：添加headers內容

進入淘寶頁面輸入你想要搜索的內容
在這里插入圖片描述
這邊我搜索了筆記本電腦，

按F12進入控制臺
在這里插入圖片描述
依次點擊Network，選擇ALL，點擊serch，找到上圖勾選的serch，右擊選擇Copy，選擇Copy as cURL(bash)
然后進去下方鏈接
https://curl.trillworks.com/

將內容復制到curl command框內
在這里插入圖片描述
將Python requests內容中的headers內容復制到第一個函式getHTMLText中，并將
r = requests.get(url,timeout=30)修改為
r = requests.get(url,headers=headers,timeout=30)

import requests
import re

def getHTMLText(url):
    try:
        headers = {
            'authority': 's.taobao.com',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
            'sec-fetch-dest': 'document',
            'referer': '********',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cookie': '******',}
             
        r = requests.get(url,headers=headers,timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
        
def parsePage(ilt,html):
    try:
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price,title])
    except:
        print("")
    
def printGoodsList(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序號","價格","商品名稱"))
    count = 0
    for g in ilt:
        count = count + 1
        print(tplt.format(count,g[0],g[1]))

def main():
    goods = input("請輸入要搜索的商品：")
    depth = input("請輸入想要搜索的商品的頁數：")
    depth = int(depth)
    start_url = 'https://s.taobao.com/search?q=' + goods
    infoList = []
    for i in range(depth):
        try:
            url = start_url + '&s=' + str(44*i)
            html = getHTMLText(url)
            parsePage(infoList, html)
        except:
            continue
    printGoodsList(infoList)
main()

referer和cookie的內容太長，這邊我就用*代替了，大家復制自己的headers就可以了

在這里插入圖片描述
這樣就可以正常爬取了，

上述案例為嵩天老師Python網路爬蟲與資訊提取課程中的案例，（本博客方法只為學習討論，不做商業用途）

轉載請註明出處，本文鏈接：https://www.uj5u.com/houduan/229197.html

標籤：python

上一篇：python筆記(二)

下一篇：python識別word檔案格式 ——（專欄：基于python撰寫簡單office閱卷程式①）