教你用python爬取唯品會商品資訊,詳細教程,僅供學習,
代碼展示
運行結束來個照
資訊保存到表格中
很多人學習python,不知道從何學起, 很多人學習python,掌握了基本語法過后,不知道在哪里尋找案例上手, 很多已經做案例的人,卻不知道如何去學習更加高深的知識, 那么針對這三類人,我給大家提供一個好的學習平臺,免費領取視頻教程,電子書籍,以及課程的源代碼! QQ群:701698587 歡迎加入,一起討論 一起學習!
工具使用
開發環境
python3.6
Windows10
開發工具
pycharm
工具包
re,os, tkinter,xlwt
開發思路
1.進入百度搜索唯品會,進入唯品會的官網,默認女士T恤,直接點擊搜索進入如下
2.右側滾動條一直下拉,滑動到下面的時候頁面會自動重繪出商品的資料,這里就體現了ajax互動,說明商品的資訊是存放在json介面中,接著拉到底就可以發現翻頁的按鈕了,如下
3.右擊檢查,按照步驟操作, 6步驟里的方框為該頁商品的id
拿回
start_url = r’https://mapi.vip.com/vips-mobile/rest/shopping/pc/search/product/rank?’
4.繼續抓包,找到商品資訊對應的資料包,拿回請求的url地址和引數(引數中加入商品的id,這也是為什么要拿回來商品的id)
goods_info_url = ‘https://mapi.vip.com/vips-mobile/rest/shopping/pc/product/module/list/v2?’
5,翻頁 全部的商品的id又存放在第二個rank檔案中,首先請求一下這個鏈接檔案,獲取商品id資訊,然后再重新組合url,最侄訓取商品詳細的資訊
可以發現pageOffset引數是變化的 變化是120
6.保存資料,模板可以拿去用
def save_data(self, data, user_name): ''' 保存資料 ''' os_path_1 = os.getcwd() + '/資料/' if not os.path.exists(os_path_1): os.mkdir(os_path_1) os_path = os_path_1 + user_name + '資料.xls' if not os.path.exists(os_path): # 創建新的workbook(其實就是創建新的excel) workbook = xlwt.Workbook(encoding='utf-8') # 創建新的sheet表 worksheet1 = workbook.add_sheet("唯品會商品資訊", cell_overwrite_ok=True) borders = xlwt.Borders() # Create Borders """定義邊框實線""" borders.left = xlwt.Borders.THIN borders.right = xlwt.Borders.THIN borders.top = xlwt.Borders.THIN borders.bottom = xlwt.Borders.THIN borders.left_colour = 0x40 borders.right_colour = 0x40 borders.top_colour = 0x40 borders.bottom_colour = 0x40 style = xlwt.XFStyle() # Create Style style.borders = borders # Add Borders to Style """居中寫入設定""" al = xlwt.Alignment() al.horz = 0x02 # 水平居中 al.vert = 0x01 # 垂直居中 style.alignment = al # 合并 第0行到第0列 的 第0列到第13列 '''基本詳情13''' # worksheet1.write_merge(0, 0, 0, 13, '基本詳情', style) excel_data_1 = ('商品名稱', '商品標簽', '市場價格', '商品折扣', '優惠后價格', '圖片地址') for i in range(0, len(excel_data_1)): worksheet1.col(i).width = 2560 * 3 # 行,列, 內容, 樣式 worksheet1.write(0, i, excel_data_1[i], style) workbook.save(os_path) # 判斷作業表是否存在 if os.path.exists(os_path): # 打開作業薄 workbook = xlrd.open_workbook(os_path) # 獲取作業薄中所有表的個數 sheets = workbook.sheet_names() for i in range(len(sheets)): for name in data.keys(): worksheet = workbook.sheet_by_name(sheets[i]) # 獲取作業薄中所有表中的表名與資料名對比 if worksheet.name == name: # 獲取表中已存在的行數 rows_old = worksheet.nrows # 將xlrd物件拷貝轉化為xlwt物件 new_workbook = copy(workbook) # 獲取轉化后的作業薄中的第i張表 new_worksheet = new_workbook.get_sheet(i) for num in range(0, len(data[name])): new_worksheet.write(rows_old, num, data[name][num]) new_workbook.save(os_path) print('保存完成----logging----!!!')
最后代碼打包一下就可以發給客戶了(pyinstaller)
原始碼展示:
# !/usr/bin/nev python # -*-coding:utf8-*- import tkinter as tk import re, xlwt, os, xlrd from xlutils.copy import copy from requests_html import HTMLSession session = HTMLSession() class WPHSpider(object): def __init__(self): """定義可視化視窗,并設定視窗和主題大小布局""" self.window = tk.Tk() self.window.title('唯品會資訊采集') self.window.geometry('800x600') """創建label_user按鈕,與說明書""" self.label_user = tk.Label(self.window, text='需要爬取的商品名稱:', font=('Arial', 12), width=30, height=2) self.label_user.pack() """創建label_user關聯輸入""" self.entry_user = tk.Entry(self.window, show=None, font=('Arial', 14)) self.entry_user.pack(after=self.label_user) """創建label_passwd按鈕,與說明書""" self.label_passwd = tk.Label(self.window, text="爬取多少頁:(小于100)", font=('Arial', 12), width=30, height=2) self.label_passwd.pack() """創建label_passwd關聯輸入""" self.entry_passwd = tk.Entry(self.window, show=None, font=('Arial', 14)) self.entry_passwd.pack(after=self.label_passwd) """創建Text富文本框,用于按鈕操作結果的展示""" self.text1 = tk.Text(self.window, font=('Arial', 12), width=85, height=22) self.text1.pack() """定義按鈕1,系結觸發事件方法""" self.button_1 = tk.Button(self.window, text='爬取', font=('Arial', 12), width=10, height=1, command=self.parse_hit_click_1) self.button_1.pack(before=self.text1) """定義按鈕2,系結觸發事件方法""" self.button_2 = tk.Button(self.window, text='清除', font=('Arial', 12), width=10, height=1, command=self.parse_hit_click_2) self.button_2.pack(anchor="e") self.start_url = r'https://mapi.vip.com/vips-mobile/rest/shopping/pc/search/product/rank?' self.goods_info_url = 'https://mapi.vip.com/vips-mobile/rest/shopping/pc/product/module/list/v2?' self.headers = { 'cookie': '輸入登錄后的你的cookie值', 'referer': '', 'user-agent': '' } def parse_hit_click_1(self): """定義觸發事件1,呼叫main函式""" user_name = self.entry_user.get() pass_wd = int(self.entry_passwd.get()) self.request_start_url_get_pid(user_name, pass_wd) def request_start_url_get_pid(self, user_name, pass_wd): ''' 請求獲取 pid ''' print('------------------------------------------' + '\n') for page in range(1, pass_wd+1): self.params_pid = { "callback": "getMerchandiseIds", "app_name": "shop_pc", "app_version": "4.0", "warehouse": "VIP_HZ", "fdc_area_id": "104103101", "client": "pc", "mobile_platform": "1", "province_id": "104103", "api_key": "70f71280d5d547b2a7bb370a529aeea1", "user_id": "", "mars_cid": "1613809051869_ecad06e028e7248cee802bb1c6414931", "wap_consumer": "a", "standby_id": "nature", "keyword": user_name, "lv3CatIds": "", "lv2CatIds": "", "lv1CatIds": "", "brandStoreSns": "", "props": "", "priceMin": "", "priceMax": "", "vipService": "", "sort": "0", "pageOffset": (page-1)*120, "channelId": "1", "gPlatform": "PC", "batchSize": "120", "_": "1613809204088", } response_pid = session.get(self.start_url, headers=self.headers, params=self.params_pid).content.decode() for pid in re.findall(r'"pid":"(.*?)"', response_pid): # pprint(pid) self.request_goods_info_url(pid, user_name, pass_wd) self.text1.insert("insert", r'******第{}頁保存完成******'.format(page)) self.text1.insert("insert", '\n') def request_goods_info_url(self, pid, user_name, pass_wd): ''' 請求獲取商品資訊資料 ''' self.params_info = { "callback": "getMerchandiseDroplets3", "app_name": "shop_pc", "app_version": "4.0", "warehouse": "VIP_HZ", "fdc_area_id": "104103101", "client": "pc", "mobile_platform": "1", "province_id": "104103", "api_key": "70f71280d5d547b2a7bb370a529aeea1", "user_id": "449181505", "mars_cid": "1613809051869_ecad06e028e7248cee802bb1c6414931", "wap_consumer": "b", "productIds": "{}".format(pid), "scene": "search", "standby_id": "nature", "extParams": '{"stdSizeVids":"","preheatTipsVer":"3","couponVer":"v2","exclusivePrice":"1","iconSpec":"2x"}', "context": "", "_": "1613810807777", } response_info = session.get(self.goods_info_url, headers=self.headers, params=self.params_info).content.decode() # pprint(response_info) self.parse_get_info(response_info, user_name, pass_wd) def parse_get_info(self, response_info, user_name, pass_wd): ''' 決議獲取商品資訊欄位 ''' names = re.findall(r'"title":"(.*?)"', response_info) price_labels = re.findall(r'"priceLabel":"(.*?)"', response_info) market_prices = re.findall(r'"marketPrice":"(.*?)"', response_info) discounts = re.findall(r'"saleDiscount":"(.*?)"', response_info) sale_prices = re.findall(r'"salePrice":"(.*?)"', response_info) image_urls = re.findall(r'"squareImage":"(.*?)"', response_info) # print(names, price_labels, market_prices, discounts, sale_prices, image_urls, sep='| ') print(r'***正在請求商品資料:{}'.format(names[0])) self.text1.insert("insert", r'***正在請求商品資料:{}'.format(names[0])) self.text1.insert("insert", '\n') for name, price_label, market_price, discount, sale_price, image_url in zip(names, price_labels, market_prices, discounts, sale_prices, image_urls): a = [name, price_label, market_price+'元', discount, sale_price+'元', image_url] b = {"唯品會商品資訊": a} self.save_data(b, user_name) def save_data(self, data, user_name): ''' 保存資料 ''' os_path_1 = os.getcwd() + '/資料/' if not os.path.exists(os_path_1): os.mkdir(os_path_1) os_path = os_path_1 + user_name + '資料.xls' if not os.path.exists(os_path): # 創建新的workbook(其實就是創建新的excel) workbook = xlwt.Workbook(encoding='utf-8') # 創建新的sheet表 worksheet1 = workbook.add_sheet("唯品會商品資訊", cell_overwrite_ok=True) borders = xlwt.Borders() # Create Borders """定義邊框實線""" borders.left = xlwt.Borders.THIN borders.right = xlwt.Borders.THIN borders.top = xlwt.Borders.THIN borders.bottom = xlwt.Borders.THIN borders.left_colour = 0x40 borders.right_colour = 0x40 borders.top_colour = 0x40 borders.bottom_colour = 0x40 style = xlwt.XFStyle() # Create Style style.borders = borders # Add Borders to Style """居中寫入設定""" al = xlwt.Alignment() al.horz = 0x02 # 水平居中 al.vert = 0x01 # 垂直居中 style.alignment = al # 合并 第0行到第0列 的 第0列到第13列 '''基本詳情13''' # worksheet1.write_merge(0, 0, 0, 13, '基本詳情', style) excel_data_1 = ('商品名稱', '商品標簽', '市場價格', '商品折扣', '優惠后價格', '圖片地址') for i in range(0, len(excel_data_1)): worksheet1.col(i).width = 2560 * 3 # 行,列, 內容, 樣式 worksheet1.write(0, i, excel_data_1[i], style) workbook.save(os_path) # 判斷作業表是否存在 if os.path.exists(os_path): # 打開作業薄 workbook = xlrd.open_workbook(os_path) # 獲取作業薄中所有表的個數 sheets = workbook.sheet_names() for i in range(len(sheets)): for name in data.keys(): worksheet = workbook.sheet_by_name(sheets[i]) # 獲取作業薄中所有表中的表名與資料名對比 if worksheet.name == name: # 獲取表中已存在的行數 rows_old = worksheet.nrows # 將xlrd物件拷貝轉化為xlwt物件 new_workbook = copy(workbook) # 獲取轉化后的作業薄中的第i張表 new_worksheet = new_workbook.get_sheet(i) for num in range(0, len(data[name])): new_worksheet.write(rows_old, num, data[name][num]) new_workbook.save(os_path) print('保存完成----logging----!!!') self.text1.insert("insert", '保存完成----logging----!!!') self.text1.insert("insert", '\n') def parse_hit_click_2(self): """定義觸發事件2,洗掉文本框中內容""" self.entry_user.delete(0, "end") self.entry_passwd.delete(0, "end") self.text1.delete("1.0", "end") def center(self): """創建視窗居中函式方法""" ws = self.window.winfo_screenwidth() hs = self.window.winfo_screenheight() x = int((ws / 2) - (800 / 2)) y = int((hs / 2) - (600 / 2)) self.window.geometry('{}x{}+{}+{}'.format(800, 600, x, y)) def run_loop(self): """禁止修改表單大小規格""" self.window.resizable(False, False) """視窗居中""" self.center() """視窗維持--持久化""" self.window.mainloop() if __name__ == '__main__': w= WPHSpider() w.run_loop()
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/302898.html
標籤:Python
下一篇:FastAPI 學習之路(四)
