用python爬取豆瓣電影資訊,輸入類別和爬取頁數,想怎么爬就怎么爬,哎就是玩!
代碼操作展示:


開發環境
windows 10
python3.6
開發工具
pycharm
庫
tkinter、jsonpath、lxml、random、os、xlrd
1.百度搜索豆瓣打開豆瓣電影,下面是我們要爬取的地址
start_url = https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=time&page_limit=20&page_start=0界面如下

2.在進入網頁的時候看到電影資料那里有加載中,確定資訊為異步加載,直接右擊檢查進行抓包,點擊Network,點擊XHR,如下圖所示

3,接下來我們看翻頁操作,點擊加載更多,查看網址變化

第二頁地址https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=time&page_limit=20&page_start=20
第三頁地址
https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=time&page_limit=20&page_start=40
規律找到了
最后page_start=一次增長20
同時也發現tag后面是電影的分類,可以通過更改類名獲取其他類別的電影資訊

4.獲取電影的詳情頁地址

def main(self, user_name, pass_wd):
'''
url 的拼接呼叫
:return:
'''
for i in range(pass_wd):
try:
start_url = self.start_url.format(user_name, i * 20)
response = session.get(start_url, headers=self.headers).json()
self.respons(response, user_name)
except:
print(f'{user_name}====保存完成正在翻頁')
def respons(self, response, user_name):
'''
提取資料
:return:
'''
subjects = response['subjects']
if subjects == []:
a = subjects[6]
else:
'''提取詳情頁'''
url_list = jsonpath(subjects, '$..url')
for url in url_list:
5.用xpath語法提取需要的資料,并將提取的資料放到串列里
for url in url_list:
list_1 = []
res = session.get(url, headers=self.headers).content.decode()
res = etree.HTML(res)
'''電影名稱'''
title = res.xpath('//h1/span/text()')[0]
list_1.append(title)
'''電影評分'''
pf = res.xpath('//strong/text()')[0]
list_1.append(pf)
dy_list = []
'''導演'''
dy = res.xpath('//div[@id="info"]/span[1]/span[2]/a/text()')
for d in dy:
dy_list.append(d)
# print(dy)
'''編劇'''
bj = res.xpath('//div[@id="info"]/span[2]/span[2]/a/text()')
for b in bj:
pass
# print(bj)
'''主演'''
zy = res.xpath('//span[@class="actor"]/span[@class="attrs"]/a/text()')
# zy = ''.join(zy)
# print(zy)
'''型別'''
lx = res.xpath('//span[@property="v:genre"]/text()')
# print(lx)
'''制片國家'''
zp_list = res.xpath('//div[@id="info"]/text()')
# print(zp_list)
zp = ''.join(zp_list)
zp = zp.replace(' ', '').replace('\n', '')
zp = zp.split('/')
zp = [i for i in zp if i != '']
zp = zp[0]
data = {
'基本詳情': [title, pf, dy, bj, zy, lx, zp]
}
6.最后保存到表格,可以作為模板使用
def save_excel(self, data, title, f):
os_path_1 = os.getcwd() + '/資料/'
if not os.path.exists(os_path_1):
os.mkdir(os_path_1)
os_path = os_path_1 + '資料.xls'
if not os.path.exists(os_path):
# 創建新的workbook(其實就是創建新的excel)
workbook = xlwt.Workbook(encoding='utf-8')
# 創建新的sheet表
worksheet1 = workbook.add_sheet("基本詳情", cell_overwrite_ok=True)
borders = xlwt.Borders() # Create Borders
"""定義邊框實線"""
borders.left = xlwt.Borders.THIN
borders.right = xlwt.Borders.THIN
borders.top = xlwt.Borders.THIN
borders.bottom = xlwt.Borders.THIN
borders.left_colour = 0x40
borders.right_colour = 0x40
borders.top_colour = 0x40
borders.bottom_colour = 0x40
style = xlwt.XFStyle() # Create Style
style.borders = borders # Add Borders to Style
"""居中寫入設定"""
al = xlwt.Alignment()
al.horz = 0x02 # 水平居中
al.vert = 0x01 # 垂直居中
style.alignment = al
# 合并 第0行到第0列 的 第0列到第13列
'''基本詳情13'''
# worksheet1.write_merge(0, 0, 0, 13, '基本詳情', style)
excel_data_1 = ('電影名稱', '電影評分', '導演', '編劇', '主演', '型別', '制片國家')
for i in range(0, len(excel_data_1)):
worksheet1.col(i).width = 2560 * 3
# 行,列, 內容, 樣式
worksheet1.write(0, i, excel_data_1[i], style)
workbook.save(os_path)
# 判斷作業表是否存在
if os.path.exists(os_path):
# 打開作業薄
workbook = xlrd.open_workbook(os_path)
# 獲取作業薄中所有表的個數
sheets = workbook.sheet_names()
for i in range(len(sheets)):
for name in data.keys():
worksheet = workbook.sheet_by_name(sheets[i])
# 獲取作業薄中所有表中的表名與資料名對比
if worksheet.name == name:
# 獲取表中已存在的行數
rows_old = worksheet.nrows
# 將xlrd物件拷貝轉化為xlwt物件
new_workbook = copy(workbook)
# 獲取轉化后的作業薄中的第i張表
new_worksheet = new_workbook.get_sheet(i)
for num in range(0, len(data[name])):
new_worksheet.write(rows_old, num, data[name][num])
new_workbook.save(os_path)
print(f'{f}===={title}========保存完成')
原始碼展示:
# !/usr/bin/nev python
# -*-coding:utf8-*-
USER_AGENT_LIST = ['Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; Hot Lingo 2.0)',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3451.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:57.0) Gecko/20100101 Firefox/57.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.2999.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.70 Safari/537.36',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36 OPR/31.0.1889.174',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.1.4322; MS-RTC LM 8; InfoPath.2; Tablet PC 2.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 OPR/55.0.2994.61',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MATP; InfoPath.2; .NET4.0C; CIBA; Maxthon 2.0)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.814.0 Safari/535.1',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; ja-jp) AppleWebKit/418.9.1 (KHTML, like Gecko) Safari/419.3',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0; Touch; MASMJS)',
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1041.0 Safari/535.21',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; Hot Lingo 2.0)',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3451.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:57.0) Gecko/20100101 Firefox/57.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.2999.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.70 Safari/537.36',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36 OPR/31.0.1889.174',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.1.4322; MS-RTC LM 8; InfoPath.2; Tablet PC 2.0)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 OPR/55.0.2994.61',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MATP; InfoPath.2; .NET4.0C; CIBA; Maxthon 2.0)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.814.0 Safari/535.1',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; ja-jp) AppleWebKit/418.9.1 (KHTML, like Gecko) Safari/419.3',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0; Touch; MASMJS)',
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1041.0 Safari/535.21',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4093.3 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko; compatible; Swurl) Chrome/77.0.3865.120 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Goanna/4.7 Firefox/68.0 PaleMoon/28.16.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4086.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:75.0) Gecko/20100101 Firefox/75.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/91.0.146 Chrome/85.0.4183.146 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36 VivoBrowser/8.4.72.0 Chrome/62.0.3202.84',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:83.0) Gecko/20100101 Firefox/83.0',
'Mozilla/5.0 (X11; CrOS x86_64 13505.63.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:68.0) Gecko/20100101 Firefox/68.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36 OPR/72.0.3815.400',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Safari/537.36',
]
import random, os, xlrd
import tkinter as tk
import jsonpath, xlwt
from lxml import etree
from xlutils.copy import copy
from jsonpath import jsonpath
from requests_html import HTMLSession
session = HTMLSession()
class DBSpider(object):
def __init__(self):
"""定義可視化視窗,并設定視窗和主題大小布局"""
self.window = tk.Tk()
self.window.title('豆瓣電影資訊采集')
self.window.geometry('800x600')
"""創建label_user按鈕,與說明書"""
self.label_user = tk.Label(self.window, text="請輸入需要爬取的分類('熱門','最新','經典','豆瓣高分','冷門佳片','華語','歐美','韓國',"
"'日本'):", font=('Arial', 12), width=150, height=2)
self.label_user.pack()
"""創建label_user關聯輸入"""
self.entry_user = tk.Entry(self.window, show=None, font=('Arial', 14))
self.entry_user.pack(after=self.label_user)
"""創建label_passwd按鈕,與說明書"""
self.label_passwd = tk.Label(self.window, text="爬取多少頁:(小于100)", font=('Arial', 12), width=30, height=2)
self.label_passwd.pack()
"""創建label_passwd關聯輸入"""
self.entry_passwd = tk.Entry(self.window, show=None, font=('Arial', 14))
self.entry_passwd.pack(after=self.label_passwd)
"""創建Text富文本框,用于按鈕操作結果的展示"""
self.text1 = tk.Text(self.window, font=('Arial', 12), width=85, height=22)
self.text1.pack()
"""定義按鈕1,系結觸發事件方法"""
self.button_1 = tk.Button(self.window, text='爬取', font=('Arial', 12), width=10, height=1,
command=self.parse_hit_click_1)
self.button_1.pack(before=self.text1)
"""定義按鈕2,系結觸發事件方法"""
self.button_2 = tk.Button(self.window, text='清除', font=('Arial', 12), width=10, height=1,
command=self.parse_hit_click_2)
self.button_2.pack(anchor="e")
self.start_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag={}&sort=time&page_limit=20&page_start={}'
self.headers = {
'User-Agent': random.choice(USER_AGENT_LIST)
}
def parse_hit_click_1(self):
"""定義觸發事件1,呼叫main函式"""
user_name = self.entry_user.get()
pass_wd = int(self.entry_passwd.get())
self.main(user_name, pass_wd)
def main(self, user_name, pass_wd):
'''
url 的拼接呼叫
:return:
'''
for i in range(pass_wd):
try:
start_url = self.start_url.format(user_name, i * 20)
response = session.get(start_url, headers=self.headers).json()
self.respons(response, user_name)
except:
print(f'{user_name}====保存完成正在翻頁')
def respons(self, response, user_name):
'''
提取資料
:return:
'''
subjects = response['subjects']
if subjects == []:
a = subjects[6]
else:
'''提取詳情頁'''
url_list = jsonpath(subjects, '$..url')
for url in url_list:
list_1 = []
res = session.get(url, headers=self.headers).content.decode()
res = etree.HTML(res)
'''電影名稱'''
title = res.xpath('//h1/span/text()')[0]
list_1.append(title)
'''電影評分'''
pf = res.xpath('//strong/text()')[0]
list_1.append(pf)
dy_list = []
'''導演'''
dy = res.xpath('//div[@id="info"]/span[1]/span[2]/a/text()')
for d in dy:
dy_list.append(d)
# print(dy)
'''編劇'''
bj = res.xpath('//div[@id="info"]/span[2]/span[2]/a/text()')
for b in bj:
pass
# print(bj)
'''主演'''
zy = res.xpath('//span[@class="actor"]/span[@class="attrs"]/a/text()')
# zy = ''.join(zy)
# print(zy)
'''型別'''
lx = res.xpath('//span[@property="v:genre"]/text()')
# print(lx)
'''制片國家'''
zp_list = res.xpath('//div[@id="info"]/text()')
# print(zp_list)
zp = ''.join(zp_list)
zp = zp.replace(' ', '').replace('\n', '')
zp = zp.split('/')
zp = [i for i in zp if i != '']
zp = zp[0]
data = {
'基本詳情': [title, pf, dy, bj, zy, lx, zp]
}
self.save_excel(data, title, user_name)
def save_excel(self, data, title, f):
os_path_1 = os.getcwd() + '/資料/'
if not os.path.exists(os_path_1):
os.mkdir(os_path_1)
os_path = os_path_1 + '資料.xls'
if not os.path.exists(os_path):
# 創建新的workbook(其實就是創建新的excel)
workbook = xlwt.Workbook(encoding='utf-8')
# 創建新的sheet表
worksheet1 = workbook.add_sheet("基本詳情", cell_overwrite_ok=True)
borders = xlwt.Borders() # Create Borders
"""定義邊框實線"""
borders.left = xlwt.Borders.THIN
borders.right = xlwt.Borders.THIN
borders.top = xlwt.Borders.THIN
borders.bottom = xlwt.Borders.THIN
borders.left_colour = 0x40
borders.right_colour = 0x40
borders.top_colour = 0x40
borders.bottom_colour = 0x40
style = xlwt.XFStyle() # Create Style
style.borders = borders # Add Borders to Style
"""居中寫入設定"""
al = xlwt.Alignment()
al.horz = 0x02 # 水平居中
al.vert = 0x01 # 垂直居中
style.alignment = al
# 合并 第0行到第0列 的 第0列到第13列
'''基本詳情13'''
# worksheet1.write_merge(0, 0, 0, 13, '基本詳情', style)
excel_data_1 = ('電影名稱', '電影評分', '導演', '編劇', '主演', '型別', '制片國家')
for i in range(0, len(excel_data_1)):
worksheet1.col(i).width = 2560 * 3
# 行,列, 內容, 樣式
worksheet1.write(0, i, excel_data_1[i], style)
workbook.save(os_path)
# 判斷作業表是否存在
if os.path.exists(os_path):
# 打開作業薄
workbook = xlrd.open_workbook(os_path)
# 獲取作業薄中所有表的個數
sheets = workbook.sheet_names()
for i in range(len(sheets)):
for name in data.keys():
worksheet = workbook.sheet_by_name(sheets[i])
# 獲取作業薄中所有表中的表名與資料名對比
if worksheet.name == name:
# 獲取表中已存在的行數
rows_old = worksheet.nrows
# 將xlrd物件拷貝轉化為xlwt物件
new_workbook = copy(workbook)
# 獲取轉化后的作業薄中的第i張表
new_worksheet = new_workbook.get_sheet(i)
for num in range(0, len(data[name])):
new_worksheet.write(rows_old, num, data[name][num])
new_workbook.save(os_path)
print(f'{f}===={title}========保存完成')
self.text1.insert("insert", f'{f}===={title}========保存完成')
self.text1.insert("insert", '\n ')
self.text1.insert("insert", '\n ')
def parse_hit_click_2(self):
"""定義觸發事件2,洗掉文本框中內容"""
self.entry_user.delete(0, "end")
self.entry_passwd.delete(0, "end")
self.text1.delete("1.0", "end")
def center(self):
"""創建視窗居中函式方法"""
ws = self.window.winfo_screenwidth()
hs = self.window.winfo_screenheight()
x = int((ws / 2) - (800 / 2))
y = int((hs / 2) - (600 / 2))
self.window.geometry('{}x{}+{}+{}'.format(800, 600, x, y))
def run_loop(self):
"""禁止修改表單大小規格"""
self.window.resizable(False, False)
"""視窗居中"""
self.center()
"""視窗維持--持久化"""
self.window.mainloop()
if __name__ == '__main__':
d = DBSpider()
d.run_loop()

代碼僅供學習
希望可以得到各位的一鍵三連,感謝各位支持,
祝大家學習python順利
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/289298.html
標籤:python
