為什么資料沒有爬取到?
程式沒出錯,不知道是哪個部分出了問題,還是資料沒抓到?
#coding:utf-8
import re
import random
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
wb=Workbook()
dest_filename = '電影.xlsx'
ws1 = wb.active
ws1.title = '電影top250'
DOWNLOAD_URL ='https://movie.douban.com/top250'
def download_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'
}
data = requests.get(url,headers=headers).content
return data
def get_data(doc):
soup = BeautifulSoup(doc,'lxml')
ol = soup.find('ol',class_='grid_view')
name = []
star_con = []
score = []
info_list=[]
for i in ol.find_all('li'):
detail = i.find('div',class_='hd')
movie_name = detail.find('span',class_='title').get_text()
level_star = i.find('span',class_='rating_num').get_text()
star = i.find('div',class_='star')
star_num = star.find(test=re.compile('評價'))
info = i.find('span',class_='inq')
if info:
info_list.append(info.get_test())
else:
info_list.append('無')
score.append(level_star)
name.append(movie_name)
star_con.append(star_num)
page = soup.find('span',class_='next').find('a')
print(i)
if page:
return name,star_con,score,info_list,DOWNLOAD_URL + page['href']
return name,star_con,score,info_list,None
def main():
url = DOWNLOAD_URL
name=[]
star_con=[]
score = []
info=[]
while url :
doc = download_page(url)
movie,star,level_num,info_list,url=get_data(doc)
name = name + movie
star_con = star_con+ star
score = score + level_num
info = info_list+ info
for(i,m,o,p) in zip(name,star_con,score,info):
print(i,m,o,p)
col_A = 'A%S' % (name.index(i)+1)
col_B = 'B%S' % (name.index(i)+1)
col_C = 'C%S' % (name.index(i)+1)
col_D = 'D%S' % (name.index(i)+1)
ws1[col_A] = i
ws1[col_B] = m
ws1[col_C] = o
ws1[col_D] = p
wb.save(filename=dest_filename)
if __name__ == "__main__":
main()
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/113489.html
上一篇:matlab
