import requests
from lxml import etree
import os
from bs4 import BeautifulSoup
import random
if not os.path.exists('./詩詞'):
os.mkdir('./詩詞')
print("檔案已建立!!")
#偽裝瀏覽器
agent1 = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/31.0.1650.63 Safari/537.36"
agent2 = "HTC One Mozilla/5.0 (Linux; Android 4.0.3; HTC One X Build/IML74K) AppleWebKit/535.19\
(KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"
agent3 = "Galaxy S4 Mozilla/5.0 (Linux; U; Android 4.2; xx-xx; GT-I9500 Build/JDQ39)\
AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"
agent4 = "Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko)\
Version/9.0 Mobile/13B143 Safari/601.1"
agent5 = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0"
list1 = [agent1,agent2,agent3,agent4,agent5]
agent = random.choice(list1)
headers = {
"User-Agent":agent
}#偽裝瀏覽器結束!!
url = 'https://www.shicimingju.com'
page_text = requests.get(url,headers).text#.encode('iso-8859-1').decode('UTF-8')#utf-8為網頁charst屬性
tree = etree.HTML(page_text)
list_li =tree.xpath('//*[@id="main_right"]/div[1]/ul/li[1]')#li[1]有括號1則只有1個li標簽
for li in list_li:
url1 =url+ li.xpath('./a/@href')[0] #從主頁中獲取(每個)作者的地址
#new_url = url + new_url
zuozhe = li.xpath('./a/text()')[0]
print(zuozhe)
#print(url1)
fp = open('./詩詞.doc','w',encoding = 'UTF-8')
url2 = url1.replace('.html','_{}.html')#{}字串替代,獲取每個作者的詩的所有地址。
#print(url2)
pagetext1 = requests.get(url1,headers).text
soup = BeautifulSoup(pagetext1,'lxml')
number = soup.h1.text #獲取h1標題中的文本內容,再提取數字
total = int(re.findall('\d+',number)[0])#正則提取字串中的數字,并轉換成整數。
#print(total)
page_number = int(total/20+1)
for i in range(1,page_number+1):
#print(url2.format(i))#遍歷詩人名下詩歌的總頁數
page_text2 = (requests.get(url2.format(i),headers).text).encode('iso-8859-1').decode('UTF-8')
soup = BeautifulSoup(page_text2,'lxml')
list_url3= soup.select('h3')#詩名在h3標簽中,這樣可以獲取每首詩名串列
for url3 in list_url3:
url3 = url + url3.a['href'] #得到每首詩的獨立url,對每個獨立網頁進行請求并爬取資料。
print(url3)
page_text3 = (requests.get(url3,headers).text).encode('iso-8859-1').decode('UTF-8')
soup = BeautifulSoup(page_text3,'lxml')
#獲取詩的題目
title3 = soup.h1.text
#獲取詩的內容
content = soup.find('div', class_="item_content").text
#獲取詩的決議,個別網頁沒有決議會出錯
shangxi = soup.find('div', class_="shangxi_content").text
if len(shangxi)<0:
continue
fp.write(title3.center(60)+zuozhe +':'+content+'\n'+shangxi+'\n')
print(title3)
print('結束,快去看下下載的檔案!!')
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/283454.html
上一篇:小白求助-items
下一篇:PS的文字處理工具
