核心代碼
requests.get 下載html網頁
bs4.BeautifulSoup 分析html內容
from requests import get
from bs4 import BeautifulSoup as bs
from datetime import datetime as dt
def Today(style=1):
date = dt.today()
if style!=1: return f'{date.month}月{date.day}日'
return f'{date.year}-{date.month:02}-{date.day:02}'
def SinaNews(style=1):
url1 = 'http://news.***.com.cn/'
if style==1: url1 += 'world'
elif style==2: url1 += 'china'
else: url1='https://mil.news.sina.com.cn/'
text = get(url1)
text.encoding='uft-8'
soup = bs(text.text,'html.parser')
aTags = soup.find_all("a")
return [(t.text,t['href']) for t in aTags if Today() in str(t)]
爬取標題
>>> for i,news in enumerate(SinaNews(1)):
print(f'No{i+1}:',news[0])
No1: 外媒:*****
No2: 日媒:******
.............
內容已馬賽克!!!
>>>
首次做爬蟲,為了方便下手找一個不用破解網頁的某新聞網站,下載網頁就能直接取得內容,其中的國際、國內和軍事新聞三個網頁作內容源,requests.get下載網頁后,分析所得html文本,所有<a href=...>標記帶日期剛好所需要的,
爬取正文
然后再根據url下載正文網頁,分析可知id=‘article’的<div>層就是正文所在位置,.get_text()是取得文本的關鍵函式,然后適當做一些格式處理:
>>> def NewsDownload(url):
html = get(url)
html.encoding='uft-8'
soup = bs(html.text,'html.parser')
text = soup.find('div',id='article').get_text().strip()
text = text.replace('點擊進入專題:','相關專題:')
text = text.replace(' ','\n ')
while '\n\n\n' in text:
text = text.replace('\n\n\n','\n\n')
return text
>>> url = 'https://******/w/2021-09-29/doc-iktzqtyt8811588.shtml'
>>> NewsDownload(url)
'原標題:******************************************************'
>>>
界面代碼
使用內置的圖形界面庫 tkinter 控制元件 Text 、Listbox、Scrollbar、Button,設定基本屬性、放置位置、系結命令,然后除錯到程式完工!
源代碼 News.pyw :其中涉及的網站名稱已馬賽克!
from requests import get
from bs4 import BeautifulSoup as bs
from datetime import datetime as dt
from os import path
import tkinter as tk
def Today(style=1):
date = dt.today()
if style!=1: return f'{date.month}月{date.day}日'
return f'{date.year}-{date.month:02}-{date.day:02}'
def SinaNews(style=1):
url1 = 'http://news.****.com.cn/'
if style==1: url1 += 'world'
elif style==2: url1 += 'china'
else: url1='https://mil.****.com.cn/'
text = get(url1)
text.encoding='uft-8'
soup = bs(text.text,'html.parser')
aTags = soup.find_all("a")
return [(t.text,t['href']) for t in aTags if Today() in str(t)]
def NewsList(i):
global news
news = SinaNews(i)
tList.delete(0,tk.END)
for idx,item in enumerate(news):
tList.insert(tk.END,f'{idx+1:03} {item[0]}')
tText.config(state=tk.NORMAL)
tText.delete(0.0,tk.END)
tText.config(state=tk.DISABLED)
NewsShow(0)
def NewsList1(): NewsList(1)
def NewsList2(): NewsList(2)
def NewsList3(): NewsList(3)
def NewsShow(idx):
if idx!=0:
idx = tList.curselection()[0]
title,url = news[idx][0],news[idx][1]
html = get(url)
html.encoding='uft-8'
soup = bs(html.text,'html.parser')
text = soup.find('div',id='article').get_text().strip()
text = text.replace('點擊進入專題:','相關專題:')
text = text.replace(' ','\n ')
while '\n\n\n' in text:
text = text.replace('\n\n\n','\n\n')
tText.config(state=tk.NORMAL)
tText.delete(0.0,tk.END)
tText.insert(tk.END, title+'\n\n'+text)
tText.config(state=tk.DISABLED)
def InitWindow(self,W,H):
Y = self.winfo_screenheight()
winPosition = str(W)+'x'+str(H)+'+8+'+str(Y-H-100)
self.geometry(winPosition)
icoFile = 'favicon.ico'
f = path.exists(icoFile)
if f: win.iconbitmap(icoFile)
self.resizable(False,False)
self.wm_attributes('-topmost',True)
self.title(bTitle[0])
SetControl()
self.update()
self.mainloop()
def SetControl():
global tList,tText
tScroll = tk.Scrollbar(win, orient=tk.VERTICAL)
tScroll.place(x=450,y=320,height=300)
tList = tk.Listbox(win,selectmode=tk.BROWSE,yscrollcommand=tScroll.set)
tScroll.config(command=tList.yview)
for idx,item in enumerate(news):
tList.insert(tk.END,f'{idx+1:03} {item[0]}')
tList.place(x=15,y=320,width=435,height=300)
tList.select_set(0)
tList.focus()
bW,bH = 70,35 #按鈕的寬高
bX,bY = 95,270 #按鈕的坐標
tBtn1 = tk.Button(win,text=bTitle[1],command=NewsList1)
tBtn1.place(x=bX,y=bY,width=bW,height=bH)
tBtn2=tk.Button(win,text=bTitle[2],command=NewsList2)
tBtn2.place(x=bX+100,y=bY,width=bW,height=bH)
tBtn3 = tk.Button(win,text=bTitle[3],command=NewsList3)
tBtn3.place(x=bX+200,y=bY,width=bW,height=bH)
tScroll2 = tk.Scrollbar(win, orient=tk.VERTICAL)
tScroll2.place(x=450,y=10,height=240)
tText = tk.Text(win,yscrollcommand=tScroll2.set)
tScroll2.config(command=tText.yview)
tText.place(x=15,y=10,width=435,height=240)
tText.config(state=tk.DISABLED,bg='azure',font=('宋體', '14'))
NewsShow(0)
tList.bind("<Double-Button-1>",NewsShow)
if __name__=='__main__':
win = tk.Tk()
bTitle = ('今日新聞','國際新聞','國內新聞','軍事新聞')
news = SinaNews()
InitWindow(win,480,640)
奉上全部代碼,在此就不作詳細分析了,如有需要請留言討論,我的使用環境 Win7+Python3.8.8 下可以無錯運行!文中涉及網站名稱已打上馬賽克,猜不出名字的可以私下里問我,
軟體編譯
使用pyinstaller.exe編譯成單個運行檔案,注意原始碼檔案的后綴名應該用.pyw否則會有cmd黑視窗出現,還有一個小知識點,任意網站的Logo圖示icon檔案,一般都能在根目錄里下載到,即:
http(s)://websiteurl.com(.cn)/favicon.ico
編譯命令如下:
D:\>pyinstaller --onefile --nowindowed --icon="D:\favicon.ico" News.pyw
編譯完成后,在dist檔案夾下生成一個News.exe可執行檔案,大小約15M還能接受,
反正拿走就能直接用,臨走前給個一鍵三連吧,謝謝!

轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/304336.html
標籤:python
