# coding=gbk
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url): # 爬取進入網站
try:
r.requests.get(url,timeout = 15)
r.raise_for_status
r.encoding = r.apparent_encoding
return r.text
except:
return '爬取失敗'
def parserHTML(urllst,html): # 分析源代碼,找到各個題目的url,最后接到url后
soup = BeautifulSoup(html,'html.parser')
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
tds = tr('td')
urllst.append(tds[0].string,tds[1].string)
turl = tds[1]
return turl
'''注意爬取的題庫源代碼中,a里面有兩個url,第二個或者雙數個數的url不是目標url'''
def parsechildrenHTML(turl): # 在各個題目源代碼中,找到題目,將其爬取下來`
soup = BeautifulSoup(turl,'html.parser')
pres = soup.find('pre')
pre = pre[1]
return pre
def filebank(path): # 將爬取內容進行存檔
with open(path,'a',encoding =' utf-8') as f:
for p in pre:
f.write(p)
def gan():
path = 'D://Bank of Pyhton//'
ulst = []
url = 'http://www.pythontip.com'
start_url = url + '/coding/code_oj'
html = getHTMLText(start_url)
parserHTML(ulst,html)
parserchildrenHTML(turl)
filebank(path)
gan()
uj5u.com熱心網友回復:
哪里抄的一段代碼,錯誤太多!uj5u.com熱心網友回復:
啊啊啊,通用模板自己改編的
uj5u.com熱心網友回復:
求大佬指點樓上大佬轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/144847.html
上一篇:面試網易嚴選Java開發工程師,太真實了,直接涼涼(涼經)
下一篇:[BUUCTF]第一天訓練
