Python爬蟲自動化獲取華圖和粉筆網站的錯題-有解無憂

基于Python的粉筆和華圖網站錯題爬蟲

- 粉筆網站
- 華圖網站
- - 總結

這篇博客對于考公人或者其他用華圖或者粉筆做題的人比較友好，通過輸入網址可以自動化獲取華圖以及粉筆練習的錯題，

粉筆網站

我們從做過的題目組中獲取錯題
在這里插入圖片描述

打開某一次做題組，我們首先進行抓包看看資料在哪里
在這里插入圖片描述
我們發現現在資料已經被隱藏，事實上資料在這兩個包中：
https://tiku.fenbi.com/api/xingce/questions
https://tiku.fenbi.com/api/xingce/solutions
一個為題目的一個為決議的，此url要通過傳入一個題目組引數才能獲取到當前題目資料，而題目組引數在這個包中
在這里插入圖片描述
以網址的倒數第二個數字串有關

url的規則為'https://tiku.fenbi.com/api/xingce/exercises/'+str(id_)+'?app=web&kav=12&version=3.0.0.0'，id_即為下劃線數字
通過請求這個包獲取到引數然后通過引數請求上面兩個包（

https://tiku.fenbi.com/api/xingce/questions
https://tiku.fenbi.com/api/xingce/solutions

）即可獲取到題目資料，而且自己的答案在也在https://tiku.fenbi.com/api/xingce/exercises/'+str(id_)+'?app=web&kav=12&version=3.0.0.0這個包中，

不過粉筆的題目資料有些是圖片，而且圖片在題目中，選項中，這里以word檔案存盤操作docx庫有些吃力，于是我想到了直接構造HTML代碼，然后通過pdfkit轉為pdf（具體如何下載可以參考百度，要下載wkhtmltopdf.exe）即可變為錯題集在平板或者其他設備中看，
（請求時一定要攜帶完整的headers，否則很可能獲取不到資料）

具體操作看代碼決議

###此函式用于決議題目和每道題的答案
def jiexi(liebiao):
    new = []
    timu_last = []
    for each in liebiao:
        new.append(re.sub(r'flag=\\"tex\\" ','',each))
    for each in new:
        timu_last.append(re.sub(r'\\','',each))
    return timu_last
###此函式用于決議選項
def xuanxiang(liebiao):
    xuanxiang_v2 = []
    xuanxiang_v3 = []
    for each in liebiao:
        a = re.sub('<p>','',each)
        a = re.sub('</p>','',a)
        xuanxiang_v2.append(a)
    for each in xuanxiang_v2:
        each = each+'</p>'
        xuanxiang_v3.append(each)
    return xuanxiang_v3
import requests
import re
import pdfkit
import os
url = str(input("請輸入練習的網址："))
###獲取本節練習id
id_ = re.findall(r'https://www.fenbi.com/spa/tiku.*?/xingce/xingce/(.*?)/',url,re.S)[0]
mid_url = 'https://tiku.fenbi.com/api/xingce/exercises/'+str(id_)+'?app=web&kav=12&version=3.0.0.0'
headers = {
#####完整的headers，自己添加
}
response = requests.get(url=mid_url,headers=headers)
response.encoding = 'utf-8'
page_text = response.text
###獲取題目組引數
id_list = re.findall('\"questionIds\"\:\[(.*?)\]\,',page_text,re.S)
###獲取自己的答案
your_answer = re.findall(r'"answer":{"choice":"(.*?)",',page_text,re.S)
###此練習名稱
name = re.findall(r'"name":"(.*?)",',page_text,re.S)[0]
###真正存盤資料的包
timu_url = 'https://tiku.fenbi.com/api/xingce/questions'
params = {
    'ids': id_list
}
response = requests.get(url=timu_url,headers=headers,params=params)
response.encoding = 'utf-8'
page_text = response.text
###獲取正確答案
true_answer = re.findall('"correctAnswer":{"choice":"(.*?)"',page_text,re.S)
###真正存盤資料的包
solution_url = 'https://tiku.fenbi.com/api/xingce/solutions'
response = requests.get(url=solution_url,headers=headers,params=params)
response.encoding = 'utf-8'
page_text = response.text
###獲取決議
solution_list = re.findall(r'"solution":"(.*?)","userAnswer"',page_text,re.S)
solution_last = jiexi(solution_list)
cailiao = []
timu = []
###獲取單選題題目和復合題的題目
for each in response.json():
    timu.append(each['content'])
    try:
        cailiao.append(each['material']['content'])
    except:
        cailiao.append('none')
###獲取選項資訊
A_option = re.findall('\"options\"\:\[\"(.*?)\"\,\".*?\"\,\".*?\"\,\".*?\"\]',page_text,re.S)
B_option = re.findall('\"options\"\:\[\".*?\"\,\"(.*?)\"\,\".*?\"\,\".*?\"\]',page_text,re.S)
C_option = re.findall('\"options\"\:\[\".*?\"\,\".*?\"\,\"(.*?)\"\,\".*?\"\]',page_text,re.S)
D_option = re.findall('\"options\"\:\[\".*?\"\,\".*?\"\,\".*?\"\,\"(.*?)\"\]',page_text,re.S)
A_option = xuanxiang(A_option)
B_option = xuanxiang(B_option)
C_option = xuanxiang(C_option)
D_option = xuanxiang(D_option)
A_option = jiexi(A_option)
B_option = jiexi(B_option)
C_option = jiexi(C_option)
D_option = jiexi(D_option)
###構造HTML代碼
count = 0
all_content = "<!DOCTYPE html>\n<meta charset='utf-8'>\n<html>"
for each in true_answer:
    if each != your_answer[count]:
        ###處理復合題
        if cailiao[count] != 'none' and cailiao[count] not in all_content:
            all_content += cailiao[count]
        all_content += str(count+1)
        all_content += '、'
        all_content += timu[count][3:]
        all_content += 'A、'
        all_content += A_option[count]
        all_content += 'B、'
        all_content += B_option[count]
        all_content += 'C、'
        all_content += C_option[count]
        all_content += 'D、'
        all_content += D_option[count]
        all_content += '<br>'
    count += 1
count = 0
all_content += '<br><br><br><br><br><br><br><br><br>'
for each in true_answer:
    if each != your_answer[count]:
        temp = '第'+str(count+1)+'題的正確答案為'
        all_content += temp
        if true_answer[count]=='0':
            all_content += 'A'
        elif true_answer[count]=='1':
            all_content += 'B'
        elif true_answer[count]=='2':
            all_content += 'C'
        elif true_answer[count]=='3':
            all_content += 'D'
        all_content += solution_last[count]
        all_content += '<br>'
    count += 1
all_content += '</html>'
path_name = name + '.html'
###保存為HTML檔案
with open(path_name,'w',encoding='utf-8') as fp:
    fp.write(all_content)
confg = pdfkit.configuration(wkhtmltopdf=r'wkhtmltopdf.exe保存的路徑')
pdfkit.from_url(path_name, name+'.pdf',configuration=confg)###把HTML檔案轉為pdf
print('錯題PDF保存成功')
###洗掉HTML檔案
os.remove(path_name)

華圖網站

在這里插入圖片描述
也是答題記錄中自己做過的題目
華圖網站稍微不一樣，他的資料直接抓包就可看到

通過請求這個包即可獲取到資料，接下來就是決議的事情了，這次我用word檔案進行存盤，如果覺得不方便也可以像上文一樣構造HTML

##導包
import requests
import lxml.etree
import re
import time
import os
from docx import Document
from docx.shared import Inches
from docx.shared import Pt
from docx.shared import Inches
from docx.oxml.ns import qn
from docx.enum.text import WD_ALIGN_PARAGRAPH
url = str(input("請輸入練習的網址："))
headers={
###完整的headers，否則獲取不到資料
}
response = requests.get(url = url,headers = headers)
response.encoding='utf-8'
reptext = response.text
tree = lxml.etree.HTML(reptext) #決議網站獲取原始碼

dirName="考公圖片"
if not os.path.exists(dirName):
    os.mkdir(dirName)  #網站圖片保存路徑
    
jiexi = re.findall(r'<div class="jiexi-item-title">決議.*?，</div>.*?</div>', reptext,re.S) #獲取題目決議

imgg = []
for each in jiexi:
    imgg.append(re.findall(r'<img src="(.*?)".*?>', each)) #獲取決議里的圖片URL
    
imgt = []
for each in imgg:
    if each == []:
        imgt.append([1])
    else:
        imgt.append(each) #把決議里圖片URL美化整理一下
        
jiexilast = []
for qq in jiexi:
    jiexilast.append(re.sub(r'<[^>]+>', '', qq))  #美化題目決議
    
corrected = re.findall(r'<span class="g-right-answer-color">[a-zA-Z]{1,4}</span>', reptext)  #獲取正確答案
correct = []
for ee in corrected:
    correct.append(re.sub(r'<[^>]+>', '', ee)) #美化正確答案
    
yoursed = re.findall(r'<span class="yellowWord">[a-zA-Z]{1,4}</span>', reptext)  #獲取自己的答案
yours = []
for ee in yoursed:
    yours.append(re.sub(r'<[^>]+>', '', ee)) #美化自己的答案
    
timuleixing = re.findall(r'<span class="greenWord">(.*?)</span>.*?</div>',reptext,re.S) #獲取題目型別

find1 = re.findall(r'<span class="greenWord">.*?</span>(.*?)</div>',reptext,re.S)
for each in find1:
    re.sub(r'<.*?>','',each)
find5 = []  #最終的題目
for each in find1:
    find5.append(re.sub(r'<[^>]+>', '', each))
    
img = []
for each in find1:
    img.append(re.findall(r'<img src="(.*?)".*?>', each))
imgx = []
for each in img:
    if each == []:
        imgx.append([1])
    else:
        imgx.append(each)   #最終版題目圖片URL
        

v = tree.xpath('//div[@class="exercise-main-title"]//text()') #本次題目型別

try:
    ###這是既有復合題也有單選題的
    fuheti = re.findall(r'<!--復合題-->(.*?)<div class="exercise-main-topics"',reptext,re.S)[0].split('<!--復合題-->')
except:
    try:
        ###這是只有復合題或者復合題在最后幾題的
        fuheti = re.findall(r'<!--復合題-->(.*?)<!-- 糾錯的彈窗 -->',reptext,re.S)[0].split('<!--復合題-->')
    except:
        pass
count = 0

###匯入標題
document = Document()
p = document.add_paragraph()
p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run(v[0][5:-3])
run.font.size = Pt(14)
run.font.name=u'宋體'
r = run._element
r.rPr.rFonts.set(qn('w:eastAsia'),u'宋體')
choose = []

###處理題目選項
axuanxiang = []
bxuanxiang = []
cxuanxiang = []
dxuanxiang = []
xuanxiang = re.findall(r'<div class="main-topic-choices">(.*?)<div class="main-topic-letters clearfix pl14">',reptext,re.S)
for everything in xuanxiang:
    try: ##處理只有兩個選項
        axuanxiang.append(re.sub("<.*?>","",re.findall(r'<div.*?class.*?main-topic-choice.*?>(A.*?)</div>',everything,re.S)[0]))
    except:
        axuanxiang.append('--')
    try:
        bxuanxiang.append(re.sub("<.*?>","",re.findall(r'<div.*?class.*?main-topic-choice.*?>(B.*?)</div>',everything,re.S)[0]))
    except:
        bxuanxiang.append('--')
    try:
        cxuanxiang.append(re.sub("<.*?>","",re.findall(r'<div.*?class.*?main-topic-choice.*?>(C.*?)</div>',everything,re.S)[0]))
    except:
        cxuanxiang.append('--')
    try:
        dxuanxiang.append(re.sub("<.*?>","",re.findall(r'<div.*?class.*?main-topic-choice.*?>(D.*?)</div>',everything,re.S)[0]))
    except:
        dxuanxiang.append('--')
        

    
for every in correct:
    if every != yours[count]:
        ###處理復合題題目
        try:
            for eacy in fuheti:
                if find5[count] in eacy:
                    fuheti_URL = re.findall(r'<img src="(.*?)".*?>',re.findall(r'.*?<p>(.*?)</p>',eacy,re.S)[0],re.S)
                    fuheti_last = re.sub(r'<.*?>','',re.findall(r'.*?<p>(.*?)</p>',eacy,re.S)[0])
                    fuheti_last = re.sub(r'\xa0\xa0\xa0\xa0\xa0\xa0\xa0','\n',fuheti_last)
                    if fuheti_last not in choose:
                        p = document.add_paragraph()
                        run = p.add_run(fuheti_last)
                        run.font.size = Pt(14)
                        run.font.name=u'宋體'
                        r = run._element
                        r.rPr.rFonts.set(qn('w:eastAsia'),u'宋體')
                        headers  ={
                    'Use-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
                            }
                        for eacu in fuheti_URL:
                            img_data = requests.get(url = eacu,headers = headers).content
                            img_path = dirName+'/'+'tupian'+'.jpg'
                            with open(img_path,'wb') as fp:
                                fp.write(img_data)
                                print("保存成功")
                            document.add_picture(img_path, width=Inches(5))
                        choose.append(fuheti_last)
        except:
            pass
        
        ###匯入單選題題目
        p = document.add_paragraph()
        run = p.add_run(str(count+1)+"、"+timuleixing[count]+find5[count][3:])
        run.font.size = Pt(14)
        run.font.name=u'宋體'
        r = run._element
        r.rPr.rFonts.set(qn('w:eastAsia'),u'宋體')
        url = imgx[count][0]
        headers  ={
            'Use-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
        }
        try:
            img_data = requests.get(url = url,headers = headers).content
            img_path = dirName+'/'+'tupian'+'.jpg'
            with open(img_path,'wb') as fp:
                fp.write(img_data)
                print("保存成功")
            document.add_picture(img_path, width=Inches(5))
            count+=1
        except:
            count+=1
            
        ###匯入選項
        p = document.add_paragraph()
        run = p.add_run(axuanxiang[count-1])
        run.font.size = Pt(14)
        run.font.name=u'宋體'
        r = run._element
        r.rPr.rFonts.set(qn('w:eastAsia'),u'宋體')
        p = document.add_paragraph()
        run = p.add_run(bxuanxiang[count-1])
        run.font.size = Pt(14)
        run.font.name=u'宋體'
        r = run._element
        r.rPr.rFonts.set(qn('w:eastAsia'),u'宋體')
        p = document.add_paragraph()
        run = p.add_run(cxuanxiang[count-1])
        run.font.size = Pt(14)
        run.font.name=u'宋體'
        r = run._element
        r.rPr.rFonts.set(qn('w:eastAsia'),u'宋體')
        p = document.add_paragraph()
        run = p.add_run(dxuanxiang[count-1])
        run.font.size = Pt(14)
        run.font.name=u'宋體'
        r = run._element
        r.rPr.rFonts.set(qn('w:eastAsia'),u'宋體')
        p = document.add_paragraph()
        run = p.add_run("\n")
        run.font.size = Pt(14)
        run.font.name=u'宋體'
        r = run._element
        r.rPr.rFonts.set(qn('w:eastAsia'),u'宋體')
        
    else:
        count+=1

###美化界面
p = document.add_paragraph()
run = p.add_run("\n\n\n\n\n")
run.font.size = Pt(14)
run.font.name=u'宋體'
r = run._element
r.rPr.rFonts.set(qn('w:eastAsia'),u'宋體')

###美化決議
counting = 0
jiexilast2 = []
for ok in jiexilast:
    jiexilast2.append(re.sub(r'\n\t\t','：',ok))
for every in correct:
    if every != yours[counting]:
        ###匯入決議和答案
        p = document.add_paragraph()
        run = p.add_run(str(counting+1)+"、"+"正確答案為："+correct[counting]+"\n"+jiexilast2[counting])
        run.font.size = Pt(14)
        run.font.name=u'宋體'
        r = run._element
        r.rPr.rFonts.set(qn('w:eastAsia'),u'宋體')
        url = imgt[counting][0]
        headers  ={
            'Use-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
        }
        try:
            img_data = requests.get(url = url,headers = headers).content
            img_path = dirName+'/'+'tupian'+'.jpg'
            with open(img_path,'wb') as fp:
                fp.write(img_data)
                print("保存成功")
            document.add_picture(img_path, width=Inches(5))
            print("寫入成功")
            counting+=1
        except:
            counting+=1
    else:
        counting+=1
###保存檔案
document.save(v[0][5:-3]+'.docx')
print(v[0][5:-3]+'保存成功！')

總結

粉筆和華圖錯題爬蟲主要區別是華圖獲取資料簡單，決議操作繁瑣；粉筆的資料隱秘，決議起來可以用json，比較方便，

轉載請註明出處，本文鏈接：https://www.uj5u.com/houduan/246559.html

標籤：python

上一篇：python的Tesseract-OCR-04-識別，使用jTessBoxEditor 提高數字驗證碼識別準確率

下一篇：如何使用python or Shell 進行整點的自動任務監聽運行