記錄一下第二個爬取的試題網站，和上傳爬取的問答庫不同，這個網站連登錄都不需要，撿到寶了，雖然大部分的問卷庫都沒有反爬機制，但是像這樣不收費的網站屬實罕見，
這次爬取的是螞蟻學習網中的高中語文題庫，
先把原始碼給上：

import json
from bs4 import BeautifulSoup
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait  # 參考設定顯示等待時間


def init():
    # 定義為全域變數，方便其他模塊使用
    global url, browser, username, password, wait
    # 登錄界面的url
    url = 'https://k12.mayi173.com/exam/list/3-1-0-1.html'
    # 實體化一個chrome瀏覽器
    browser = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe")
    # 用戶名
    username = ""
    # 密碼
    password = ""
    # 設定等待超時
    wait = WebDriverWait(browser, 20)


def get_item_info(url1):
    browser.get(url1)
    soup = BeautifulSoup(browser.page_source.encode('utf-8'), 'lxml')
    question_type_list = soup.select('body > div.i-page > div.i-content.g-clear > div.i-left > div.i-panel.p-panel > '
                                     'div.header > label:nth-child(2)')
    question_type = question_type_list[0].text
    question_list = soup.select(
        'body > div.i-page > div.i-content.g-clear > div.i-left > div.i-panel.p-panel > '
        'div.content ')
    question = question_list[0].text
    answer_list = soup.select('#i-tab-content > div')
    answer = answer_list[0].text
    jiexi_list = soup.select(
        'body > div.i-page > div.i-content.g-clear > div.i-left > div:nth-child(3) > div:nth-child(2) > div')
    jiexi = jiexi_list[0].text
    data = https://www.cnblogs.com/lychan/p/{'question_type': question_type,
        'question': question,
        'answer': answer,
        'jiexi': jiexi
    }
    print(data)
    with open('mayi.json', 'a', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


def get_all_link():
    for page in range(1, 50):
        if page == 50:
            break****
        if page == 1:
            browser.get('https://k12.mayi173.com/exam/list/3-1-0-1.html')
            sleep(5)
        else:
            browser.get('https://k12.mayi173.com/exam/list/3-1-0-' + str(page) + '.html')
            sleep(5)
        soup = BeautifulSoup(browser.page_source.encode('utf-8'), 'lxml')
        hrefs_list = soup.select(
            'body > div.i-page > div.i-content.g-clear > div.i-right.p-right > div.p-results > div > div.footer > a')
        for href in hrefs_list:
            link = href.get('href')
            url1 = 'https://k12.mayi173.com/' + link
            get_item_info(url1)


def main():
    # 初始化
    init()
    sleep(15)
    get_all_link()


if __name__ == '__main__':
    main()

下面簡單說一下爬取思路，其實與上一篇爬取問答庫如出一轍，還更為簡單

匯入第三方庫

import json
from bs4 import BeautifulSoup
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait  # 參考設定顯示等待時間

初始化 chromedriver

def init():
    # 定義為全域變數，方便其他模塊使用
    global url, browser, username, password, wait
    # 登錄界面的url
    url = 'https://k12.mayi173.com/exam/list/3-1-0-1.html'
    # 實體化一個chrome瀏覽器
    browser = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe")
    # 用戶名
    username = ""
    # 密碼
    password = ""
    # 設定等待超時
    wait = WebDriverWait(browser, 20)

查看頁面源代碼，以決議為例，復制選擇器（selector）

將試題型別、試題內容、參考答案以及試題決議爬取出來，存到字典中，
回到高中語文界面，爬取各網頁的鏈接，并把等到的鏈接回傳給get_item_info()函式
在主函式中呼叫各函式，進行測驗，將結果存到mayi.json檔案中，

這個頁面的爬取較為簡單，應該不會遇到什么困難，歡迎評論交流學習，

轉載請註明出處，本文鏈接：https://www.uj5u.com/houduan/88117.html

標籤：Python

上一篇：基于selenium的簡單爬蟲——爬取問答庫試題

下一篇：Python三方庫：Numpy（陣列處理）

基于selenium的簡單爬蟲——爬取螞蟻學習網

下面簡單說一下爬取思路，其實與上一篇爬取問答庫如出一轍，還更為簡單