用puppeteer抓取字典網站-有解無憂

我正在嘗試抓取一個字典網站（這個“http://rjecnik.hr/”），其中包含所有字母中的所有單詞。設法做到了部分。我設法遍歷頁面，但無法實作遍歷每個字母然后將該資訊保存在檔案中。在互聯網上搜索，只是看不到我的問題的解決方案。補充一點，我是編程的大初學者，還在學習東西。可能是我看不到的簡單解決方案。這是代碼，也不是我自己撰寫的代碼，但我可以理解每個部分的作用。

// Browser
const puppeteer = require('puppeteer');

// funkcija se odnosi na puppeteer
(async () => {
    // Izvla?enje rije?i sa stranice, pomo?u rekurzije provjerit idu?e stranice.
    const izvuciRijeci = async (url) => 
    {
        // Izvla?enje (Scraping) podataka koje ?elimo. // Scraping data we want.
        const page = await browser.newPage()
        await page.goto(url)
        //console.log(`Scraping: ${url}`); // Debugging
        const rijeciNaStranici = await page.evaluate(() => Array.from(document.querySelectorAll('.word')).map((rijeci) => rijeci.innerText.trim())); // Getting the words from a page.
        await page.close();

        // Provjera idu?e stranice pomo?u rekurzije. // Checkin next page using recursion.
        if (rijeciNaStranici.length < 1) 
        {
            // Prekidanje ako nema rije?i. // Stop if no more words.
            //console.log (`Terminate recursion on: ${url}`) // Debugging
            return rijeciNaStranici
        }
        else 
        {
        // Dohvati idu?u stranicu na?inom "?page=X 1". // Get next page using "?page=X 1".
        const  nextPageNumber = parseInt(url.match(/page=(\d )$/)[1], 10)   1;
        const nextUrl = `http://rjecnik.hr/?letter=a&page=${nextPageNumber}`;
        
        return rijeciNaStranici.concat(await izvuciRijeci(nextUrl))
        }
    }

    const browser = await puppeteer.launch();
    const url = "http://rjecnik.hr/?letter=a&page=1";
    const rijec = await izvuciRijeci(url);

    // Todo: A?urirati bazu s rije?ima
    console.log(rijec);

// Spremanje u datoteku. // Save to file.
const content = rijec.toString();

var fs = require('fs');

fs.writeFile("rijeci.txt", content, function (err){
    if (err) {
        console.log(err);
    } else {
        console.log("File saved");
    }
});

    await browser.close();
})();

uj5u.com熱心網友回復：

如果您覺得此解決方案有用且有幫助，請選擇此解決方案作為正確答案。

首先，您不需要在每次加載新 URL 時打開和關閉頁面。當瀏覽器啟動時，您可以簡單地使用已經打開的頁面。

// const page = await page.newPage()    // <= this is also not efficient enough
// await page.close()                   // <= this is unnecessary and way too heavy
                                        // == You can use these method instead
const page = (await browser.pages())[0] // <= this way is lot better and lighter

然后你需要在一個陣列中列出所有可用的字母：

const getLettersArray = async (url) => {
    const page = (await browser.pages())[0] // Use the first page already opened, to keep it light
    await page.goto(url)
    return await page.evaluate(() => Array.from(document.querySelectorAll('.alphabet ul > li')).map(element => element.innerText))
}

然后要定義選定或活動的字母，您可以使用下面的正則運算式進行檢查，（注意：由于字典使用了一些非英語 QWERTY 字符，我已{1.6}在引數中添加）

const letterInUse = url.match(/letter=(.{1,6})&page=(\d )$/)[1] // Get the letter used in the page

我添加了更多方法，因此您可以在下面運行這個完整的功能腳本：

// Browser
const puppeteer = require('puppeteer')
const fs = require('fs')

// funkcija se odnosi na puppeteer
;(async () => {
    const getLettersArray = async (url) => {
        const page = (await browser.pages())[0] // Use the first page already opened, to keep it light
        await page.goto(url)
        return await page.evaluate(() => Array.from(document.querySelectorAll('.alphabet ul > li')).map(element => element.innerText))
    }
    // Izvla?enje rije?i sa stranice, pomo?u rekurzije provjerit idu?e stranice.
    const izvuciRijeci = async (url, allLetters) => {
        // Izvla?enje (Scraping) podataka koje ?elimo. // Scraping data we want.
        const page = (await browser.pages())[0] // Use the first page already opened, to keep it light
        await page.goto(url)
        //console.log(`Scraping: ${url}`); // Debugging
        const rijeciNaStranici = await page.evaluate(() => Array.from(document.querySelectorAll('.word')).map((rijeci) => rijeci.innerText.trim())) // Getting the words from a page.
        // await page.close() // Don't close page when it can be reused for efficiency and effectivity

        // Provjera idu?e stranice pomo?u rekurzije. // Checkin next page using recursion.
        if (rijeciNaStranici.length < 1) {
            // Prekidanje ako nema rije?i. // Stop if no more words.
            // console.log (`Terminate recursion on: ${url}`) // Debugging
            return rijeciNaStranici
        } else {
            // Dohvati idu?u stranicu na?inom "?page=X 1". // Get next page using "?page=X 1".
            const nextPageNumber = parseInt(url.match(/page=(\d )$/)[1], 10)   1
            const letterInUse = url.match(/letter=(.{1,6})&page=(\d )$/)[1] // Get the letter used in the page
            const letterIndexed = allLetters.findIndex(value => value === letterInUse.toUpperCase())   1
            if (letterIndexed > allLetters.length) {
                return []
            }
            const nextLetter = allLetters.at(letterIndexed) // Get the next letter after this letter
            const nextLetterUrl = `http://rjecnik.hr/?letter=${nextLetter}&page=1`
            const nextUrl = `http://rjecnik.hr/?letter=${letterInUse}&page=${nextPageNumber}`
            const nextPageArray = await izvuciRijeci(nextUrl, allLetters)
            if (nextPageArray.length) {
                return rijeciNaStranici.concat(nextPageArray)
            } else {
                const nextLetterArray = await izvuciRijeci(nextLetterUrl, allLetters)
                return rijeciNaStranici.concat(nextLetterArray)
            }
        }
    }

    const browser = await puppeteer.launch({headless: true})
    const url = "http://rjecnik.hr/?letter=a&page=1"
    const allLetters = await getLettersArray(url)
    const rijec = await izvuciRijeci(url, allLetters)

    // Todo: A?urirati bazu s rije?ima
    console.log(rijec)

    // Spremanje u datoteku. // Save to file.
    const content = rijec.toString()


    fs.writeFile('rijeci.txt', content, function (error) {
        if (error) {
            console.log(error)
        } else {
            console.log('File saved')
        }
    });

    await browser.close()
})()

轉載請註明出處，本文鏈接：https://www.uj5u.com/shujuku/433097.html

標籤：javascript 网页抓取 dom 傀儡师

上一篇：我提交的表格沒有顯示在瀏覽器上

下一篇：我想在另一個音頻開始時暫停當前播放的音頻