無法實作任何邏輯，使用puppeteer從最里面的頁面刮取內容 -有解無憂

我使用puppeteer創建了一個腳本，從網頁中抓取不同作者的鏈接，穿越多個頁面，觸發點擊下一頁按鈕。該腳本似乎是以正確的方式作業的。

雖然這個網站的內容是靜態的，但我故意在下面的腳本中使用puppeteer，只是為了學習如何從內部頁面決議內容。

鑒于我希望深入一層，從這樣的頁面中刮取description。我如何才能實作這一目標？

const puppeteer = require（'puppeteer'）。

function run（pagesToScrape） {
    return new Promise(async（resolve, reject）=> {
        try {
            if (! pagesToScrape) {
                pagesToScrape = 1;
            }
            const browser = await puppeteer.launch({headless:false}) 。
            const [page] = await browser.pages（）。
            await page.goto("https://quotes.toscrape.com/") 。
            let currentPage = 1;
            let urls = [];
            while (currentPage <= pagesToScrape) {
                let newUrls = await page.evaluation(() =>  {
                    let results = [];
                    let items = document.querySelectorAll('[class="quote"] ' )。
                    items.forEach((item) =>  {
                        結果.push（{
                            authorUrl:  'https://quotes.toscrape.com'   item.querySelector(" small. author   a"）.getAttribute（'href'）。
                            title: item.querySelector("span.text").innerText。
                        });
                    });
                    return結果。
                });
                urls = urls.concat（newUrls）。
                if (currentPage < pagesToScrape) {
                    await Promise.all（[
                        await page.waitForSelector('li.next > a'/span>)。
                        await page.click('li.next > a') 。
                        await page.waitForSelector('[class="quote"]'/span>)
                    ])
                }
                currentPage  ;
            }
            browser.close()。
            return resolve（urls）。
        } catch (e) {
            return reject（e）。
        }
    })
}
run（3）.then（console. log）.catch（console.error）。

uj5u.com熱心網友回復：

我將這樣做：

const puppeteer = require('puppeteer');

let browser;

(async function main（） {
  browser = await puppeteer.launch({ headless: false, defaultViewport: null })。

  const [pageQuotes] = await browser.pages（）。
  const pageAbout = await browser.newPage();
  await pageQuotes.bringToFront(); //否則，點擊下一頁的鏈接不起作用。

  const pagesToScrape = 3;

  await pageQuotes.goto('https://quotes.toscrape.com/');
  let currentPage = 1;

  const data = { quotes: {}, abouts: {}. };
  const visitedAbouts = new Set() 。

  while (currentPage <= pagesToScrape) {
    await pageQuotes.waitForSelector('.quote');

    const { quote, aboutURLs } = await pageQuotes.evaluation(() =>  ({
      quotes: Array.from(
        document.querySelectorAll(' .quote')。
        quote => [quote.querySelector(' small. author').innerText, quote.innerText】。]
      ),
      aboutURLs: Array.from(
        document.querySelectorAll('.quote small.author   a[href]'/span>)。
        quote => quote.href,
      ),
    }));

    for (const [author, quote] of quotes) {
      if (data.quotes[author] === undefined) data.quotes[author] = [] 。
      data.quotes[author].push（報價）。
    }

    for (const aboutURL of aboutURLs) {
      if (!visitedAbouts.has(aboutURL)) {
        visitedAbouts.add（aboutURL）。

        等待 pageAbout.goto(aboutURL);
        await pageAbout.waitForSelector('div.author-details')。

        const { title, about } = await pageAbout.evaluation(() => ({
          title: document.querySelector('div.author-details h3.author-title').innerText。
          about: document.querySelector('div.author-details').innerText。
        }));

        data.abouts[title] = about;
      }
    }

    if (currentPage < pagesToScrape) {
      const nextLink = await pageQuotes.waitForSelector('li.next > a'/span>) 。

      await Promise.all（[
        nextLink.click（）。
        pageQuotes.waitForNavigation()。
      ]);
    }
    currentPage  ;
  }

  console.log(JSON. stringify(data, null, ' '))。
})().catch（console.error）。 finally(async () => { if (browser) await browser.close(); }) 。

轉載請註明出處，本文鏈接：https://www.uj5u.com/yidong/331298.html

標籤：

上一篇：在用Rselenium進行網路搜索并保存為資料框架時創建"for"回圈

下一篇：x86_64匯編中的分段故障