如何使用Scrapy將抓取的資料匯出為可讀的json-有解無憂

根據如何使用 Scrapy 將抓取的資料匯出為可讀的 json

所需的輸出格式是：

{"content": "text", "scrape_date": "36456456456"}
{"content": "text", "scrape_date": "56445435435"}

我的蜘蛛.py：

import scrapy
import time
import json
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse


DICT = {
    'quotes.toscrape.com': 'domain1.json',
    'stadt-koeln.de': 'domain2.json',
}


class PagingIncremental(CrawlSpider):
    name = "my_spider"

    allowed_domains = ['quotes.toscrape.com', 'stadt-koeln.de']

    start_urls = [
        'https://quotes.toscrape.com/page/1/',
        'https://www.stadt-koeln.de/leben-in-koeln/planen-bauen/bebauungsplaene/aufstellen-eines-bauleitplanes'
    ]

    custom_settings = {
        'DOWNLOAD_DELAY': '0',
        'FEED_EXPORT_ENCODING': 'utf-8',
        'DEPTH_LIMIT': '1',
        'AUTOTHROTTLE_ENABLED': 'True',
        'AUTOTHROTTLE_START_DELAY': '1',
        'AUTOTHROTTLE_MAX_DELAY': '3'
    }
    # Visit all found sublinks
    rules = (
        Rule(LinkExtractor(allow=r""), callback='parse', follow=False),
    )

    def parse(self, response):

        item = {}

        # get domain from each sub page 
        domain = urlparse(response.url).netloc
        domain = domain.replace("www.", "")

        # if domain from DICT above matches with domain from subpage
        # all sublinks are stored in the same output file
        item["filename"] = DICT[domain]
        item["content"] = response.xpath("//p/text()").getall() 
        item['scrape_date'] = int(time.time())

        yield item


if __name__ == "__main__":
    process = CrawlerProcess(settings={
    })

    # process = CrawlerProcess()
    process.crawl(PagingIncremental)
    process.start()

我的pipelines.py：

from scrapy.exporters import JsonItemExporter

class SaveJsonPipeline:
    def process_item(self, item, spider):

        filename = item['filename']
        del item['filename']

        # if the file exists it will append the data 
        JsonItemExporter(open(filename, "ab")).export_item(item)

        return item

我的settings.py：

ITEM_PIPELINES = {
   '<project_name>.pipelines.SaveJsonPipeline': 300,
}

如果我使用a而不是以非二進制格式ab匯出資料， Scrapy 會說：pipelines.py

 JsonItemExporter(open(filename, "a")).export_item(item)
  File "c:\python\lib\site-packages\scrapy\exporters.py", line 135, in export_item
    self.file.write(to_bytes(data, self.encoding))
TypeError: write() argument must be str, not bytes

任何想法和解決方案都會被獎勵！

uj5u.com熱心網友回復：

您應該使用JsonLinesItemExporter而不是JsonItemExporter在單獨的行中獲取每個專案。

并且不要打擾bytes，因為檔案提到它必須在bytes mode.

在pandas.read_json()中，您可以使用選項lines=True來讀取JSONL（多行 JSON）：

df = pd.read_json('domain1.json', lines=True)

完整的作業代碼。

所有代碼都在一個檔案中，因此每個人都可以簡單地復制和測驗它。

我曾經'__main__.SaveJsonPipeline'從當前檔案加載類。

我還添加了代碼來洗掉空格content并加入一個字串：

" ".join([x.strip() for x in response.xpath("//p/text()").getall()]).strip()

import time
import scrapy
#import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urlparse
from scrapy.exporters import JsonItemExporter, JsonLinesItemExporter


class SaveJsonPipeline:
    def process_item(self, item, spider):

        filename = item['filename']
        del item['filename']

        # if the file exists it will append the data 
        JsonLinesItemExporter(open(filename, "ab")).export_item(item)

        return item


DICT = {
    'quotes.toscrape.com': 'domain1.json',
    'stadt-koeln.de': 'domain2.json',
}


class PagingIncremental(CrawlSpider):
    name = "my_spider"

    allowed_domains = ['quotes.toscrape.com', 'stadt-koeln.de']

    start_urls = [
        'https://quotes.toscrape.com/page/1/',
        'https://www.stadt-koeln.de/leben-in-koeln/planen-bauen/bebauungsplaene/aufstellen-eines-bauleitplanes'
    ]

    custom_settings = {
        'DOWNLOAD_DELAY': '0',
        'FEED_EXPORT_ENCODING': 'utf-8',
        'DEPTH_LIMIT': '1',
        'AUTOTHROTTLE_ENABLED': 'True',
        'AUTOTHROTTLE_START_DELAY': '1',
        'AUTOTHROTTLE_MAX_DELAY': '3'
    }
    # Visit all found sublinks
    rules = (
        Rule(LinkExtractor(allow=r""), callback='parse', follow=False),
    )

    def parse(self, response):

        item = {}

        # get domain from each sub page 
        domain = urlparse(response.url).netloc
        domain = domain.replace("www.", "")

        # if domain from DICT above matches with domain from subpage
        # all sublinks are stored in the same output file
        item["filename"] = DICT[domain]
        #item["content"] = [x.strip() for x in response.xpath("//p/text()").getall()]
        item["content"] = " ".join([x.strip() for x in response.xpath("//p/text()").getall()]).strip()
        item['scrape_date'] = int(time.time())

        yield item


if __name__ == "__main__":
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/5.0',
        'ITEM_PIPELINES': {'__main__.SaveJsonPipeline': 1},  # used Pipeline create in current file (needs __main___)
    })

    # process = CrawlerProcess()
    process.crawl(PagingIncremental)
    process.start()

    import pandas as pd
    df = pd.read_json('domain1.json', lines=True)
    print(df.head())

轉載請註明出處，本文鏈接：https://www.uj5u.com/net/455141.html

標籤：Python json 熊猫网页抓取刮擦

上一篇：如何使用python從網頁中抓取視頻URL？

下一篇：如何使用XPath在兩個條件下進行選擇