根據
所需的輸出格式是:
{"content": "text", "scrape_date": "36456456456"}
{"content": "text", "scrape_date": "56445435435"}
我的蜘蛛.py:
import scrapy
import time
import json
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse
DICT = {
'quotes.toscrape.com': 'domain1.json',
'stadt-koeln.de': 'domain2.json',
}
class PagingIncremental(CrawlSpider):
name = "my_spider"
allowed_domains = ['quotes.toscrape.com', 'stadt-koeln.de']
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://www.stadt-koeln.de/leben-in-koeln/planen-bauen/bebauungsplaene/aufstellen-eines-bauleitplanes'
]
custom_settings = {
'DOWNLOAD_DELAY': '0',
'FEED_EXPORT_ENCODING': 'utf-8',
'DEPTH_LIMIT': '1',
'AUTOTHROTTLE_ENABLED': 'True',
'AUTOTHROTTLE_START_DELAY': '1',
'AUTOTHROTTLE_MAX_DELAY': '3'
}
# Visit all found sublinks
rules = (
Rule(LinkExtractor(allow=r""), callback='parse', follow=False),
)
def parse(self, response):
item = {}
# get domain from each sub page
domain = urlparse(response.url).netloc
domain = domain.replace("www.", "")
# if domain from DICT above matches with domain from subpage
# all sublinks are stored in the same output file
item["filename"] = DICT[domain]
item["content"] = response.xpath("//p/text()").getall()
item['scrape_date'] = int(time.time())
yield item
if __name__ == "__main__":
process = CrawlerProcess(settings={
})
# process = CrawlerProcess()
process.crawl(PagingIncremental)
process.start()
我的pipelines.py:
from scrapy.exporters import JsonItemExporter
class SaveJsonPipeline:
def process_item(self, item, spider):
filename = item['filename']
del item['filename']
# if the file exists it will append the data
JsonItemExporter(open(filename, "ab")).export_item(item)
return item
我的settings.py:
ITEM_PIPELINES = {
'<project_name>.pipelines.SaveJsonPipeline': 300,
}
如果我使用a而不是以非二進制格式ab匯出資料, Scrapy 會說:pipelines.py
JsonItemExporter(open(filename, "a")).export_item(item)
File "c:\python\lib\site-packages\scrapy\exporters.py", line 135, in export_item
self.file.write(to_bytes(data, self.encoding))
TypeError: write() argument must be str, not bytes
任何想法和解決方案都會被獎勵!
uj5u.com熱心網友回復:
您應該使用JsonLinesItemExporter而不是JsonItemExporter在單獨的行中獲取每個專案。
并且不要打擾bytes,因為檔案提到它必須在bytes mode.
在pandas.read_json()中,您可以使用選項lines=True來讀取JSONL(多行 JSON):
df = pd.read_json('domain1.json', lines=True)
完整的作業代碼。
所有代碼都在一個檔案中,因此每個人都可以簡單地復制和測驗它。
我曾經'__main__.SaveJsonPipeline'從當前檔案加載類。
我還添加了代碼來洗掉空格content并加入一個字串:
" ".join([x.strip() for x in response.xpath("//p/text()").getall()]).strip()
import time
import scrapy
#import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urlparse
from scrapy.exporters import JsonItemExporter, JsonLinesItemExporter
class SaveJsonPipeline:
def process_item(self, item, spider):
filename = item['filename']
del item['filename']
# if the file exists it will append the data
JsonLinesItemExporter(open(filename, "ab")).export_item(item)
return item
DICT = {
'quotes.toscrape.com': 'domain1.json',
'stadt-koeln.de': 'domain2.json',
}
class PagingIncremental(CrawlSpider):
name = "my_spider"
allowed_domains = ['quotes.toscrape.com', 'stadt-koeln.de']
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://www.stadt-koeln.de/leben-in-koeln/planen-bauen/bebauungsplaene/aufstellen-eines-bauleitplanes'
]
custom_settings = {
'DOWNLOAD_DELAY': '0',
'FEED_EXPORT_ENCODING': 'utf-8',
'DEPTH_LIMIT': '1',
'AUTOTHROTTLE_ENABLED': 'True',
'AUTOTHROTTLE_START_DELAY': '1',
'AUTOTHROTTLE_MAX_DELAY': '3'
}
# Visit all found sublinks
rules = (
Rule(LinkExtractor(allow=r""), callback='parse', follow=False),
)
def parse(self, response):
item = {}
# get domain from each sub page
domain = urlparse(response.url).netloc
domain = domain.replace("www.", "")
# if domain from DICT above matches with domain from subpage
# all sublinks are stored in the same output file
item["filename"] = DICT[domain]
#item["content"] = [x.strip() for x in response.xpath("//p/text()").getall()]
item["content"] = " ".join([x.strip() for x in response.xpath("//p/text()").getall()]).strip()
item['scrape_date'] = int(time.time())
yield item
if __name__ == "__main__":
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'ITEM_PIPELINES': {'__main__.SaveJsonPipeline': 1}, # used Pipeline create in current file (needs __main___)
})
# process = CrawlerProcess()
process.crawl(PagingIncremental)
process.start()
import pandas as pd
df = pd.read_json('domain1.json', lines=True)
print(df.head())
轉載請註明出處,本文鏈接:https://www.uj5u.com/net/455141.html
