我試圖創建一個scrapy蜘蛛來從一個站點下載一些json檔案 -
這是我的scrapy蜘蛛:(首先測驗了蜘蛛 - 所以它只輸出作業正常的json檔案的鏈接 - 請參閱下面的注釋代碼)但我想將json檔案下載到我電腦上的一個檔案夾中。
import scrapy
class spiderWords(scrapy.Spider):
name = 'spiderWords'
allowed_domains = ['kaikki.org']
start_urls = ['https://kaikki.org/dictionary/Spanish/words.html']
def parse(self, response):
tmpLinks = response.xpath("(//ul)[2]/li/a/@href").getall()
for l in tmpLinks:
l = response.urljoin(l)
request = scrapy.Request(l,
callback=self.parseDetails)
yield request
def parseDetails(self, response):
tmpLinks2 = response.xpath("(//ul)[2]/li/a/@href").getall()
for l2 in tmpLinks2:
l2 = response.urljoin(l2)
request = scrapy.Request(l2,
callback=self.parseDownload)
yield request
def parseDownload(self, response):
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field
tmpDownloadLink = response.xpath("//p[contains(text(), 'JSON')]/a/@href").get()
tmpDownloadLink = response.urljoin(tmpDownloadLink)
item = DownfilesItem()
item['file_urls'] = tmpDownloadLink
yield item
# yield {
# "link": tmpDownloadLink,
# }
這是我在 settings.py 中所做的更改:
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 1,
}
IMAGES_STORE = r'C:\DOWNLOAD\DATASETS\Spanish'
但不幸的是,json 檔案的下載不起作用。
如何將 json 檔案下載到定義的檔案夾?
uj5u.com熱心網友回復:
你有兩個問題。
item['file_urls']應該是一個串列。IMAGES_STORE應該FILES_STORE
import scrapy
class spiderWords(scrapy.Spider):
name = 'spiderWords'
allowed_domains = ['kaikki.org']
start_urls = ['https://kaikki.org/dictionary/Spanish/words.html']
def parse(self, response):
tmpLinks = response.xpath("(//ul)[2]/li/a/@href").getall()
for l in tmpLinks:
l = response.urljoin(l)
request = scrapy.Request(l,
callback=self.parseDetails)
yield request
def parseDetails(self, response):
tmpLinks2 = response.xpath("(//ul)[2]/li/a/@href").getall()
for l2 in tmpLinks2:
l2 = response.urljoin(l2)
request = scrapy.Request(l2,
callback=self.parseDownload)
yield request
def parseDownload(self, response):
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
tmpDownloadLink = response.xpath("//p[contains(text(), 'JSON')]/a/@href").get()
tmpDownloadLink = response.urljoin(tmpDownloadLink)
item = DownfilesItem()
item['file_urls'] = [tmpDownloadLink]
yield item
# yield {
# "link": tmpDownloadLink,
# }
轉載請註明出處,本文鏈接:https://www.uj5u.com/net/388006.html
