我有這個腳本:
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables={}&extensions={"persistedQuery":{"version":1,"sha256Hash":"6869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5","sender":"[email protected]","provider":"[email protected]"},"variables":"eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9"}',
callback=self.parse,
method="GET"
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
yield {
'Casa':'Just_For_Sports',
'Sku' :result['productReference'],
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' result['link'],
'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
它作業正常并獲得 576 行,但問題是它們是重復的。當我洗掉重復的資料時,我只得到 32 個唯一值,我認為我只從一頁(每頁 32 個產品)中獲取值我如何迭代所有我認為與該行有關的元素:
for item in range(0,576,32):
提前致謝
uj5u.com熱心網友回復:
您使用'Casa':'Just_For_Sports',的是不正確的,result['Just_For_Sports']但最重要的是您從哪里獲得"Just_For_Sports". 我沒有在產品串列中找到它。實際上,您不能將不存在的密鑰包含在products. 'Date':datetime.today().strftime('%Y-%m-%d')您也不會在產品串列中找到關鍵。現在您可以嘗試是否存在重復值。
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
headers = {"content-type": "application/json"}
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables={}&extensions={"persistedQuery":{"version":1,"sha256Hash":"6869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5","sender":"[email protected]","provider":"[email protected]"},"variables":"eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9"}',
callback=self.parse,
method="GET",
headers=headers,
dont_filter=True
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['data']['productSearch']['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
yield {
#'Casa':'Just_For_Sports',
'Sku' :result['productReference'],
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' result['link'],
# 'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
由 set() 證明
import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_hombre.csv'):
os.remove('jfs_hombre.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_hombre(scrapy.Spider):
name = 'jfs_hombre'
unique_data = set()
#start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]
custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}}
def start_requests(self):
headers = {"content-type": "application/json"}
yield scrapy.Request(
url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables={}&extensions={"persistedQuery":{"version":1,"sha256Hash":"6869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5","sender":"[email protected]","provider":"[email protected]"},"variables":"eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9"}',
callback=self.parse,
method="GET",
headers=headers,
dont_filter=True
)
def parse(self, response):
resp = response.json()
#print(resp)
for item in range(0,576,32):
resp['data']['productSearch']['recordsFiltered']=item
for result in resp['data']['productSearch']['products']:
s=result['productReference']
self.unique_data.add(s)
yield {
#'Casa':'Just_For_Sports',
'Sku' :s,
'Name': result['productName'],
'precio': result['priceRange']['sellingPrice']['highPrice'],
'Link': 'https://www.justforsport.com.ar' result['link'],
# 'Date':datetime.today().strftime('%Y-%m-%d')
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(JfsSpider_hombre)
process.start()
輸出:
'item_scraped_count': 576,
轉載請註明出處,本文鏈接:https://www.uj5u.com/qiye/471075.html
上一篇:Python如何抓取使用JavaScript函式的CSV下載按鈕?
下一篇:使用scrapy抓取JSON時出現此錯誤:Spidermustreturnrequest,item,orNone,got'str'
