我想從價格等網站中提取資訊并將其作為值存盤在字典中。但是,我正在嘗試學習scrapy,所以我想知道如何用它來實作這一目標。
這是它的外觀requests和BeautifulSoup
import numpy as np
import requests as r
import pandas as pd
from bs4 import BeauitfulSoup
html = ['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16']
data = defaultdict(list)
for i in range(0, len(html):
r = requests.get(html[i])
soup = BeautifulSoup(r.content, 'lxml')
name = soup.select(".s-item__title")
value = soup.select(".ITALIC")
for n, v in zip(name, value):
data["card"].append(n.text.strip())
data["price"].append(v.text.strip())
這是我用scrapy嘗試過的,但在查看json輸出后我沒有得到任何值。我只是得到了鏈接,我如何得到像上面代碼一樣的輸出?:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
html = np.array(['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16'],
dtype=object)
url = pd.DataFrame(html, columns=['data'])
class StatisticsItem(scrapy.Item):
statistics_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'statistics'
start_urls = url.data.values
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
table = response.xpath("//div[@class='s-item__price']").get()
loader = ItemLoader(StatisticsItem())
loader.add_value('values', table)
loader.add_value('url', response.url)
yield loader.load_item()
process = CrawlerProcess(
settings={
'FEED_URI': 'ebay_data.json',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(StatisticsSpider)
process.start()
uj5u.com熱心網友回復:
這是迄今為止最小的作業解決方案
代碼:
import numpy as np
import requests
import pandas as pd
from bs4 import BeautifulSoup
urls = ['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16']
data=[]
for url in urls:
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
cards = soup.select("ul.b-list__items_nofooter.srp-results.srp-grid li")
for card in cards:
title =card.select_one('h3.s-item__title')
p =card.select_one('span.s-item__price span')
price = p.text if p else None
data.append([title,price])
cols= ['Title','Price']
df = pd.DataFrame(data, columns= cols)
print(df)
#df.to_csv('info.csv',index = False)
輸出:
Title Price
0 [THE ULTIMATE COMPLETE 1ST EDITION POKEMON BOO... £4,466,944.61
1 [Japanese Old Back Pokemon Trophy Card NO.2 Ne... £1,315,157.83
2 [Non-Ultimate Pokemon WOTC Booster Box Collect... £1,095,964.86
3 [First Digital Charizard Lv.80 from Pokemon Re... £850,245.68
4 [First Digital Blastoise Lv.100 from Pokemon R... £850,245.68
.. ... ...
187 [PSA 9 BLASTOISE 1999 Pokemon 1st Edition THIN... £29,779.62
188 [Pokemon Charizard No Rarity Symbol ?? BGS 8 Ho... None
189 [Yu GI OH UPPER DECK Pharaoh Tour 2004 Final W... £29,757.75
190 [?? Pokemon PSA 10 1st edition Shadowless Base ... £29,779.59
191 [POKEMON EX DEOXYS LATIAS HOLO GOLD STAR #105 ... £29,835.43
[192 rows x 2 columns]
uj5u.com熱心網友回復:
我將 custom_settings 設定為使用 json 格式寫入“cards_info.json”。
在決議內部,我瀏覽頁面上的每張卡片(請參閱 xpath)并獲取卡片的標題和價格,然后我將它們生成。Scrapy 會將它們寫入“cards_info.json”。
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
class StatisticsItem(scrapy.Item):
statistics_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'statistics'
start_urls = ['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16']
custom_settings = {
'FEED_FORMAT': 'json',
'FEED_URI': 'cards_info.json'
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
all_cards = response.xpath('//div[@]')
for card in all_cards:
name = card.xpath('.//h3/text()').get()
price = card.xpath('.//span[@]//text()').get()
# now do whatever you want, append to dictionary, yield as item.
# example with yield:
yield {
'card': name,
'price': price
}
輸出:
[scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16>
{'card': 'Pokemon 1st Edition Shadowless Base Set 11 Blister Booster Pack Lot - DM To Buy!', 'price': '£93,805.84'}
[scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16>
{'card': 'Pokemon Team Rocket Complete Complete 83/82, German, 1. Edition', 'price': '£102,026.04'}
[scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16>
{'card': 'Yugioh E Hero Pit Boss 2013 World Championship Prize Card BGS 9.5 Gem Mint', 'price': '£100,000.00'}
...
...
card_info.json:
[
{"card": "1999 Pokemon Base Set Booster Box GREEN WING", "price": "\u00a340,000.00"},
{"card": "1996 MEDIA FACTORY POKEMON NO RARITY BASE SET CHARIZARD 006 BECKETT BGS MINT 9 ", "price": "\u00a339,999.99"},
{"card": "Yugioh - BGS8.5 Jump Festa Blue Eyes White Dragon -1999 - Limited - PSA", "price": "\u00a340,000.00"},
{"card": "PSA 8 CHARIZARD 1999 POKEMON 1ST EDITION THICK STAMP SHADOWLESS #4 HOLO NM-MINT", "price": "\u00a337,224.53"},
{"card": "PSA 9 MINT Pok\u00e9mon Play Promo 50000 PTS Gold Star Japanese Pokemon", "price": "\u00a338,261.06"},
...
...
]
轉載請註明出處,本文鏈接:https://www.uj5u.com/ruanti/357467.html
