請求各位大佬幫忙看一下,我的問題所在。最近在學習Scrapy的爬蟲框架,對照北理嵩天老師的代碼,重新選取網站做的,使用scrapy crawl命令運行時,在cmd命令列中無錯誤,但是在txt檔案內沒有內容,下面是我的代碼,請各位大佬幫忙debug下。
爬取的網站為股城網:
股票串列鏈接:https://hq.gucheng.com/gpdmylb.html
個股資訊鏈接:https://hq.gucheng.com/'+ stock
1.Spider中爬蟲檔案(stocks.py)
import scrapy
import re
class StocksSpider(scrapy.Spider):
name = 'stocks'
#allowed_domains = ['hq.gecheng.com']
start_urls = ['https://hq.gucheng.com/gpdmylb.html']
def parse(self, response):
# 對a標簽中的鏈接進行提取
kv = {'user-agent': 'Mozilla/5.0'} # 模擬瀏覽器發送請求
for href in response.css('a::attr(href)').extract():
try:
stock = re.findall(r"[S][HZ]\d{6}",href)[0]#通過正則運算式獲取正確的股票代碼
url = 'https://hq.gucheng.com/' + stock
yield scrapy.Request(url,callback=self.parse_stock,headers=kv)
#第二個引數callback給出了處理當前url給出的新的函式即parse_stock
#return item
except:
continue
def parse_stock(self, response):
infoDict = {} # 對每一個頁面生成空字典
stockInfo = response.css('.stock_top clearfix')
name = stockInfo.css('.stock_title').extract()[0]
keyList = stockInfo.css('dt').extract()
valueList = stockInfo.css('dd').extract()
for i in range(len(keyList)):
key = re.findall(r'<dt>.*</dt>', keyList[i])[0][1:-5]
# key = key.replace('\u2003','')
# key = key.replace('\xa0', '')
try:
val = re.findall(r'<dd>\d+\.?.*</dd>', valueList[i])[0][0:-5]
except:
val = '--'
infoDict[key] = val
infoDict.update(
{'股票名稱': re.findall('\s.*\(', name)[0].split()[0] + re.findall('\>.*\<', name)[0][1:-1]})
yield infoDict
2.pipelines.py
class GuchengstocksInfoPipeline(object):
#openspider指的是當一個爬蟲被呼叫時對應的pipline啟動的方法
def open_spider(self, spider):
self.f = open('GuchengStockInfo.txt', 'w')
#close_spider指的是當一個爬蟲關閉時對應的pipline啟動的方法
def close_spider(self, spider):
self.f.close()
#對每一個item項進行處理時對應的方法,也是最主體的函式
def process_item(self, item, spider):
try:
line = str(dict(item)) + '\n'
self.f.write(line)
except:
pass
return item
3.組態檔settings
ITEM_PIPELINES = {
'GuchengStocks.pipelines.GuchengstocksInfoPipeline': 300,
}
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/200352.html
