之前看了網上一個爬取騰訊招聘的栗子
import re
import json
from scrapy.selector import Selector
try:
from scrapy.spider import Spider
except:
from scrapy.spider import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.utils.url import urljoin_rfc
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from itzhaopin.items import *
from itzhaopin.misc.log import *
class TencentSpider(CrawlSpider):
name = "tencent"
allowed_domains = ["tencent.com"]
start_urls = [
"http://hr.tencent.com/position.php"
]
rules = [ # 定義爬取URL的規則
Rule(sle(allow=("/position.php\?&start=\d{,4}#a")), follow=True, callback='parse_item')
]
def parse_item(self, response): # 提取資料到Items里面,主要用到XPath和CSS選擇器提取網頁資料
items = []
sel = Selector(response)
base_url = get_base_url(response)
sites_even = sel.css('table.tablelist tr.even')
for site in sites_even:
item = TencentItem()
item['name'] = site.css('.l.square a').xpath('text()').extract()
relative_url = site.css('.l.square a').xpath('@href').extract()[0]
item['detailLink'] = urljoin_rfc(base_url, relative_url)
item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()
item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()
item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()
item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()
items.append(item)
#print repr(item).decode("unicode-escape") + '\n'
sites_odd = sel.css('table.tablelist tr.odd')
for site in sites_odd:
item = TencentItem()
item['name'] = site.css('.l.square a').xpath('text()').extract()
relative_url = site.css('.l.square a').xpath('@href').extract()[0]
item['detailLink'] = urljoin_rfc(base_url, relative_url)
item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()
item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()
item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()
item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()
items.append(item)
#print repr(item).decode("unicode-escape") + '\n'
info('parsed ' + str(response))
return items
def _process_request(self, request):
info('process ' + str(request))
return request
自己根據這個寫了一個,從python的百科詞條,提取頁面內鏈接,爬取標題,簡介,url
import re
from scrapy.selector import Selector
from scrapy.utils.response import get_base_url
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor as sle
from baikeSpider.items import *
class BaikeSpider(CrawlSpider):
name = "baike"
allowed_domains = ["baike.baidu.com"]
start_urls = [
"http://baike.baidu.com/view/21087.htm"
]
rule = [
Rule(sle(allow=("/view/\d+\.htm")), follow=True, callback='parse_item')
]
def parse_item(self,response):
sel = Selector(response)
base_url = get_base_url(response)
item = BaikespiderItem()
item['title'] = sel.xpath('//dd[contains(@class, "lemmaWgt-lemmaTitle-title")]/h1/text()').extract()[0]
item['url'] = base_url
item['desc'] = sel.xpath('//div[contains(@class, "lemma-summary")]/div/text()').extract()
return item
def _process_request(self, request):
print('process ' + str(request))
return request
這個無法爬取,測驗了一下,parse_item沒有執行過,我自己把parse_item改成parse,卻是能爬取start_url,很疑惑,為什么原來的不用,可以那樣寫,另外請教下我這個該怎么改
uj5u.com熱心網友回復:
沒說清,代碼也沒貼全uj5u.com熱心網友回復:
唔你覺得哪里沒說清,然后少了哪部分代碼能說下嗎uj5u.com熱心網友回復:
你還挺逗的, 我認為你沒說清,我還說的清? 既然你說一個能運行成功,一個不能 那你給出這兩份代碼來uj5u.com熱心網友回復:
你要是說不清我哪里說不清,那你是怎么認為我哪里說不清,spider的代碼都貼全了,另外items和pipelines根本不影響爬取,除此之外你還要我貼什么???uj5u.com熱心網友回復:
scrappy 預設呼叫 parse,你要用parse——item,說明你的這句話沒起作用:rule = [Rule(sle(allow=("/view/\d+\.htm")), follow=True, callback='parse_item')
]
正則運算式沒有匹配view/21087.htm, 自己找正則運算式的知識補全他就可以了。
uj5u.com熱心網友回復:
樓主,你好,我也遇到了一樣的問題,想問下你最后有找到解決方法嗎uj5u.com熱心網友回復:
在前面我們的爬蟲程式繼承了CrawlSpider,還定義了parse(self, response)作為抓取到內容之后的回呼函式,實際上這是錯誤的用法!!!
官方提示:當撰寫爬蟲規則時,請避免使用 parse 作為回呼函式。 由于 CrawlSpider 使用 parse 方法來實作其邏輯,如果 您覆寫了 parse 方法,crawl spider 將會運行失敗。
所以解決方法就是洗掉自定義的parse,然后callback=parse_item 決議操作放到parse_item中就可以了
uj5u.com熱心網友回復:
樓上說跟沒說一樣轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/81753.html
