# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import KejianItem
import re
from lxml import etree
import requests
from scrapy import Request
class A53kejianSpider(CrawlSpider):
name = ''
allowed_domains = ['com']
start_urls = ['http://www.com/']
rules = (
Rule(LinkExtractor(allow=r'/\w{2,9}/$'),follow=True),
Rule(LinkExtractor(allow=r'/\w{2,9}/\d+.html'), callback='parse_item', follow=False),
)
#重寫start_requests
def start_requests(self):
cookies = 'ASPSESSIONIDAQSSQBCQ=ANELFNGBHNAFGJGBPCOOHACN; Hm_lvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587718373,1587728873,1587741214,1587773867; Hm_lpvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587776002'
cookies = {i.split('=')[0]: i.split('=')[1] for i in cookies.split('; ')}
yield scrapy.Request(url=self.start_urls[0],cookies=cookies,callback=self.parse,dont_filter=True)
def parse_item(self, response):
item=KejianItem()
#標題
item['title'] =response.xpath('//div[@class="b downinfo"]/h1/text()').get()
#內容
content=response.xpath('//div[@class ="p20"]/p//text()').getall()
item['content']=''.join(content)
#驗證碼
codes =response.xpath('//*[@id="container"]/div[1]/div[4]//text()').getall()
item['code'] =codes[1]+codes[2]+codes[9]
#下載地址:
orgin =response.url
number =re.findall('\d',orgin)
number=''.join(number)
url = 'http://www。com/plug/down.asp?id=' + number + '&order=0'
###獲取下載鏈接
yield scrapy.http.Request(url,meta={'item':item} ,callback=self.down_url,dont_filter=True)
def down_url(self,response):
item = response.meta['item']
result = response.xpath('//text()').getall()
result = "".join(result)
downurl = result[1:] #下載地址
item['downurl'] = downurl
yield item
重寫了start_requests,讓parse請求中代了cookies資訊,但是下載鏈接還是不能獲取。
不再scrapy框架中,使用該requests模塊可以獲取下載鏈接
from lxml import etree
import requests
url = 'http://www。com/plug/down.asp?id=' + number + '&order=0'
cookies ='ASPSESSIONIDAQSSQBCQ=OIEKFNGBBMBKAOACCCJJCJCK; Hm_lvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587685941,1587718373,1587728873,1587741214; 1Hl5Yp=content%5F9353=6123%3A381; Hm_lpvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587742953'
cookies = {i.split('=')[0]: i.split('=')[1] for i in cookies.split('; ')}
resp = requests.get(url,cookies=cookies) # 獲取 url 網頁原始碼
data = etree.HTML(resp.text)
downl =data.xpath('//text()')
print(downl)
**大神,這是為什么?在scrapy中,我也攜帶了cookie,但是為什么不能獲取下載地址**
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/54172.html
