scrapy 反扒 post 瀏覽器可獲取資料 scrapy資料為空-有解無憂

目標地址:http://www.ccgp-gansu.gov.cn/web/article/128/0/index.htm
此網站為post提交,回傳html文本,詳細的可以看我的代碼
想爬取的內容:串列中的專案
問題:scrapy獲取的body中沒有串列的資料 ul中沒有li
曾嘗試解決,用cookiejar:True,還是沒有資料
希望有能力的小伙伴,能給予一點提示,不勝感激
spider源檔案



# -*- coding: utf-8 -*-

import re



import scrapy

import scrapy_splash

from demo.items import DemoItem

from datetime import datetime





class GgzyfwSpider(scrapy.Spider):

    name = 'gsccgp'

    allowed_domains = ['www.ccgp-gansu.gov.cn']

    start_urls = ['http://www.ccgp-gansu.gov.cn/web/doSearchmxarticle.action']

    url = 'http://www.ccgp-gansu.gov.cn/web/doSearchmxarticle.action'



    def get_form_data(self, page):

        payload = {'articleSearchInfoVo.releasestarttime': '',

                   'articleSearchInfoVo.releaseendtime': '',

                   'articleSearchInfoVo.tflag': '1',

                   'articleSearchInfoVo.classname': '128',

                   'articleSearchInfoVo.dtype': '0',

                   'articleSearchInfoVo.days': '',

                   'articleSearchInfoVo.releasestarttimeold': '',

                   'articleSearchInfoVo.releaseendtimeold': '',

                   'articleSearchInfoVo.title': '',

                   'articleSearchInfoVo.agentname': '',

                   'articleSearchInfoVo.bidcode': '',

                   'articleSearchInfoVo.proj_name': '',

                   'articleSearchInfoVo.buyername': '',

                   'total': '5402',

                   'limit': '20',

                   'current': str(page),

                   'sjm': '7466'}

        return payload



    def start_requests(self):

        yield scrapy_splash.SplashFormRequest(method='post', formdata=https://bbs.csdn.net/topics/self.get_form_data(1),

                                              url=self.url, callback=self.parse)



    def parse(self, response):

        tr_list = response.xpath("//ul[@class='Expand_SearchSLisi']/li")

        if not tr_list:

            return

        else:

            pass



        current = self.settings.get('CURRENT_DATA')

        domain = 'http://www.ccgp-gansu.gov.cn'

        # 第一個tr是表頭

        for li in tr_list:

            date_str = li.xpath("string(.//span[1]//text())").get().strip()



            # 開標時間： | 發布時間：2020-03-12 20:41:01 | 采購人：平涼市崆峒區白水鎮人民政府 | 代理機構：甘肅海天建設工程造價咨詢有限公司

            date_arr = date_str.split('|')

            date = date_arr[1].split('：')[1].strip()

            buy_person = date_arr[2].split('：')[1].strip()

            middle_name = date_arr[3].split('：')[1].strip()

            if date:

                # project_time

                date_time = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")



                now_time = datetime.now()



                diff_day = (now_time - date_time).days



                if diff_day > current:

                    # 因為還要處理圖片所以不能停止爬蟲,停止了圖片就也不處理了

                    # self.crawler.engine.close_spider(self, '日期過了')

                    # print('>>>>>日期過了')

                    return

                else:

                    # print('>>>>>可以繼續')

                    pass

                item = DemoItem()

                item['publish_date'] = date

                item['source_url'] = self.start_urls[0]

                item['project_name'] = li.xpath(".//a//text()").get()

                href = li.xpath('.//a/@href').get()

                item['url'] = domain + href



                # 廢標/終止公告 | 平涼市崆峒區白水鎮人民政府 | 農、林、牧、漁業

                other_str = li.xpath("string(.//span/strong//text())").get().strip()

                other_arr = other_str.split('|')

                item['status'] = other_arr[0].strip()

                item['buy_area'] = other_arr[1].strip()

                item['project_type_name'] = other_arr[2].strip()

                item['buy_person'] = buy_person

                item['middle_name'] = middle_name

                print(item)

                # yield item



        depth = response.meta.get('depth', 0)

        page = depth + 1

        url = domain + '/web/doSearchmxarticle.action?limit=20&start=' + str(page * 20)

        yield scrapy.Request(url=url, callback=self.parse)

uj5u.com熱心網友回復：

截取你部分代碼，跑一個僅requests后決議的代碼，取資料是成功的



# -*- coding: utf-8 -*-

import re



from datetime import datetime

class GgzyfwSpider():

    name = 'gsccgp'

    allowed_domains = ['www.ccgp-gansu.gov.cn']

    start_urls = ['http://www.ccgp-gansu.gov.cn/web/doSearchmxarticle.action']

    url = 'http://www.ccgp-gansu.gov.cn/web/doSearchmxarticle.action'



    def get_form_data(self, page):

        payload = {'articleSearchInfoVo.releasestarttime': '',

                   'articleSearchInfoVo.releaseendtime': '',

                   'articleSearchInfoVo.tflag': '1',

                   'articleSearchInfoVo.classname': '128',

                   'articleSearchInfoVo.dtype': '0',

                   'articleSearchInfoVo.days': '',

                   'articleSearchInfoVo.releasestarttimeold': '',

                   'articleSearchInfoVo.releaseendtimeold': '',

                   'articleSearchInfoVo.title': '',

                   'articleSearchInfoVo.agentname': '',

                   'articleSearchInfoVo.bidcode': '',

                   'articleSearchInfoVo.proj_name': '',

                   'articleSearchInfoVo.buyername': '',

                   'total': '5402',

                   'limit': '20',

                   'current': str(page),

                   'sjm': '7466'}

        return payload



    def start_requests(self):

        import requests

        resp = requests.post(url=self.url, data=https://bbs.csdn.net/topics/self.get_form_data(1))

        self.parse(resp)



    def parse(self, response):

        import lxml

        from lxml import etree

        doc = etree.HTML(response.text)

        tr_list = doc.xpath("//ul[@class='Expand_SearchSLisi']/li")

        if not tr_list:

            return

        else:

            pass



        domain = 'http://www.ccgp-gansu.gov.cn'

        # 第一個tr是表頭

        for li in tr_list:

            date_str = li.xpath("string(.//span[1]//text())").strip()

            print(date_str)



GgzyfwSpider().start_requests()

轉載請註明出處，本文鏈接：https://www.uj5u.com/qita/75139.html

標籤：腳本語言(Perl/Python)

上一篇：新新人求教

下一篇：flask的視圖函式無法呼叫外部方法