目標地址:http://www.ccgp-gansu.gov.cn/web/article/128/0/index.htm
此網站為post提交,回傳html文本,詳細的可以看我的代碼
想爬取的內容:串列中的專案
問題:scrapy獲取的body中沒有串列的資料 ul中沒有li
曾嘗試解決,用cookiejar:True,還是沒有資料
希望有能力的小伙伴,能給予一點提示,不勝感激
spider源檔案
# -*- coding: utf-8 -*-
import re
import scrapy
import scrapy_splash
from demo.items import DemoItem
from datetime import datetime
class GgzyfwSpider(scrapy.Spider):
name = 'gsccgp'
allowed_domains = ['www.ccgp-gansu.gov.cn']
start_urls = ['http://www.ccgp-gansu.gov.cn/web/doSearchmxarticle.action']
url = 'http://www.ccgp-gansu.gov.cn/web/doSearchmxarticle.action'
def get_form_data(self, page):
payload = {'articleSearchInfoVo.releasestarttime': '',
'articleSearchInfoVo.releaseendtime': '',
'articleSearchInfoVo.tflag': '1',
'articleSearchInfoVo.classname': '128',
'articleSearchInfoVo.dtype': '0',
'articleSearchInfoVo.days': '',
'articleSearchInfoVo.releasestarttimeold': '',
'articleSearchInfoVo.releaseendtimeold': '',
'articleSearchInfoVo.title': '',
'articleSearchInfoVo.agentname': '',
'articleSearchInfoVo.bidcode': '',
'articleSearchInfoVo.proj_name': '',
'articleSearchInfoVo.buyername': '',
'total': '5402',
'limit': '20',
'current': str(page),
'sjm': '7466'}
return payload
def start_requests(self):
yield scrapy_splash.SplashFormRequest(method='post', formdata=https://bbs.csdn.net/topics/self.get_form_data(1),
url=self.url, callback=self.parse)
def parse(self, response):
tr_list = response.xpath("//ul[@class='Expand_SearchSLisi']/li")
if not tr_list:
return
else:
pass
current = self.settings.get('CURRENT_DATA')
domain = 'http://www.ccgp-gansu.gov.cn'
# 第一個tr是表頭
for li in tr_list:
date_str = li.xpath("string(.//span[1]//text())").get().strip()
# 開標時間: | 發布時間:2020-03-12 20:41:01 | 采購人:平涼市崆峒區白水鎮人民政府 | 代理機構:甘肅海天建設工程造價咨詢有限公司
date_arr = date_str.split('|')
date = date_arr[1].split(':')[1].strip()
buy_person = date_arr[2].split(':')[1].strip()
middle_name = date_arr[3].split(':')[1].strip()
if date:
# project_time
date_time = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
now_time = datetime.now()
diff_day = (now_time - date_time).days
if diff_day > current:
# 因為還要處理圖片所以不能停止爬蟲,停止了圖片就也不處理了
# self.crawler.engine.close_spider(self, '日期過了')
# print('>>>>>日期過了')
return
else:
# print('>>>>>可以繼續')
pass
item = DemoItem()
item['publish_date'] = date
item['source_url'] = self.start_urls[0]
item['project_name'] = li.xpath(".//a//text()").get()
href = li.xpath('.//a/@href').get()
item['url'] = domain + href
# 廢標/終止公告 | 平涼市崆峒區白水鎮人民政府 | 農、林、牧、漁業
other_str = li.xpath("string(.//span/strong//text())").get().strip()
other_arr = other_str.split('|')
item['status'] = other_arr[0].strip()
item['buy_area'] = other_arr[1].strip()
item['project_type_name'] = other_arr[2].strip()
item['buy_person'] = buy_person
item['middle_name'] = middle_name
print(item)
# yield item
depth = response.meta.get('depth', 0)
page = depth + 1
url = domain + '/web/doSearchmxarticle.action?limit=20&start=' + str(page * 20)
yield scrapy.Request(url=url, callback=self.parse)
uj5u.com熱心網友回復:
截取你部分代碼,跑一個僅requests后決議的代碼,取資料是成功的
# -*- coding: utf-8 -*-
import re
from datetime import datetime
class GgzyfwSpider():
name = 'gsccgp'
allowed_domains = ['www.ccgp-gansu.gov.cn']
start_urls = ['http://www.ccgp-gansu.gov.cn/web/doSearchmxarticle.action']
url = 'http://www.ccgp-gansu.gov.cn/web/doSearchmxarticle.action'
def get_form_data(self, page):
payload = {'articleSearchInfoVo.releasestarttime': '',
'articleSearchInfoVo.releaseendtime': '',
'articleSearchInfoVo.tflag': '1',
'articleSearchInfoVo.classname': '128',
'articleSearchInfoVo.dtype': '0',
'articleSearchInfoVo.days': '',
'articleSearchInfoVo.releasestarttimeold': '',
'articleSearchInfoVo.releaseendtimeold': '',
'articleSearchInfoVo.title': '',
'articleSearchInfoVo.agentname': '',
'articleSearchInfoVo.bidcode': '',
'articleSearchInfoVo.proj_name': '',
'articleSearchInfoVo.buyername': '',
'total': '5402',
'limit': '20',
'current': str(page),
'sjm': '7466'}
return payload
def start_requests(self):
import requests
resp = requests.post(url=self.url, data=https://bbs.csdn.net/topics/self.get_form_data(1))
self.parse(resp)
def parse(self, response):
import lxml
from lxml import etree
doc = etree.HTML(response.text)
tr_list = doc.xpath("//ul[@class='Expand_SearchSLisi']/li")
if not tr_list:
return
else:
pass
domain = 'http://www.ccgp-gansu.gov.cn'
# 第一個tr是表頭
for li in tr_list:
date_str = li.xpath("string(.//span[1]//text())").strip()
print(date_str)
GgzyfwSpider().start_requests()
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/75139.html
上一篇:新新人求教
