主程式
import scrapy
from scrapyDemo.qsbk.qsbk.items import QsbkItem
class QsbkSpiderSpider(scrapy.Spider):
name = 'qsbk_spider'
allowed_domains = ['qsbk.com']
start_urls = ['https://www.qiushibaike.com/text/page/1/']
base_domain = "https://www.qiushibaike.com"
def parse(self, response):
all_div = response.xpath("//div[@class='col1 old-style-col1']/div")
for div in all_div:
authors = div.xpath(".//div[@class='author clearfix']/a[2]/h2/text()").getall()[0].strip()
content = div.xpath(".//div[@class='content']/span/text()").getall()[0].replace("\n","")
# duanzi = {"author":authors,"content":content}
item = QsbkItem(author = authors,content = content)
yield item
# 下一頁的url
next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get()
# print(next_url)
if next_url:
yield scrapy.Request(self.base_domain+next_url,callback=self.parse)
else:
return
settings.py
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
}
ITEM_PIPELINES = {
'qsbk.pipelines.QsbkPipeline': 300,
}
import scrapy
class QsbkItem(scrapy.Item):
author = scrapy.Field()
content = scrapy.Field()
from scrapy.exporters import JsonLinesItemExporter
class QsbkPipeline:
def __init__(self):
# 匯出器使用bytes方式寫入檔案,所以這兒要用wb
self.fp = open("duanzi.json","wb")
self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding="utf-8")
def open_spider(self,spider):
print("爬蟲開始")
def process_item(self, item, spider):
# 匯入中
self.exporter.export_item(item)
return item
def close_spider(self,spider):
self.fp.close()
print("爬蟲結束")
運行結果如下:
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/36764.html
上一篇:excel 熱力圖
