next_page 變數在 shell 上使用時甚至在控制臺上列印時都會給出正確的鏈接,但 Scrapy 仍然會繼續抓取相同的(第一頁)頁面
下面的代碼:
class QuotesSpider(scrapy.Spider):
name = "Bider"
def start_requests(self):
urls = [
"https://www.flipkart.com/clothing-and-accessories/bottomwear/pr?sid=clo,vua&p[]=facets.ideal_for%5B%5D=Men&p[]=facets.ideal_for%5B%5D=men&otracker=categorytree&fm=neo/merchandising&iid=M_1064313a-7a8d-48f3-8199-daaf60d62ef6_2_372UD5BXDFYS_MC.8HARX8UX7IX5&otracker=hp_rich_navigation_2_2.navigationCard.RICH_NAVIGATION_Fashion~Men%27s+Bottom+Wear_8HARX8UX7IX5&otracker1=hp_rich_navigation_PINNED_neo/merchandising_NA_NAV_EXPANDABLE_navigationCard_cc_2_L1_view-all&cid=8HARX8UX7IX5"
]
for url in urls:
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
browser=webdriver.Chrome()
browser.get(response.request.url)
next_page=response.css("a._1LKTO3::attr(href)").getall()
try:
next_page=next_page[-1]
except:
time.sleep(1)
next_page=response.css("a._1LKTO3::attr(href)").getall()
next_page=next_page[-1]
print("\n\n\n NEXT PAGE\n\n\n")
print("\n" next_page "\n")
print(response.urljoin(next_page))
if next_page is not None:
next_page=response.urljoin(next_page)
# yield scrapy.Request(url=next_page,callback=self.parse)
yield scrapy.Request(next_page, callback=self.parse)
uj5u.com熱心網友回復:
你的代碼對我有用,所以我不確定為什么它對你不起作用。無論如何,這種分頁也有效,但更干凈。
import scrapy
from selenium import webdriver
class QuotesSpider(scrapy.Spider):
name = "Bider"
def start_requests(self):
urls = [
"https://www.flipkart.com/clothing-and-accessories/bottomwear/pr?sid=clo,vua&p[]=facets.ideal_for%5B%5D=Men&p[]=facets.ideal_for%5B%5D=men&otracker=categorytree&fm=neo/merchandising&iid=M_1064313a-7a8d-48f3-8199-daaf60d62ef6_2_372UD5BXDFYS_MC.8HARX8UX7IX5&otracker=hp_rich_navigation_2_2.navigationCard.RICH_NAVIGATION_Fashion~Men%27s+Bottom+Wear_8HARX8UX7IX5&otracker1=hp_rich_navigation_PINNED_neo/merchandising_NA_NAV_EXPANDABLE_navigationCard_cc_2_L1_view-all&cid=8HARX8UX7IX5"
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
browser = webdriver.Chrome()
browser.get(response.request.url)
next_page = response.xpath('//a[span[text()="Next"]]/@href').get()
if next_page:
print("\n\n\n NEXT PAGE\n\n\n")
print("\n" next_page "\n")
next_page = response.urljoin(next_page)
print(next_page)
yield scrapy.Request(next_page, callback=self.parse)

轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/412071.html
標籤:
