我正在嘗試在以下頁面上抓取產品串列:https ://www.beermerchants.com/browse/brewery/cantillon ,但是我只想列印有庫存的產品。我已經能夠使用以下代碼抓取完整的產品串列,但是如何修改它以便僅適用于有庫存的產品?
import ssl
import requests
import sys
import time
import smtplib
from email.message import EmailMessage
import hashlib
from urllib.request import urlopen
from datetime import datetime
import json
import random
import requests
from itertools import cycle
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib3.exceptions import InsecureRequestWarning
from requests_html import HTMLSession
session = HTMLSession()
# Suppress only the single warning from urllib3 needed.
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
user_agent_list = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]
for i in range(1,4):
#Pick a random user agent
user_agent = random.choice(user_agent_list)
#Set the headers
headers = {'User-Agent': user_agent}
url = 'https://www.beermerchants.com/browse/brewery/cantillon'
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text,features="html.parser")
link = []
for product in soup.find_all('a', href=True, class_="product-item-link"):
link.append(product['href'])
print(link)
提前致謝!!!
uj5u.com熱心網友回復:
我已將您的代碼調整為使用 xpath,以添加更復雜的邏輯。
我已經檢查了可以添加到購物車的產品(意味著它們有庫存)
import ssl
import requests
import sys
import time
import smtplib
from email.message import EmailMessage
import hashlib
from urllib.request import urlopen
from datetime import datetime
import json
import random
import requests
from itertools import cycle
import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree
from selenium import webdriver
from urllib3.exceptions import InsecureRequestWarning
from requests_html import HTMLSession
session = HTMLSession()
# Suppress only the single warning from urllib3 needed.
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
user_agent_list = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]
for i in range(1,4):
#Pick a random user agent
user_agent = random.choice(user_agent_list)
#Set the headers
headers = {'User-Agent': user_agent}
url = 'https://www.beermerchants.com/browse/brewery/cantillon'
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text,features="html.parser")
link = []
dom = etree.HTML(str(soup))
for i in dom.xpath('//div[contains(@class, "product-item-info") and .//form[@data-role="tocart-form"]]//a[@]/@href'):
print(i)
uj5u.com熱心網友回復:
您必須檢查產品 div 是否包含“缺貨”的跨度。簡單的例子:
for product in soup.find_all('a', href=True, class_="product-item-link"):
out_of_stock=False
for span in product.parent.parent.find_all('span', ):
if "Out of stock" in span.text:
out_of_stock = True
break
if not out_of_stock:
link.append(product['href'])
uj5u.com熱心網友回復:
以下代碼將僅抓取包含的鏈接 products that are in stock meaning add to card
import requests
import pandas as pd
from bs4 import BeautifulSoup
url_link = 'https://www.beermerchants.com/browse/brewery/cantillon'
lst = []
url = requests.get(url_link)
soup = BeautifulSoup(url.text,'lxml')
for card in soup.select('div[] > ol li:has(:-soup-contains("Add to Cart"))'):
e=card.a.get('href')
lst.append(e)
print(lst)
輸出:
https://www.beermerchants.com/cantillon-gueuze-75cl-bottle
https://www.beermerchants.com/cantillon-rose-de-gambrinus-75cl-bottle
https://www.beermerchants.com/cantillon-kriek-75cl-bottle
https://www.beermerchants.com/cantillon-grand-cru-bruocsella-75cl-bottle
https://www.beermerchants.com/cantillon-kriek-37-5cl-bottle
https://www.beermerchants.com/cantillon-rose-de-gambrinus-37-5cl-bottle
https://www.beermerchants.com/cantillon-c-est-bon-aluminium-sign
https://www.beermerchants.com/cantillon-gueuze-new-aluminium-sign
https://www.beermerchants.com/cantillon-traditionnal-gueuze-glas-33-cl
https://www.beermerchants.com/cantillon-super-tasting-glass-magnifica
轉載請註明出處,本文鏈接:https://www.uj5u.com/gongcheng/527420.html
上一篇:使用“get”函式讀取URL時出現python網路抓取問題
下一篇:通過更改Xpaths抓取資料
