我正在嘗試使用 BeautifulSoup 從網站上抓取房地產串列的 MLS 編號、價格和地址。
import requests
from bs4 import BeautifulSoup
# string url
str_url = 'https://www.utahrealestate.com/search/map.search'
# get response
response = requests.get(str_url)
# get html
soup = BeautifulSoup(response.text, 'html.parser')
# get the number of listings and assign it to int_n_pages (I cant get this to work; it returns NoneType)
int_n_pages = soup.find('li', {'class': 'view-results'})
# split and get n pages (this does not work because the previous line does not work)
int_n_pages = int(int_n_pages.split(' ')[2])
接下來,我的計劃是遍歷所有頁面并從每個串列中提取資訊。
就像是...
# empty list
list_dict_cards = []
# iterate through pages
for int_page in range(1, int_n_pages 1):
# get url
str_url = f'https://www.utahrealestate.com/search/map.search/page/{int_page}/vtype/map'
# get response
response = requests.get(str_url)
# get html
soup = BeautifulSoup(response.text, 'html.parser')
# get property cards
property_cards = soup.find_all(class_='property___card')
# iterate through property cards
for card in property_cards:
# empty dict
dict_card = {}
# get mls number
int_mls = card.find(class_='mls___number').text.split(' ')[1]
# put into dict_card
dict_card['mls'] = int_mls
# I would get other info here as well and put into dict_card
# append dict_card to list_cards
list_dict_cards.append(dict.card)
# make df
df_cards = pd.DataFrame(list_dict_cards)
# save
df_cards.to_csv('./output/df_dict_cards.csv', index=False)
我很確定該網站正試圖阻止以編程方式訪問它顯示的大部分資訊。
如何/周圍有什么?
uj5u.com熱心網友回復:
如果您在訪問主頁后使用正確的標頭向它發出 POST 請求(可能在您的會話中具有正確的 cookie。下面的示例似乎可以有效地刮取端點)技巧。這個網站很慢,不是腳本。
import requests
s = requests.Session()
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
}
home = 'https://www.utahrealestate.com/search/map.search'
step = s.get(home,headers=headers)
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Host':'www.utahrealestate.com',
'Origin':'https://www.utahrealestate.com',
'Referer':'https://www.utahrealestate.com/search/map.search',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
for page in range(1,5):
url = f'https://www.utahrealestate.com/search/map.inline.results/pg/{page}/sort/entry_date_desc/paging/0/dh/862'
data = s.post(url,headers=headers).json()
results = len(data['listing_data'])
print(f'Scraped {results} results from page {page}')
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/412063.html
標籤:
