# coding =utf-8
import requests
from bs4 import BeautifulSoup
import json
class zufangSpider:
def __init__(self, anjuke_name):
self.anjuke_name = anjuke_name
self.url_temp = 'https://nj.lianjia.com/' + anjuke_name + '/pg{}/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
}
def get_url_list(self): # 構造url串列
url_list = []
for i in range(1000):
url_list.append(self.url_temp.format(i))
return url_list
def parse_url(self, url): # 發送請求,獲取回應
print(url)
wb_data = requests.get(url, headers=self.headers)
soup = BeautifulSoup(wb_data.text, 'lxml')
prices = soup.select('div > span')
titles = soup.select('p.content__list--item--title.twoline > a')
rooms = soup.select('p.content__list--item--des')
imgs = soup.select('a > img')
cates = soup.select('p.content__list--item--bottom.oneline')
for title, price, room, img, cate in zip(titles, prices, rooms, imgs, cates):
data = {
'title': title.get_text(),
'price': price.get_text(),
'room': list(room.stripped_strings),
'img': img.get('src'),
'cate': list(cate.stripped_strings),
}
print(data)
info = json.dumps(data, ensure_ascii=False)
return info
def save_html(self, html_str, page_num):
file_path = '{}--第{}頁.html'.format(self.anjuke_name, page_num)
with open(file_path, 'w', encoding='utf-8') as f:
f.write(html_str)
def run(self): # 實作主要邏輯
# 1.構造url串列
url_list = self.get_url_list()
# 2.遍歷,發送請求,獲取回應
for url in url_list:
html_str = self.parse_url(url)
# 3.保存
page_num = url_list.index(url) + 1
self.save_html(html_str, page_num)
if __name__ == '__main__':
zufang_spieder = zufangSpider('zufang')
zufang_spieder.run()
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/126200.html
標籤:其他技術專區
