本文重點
- 系統分析網頁性質
- 結構化的資料決議
- csv資料保存
環境介紹
- python 3.8
- pycharm 專業版 >>> 激活碼
#模塊使用
- requests >>> pip install requests
- parsel >>> pip install parsel
- csv
【付費VIP完整版】只要看了就能學會的教程,80集Python基礎入門視頻教學點擊免費觀看
對于本篇文章有疑問,或者想要資料集的同學也可以點這里加群:1039649593
爬蟲代碼實作步驟: 發送請求 >>> 獲取資料 >>> 決議資料 >>> 保存資料
匯入模塊
import requests # 資料請求模塊 第三方模塊 pip install requests import parsel # 資料決議模塊 import re import csv
發送請求, 對于房源串列頁發送請求
url = 'https://bj.lianjia.com/ershoufang/pg1/' # 需要攜帶上 請求頭: 把python代碼偽裝成瀏覽器 對于服務器發送請求 # User-Agent 瀏覽器的基本資訊 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36' } response = requests.get(url=url, headers=headers)
獲取資料
print(response.text)
決議資料
selector_1 = parsel.Selector(response.text) # 把獲取到response.text 資料內容轉成 selector 物件 href = https://www.cnblogs.com/qshhl/p/selector_1.css('div.leftContent li div.title a::attr(href)').getall() for link in href: html_data = requests.get(url=link, headers=headers).text selector = parsel.Selector(html_data) # css選擇器 語法 # try: title = selector.css('.title h1::text').get() # 標題 area = selector.css('.areaName .info a:nth-child(1)::text').get() # 區域 community_name = selector.css('.communityName .info::text').get() # 小區 room = selector.css('.room .mainInfo::text').get() # 戶型 room_type = selector.css('.type .mainInfo::text').get() # 朝向 height = selector.css('.room .subInfo::text').get().split('/')[-1] # 樓層 # 中樓層/共5層 split('/') 進行字串分割 ['中樓層', '共5層'] [-1] # ['中樓層', '共5層'][-1] 串列索引位置取值 取串列中最后一個元素 共5層 # re.findall('共(\d+)層', 共5層) >>> [5][0] >>> 5 height = re.findall('共(\d+)層', height)[0] sub_info = selector.css('.type .subInfo::text').get().split('/')[-1] # 裝修 Elevator = selector.css('.content li:nth-child(12)::text').get() # 電梯 # if Elevator == '暫無資料電梯' or Elevator == None: # Elevator = '無電梯' house_area = selector.css('.content li:nth-child(3)::text').get().replace('㎡', '') # 面積 price = selector.css('.price .total::text').get() # 價格(萬元) date = selector.css('.area .subInfo::text').get().replace('年建', '') # 年份 dit = { '標題': title, '市區': area, '小區': community_name, '戶型': room, '朝向': room_type, '樓層': height, '裝修情況': sub_info, '電梯': Elevator, '面積(㎡)': house_area, '價格(萬元)': price, '年份': date, } csv_writer.writerow(dit) print(title, area, community_name, room, room_type, height, sub_info, Elevator, house_area, price, date, sep='|')
保存資料
f = open('二手房資料.csv', mode='a', encoding='utf-8', newline='') csv_writer = csv.DictWriter(f, fieldnames=[ '標題', '市區', '小區', '戶型', '朝向', '樓層', '裝修情況', '電梯', '面積(㎡)', '價格(萬元)', '年份', ]) csv_writer.writeheader()

資料可視化
匯入所需模塊
import pandas as pd from pyecharts.charts import Map from pyecharts.charts import Bar from pyecharts.charts import Line from pyecharts.charts import Grid from pyecharts.charts import Pie from pyecharts.charts import Scatter from pyecharts import options as opts
讀取資料
df = pd.read_csv('鏈家.csv', encoding = 'utf-8') df.head()

各城區二手房數量北京市地圖
new = [x + '區' for x in region] m = ( Map() .add('', [list(z) for z in zip(new, count)], '北京') .set_global_opts( title_opts=opts.TitleOpts(title='北京市二手房各區分布'), visualmap_opts=opts.VisualMapOpts(max_=3000), ) ) m.render_notebook()

各城區二手房數量-平均價格柱狀圖
df_price.values.tolist() price = [round(x,2) for x in df_price.values.tolist()] bar = ( Bar() .add_xaxis(region) .add_yaxis('數量', count, label_opts=opts.LabelOpts(is_show=True)) .extend_axis( yaxis=opts.AxisOpts( name="價格(萬元)", type_="value", min_=200, max_=900, interval=100, axislabel_opts=opts.LabelOpts(formatter="{value}"), ) ) .set_global_opts( title_opts=opts.TitleOpts(title='各城區二手房數量-平均價格柱狀圖'), tooltip_opts=opts.TooltipOpts( is_show=True, trigger="axis", axis_pointer_type="cross" ), xaxis_opts=opts.AxisOpts( type_="category", axispointer_opts=opts.AxisPointerOpts(is_show=True, type_="shadow"), ), yaxis_opts=opts.AxisOpts(name='數量', axistick_opts=opts.AxisTickOpts(is_show=True), splitline_opts=opts.SplitLineOpts(is_show=False),) ) ) line2 = ( Line() .add_xaxis(xaxis_data=region) .add_yaxis( series_name="價格", yaxis_index=1, y_axis=price, label_opts=opts.LabelOpts(is_show=True), z=10 ) ) bar.overlap(line2) grid = Grid() grid.add(bar, opts.GridOpts(pos_left="5%", pos_right="20%"), is_control_axis_index=True) grid.render_notebook()

area0 = top_price['小區'].values.tolist() count = top_price['價格(萬元)'].values.tolist() bar = ( Bar() .add_xaxis(area0) .add_yaxis('數量', count,category_gap = '50%') .set_global_opts( yaxis_opts=opts.AxisOpts(name='價格(萬元)'), xaxis_opts=opts.AxisOpts(name='數量'), ) ) bar.render_notebook()

散點圖
s = ( Scatter() .add_xaxis(df['面積(㎡)'].values.tolist()) .add_yaxis('',df['價格(萬元)'].values.tolist()) .set_global_opts(xaxis_opts=opts.AxisOpts(type_='value')) ) s.render_notebook()

房屋朝向占比
directions = df_direction.index.tolist() count = df_direction.values.tolist() c1 = ( Pie(init_opts=opts.InitOpts( width='800px', height='600px', ) ) .add( '', [list(z) for z in zip(directions, count)], radius=['20%', '60%'], center=['40%', '50%'], # rosetype="radius", label_opts=opts.LabelOpts(is_show=True), ) .set_global_opts(title_opts=opts.TitleOpts(title='房屋朝向占比',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%",pos_top="25%",orient="vertical") ) .set_series_opts(label_opts=opts.LabelOpts(formatter='{b}:{c} ({d}%)'),position="outside") ) c1.render_notebook()

裝修情況/有無電梯玫瑰圖(組合圖)
fitment = df_fitment.index.tolist() count1 = df_fitment.values.tolist() directions = df_direction.index.tolist() count2 = df_direction.values.tolist() bar = ( Bar() .add_xaxis(fitment) .add_yaxis('', count1, category_gap = '50%') .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position='right')) .set_global_opts( xaxis_opts=opts.AxisOpts(name='數量'), title_opts=opts.TitleOpts(title='裝修情況/有無電梯玫瑰圖(組合圖)',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%",pos_top="58%",orient="vertical") ) ) c2 = ( Pie(init_opts=opts.InitOpts( width='800px', height='600px', ) ) .add( '', [list(z) for z in zip(directions, count2)], radius=['10%', '30%'], center=['75%', '65%'], rosetype="radius", label_opts=opts.LabelOpts(is_show=True), ) .set_global_opts(title_opts=opts.TitleOpts(title='有/無電梯',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%",pos_top="15%",orient="vertical") ) .set_series_opts(label_opts=opts.LabelOpts(formatter='{b}:{c} \n ({d}%)'),position="outside") ) bar.overlap(c2) bar.render_notebook()
二手房樓層分布柱狀縮放圖
floor = df_floor.index.tolist() count = df_floor.values.tolist() bar = ( Bar() .add_xaxis(floor) .add_yaxis('數量', count) .set_global_opts( title_opts=opts.TitleOpts(title='二手房樓層分布柱狀縮放圖'), yaxis_opts=opts.AxisOpts(name='數量'), xaxis_opts=opts.AxisOpts(name='樓層'), datazoom_opts=opts.DataZoomOpts(type_='slider') ) ) bar.render_notebook()

房屋面積分布縱向柱狀圖
area = df_area.index.tolist() count = df_area.values.tolist() bar = ( Bar() .add_xaxis(area) .add_yaxis('數量', count) .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position="right")) .set_global_opts( title_opts=opts.TitleOpts(title='房屋面積分布縱向柱狀圖'), yaxis_opts=opts.AxisOpts(name='面積(㎡)'), xaxis_opts=opts.AxisOpts(name='數量'), ) ) bar.render_notebook()

對于本篇文章有疑問,或者想要資料集的同學也可以點這里
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/304156.html
標籤:Python
