因為找不太到途牛的url規律,就只能慢慢爬取資料,由于頁面加載的時間很慢,用一臺電腦爬取4000+資料可能需要數小時,這里只是簡單實作了每個城市的第一頁資料,可以在這個基礎上實作多個頁面一起爬和強化翻頁個功能
爬取全國酒店資料+可視化
- 爬取資料
- 可視化Flask+Echarts
爬取資料

一個py檔案和一個文本檔案就可以爬取了
首先是py檔案
import json
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
from bs4 import BeautifulSoup
import re
import pymysql
# 獲取谷歌驅動
driver = webdriver.Chrome("chromedriver.exe")
# 訪問途牛網
driver.get("https://hotel.tuniu.com/list/602p0s0b0?cityName=%E5%B9%BF%E5%B7%9E")
# 最大化
driver.maximize_window()
# 睡眠3秒等待頁面加載
time.sleep(3)
# 判斷一下資料為不為空 為空就將字串"null"回傳去
def judgeLen(temp):
if len(temp) > 0:
data = temp[0]
else:
data = "null"
return data
def getData():
# 連接資料庫
connect = pymysql.connect(host="xxxxx", port=12345, user="xxx", passwd="xxxx",database="mydata",charset="utf8")
# 獲取游標
cursor = connect.cursor()
# 建表操作在可視化提前建好即可,或者自行寫代碼創建
# 打開準備好的全部城市名字的文本檔案
with open("AllCity.txt",mode="r",encoding="utf-8") as file:
# 將文本讀取進來
text = file.read()
# 用json決議文本檔案
jsondata = json.loads(text)
# 遍歷決議出來的字典 pro就是key 省份
for pro in jsondata:
tempList = jsondata[pro]
# 通過key遍歷values 這里遍歷出來的就是city
for city in tempList:
# 通過切割得到后面中文的城市名
place = (str(city).split("|")[1])
# -----------------自動修改城市名進行跳轉-----------------------
# 清空一下輸入城市那個標簽的內容
driver.find_element_by_css_selector(".city-div > input:nth-child(1)").clear()
# 將遍歷出來的中文城市名填進去
driver.find_element_by_css_selector(".city-div > input:nth-child(1)").send_keys(place)
time.sleep(2)
# 點擊一下提示框的第一個地點 就會自動跳轉到那個城市
ActionChains(driver).move_by_offset(226, 263).click().perform()
# 回到原點
ActionChains(driver).move_by_offset(-226, -263).perform()
time.sleep(5)
# 對驅動回傳的頁面內容進行決議
bs = BeautifulSoup(driver.page_source, "html.parser")
# 獲取每個酒店div標簽
data = bs.find_all("div", class_="hotel-item")
# 遍歷div標簽
for div in data:
# 正則運算式獲取每個資料
# 酒店名
namepatt = re.compile(r'span.*?hotel-name f-m.*?>(.*?)</span>')
# 鉆石圖示,當做星星用了
diapatt = re.compile(r'(icon icon-diamond)')
# 星星
starpatt = re.compile(r'(icon icon-star)')
# 評分
ratingpatt = re.compile(
r'"hotel-score f-b f-DINA" data-v-74d0f10f="" style="background: rgb.*?;">(.*?)</div')
# 評論數
commpatt = re.compile(r'</span><span class="comment-amount f-r" data-v-74d0f10f="">(.*?)條評論')
# 價格
pricepatt = re.compile(
r'<span class="amount f-b f-DINA" data-v-74d0f10f="">(.*?)</span><span class="qi')
# -----------------匹配環節-----------------
# 匹配酒店名字和品牌
name = judgeLen(re.findall(namepatt, str(div)))
# 如果有找到"("
if name.find("(") > 0:
# 酒店名
hname = name.split("(")[1][:-1]
# 品牌
hbrand = name.split("(")[0]
else:
# 酒店名
hname = name
# 品牌
hbrand = "其他"
# 匹配星級 通過星星標簽數量
if len(re.findall(diapatt, str(div))) > 0:
star = str(len(re.findall(diapatt, str(div)))) + "星"
else:
star = str(len(re.findall(starpatt, str(div)))) + "星"
# 評分
rating = judgeLen(re.findall(ratingpatt, str(div)))
# 評論數
comm = judgeLen(re.findall(commpatt, str(div)))
# 價格
price = judgeLen(re.findall(pricepatt, str(div)))
# 往資料庫插入資料
insertSql = """
insert into `TC_hotel` (hname,hbrand,province,city,starlevel,rating,comment_count,price)values
('{}','{}','{}','{}','{}','{}','{}','{}')
""".format(str(hname), str(hbrand), str(pro), str(place), str(star), str(rating), str(comm), str(price))
# 預編譯sql陳述句
cursor.execute(insertSql)
# 提交
connect.commit()
# 列印插入資訊
print("插入資料 "+str(pro), str(place), str(hname), str(hbrand), str(star), str(rating), str(comm), str(price))
if __name__ == '__main__':
getData()
還有一個文本檔案
copy過去即可
{
"北京": ["bj|北京"],
"天津": ["tj|天津"],
"上海": ["sh|上海"],
"臺灣": ["tw|臺灣"],
"香港": ["hk|香港"],
"澳門": ["am|澳門"],
"河北": ["bd|保定", "cangzhou|滄州", "chengde|承德", "dingzhou|定州", "gt|館陶", "hd|邯鄲", "hs|衡水", "lf|廊坊", "qhd|秦皇島", "sjz|石家莊", "ts|唐山", "xt|邢臺", "zjk|張家口", "zd|正定", "zx|趙縣", "zhangbei|張北"],
"河南": ["ay|安陽", "changge|長葛", "hb|鶴壁", "jiaozuo|焦作", "jiyuan|濟源", "kaifeng|開封", "luoyang|洛陽", "luohe|漯河", "mg|明港", "ny|南陽", "pds|平頂山", "puyang|濮陽", "sq|商丘", "smx|三門峽", "xx|新鄉", "xc|許昌", "xy|信陽", "yuzhou|禹州", "yanling|鄢陵", "zz|鄭州", "zk|周口", "zmd|駐馬店"],
"黑龍江": ["dq|大慶","dxal|大興安嶺", "hrb|哈爾濱", "hegang|鶴崗", "heihe|黑河", "jms|佳木斯", "jixi|雞西", "mdj|牡丹江", "qqhr|齊齊哈爾", "qth|七臺河", "suihua|綏化", "sys|雙鴨山", "yich|伊春"],
"吉林": ["bc|白城", "baishan|白山", "cc|長春", "jl|吉林", "liaoyuan|遼源", "songyuan|松原", "sp|四平", "th|通化", "yanbian|延邊"],
"遼寧" : ["as|鞍山", "benxi|本溪", "cy|朝陽", "dl|大連", "dandong|丹東", "fushun|撫順", "fx|阜新", "hld|葫蘆島", "jinzhou|錦州", "liaoyang|遼陽", "pj|盤錦", "sy|沈陽", "tl|鐵嶺", "wfd|瓦房店", "yk|營口", "pld|莊河"],
"山東": ["bz|濱州", "dz|德州", "dy|東營", "heze|菏澤", "jn|濟南", "jining|濟寧", "kl|墾利", "linyi|臨沂", "lc|聊城", "lw|萊蕪", "qd|青島", "rizhao|日照", "shouguang|壽光", "longkou|龍口", "ta|泰安", "wf|濰坊", "weihai|威海", "yt|煙臺", "zb|淄博", "zaozhuang|棗莊", "zhangqiu|章丘", "zc|諸城"],
"內蒙古": ["alsm|阿拉善盟", "bt|包頭", "bycem|巴彥淖爾", "chifeng|赤峰", "erds|鄂爾多斯", "hu|呼和浩特", "hlbe|呼倫貝爾", "hlr|海拉爾", "tongliao|通遼", "wuhai|烏海", "wlcb|烏蘭察布", "xl|錫林郭勒", "xam|興安盟"],
"江蘇": ["cz|常州", "dafeng|大豐", "danyang|丹陽", "dongtai|東臺", "donghai|東海", "ha|淮安", "haimen|海門", "haian|海安", "jingjiang|靖江", "jianhu|建湖", "liyang|溧陽", "lyg|連云港", "nj|南京", "nt|南通", "pizhou|邳州", "qidong|啟東", "rugao|如皋", "rudong|如東", "su|蘇州", "shuyang|沭陽", "suqian|宿遷", "taizhou|泰州", "taixing|泰興", "wx|無錫", "xinghuashi|興化", "xinyishi|新沂", "xz|徐州", "xzpeixian|沛縣", "yangzhong|揚中", "yz|揚州", "yancheng|鹽城", "zj|鎮江"],
"安徽": ["anqing|安慶", "bengbu|蚌埠", "bozhou|亳州", "ch|巢湖", "chizhou|池州", "chuzhou|滁州", "fy|阜陽", "hf|合肥", "hn|淮南", "huaibei|淮北", "huangshan|黃山", "hexian|和縣", "hq|霍邱", "la|六安", "mas|馬鞍山", "ningguo|寧國", "suzhou|宿州", "tianchang|天長", "tongling|銅陵", "tongcheng|桐城", "wuhu|蕪湖", "xuancheng|宣城"],
"山西": ["changzhi|長治", "dt|大同", "jincheng|晉城", "jz|晉中", "lvliang|呂梁", "linfen|臨汾", "linyixian|臨猗", "qingxu|清徐", "shuozhou|朔州", "ty|太原", "xinzhou|忻州", "yuncheng|運城", "yq|陽泉"],
"陜西": ["ankang|安康", "baoji|寶雞", "hanzhong|漢中", "sl|商洛", "tc|銅川", "wn|渭南", "xa|西安", "xianyang|咸陽", "yanan|延安", "yl|榆林"],
"甘肅": ["by|白銀", "dx|定西", "gn|甘南", "jinchang|金昌", "jyg|嘉峪關", "jq|酒泉", "lz|蘭州", "linxia|臨夏", "ln|隴南", "pl|平涼", "qingyang|慶陽", "tianshui|天水", "wuwei|武威", "zhangye|張掖"],
"浙江": ["hz|杭州", "cixi|慈溪", "changxing|長興", "deqing|德清", "dongyang|東陽", "haining|海寧", "huzhou|湖州", "jiashanx|嘉善", "jx|嘉興", "jh|金華", "lishui|麗水", "nb|寧波", "quzhou|衢州", "ruiancity|瑞安", "sx|紹興", "tongxiang|桐鄉", "tz|臺州", "wenling|溫嶺", "wz|溫州", "xiangshanxian|象山", "yiwu|義烏", "yueqingcity|樂清", "yuyao|余姚", "zhoushan|舟山", "zhuji|諸暨"],
"江西": ["fuzhou|撫州", "ganzhou|贛州", "jj|九江", "ja|吉安", "jdz|景德鎮", "nc|南昌", "px|萍鄉", "sr|上饒", "xinyu|新余", "yingtan|鷹潭", "yichun|宜春", "yxx|永新"],
"湖北": ["es|恩施", "ez|鄂州", "hshi|黃石", "hg|黃岡", "jingzhou|荊州", "jingmen|荊門", "qianjiang|潛江", "shiyan|十堰", "snj|神農架", "suizhou|隨州", "tm|天門", "wh|武漢", "xf|襄陽", "xiaogan|孝感", "xiantao|仙桃", "xianning|咸寧", "yc|宜昌", "yidou|宜都"],
"湖南": ["cs|長沙", "changde|常德", "chenzhou|郴州", "hy|衡陽", "hh|懷化", "ld|婁底", "shaoyang|邵陽", "xiangtan|湘潭", "xiangxi|湘西", "yy|岳陽", "yongzhou|永州", "yiyang|益陽", "zhuzhou|株洲", "zjj|張家界"],
"貴州": ["anshun|安順", "bijie|畢節", "gy|貴陽", "lps|六盤水", "qdn|黔東南", "qn|黔南", "qxn|黔西南", "tr|銅仁", "zunyi|遵義"],
"四川": ["ab|阿壩", "bazhong|巴中", "cd|成都", "deyang|德陽", "dazhou|達州", "ga|廣安", "guangyuan|廣元", "ganzi|甘孜", "ls|樂山", "luzhou|瀘州", "liangshan|涼山", "mianyang|綿陽", "ms|眉山", "scnj|內江", "nanchong|南充", "panzhihua|攀枝花", "suining|遂寧", "yb|宜賓", "ya|雅安", "zg|自貢", "zy|資陽"],
"云南": ["bs|保山", "cx|楚雄", "dali|大理", "diqing|迪慶", "dh|德宏", "honghe|紅河", "km|昆明", "lj|麗江", "lincang|臨滄", "nujiang|怒江", "pe|普洱", "qj|曲靖", "ws|文山", "bn|西雙版納", "yx|玉溪", "zt|昭通"],
"新疆": ["aks|阿克蘇", "ale|阿拉爾", "bygl|巴音郭楞", "betl|博爾塔拉", "changji|昌吉", "hami|哈密", "ht|和田", "klmy|克拉瑪依", "kel|庫爾勒", "ks|喀什", "kzls|克孜勒蘇", "shz|石河子", "tlf|吐魯番", "tmsk|圖木舒克", "xj|烏魯木齊", "wjq|五家渠", "yili|伊犁", "alt|阿勒泰", "tac|塔城"],
"寧夏": ["guyuan|固原", "szs|石嘴山", "wuzhong|吳忠", "yinchuan|銀川", "zw|中衛"],
"青海": ["guoluo|果洛", "huangnan|黃南", "hx|海西", "haidong|海東", "haibei|海北", "hainan|海南", "xn|西寧", "ys|玉樹"],
"西藏": ["al|阿里", "changdu|昌都", "lasa|拉薩", "linzhi|林芝", "nq|那曲", "rkz|日喀則", "sn|山南", "rituxian|日土", "gaizexian|改則"],
"廣西": ["baise|百色", "bh|北海", "chongzuo|崇左", "fcg|防城港", "gl|桂林", "gg|貴港", "hc|河池", "hezhou|賀州", "liuzhou|柳州", "lb|來賓", "nn|南寧", "qinzhou|欽州", "wuzhou|梧州", "yulin|玉林"],
"廣東": ["chaozhou|潮州", "dg|東莞", "fs|佛山", "gz|廣州", "huidong|惠東", "huizhou|惠州", "heyuan|河源", "jm|江門", "jy|揭陽", "mm|茂名", "mz|梅州", "qingyuan|清遠", "sd|順德", "sz|深圳", "st|汕頭", "sg|韶關", "sw|汕尾", "taishan|臺山", "yj|陽江", "yangchun|陽春", "yf|云浮", "zh|珠海", "zs|中山", "zhanjiang|湛江", "zq|肇慶", "boluo|博羅"],
"福建": ["fz|福州", "jinjiangshi|晉江", "ly|龍巖", "nd|寧德", "np|南平", "nananshi|南安", "pt|莆田", "qz|泉州", "sm|三明", "shishi|石獅", "wuyishan|武夷山", "xm|廈門", "zhangzhou|漳州"],
"海南": ["haikou|海口", "sansha|三沙", "sanya|三亞", "wzs|五指山", "qh|瓊海", "wenchang|文昌", "wanning|萬寧", "tunchang|屯昌", "qiongzhong|瓊中", "lingshui|陵水", "df|東方", "da|定安", "cm|澄邁", "baoting|保亭", "baish|白沙", "tanzhou|儋州"]
}
爬出來的資料表(星鉆可以不作區分)

設計表(方便插入資料就全部varchar,見諒)

可視化Flask+Echarts

圈起來的就是用到的

首先是app.py檔案
from flask import Flask, render_template
from flask_sqlalchemy import SQLAlchemy
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'mysql+pymysql://用戶名:密碼@域名:埠/資料庫?charset=utf8'
app.config.setdefault('SQLALCHEMY_TRACK_MODIFICATIONS', True)
db = SQLAlchemy(app)
"""
1) 撰寫程式,計算每個酒店的綜合得分
先對星級(starlevel)、評價(rating)、評論數(comment_count)3個欄位做以下轉換:
對評價(rating)和評論數(comment_count)兩個欄位做歸一化,調整到[0, 1]區間,得到評價得分和評論得分;
星級得分為: 星數 * 0.2 ,
綜合得分為: 星級得分(30%)、評價得分(50%)、評論得分(20%)的加權平均
2) 統計每個省份酒店的平均總得分
3) 主標題為“全國各省酒店綜合得分”(紅色,加粗)
4) 輸出全國各省綜合得分情況地圖
"""
# SQLAlchemy映射
class yang_Table(db.Model):
__tablename__ = 'tc_hotel'
hname =db.Column(db.String(50),primary_key=True)
hbrand =db.Column(db.String(50))
province = db.Column(db.String(50))
city = db.Column(db.String(50))
starlevel = db.Column(db.String(50))
rating = db.Column(db.String(50))
comment_count = db.Column(db.String(50))
price = db.Column(db.String(50))
@app.route("/")
@app.route("/china")
def china():
data = []
# 將表里的資料獲取到,得到的是一個串列
ds = db.session.query(yang_Table.hname, yang_Table.hbrand, yang_Table.province, yang_Table.city,yang_Table.starlevel,yang_Table.rating,yang_Table.comment_count,yang_Table.price).all()
# 根據題目計算各個城市的綜合分數 以下代碼可以根據邏輯自行敲
# 根據需求進行歸一化
# 定義并初始化最大最小值
ramax = 0
ramin = 5
comin = 9999999
comax = 0
# 然后遍歷串列的資料,求出最大最最小值
for i in ds:
ramax = max(ramax, float(i[5]))
ramin = min(ramin, float(i[5]))
comax = max(comax, float(i[6]))
comin = min(comin, float(i[6]))
# 計算最大減最小的差
racha = ramax-ramin
cocha = comax-comin
rating = []
comment = []
star = []
# 進行歸一化計算
for i in ds:
rating.append(round((float(i[5])-ramin)/racha, 2))
comment.append(round((float(i[6])-comin)/cocha,2))
star.append(round(float(int(i[4][:-1])*0.2), 2))
# 對評價和評論數計算分數
for i in range(len(star)):
data.append(round(float(star[i] * 30 + rating[i] * 50 + comment[i] * 20), 2))
print(data)
pro = []
temp = ds[0][2]
dicData = {"北京":[]}
for i in range(len(data)):
if ds[i][2] != temp:
temp = ds[i][2]
pro.append(temp)
dicData[temp] = []
else:
dicData[temp].append(data[i])
avgdata = dict()
for key in dicData:
sum = 0
for item in dicData[key]:
sum += item
avgdata[key] = round(float(sum/len(dicData[key])), 2)
# 將資料轉為Echarts可以接受的資料
result = []
for key in avgdata:
result.append({"name": key, "value": avgdata[key]})
print(len(avgdata))
title = "全國各省酒店綜合得分"
tips = '綜合得分'
# 將資料傳到前端
return render_template("china.html", data=result, title=title, tips=tips)
if __name__ == "__main__":
app.run(host='127.0.0.1', port=5222, debug=True)
然后就是html檔案
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>資料可視化</title>
<style>
#map {
width : 1000px;
height: 600px;
margin : 50px auto;
}
</style>
</head>
<body>
<div id="map">
</div>
</body>
<!--echarts引入-->
<script src="../static/js/echarts.js" charset="utf-8"></script>
<script src="../static/js/china.js" charset="utf-8"></script>
<script>
var myChart = echarts.init(document.getElementById('map'));
var option = {
title: {
text: '{{title|safe}}',
textStyle: {
color: 'red' ,
fontSize: 16 ,
fontWeight: 'bolder',
},
left: '40%'
},
tooltip: {
formatter:function(params,ticket, callback){
return params.seriesName+'<br />'+params.name+':'+params.value
}
},
visualMap: {
min: 0,
max: 100,
left: 'left',
top: 'bottom',
text: ['高','低'],
inRange: {
color: ['#00FF00', '#FFFF00', '#FF0000']
},
show:true
},
geo: {
map: 'china',
roam: false,
zoom:1.23,
label: {
normal: {
show: true,
fontSize:'10',
color: 'rgba(0,0,0,0.7)'
}
},
itemStyle: {
normal:{
borderColor: 'rgba(0, 0, 0, 0.2)'
},
emphasis:{
areaColor: '#F3B329',
shadowOffsetX: 0,
shadowOffsetY: 0,
shadowBlur: 20,
borderWidth: 0,
shadowColor: 'rgba(0, 0, 0, 0.5)'
}
}
},
series : [
{
name: '{{tips|safe}}',
type: 'map',
geoIndex: 0,
data:{{data|safe}}
}
]
}
myChart.setOption(option);
</script>
</html>
js檔案如果沒有的話可以私信我

原創不易,請給博主一個小小的贊吧~
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/256764.html
標籤:其他
