目的:爬取攜程網址
火車 中的單程與中轉
單程
url=“https://trains.ctrip.com/trainbooking/search?tocn=%25e5%258d%2583%25e5%25b2%259b%25e6%25b9%2596&fromcn=%25e6%259d%25ad%25e5%25b7%259e&day=2020-12-31”
中轉
url=“https://trains.ctrip.com/pages/booking/hubSingleTrip?ticketType=2&fromCn=%25E6%259D%25AD%25E5%25B7%259E&toCn=%25E5%258D%2583%25E5%25B2%259B%25E6%25B9%2596&departDate=2020-12-31”
采用parse.quote()進行url轉碼
采用csv進行資料保存
random.choice進行選擇一個User Agent 自認為這是個不錯的習慣
攜程單程資訊在原網頁源代碼中
攜程中轉網址火車中中轉資訊保存在json檔案中(js_url)
LET’S GO
url="https://trains.ctrip.com/pages/booking/hubSingleTrip?ticketType=5&fromCn=%25E6%259D%25AD%25E5%25B7%259E&toCn=%25E6%2596%25B0%25E4%25B9%25A1&departDate=2020-12-30" # 攜程單程火車原網址 查詢引數 fromcn 出發站 tocn 目的站 departDate 日期
#原網頁查詢引數需要進行兩次url編碼(注意點1)
#攜程單程資訊在原網頁源代碼中
'''
url="https://trains.ctrip.com/pages/booking/hubSingleTrip?ticketType=2&fromCn=%25E6%259D%25AD%25E5%25B7%259E&toCn=%25E5%258D%2583%25E5%25B2%259B%25E6%25B9%2596&departDate=2020-12-31"
js_url="https://trains.ctrip.com/pages/booking/getTransferList?departureStation=%2525E6%25259D%2525AD%2525E5%2525B7%25259E&arrivalStation=%2525E6%252596%2525B0%2525E4%2525B9%2525A1&departDateStr=2020-12-30"
攜程中轉網址火車中中轉資訊保存在json檔案中(js_url) 查詢引數departureStation arrivalStation departDateStr
類似稍加自己比較即可發現
js_url查詢引數需要進行三次url編碼(注意點2)
'''
from urllib import parse
import random
from bs4 import BeautifulSoup
import csv
import os
import requests
# print(parse.unquote((parse.unquote("%25E6%259D%25AD%25E5%25B7%259E"))))
fromArea = input("出發站")
toArea = input("目的站")
date=input("年-月-日 :")
if not os.path.exists("D:/攜程查找練習"):#創建后續保存檔案
os.mkdir("D:/攜程查找練習")
class NewsByTransfer():#該類用于爬取中轉的資訊
def __init__(self):#初始化
self.fromArea=fromArea
self.toArea=toArea
self.date=date
def getOneJsUrl(self,fromArea,toArea,date):#進行js_url拼接
fromArea=parse.quote(parse.quote(fromArea))
departureStation=parse.quote(fromArea)
toArea=parse.quote(parse.quote(toArea))
arrivalStation=parse.quote(toArea)
url="https://trains.ctrip.com/pages/booking/hubSingleTrip?ticketType=5&fromCn="+fromArea+"&toCn="+toArea #原網頁
js_url="https://trains.ctrip.com/pages/booking/getTransferList?departureStation="+departureStation+"&arrivalStation="+arrivalStation
js_url=js_url+"&departDateStr="+date
# print(url)
print(js_url)
return js_url
def getOneNews(self,js_url):#爬取js_url資訊
UA = [
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
]
user_agent = random.choice(UA)
text=requests.get(js_url,headers={"User-Agent":user_agent}).json() #獲取json字串用于python字典處理
transferList=text["data"]["transferList"]#第一次定位主要資訊串列
csvList=[]#創建csv后續dictwriter寫入 保存串列
for oneTransfer in transferList:
# print(oneTransfer)
tranDict={}
tranDict["總出發站"] = oneTransfer["departStation"]
tranDict["總目的站"] = oneTransfer["arriveStation"]
tranDict["總資訊"] = oneTransfer["transferStation"] + "換乘 停留" + oneTransfer["transferTakeTime"] + " 全程" + \
oneTransfer["totalRuntime"] + " 價格" + oneTransfer['showPriceText']
trainTransferInfosList=oneTransfer["trainTransferInfos"]
for trainTransferInfos in trainTransferInfosList:
tranDict[f"班次列車號{trainTransferInfos['sequence']}"]=trainTransferInfos['trainNo']
tranDict[f"發車時間-到站時間{trainTransferInfos['sequence']}"]=trainTransferInfos['departDate']+" "+ \
trainTransferInfos['departTime']+"---"+trainTransferInfos['arriveDate']+" "+trainTransferInfos['arriveTime']
tranDict[f"發車站-目的站{trainTransferInfos['sequence']}"]=trainTransferInfos[ 'departStation']+"---" +\
trainTransferInfos["arriveStation"]
csvList.append(tranDict)
print(csvList)
return csvList
def mkcsv(self,csvlist):#創建csv檔案
with open(f"D:/攜程查找練習/{csvlist[0]['總出發站']}到{csvlist[0]['總目的站']}轉站查找.csv","w+",newline="",encoding="utf-8") as f:
writer = csv.DictWriter(f, list(csvlist[0].keys()))
writer.writeheader()
writer.writerows(csvlist)
def main(self):
js_url = self.getOneJsUrl(self.fromArea, self.toArea, self.date)
csvList = self.getOneNews(js_url)
self.mkcsv(csvList)
print(csvList)
class NewsBySingle():#爬取單程資訊
def __init__(self):
self.fromArea=fromArea
self.toArea=toArea
self.date=date
def getOneUrl(self,fromArea,toArea,date):
fromArea=parse.quote(parse.quote(fromArea))
toArea=parse.quote(parse.quote(toArea))
url="https://trains.ctrip.com/trainBooking/search?ticketType=0&fromCn="+fromArea+"&toCn="+toArea+"&day="+self.date+"&mkt_header=&orderSource="
# print(url)
print(url)
return url
def getOneNews(self,url):
UA = [
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
]
user_agent = random.choice(UA)
text=requests.get(url,headers={"User-Agent":user_agent}).content.decode("utf-8")
print(text)#獲取源代碼為后續bs4決議
soup=BeautifulSoup(text,"lxml")
oneTripList=soup.select("div.railway_list")
print(len(oneTripList))
oneTripNewList=[]
for oneTrip in oneTripList:
oneTripDict={}
print(oneTrip)
oneTripDict["班次列車號"]=oneTrip.select("strong")[0].string
oneTripDict["出發站名稱"]=oneTrip.select("span")[0].string
oneTripDict["出發站時間"]=oneTrip.select("strong")[1].string
oneTripDict["中途時間"]=list(oneTrip.select("div.haoshi")[0].stripped_strings)[0]
oneTripDict["目的站名稱"]=oneTrip.select("span")[1].string
oneTripDict["到站時間"]=oneTrip.select("strong")[2].string
print(oneTripDict)
oneTripNewList.append(oneTripDict)
print("---"*60)
print(oneTripNewList)
return oneTripNewList
def mkcsv(self,oneTripNewList):
with open(f"D:/攜程查找練習/{oneTripNewList[0]['出發站名稱']}到{oneTripNewList[0]['目的站名稱']}單程查找.csv","w+",newline="",encoding="utf-8") as f:
writer = csv.DictWriter(f, list(oneTripNewList[0].keys()))
writer.writeheader()
writer.writerows(oneTripNewList)
def main(self):
url=self.getOneUrl(self.fromArea, self.toArea, self.date)
oneTripNewList=self.getOneNews(url)
self.mkcsv(oneTripNewList)
NewsByTransfer().main()
NewsBySingle().main()
總結一下還是不難的,基礎打牢,python類函式運用 bs4決議(感覺比xpath直觀)然后瀏覽器network中xhr就簡單看看寫一下就行,給個點贊好不,創造不易,無需關注,給贊就行
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/243319.html
標籤:python
上一篇:全網最細海龜 (turtle) 畫圖講解 (五):輸入/輸出文字及滑鼠與鍵盤互動設計
下一篇:猜我能不能搶到茅臺?
