資料爬取

將實習吧刺猬實習里面有關python職位的招聘資訊爬取出來，保存在一個CSV檔案中，查看招聘網站可以發現，一共有109條資料，其中包含已下架的招聘資訊，雖然這部分下架的招聘資訊看起來是毫無意義，但其實也能通過其中獲取到很多有用的資訊，

爬取后的是保存在CSV檔案中，檔案部分內容如下圖所示，爬取的內容主要包含了作業名稱、公司名稱、公司所在城市、招聘需求學歷、實習工資、實習時間、職位描述、公司其他職位以及公司的一下說明：
在這里插入圖片描述
就這些資料我們就可以對其進行分析可視化，獲得資料背后隱藏的秘密

資料爬取部分的代碼如下所示：

import requests
from bs4 import BeautifulSoup
import os.path
import json
import os
import csv

import io
import sys

import jieba
import wordcloud

import traceback
import cgitb

#要加上這一句和import 否則后續一直會報'gbk'的錯誤
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')#'gb18030'

class shixi():

	#發爬蟲機制，加Headers,例外處理URLError類，用try-except陳述句來包圍并捕獲相應的例外
	def get_html(self,url):
		try:
			headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}#偽裝爬蟲
			resp = requests.get(url, headers = headers)
			return resp.text
		except:
			print ('url is wrong') #例外后輸出陳述句
			

	#將爬取資訊存入CSV操作
	def dict2cvs(self,dic,filename):

		file_exists=os.path.isfile(filename)
		#a訪問方式為追加，newline=''是為了防止寫入的檔案總有空行的問題
		with open(filename,'a',encoding='utf-8',newline='') as f: 
			
			headers=dic.keys()
			w =csv.DictWriter(f,delimiter=',', lineterminator='\n', fieldnames=headers)
			
			if not file_exists :
				w.writeheader()

			#writerow()逐行寫入，writerrow()是多行寫入
			w.writerow(dic)
		print('當前行寫入csv成功！')	

	#爬取職位資訊頁面
	def draw_base_list(self,url):
		html = self.get_html(url)

		##json.loads 實作json字串轉化為python的資料型別
		data = json.loads(html)
		try:
		#json的語法 "[]"迭代器標示（可以在里邊做簡單的迭代操作，如陣列下標，根據內容選值等）這樣就將lists選擇出來
			news = data['data']['lists']
			for n in news:

				jobid=n['jobid']
				#二級頁面的URL,這個網址就用來爬取職位描述，因為這一部分是靜態的
				url2='https://www.ciwei.net/internship/job/'+str(jobid)

				#這個網址是用來爬其他職位的網址，這是一個動態的，所以雖然和上面都在一個網頁，但是網址不一樣
				company_id=n['company_id']
				url3='https://www.ciwei.net/api/Shixi_V2_Job/getJobByCid?page=1&pageSize=10&company_id='+str(company_id)

				title = n['title']    
				comfullname = n['comfullname'] 
				comname = n['comname']
				cityName = n['cityName']
				education = n['education']  
				salary = n['salary']  
				fulltime_type_string = n['fulltime_type_string'] 
				label= n['label']
				dat2={
					'city':cityName
				}
				self.dict2cvs(dat2,'城市資訊.csv')
				self.draw_detail_list(url2,url3,jobid,title,comname,cityName,education,salary,fulltime_type_string,label)	
		except Exception as e:
		# except Exception,e:#這樣會報錯，SyntaxError: invalid syntax，因為except Exception, e: 這個語法需要用python2版本去運行，我們使用的是python3，所以要換成except Exception as e:
			print(repr(e))
			print('draw_base_list is wrong')
		pass


	#爬取二級詳情頁
	def draw_detail_list(self,url,url1,jobid,title,comname,cityName,education,salary,fulltime_type_string,label):
	# def draw_detail_list(self,url):
		#職位描述部分
		html = self.get_html(url)
		soup=BeautifulSoup(html,'lxml')
		#爬取職位描述的部分
		try:
			lilist=soup.find('ul',{'class':'job-desc___3cBCa'})
			l=lilist.find_all('span')
			l2=lilist.find_all('h5')#是ResultSet型別不能寫'.text'# print(type(l[0]))可迭代 型別Tag 就可以寫'.text'
			s=''
			s4=''
			for i in range(len(lilist)):
				s=l2[i].text.strip()+l[i].text.strip()+'\n'+'\n'
				s4=s4+s
			#爬取其他職位部分，因為是動態網頁，所以要重新用json決議一下
			html1 = self.get_html(url1)
			data1 = json.loads(html1)
			news1 = data1['data']['jobList']
			t=''
			for n in news1:
				tit=n['title']+'\n'
				t=tit+t
			print(t)
			dat={
				'jobid':jobid,
				'作業名稱':title,
				'公司名稱':comname,
				'城市':cityName,
				'要求學歷':education,
				'工資':salary,
				'實習時間':fulltime_type_string,
				'職位描述':s4,
				'其他職位':t,
				'說明':label
				}
			dat1={
				'職位描述':s4,
			}
			#將資訊存入CSV檔案
			self.dict2cvs(dat1,'職位描述.csv')
			self.dict2cvs(dat,'所有資訊.csv')

		#捕捉例外并輸出錯誤資訊
		except Exception as e:
			print(repr(e))
			print('draw_detail_list is wrong')
		pass

s=shixi()
if __name__ == '__main__':
	print('爬取實習吧')
	for page in range(1,12):
		url='https://www.ciwei.net/api/Shixi_Pc/search?city=0&getCount=1&key=Python&s_c=1&page='+str(page)+'&source=pc'
		print(url)
		s.draw_base_list(url)
	pass

資料分析

轉載請註明出處，本文鏈接：https://www.uj5u.com/houduan/241020.html

標籤：python

上一篇：python3網路爬蟲--使用Ip代理爬取新浪微博上小姐姐照片（附原始碼）

下一篇：Python爬蟲以及資料可視化分析

爬取實習吧與python相關的招聘資訊及資料可視化(含代碼)

目錄

資料爬取

資料分析