資料集:大學畢業生收入
下載地址,本文以繪制直方圖為主,
1. 欄位描述
| 欄位名稱 | 欄位型別 | 欄位說明 |
|---|---|---|
| Major_code | 整型 | 專業代碼, |
| Major | 字符型 | 專業名稱, |
| Major_category | 字符型 | 專業所屬目錄, |
| Total | 整型 | 總人數, |
| Employed | 整型 | 就業人數, |
| Employed_full_time_year_round | 整型 | 全年全職在崗人數, |
| Unemployed | 整型 | 失業人數, |
| Unemployment_rate | 浮點型 | 失業率, |
| Median | 整型 | 收入的中位數, |
| P25th | 整型 | 收入的25百分位數, |
| P75th | 浮點型 | 收入的75百分位數, |
2. 資料預處理
2.1 導包
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
2.2 讀取資料
df = pd.read_csv('大學畢業生收入資料集.csv')
3. 資料預覽
3.1 預覽資料
print(df.head())
結果:
Major_code Major ... P25th P75th
0 1100 GENERAL AGRICULTURE ... 34000 80000.0
1 1101 AGRICULTURE PRODUCTION AND MANAGEMENT ... 36000 80000.0
2 1102 AGRICULTURAL ECONOMICS ... 40000 98000.0
3 1103 ANIMAL SCIENCES ... 30000 72000.0
4 1104 FOOD SCIENCE ... 38500 90000.0
3.2 查看基本資訊
df.info()
結果:
RangeIndex: 173 entries, 0 to 172
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Major_code 173 non-null int64
1 Major 173 non-null object
2 Major_category 173 non-null object
3 Total 173 non-null int64
4 Employed 173 non-null int64
5 Employed_full_time_year_round 173 non-null int64
6 Unemployed 173 non-null int64
7 Unemployment_rate 173 non-null float64
8 Median 173 non-null int64
9 P25th 173 non-null int64
10 P75th 173 non-null float64
dtypes: float64(2), int64(7), object(2)
3.3 查看重復值
print(df.duplicated().sum())
結果:
0
3.4 查看缺失值
print(df.isnull().sum())
結果:
Major_code 0
Major 0
Major_category 0
Total 0
Employed 0
Employed_full_time_year_round 0
Unemployed 0
Unemployment_rate 0
Median 0
P25th 0
P75th 0
dtype: int64
4. 資料集描述性資訊
describe = df.describe()
print(describe)
結果:
Major_code Total ... P25th P75th
count 173.000000 1.730000e+02 ... 173.000000 173.000000
mean 3879.815029 2.302566e+05 ... 38697.109827 82506.358382
std 1687.753140 4.220685e+05 ... 9414.524761 20805.330126
min 1100.000000 2.396000e+03 ... 24900.000000 45800.000000
25% 2403.000000 2.428000e+04 ... 32000.000000 70000.000000
50% 3608.000000 7.579100e+04 ... 36000.000000 80000.000000
75% 5503.000000 2.057630e+05 ... 42000.000000 95000.000000
max 6403.000000 3.123510e+06 ... 78000.000000 210000.000000
[8 rows x 9 columns]
可在變數視圖中查看
describe
5. 資料分析
5.1 各專業種類(Major_category)的專業分支個數
Major_category_counts=df['Major_category'].value_counts()
print(Major_category_counts)
rects = plt.bar(range(1,17),Major_category_counts);
for rect in rects: #rects 是三根柱子的集合
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height, str(height), size=12, ha='center', va='bottom')
interval = ['Engineering','Education','Humanities & Liberal Arts','Biology & Life Science','Business','Health','Computers & Mathematics','Agriculture & Natural Resources','Physical Sciences','Social Science','Psychology & Social Work','Arts','Industrial Arts & Consumer Services','Law & Public Policy','Communications & Journalism','Interdisciplinary']
plt.xticks(range(1,17),interval,rotation=90);
plt.title('Number of Branches by Major Category')
plt.ylabel('Counts')
plt.show()
結果:
Engineering 29
Education 16
Humanities & Liberal Arts 15
Biology & Life Science 14
Business 13
Health 12
Computers & Mathematics 11
Agriculture & Natural Resources 10
Physical Sciences 10
Social Science 9
Psychology & Social Work 9
Arts 8
Industrial Arts & Consumer Services 7
Law & Public Policy 5
Communications & Journalism 4
Interdisciplinary 1
Name: Major_category, dtype: int64
圖示:

結論:
由于機械類專業發展歷史悠久,故相對來說機械類專業分支數相較其他大類專業要多
5.2 各大類專業收入
averageMoney = []
for i in range(len(interval)):
sum = 0
for j in range(173):
if df['Major_category'][j] == interval[i]:
sum = sum + df['Median'][j]
averageMoney.append(sum/Major_category_counts[i])
plt.bar(range(1,17),averageMoney);
plt.xticks(range(1,17),interval,rotation=90);
plt.title('Average Annual salary by Major Category')
plt.ylabel('Moneys')
plt.show()
圖示:

結論:
由于機械類專業與人工智能、自動化等領域相關,故平均工資比較高;計算機與數學類專業發展前景很好,但是小公司工資普遍不高,大公司工資相對來說較高,
5.3 各大類專業失業率
averageUnemployRate = []
for i in range(len(interval)):
sum = 0
for j in range(173):
if df['Major_category'][j] == interval[i]:
sum = sum + df['Unemployment_rate'][j]
averageUnemployRate.append(sum/Major_category_counts[i])
plt.bar(range(1,17),averageUnemployRate);
plt.xticks(range(1,17),interval,rotation=90);
plt.title('Average Unemployment Rate by Major Category')
plt.ylabel('Rate')
plt.show()
圖示:

結論:
藝術類專業由于可變動性特別大,加上對人才的要求相對來說較為苛刻,故失業率較高,
5.4 各大類專業就業率
averageEmployRate = []
for i in range(len(interval)):
sum = 0
for j in range(173):
if df['Major_category'][j] == interval[i]:
sum = sum + df['Employed'][j] / df['Total'][j]
averageEmployRate.append(sum/Major_category_counts[i])
plt.bar(range(1,17),averageEmployRate);
plt.xticks(range(1,17),interval,rotation=90);
plt.title('Average Employment Rate by Major Category')
plt.ylabel('Rate')
plt.show()
圖示:

結論:
相對來說,由于計算機的發展前景,計算機與數學類的就業率較高,
5.5 各大類專業全年全職在崗率
averageFullTimeRate = []
for i in range(len(interval)):
sum = 0
for j in range(173):
if df['Major_category'][j] == interval[i]:
sum = sum + df['Employed_full_time_year_round'][j] / df['Employed'][j]
averageFullTimeRate.append(sum/Major_category_counts[i])
plt.bar(range(1,17),averageFullTimeRate);
plt.xticks(range(1,17),interval,rotation=90);
plt.title('Average Full-Time Rate by Major Category')
plt.ylabel('Rate')
plt.show()
圖示:

5.6 各大類專業總人數
averageNum = []
for i in range(len(interval)):
sum = 0
for j in range(173):
if df['Major_category'][j] == interval[i]:
sum = sum + df['Total'][j]
averageNum.append(sum/Major_category_counts[i])
plt.bar(range(1,17),averageNum);
plt.xticks(range(1,17),interval,rotation=90);
plt.title('Average Total Numbers by Major Category')
plt.ylabel('Counts')
plt.show()
圖示:

5.7 就業失業比
EUratio = []
for i in range(len(interval)):
EUratio.append(averageEmployRate[i]/averageUnemployRate[i])
plt.bar(range(1,17),EUratio);
plt.xticks(range(1,17),interval,rotation=90);
plt.title('Employment-Unemployment Ratio by Major Category')
plt.ylabel('Ratio')
plt.show()
圖示:

結論:
相對來說,農業就業的門檻低,就業率高的同時失業率低,
6. 完整代碼
# 導包
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
# 讀取資料
df = pd.read_csv('大學畢業生收入資料集.csv')
# 預覽資料
print(df.head())
# 規范欄位名稱(本資料集已經較為規范)
# 查看基本資訊
df.info()
# 查看重復值
print(df.duplicated().sum())
# 查看缺失值
print(df.isnull().sum())
# 查看資料集描述性資訊
describe = df.describe()
print(describe)
# 統計表中每個專業種類(Major_category)的個數
Major_category_counts=df['Major_category'].value_counts()
print(Major_category_counts)
rects = plt.bar(range(1,17),Major_category_counts);
for rect in rects: #rects 是三根柱子的集合
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2, height, str(height), size=12, ha='center', va='bottom')
interval = ['Engineering','Education','Humanities & Liberal Arts','Biology & Life Science','Business','Health','Computers & Mathematics','Agriculture & Natural Resources','Physical Sciences','Social Science','Psychology & Social Work','Arts','Industrial Arts & Consumer Services','Law & Public Policy','Communications & Journalism','Interdisciplinary']
plt.xticks(range(1,17),interval,rotation=90);
plt.title('Number of Branches by Major Category')
plt.ylabel('Counts')
plt.show()
# 對各大類專業收入作統計并作圖
averageMoney = []
for i in range(len(interval)):
sum = 0
for j in range(173):
if df['Major_category'][j] == interval[i]:
sum = sum + df['Median'][j]
averageMoney.append(sum/Major_category_counts[i])
plt.bar(range(1,17),averageMoney);
plt.xticks(range(1,17),interval,rotation=90);
plt.title('Average Annual salary by Major Category')
plt.ylabel('Moneys')
plt.show()
# 對各大類專業失業率作統計并作圖
averageUnemployRate = []
for i in range(len(interval)):
sum = 0
for j in range(173):
if df['Major_category'][j] == interval[i]:
sum = sum + df['Unemployment_rate'][j]
averageUnemployRate.append(sum/Major_category_counts[i])
plt.bar(range(1,17),averageUnemployRate);
plt.xticks(range(1,17),interval,rotation=90);
plt.title('Average Unemployment Rate by Major Category')
plt.ylabel('Rate')
plt.show()
# 對各大類專業就業率作統計并作圖
averageEmployRate = []
for i in range(len(interval)):
sum = 0
for j in range(173):
if df['Major_category'][j] == interval[i]:
sum = sum + df['Employed'][j] / df['Total'][j]
averageEmployRate.append(sum/Major_category_counts[i])
plt.bar(range(1,17),averageEmployRate);
plt.xticks(range(1,17),interval,rotation=90);
plt.title('Average Employment Rate by Major Category')
plt.ylabel('Rate')
plt.show()
# 對各大類專業全年全職在崗率作統計并作圖(沒有早退的)
averageFullTimeRate = []
for i in range(len(interval)):
sum = 0
for j in range(173):
if df['Major_category'][j] == interval[i]:
sum = sum + df['Employed_full_time_year_round'][j] / df['Employed'][j]
averageFullTimeRate.append(sum/Major_category_counts[i])
plt.bar(range(1,17),averageFullTimeRate);
plt.xticks(range(1,17),interval,rotation=90);
plt.title('Average Full-Time Rate by Major Category')
plt.ylabel('Rate')
plt.show()
# 對各大類專業總人數作統計并作圖
averageNum = []
for i in range(len(interval)):
sum = 0
for j in range(173):
if df['Major_category'][j] == interval[i]:
sum = sum + df['Total'][j]
averageNum.append(sum/Major_category_counts[i])
plt.bar(range(1,17),averageNum);
plt.xticks(range(1,17),interval,rotation=90);
plt.title('Average Total Numbers by Major Category')
plt.ylabel('Counts')
plt.show()
# 對各大類專業就業失業比作統計并作圖
EUratio = []
for i in range(len(interval)):
EUratio.append(averageEmployRate[i]/averageUnemployRate[i])
plt.bar(range(1,17),EUratio);
plt.xticks(range(1,17),interval,rotation=90);
plt.title('Employment-Unemployment Ratio by Major Category')
plt.ylabel('Ratio')
plt.show()
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/303317.html
標籤:python
