目錄
一、import
二、匯入資料
三、借款人籍貫分布圖
四、性別分布
五、教育程度分布
六、借款人年齡分布
七、借款人職位分布
?八、借款人行業分布
九、借款金額分布圖
十、借款人收入分布
十一、婚姻狀況分布
十二、車貸情況
十三、房貸情況
零、寫在前面
①28W條資料我會盡快傳到CSDN的資源里,大家有興趣的可以自己下載
②文章只是列舉最簡單的分布情況,比如還可以看看各年齡段學歷組成等
③資料里有一條貸款理由,可以畫出詞云圖
④資料里有對各個借款人的信用進行評級,可以嘗試使用深度學習等方法訓練預測模型
⑤pandas、matplotlib都是較為基礎的用法,不做過多注釋
⑥爬蟲參考代碼:人人貸散標爬蟲實體進階-使用異步io_小zhan柯基-CSDN博客、人人貸散標爬蟲實體_小zhan柯基-CSDN博客
一、import
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.ticker as ticker
import mpl_toolkits.axisartist as AA
from mpl_toolkits.axisartist.axislines import SubplotZero
import pylab
import jieba
from wordcloud import WordCloud
pylab.mpl.rcParams['font.sans-serif'] = ['SimHei'] #顯示中文
plt.rcParams['axes.unicode_minus']=False #用于解決不能顯示負號的問題
二、匯入資料
①使用read_csv匯入資料
②設定列名
③花式索引
④將“id”設定為索引index
⑤去除所有都是nan的資料
data = pd.read_csv("all.csv",encoding="gbk",header=None,parse_dates=True)
data.columns = ["id","借款時間(月)","剩余還款時間(月)","借款金額","notPayInterest","productRepayType",
"貸款型別","利率","性別","籍貫","出生日期","教育程度","作業單位","行業","公司規模","職位","收入",
"車貸","汽車數量","婚姻狀況","房貸","房子數量","信用等級","none","none","none","借款理由"]
conciseData = data[["id","借款時間(月)","剩余還款時間(月)","借款金額","貸款型別","利率","性別","籍貫","出生日期","教育程度","作業單位","行業","公司規模","職位","收入",
"車貸","汽車數量","婚姻狀況","房貸","房子數量","信用等級","借款理由"]]
conciseData = conciseData.set_index("id")
conciseData = conciseData.dropna(how="all")
三、借款人籍貫分布圖
reigon = (conciseData["籍貫"].dropna().apply(lambda x:x.split(":")[0])\
.apply(lambda x:x.replace("省","").replace("市","").replace("壯族自治區","").replace("古",""))\
.value_counts()/(len(conciseData["籍貫"].dropna().apply(lambda x:x.split(":")[0])))*100).drop(index=["保密","null","請選擇","深圳"])[:31]
reigon = reigon[["上海","北京","浙江","天津","江蘇","廣東","福建","山東","遼寧",
"內蒙","重慶","湖南","安徽","江西","海南","湖北","河北","四川","陜西",
"吉林","寧夏","山西","黑龍江","河南","廣西","青海","新疆","云南","貴州","西藏","甘肅"]]
plt.figure(figsize=(16,8))
plt.title("借款人籍貫分布圖(按2020年各省人均可支配收入排序)",fontsize=20)
plt.ylabel("百分比/%",size=20)
# plt.tick_params(labelsize=15)
plt.xticks(rotation=45,fontsize=15)
plt.yticks(fontsize=15)
# plt.grid(linestyle=":", color="b", linewidth=1)
plt.bar(reigon.index,reigon,
color=["grey","gold","darkviolet","turquoise","r","g","b","c",
"k","darkorange","lightgreen","plum", "tan","khaki", "pink", "skyblue","lawngreen","salmon"])
plt.savefig("借款人籍貫分布圖.jpg",dpi=500,bbox_inches = "tight")

四、性別分布
conciseData["性別"].dropna().value_counts().plot.pie(figsize=(5,5),autopct='%.2f%%',textprops = {'fontsize':17, 'color':'black'})
plt.ylabel("性別分布",fontsize=20)
plt.legend(loc=2, bbox_to_anchor=(1.05,1.0),fontsize=15)
plt.savefig("性別分布圖.jpg",dpi=500,bbox_inches = "tight")

五、教育程度分布
conciseData["教育程度"] = conciseData["教育程度"].apply(lambda x:x.replace(",","").replace(" ","").replace("短期周轉","") \
.replace("","")if isinstance(x,str) else "")
conciseData["教育程度"] = conciseData[~conciseData["教育程度"].isin(["其他借款","投資創業","短期周轉","裝修借款","請選擇","購車借款","專科","大專高中或以下",""])]["教育程度"].dropna()
(conciseData["教育程度"].value_counts()/sum(conciseData["教育程度"].value_counts())).plot.pie(
figsize=(5,5),autopct='%.1f%%',textprops = {'fontsize':17, 'color':'black'})
plt.title("教育程度分布圖",fontsize=20)
plt.ylabel("")
plt.legend(loc=2, bbox_to_anchor=(1.05,1.0),fontsize=15)
plt.savefig("教育程度分布圖.jpg",dpi=500,bbox_inches = "tight")

六、借款人年齡分布
year = conciseData["出生日期"].apply(lambda x:x.split("/")[0]).value_counts()/len(conciseData["出生日期"])*100
year = year.sort_index()[10:-5]
plt.figure(figsize=(16,8))
plt.title("借款人年齡分布圖",fontsize=20)
plt.ylabel("百分比/%",size=20)
# plt.tick_params(labelsize=15)
plt.xticks(rotation=45,fontsize=15)
plt.yticks(fontsize=15)
# plt.grid(linestyle=":", color="b", linewidth=1)
plt.bar(year.index,year,
color=["grey","gold","darkviolet","turquoise","r","g","b","c",
"k","darkorange","lightgreen","plum", "tan","khaki", "pink", "skyblue","lawngreen","salmon"])
plt.savefig("借款人年齡分布圖.jpg",dpi=500,bbox_inches = "tight")

七、借款人職位分布
position = (conciseData["職位"].value_counts()/len(conciseData["職位"])*100)[:25]
plt.figure(figsize=(16,8))
plt.title("借款人職位分布圖",fontsize=20)
plt.ylabel("百分比/%",size=20)
# plt.tick_params(labelsize=15)
plt.xticks(rotation=60,fontsize=14)
plt.yticks(fontsize=15)
# plt.grid(linestyle=":", color="b", linewidth=1)
plt.bar(position.index,position,
color=["grey","gold","darkviolet","turquoise","r","g","b","c",
"k","darkorange","lightgreen","plum", "tan","khaki", "pink", "skyblue","lawngreen","salmon"])
plt.savefig("借款人職位分布圖.jpg",dpi=500,bbox_inches = "tight")
八、借款人行業分布
ind = (conciseData["行業"].value_counts()/len(conciseData["職位"])*100)[:15]
plt.figure(figsize=(16,8))
plt.title("借款人行業分布圖",fontsize=20)
plt.ylabel("百分比/%",size=20)
# plt.tick_params(labelsize=15)
plt.xticks(rotation=60,fontsize=20)
plt.yticks(fontsize=20)
# plt.grid(linestyle=":", color="b", linewidth=1)
plt.bar(ind.index,ind,
color=["grey","gold","darkviolet","turquoise","r","g","b","c",
"k","darkorange","lightgreen","plum", "tan","khaki", "pink", "skyblue","lawngreen","salmon"])
plt.savefig("借款人行業分布圖.jpg",dpi=500,bbox_inches = "tight")

九、借款金額分布圖
conciseData["借款金額"] = conciseData["借款金額"].apply(lambda x:str(int(x))+"元")
loanAmount = conciseData["借款金額"].value_counts().iloc[:10]/sum(conciseData["借款金額"].value_counts().iloc[:10])*100
# plt.figure(figsize=(16,8))
plt.title("借款金額分布圖",fontsize=20)
plt.ylabel("百分比/%",size=20)
plt.xticks(rotation=60,fontsize=20)
plt.yticks(fontsize=15)
# plt.grid(linestyle=":", color="b", linewidth=1)
plt.bar(loanAmount.index,loanAmount,
color=["grey","gold","darkviolet","turquoise","r","g","b","c",
"k","darkorange","lightgreen","plum", "tan","khaki", "pink", "skyblue","lawngreen","salmon"])
plt.savefig("借款人金額分布圖.jpg",dpi=500,bbox_inches = "tight")

十、借款人收入分布
salary = (conciseData["收入"].value_counts()[:7]/sum(conciseData["收入"].value_counts()[:7]))*100
salary = salary[["1000元以下","1001-2000元","2000-5000元","5000-10000元","10000-20000元","20000-50000元","50000元以上"]]
# plt.figure(figsize=(16,8))
plt.title("借款人收入分布圖",fontsize=20)
plt.ylabel("百分比/%",size=20)
# plt.tick_params(labelsize=15)
plt.xticks(rotation=60,fontsize=20)
plt.yticks(fontsize=15)
# plt.grid(linestyle=":", color="b", linewidth=1)
plt.bar(salary.index,salary,
color=["grey","gold","darkviolet","turquoise","r","g","b","c",
"k","darkorange","lightgreen","plum", "tan","khaki", "pink", "skyblue","lawngreen","salmon"])
plt.savefig("借款人收入分布圖.jpg",dpi=500,bbox_inches = "tight")

十一、婚姻狀況分布
conciseData["婚姻狀況"].dropna().value_counts().plot.pie(figsize=(5,5),autopct='%.1f%%',textprops = {'fontsize':17, 'color':'black'})
plt.title("婚姻狀況分布圖",fontsize=20)
plt.ylabel("")
plt.legend(loc=2, bbox_to_anchor=(1.05,1.0),fontsize=15)
plt.savefig("婚姻狀況分布圖.jpg",dpi=500,bbox_inches = "tight")

十二、車貸情況
conciseData["車貸"].dropna().value_counts().plot.pie(figsize=(5,5),autopct='%.1f%%',textprops = {'fontsize':17, 'color':'black'})
plt.title("車貸情況分布圖",fontsize=20)
plt.ylabel("")
plt.legend(loc=2, bbox_to_anchor=(1.05,1.0),fontsize=15)
plt.savefig("車貸情況分布圖.jpg",dpi=500,bbox_inches = "tight")

十三、房貸情況
conciseData["房貸"].dropna().value_counts().plot.pie(figsize=(5,5),autopct='%.1f%%',textprops = {'fontsize':17, 'color':'black'})
plt.title("房貸情況分布圖",fontsize=20)
plt.ylabel("")
plt.legend(loc=2, bbox_to_anchor=(1.05,1.0),fontsize=15)
plt.savefig("房貸情況分布圖.jpg",dpi=500,bbox_inches = "tight")

轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/352232.html
標籤:python
上一篇:學習NumPy全套代碼【超詳細】基本操作、資料型別、陣列運算、復制和試圖、索引、切片和迭代、形狀操作、通用函式、線性代數
