文本分析
1.整體思路:
- 呼叫的庫:jieba,matplotlib,networkx,wordcloud
- 分析的文本:三聯版連城訣
- 需要的工具:Python,小說文本,中文停詞檔案,
2.具體實作:
1.讀取文本:
def read_txt():
file=open('連城訣【三聯版】.txt','r+',encoding='gbk')
txt=file.read()
file.close()
return txt
2.詞性統計(寫入檔案):
def sda():
import jieba.posseg as psg
text=open("連城訣【三聯版】.txt", encoding='gbk', errors='ignore').read()
seg=psg.cut(text)
file=open("詞性.txt",'a+')
for ele in seg:
file.writelines(ele)
3.匯入停詞檔案
def stopwordslist(filepath):
stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
return stopwords
4.分詞生成人物(寫入檔案)
def write_txt():
words = jieba.lcut(read_txt()) # 使用精確模式對文本進行分詞counts = {} # 通過鍵值對的形式存盤詞語及其出現的次數
counts={}
stopwords=stopwordslist('stop.txt')
for word in words:
if len(word) == 1: # 單個詞語不計算在內
continue
elif word not in stopwords:
counts[word] = counts.get(word, 0) + 1 # 遍歷所有詞語,每出現一次其對應的值加 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True) # 根據詞語出現的次數進行從大到小排序
f=open("詞頻統計.txt","w")#寫入檔案
for i in range(len(items)):
word, count = items[i]
f.writelines("{0:<5}{1:>5}\n".format(word, count))
f.close()
5.生成詞云:
def creat_wordcloud():
f_0=open("詞頻統計.txt",'r')
bg_pic=plt.imread('張國榮.jpg')
text=f_0.read()
f_0.close()
wcloud=wordcloud.WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",
background_color="white",width=1000,
max_words=500,
mask=bg_pic,
height=860,
margin=2,
).generate(text)
wcloud.to_file("連城訣cloud.jpg")
plt.imshow(wcloud)
plt.axis('off')
plt.show()
6.生成人物關系圖:
def creat_relationship():
Names=['狄云', '水笙', '萬震山', '丁典', ' 戚芳', ' 萬圭 ', '花鐵干' ,' 血刀老祖 ', '戚長發', ' 言達平' , '寶象',' 汪嘯風' ,'水岱']
relations={}
lst_para=(read_txt()).split('\n')#lst_para是每一段
for text in lst_para:
for name_0 in Names:
if name_0 in text:
for name_1 in Names:
if name_1 in text and name_0!=name_1 and (name_1,name_0) not in relations:
relations[(name_0,name_1)]=relations.get((name_0,name_1),0)+1
maxRela=max([v for k,v in relations.items()])
relations={k:v / maxRela for k,v in relations.items()}
#return relations
plt.figure(figsize=(15,15))
G=nx.Graph()
for k,v in relations.items():
G.add_edge(k[0],k[1],weight=v)
#篩選權重大于0.6的邊
elarge=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']>0.6]
#篩選權重大于0.3小于0.6的邊
emidle=[(u,v) for (u,v,d) in G.edges(data=True) if (d['weight']>0.3) & (d['weight']<=0.6)]
#篩選權重小于0.3的邊
esmall=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']<=0.3]
#設定圖形布局
pos=nx.spring_layout(G)
#設定節點樣式
nx.draw_networkx_nodes(G,pos,alpha=0.8, node_size=1200)
#設定大于0.6的邊的樣式
nx.draw_networkx_edges(G,pos,edgelist=elarge, width=2.5,alpha=0.9,edge_color='g')
#0.3~0.6
nx.draw_networkx_edges(G,pos,edgelist=emidle, width=1.5,alpha=0.6,edge_color='y')
#<0.3
nx.draw_networkx_edges(G,pos,edgelist=esmall, width=1,alpha=0.4,edge_color='b',style='dashed')
nx.draw_networkx_labels(G,pos,font_size=12)
plt.axis('off')
plt.title("連城訣人物權重圖")
plt.show()
完整代碼:
import jieba
import matplotlib.pyplot as plt
import wordcloud
import networkx as nx
import matplotlib
import jieba.posseg as psg
matplotlib.rcParams['font.sans-serif']=['SimHei']
#讀取文本
def read_txt():
file=open('連城訣【三聯版】.txt','r+',encoding='gbk')
txt=file.read()
file.close()
return txt
#詞性統計(寫入檔案)
def sda():
import jieba.posseg as psg
text=open("連城訣【三聯版】.txt", encoding='gbk', errors='ignore').read()
seg=psg.cut(text)
file=open("詞性.txt",'a+')
for ele in seg:
file.writelines(ele)
#停詞檔案
def stopwordslist(filepath):
stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
return stopwords
#分詞生成人物(寫入檔案)
def write_txt():
words = jieba.lcut(read_txt()) # 使用精確模式對文本進行分詞counts = {} # 通過鍵值對的形式存盤詞語及其出現的次數
counts={}
stopwords=stopwordslist('stop.txt')
for word in words:
if len(word) == 1: # 單個詞語不計算在內
continue
elif word not in stopwords:
counts[word] = counts.get(word, 0) + 1 # 遍歷所有詞語,每出現一次其對應的值加 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True) # 根據詞語出現的次數進行從大到小排序
f=open("詞頻統計.txt","w")#寫入檔案
for i in range(len(items)):
word, count = items[i]
f.writelines("{0:<5}{1:>5}\n".format(word, count))
f.close()
#生成詞云
def creat_wordcloud():
f_0=open("詞頻統計.txt",'r')
bg_pic=plt.imread('張國榮.jpg')
text=f_0.read()
f_0.close()
wcloud=wordcloud.WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",
background_color="white",width=1000,
max_words=500,
mask=bg_pic,
height=860,
margin=2,
).generate(text)
wcloud.to_file("連城訣cloud.jpg")
plt.imshow(wcloud)
plt.axis('off')
plt.show()
#生成人物關系圖(全按書上抄的)
def creat_relationship():
Names=['狄云', '水笙', '萬震山', '丁典', ' 戚芳', ' 萬圭 ', '花鐵干' ,' 血刀老祖 ', '戚長發', ' 言達平' , '寶象',' 汪嘯風' ,'水岱']
relations={}
lst_para=(read_txt()).split('\n')#lst_para是每一段
for text in lst_para:
for name_0 in Names:
if name_0 in text:
for name_1 in Names:
if name_1 in text and name_0!=name_1 and (name_1,name_0) not in relations:
relations[(name_0,name_1)]=relations.get((name_0,name_1),0)+1
maxRela=max([v for k,v in relations.items()])
relations={k:v / maxRela for k,v in relations.items()}
#return relations
plt.figure(figsize=(15,15))
G=nx.Graph()
for k,v in relations.items():
G.add_edge(k[0],k[1],weight=v)
#篩選權重大于0.6的邊
elarge=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']>0.6]
#篩選權重大于0.3小于0.6的邊
emidle=[(u,v) for (u,v,d) in G.edges(data=True) if (d['weight']>0.3) & (d['weight']<=0.6)]
#篩選權重小于0.3的邊
esmall=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']<=0.3]
#設定圖形布局
pos=nx.spring_layout(G)
#設定節點樣式
nx.draw_networkx_nodes(G,pos,alpha=0.8, node_size=1200)
#設定大于0.6的邊的樣式
nx.draw_networkx_edges(G,pos,edgelist=elarge, width=2.5,alpha=0.9,edge_color='g')
#0.3~0.6
nx.draw_networkx_edges(G,pos,edgelist=emidle, width=1.5,alpha=0.6,edge_color='y')
#<0.3
nx.draw_networkx_edges(G,pos,edgelist=esmall, width=1,alpha=0.4,edge_color='b',style='dashed')
nx.draw_networkx_labels(G,pos,font_size=12)
plt.axis('off')
plt.title("連城訣人物權重圖")
plt.show()
def main():
write_txt()
creat_wordcloud()
creat_relationship()
if __name__ == '__main__':
main()
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/259737.html
標籤:python
