文本分析

1.整體思路：

呼叫的庫：jieba,matplotlib,networkx,wordcloud
分析的文本：三聯版連城訣
需要的工具：Python，小說文本，中文停詞檔案，

2.具體實作：

1.讀取文本:

def read_txt():
    file=open('連城訣【三聯版】.txt','r+',encoding='gbk')
    txt=file.read()
    file.close()
    return txt

2.詞性統計（寫入檔案）：

def sda():
    import jieba.posseg as psg
    text=open("連城訣【三聯版】.txt", encoding='gbk', errors='ignore').read() 
    seg=psg.cut(text) 
    file=open("詞性.txt",'a+')
    for ele in seg:
        file.writelines(ele)

3.匯入停詞檔案

def stopwordslist(filepath):
    stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords

4.分詞生成人物（寫入檔案）

def write_txt():
    words = jieba.lcut(read_txt())     # 使用精確模式對文本進行分詞counts = {}     # 通過鍵值對的形式存盤詞語及其出現的次數
    counts={}
    stopwords=stopwordslist('stop.txt')
    for word in words:
        if len(word) == 1:    # 單個詞語不計算在內
            continue
        elif word not in stopwords:
            counts[word] = counts.get(word, 0) + 1    # 遍歷所有詞語，每出現一次其對應的值加 1
    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)    # 根據詞語出現的次數進行從大到小排序

    f=open("詞頻統計.txt","w")#寫入檔案
    for i in range(len(items)):
        word, count = items[i]
        f.writelines("{0:<5}{1:>5}\n".format(word, count))
    f.close()

5.生成詞云：

def creat_wordcloud():
    f_0=open("詞頻統計.txt",'r')
    bg_pic=plt.imread('張國榮.jpg')
    text=f_0.read()
    f_0.close()
    wcloud=wordcloud.WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",
                           background_color="white",width=1000,
                           max_words=500,
                           mask=bg_pic,
                           height=860,
                           margin=2,
                           ).generate(text)

    wcloud.to_file("連城訣cloud.jpg")
    plt.imshow(wcloud)
    plt.axis('off')
    plt.show()

6.生成人物關系圖：

def creat_relationship():
    Names=['狄云', '水笙', '萬震山', '丁典', ' 戚芳', ' 萬圭 ', '花鐵干' ,' 血刀老祖 ', '戚長發', ' 言達平' , '寶象',' 汪嘯風' ,'水岱']
    relations={}
    lst_para=(read_txt()).split('\n')#lst_para是每一段
    for text in lst_para:
        for name_0 in Names:
            if name_0 in text:
                for name_1 in Names:
                    if name_1 in text and name_0!=name_1 and (name_1,name_0) not in relations:
                        relations[(name_0,name_1)]=relations.get((name_0,name_1),0)+1
    maxRela=max([v for k,v in relations.items()])
    relations={k:v /  maxRela for k,v in relations.items()}
    #return relations


    plt.figure(figsize=(15,15))
    G=nx.Graph()
    for k,v in relations.items():
        G.add_edge(k[0],k[1],weight=v)
        #篩選權重大于0.6的邊
    elarge=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']>0.6]
    #篩選權重大于0.3小于0.6的邊
    emidle=[(u,v) for (u,v,d) in G.edges(data=True) if (d['weight']>0.3) & (d['weight']<=0.6)]
    #篩選權重小于0.3的邊
    esmall=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']<=0.3]
    #設定圖形布局
    pos=nx.spring_layout(G)
    #設定節點樣式
    nx.draw_networkx_nodes(G,pos,alpha=0.8, node_size=1200)
    #設定大于0.6的邊的樣式
    nx.draw_networkx_edges(G,pos,edgelist=elarge, width=2.5,alpha=0.9,edge_color='g')
    #0.3~0.6
    nx.draw_networkx_edges(G,pos,edgelist=emidle, width=1.5,alpha=0.6,edge_color='y')
    #<0.3
    nx.draw_networkx_edges(G,pos,edgelist=esmall, width=1,alpha=0.4,edge_color='b',style='dashed')
    nx.draw_networkx_labels(G,pos,font_size=12)

    plt.axis('off')
    plt.title("連城訣人物權重圖")
    plt.show()

完整代碼：

import jieba
import matplotlib.pyplot as plt
import wordcloud
import networkx as nx
import matplotlib
import jieba.posseg as psg
matplotlib.rcParams['font.sans-serif']=['SimHei']
#讀取文本
def read_txt():
    file=open('連城訣【三聯版】.txt','r+',encoding='gbk')
    txt=file.read()
    file.close()
    return txt

#詞性統計（寫入檔案）
def sda():
    import jieba.posseg as psg
    text=open("連城訣【三聯版】.txt", encoding='gbk', errors='ignore').read() 
    seg=psg.cut(text) 
    file=open("詞性.txt",'a+')
    for ele in seg:
        file.writelines(ele)
  
#停詞檔案
def stopwordslist(filepath):
    stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords

#分詞生成人物（寫入檔案）
def write_txt():
    words = jieba.lcut(read_txt())     # 使用精確模式對文本進行分詞counts = {}     # 通過鍵值對的形式存盤詞語及其出現的次數
    counts={}
    stopwords=stopwordslist('stop.txt')
    for word in words:
        if len(word) == 1:    # 單個詞語不計算在內
            continue
        elif word not in stopwords:
            counts[word] = counts.get(word, 0) + 1    # 遍歷所有詞語，每出現一次其對應的值加 1
    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)    # 根據詞語出現的次數進行從大到小排序

    f=open("詞頻統計.txt","w")#寫入檔案
    for i in range(len(items)):
        word, count = items[i]
        f.writelines("{0:<5}{1:>5}\n".format(word, count))
    f.close()

#生成詞云
def creat_wordcloud():
    f_0=open("詞頻統計.txt",'r')
    bg_pic=plt.imread('張國榮.jpg')
    text=f_0.read()
    f_0.close()
    wcloud=wordcloud.WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",
                           background_color="white",width=1000,
                           max_words=500,
                           mask=bg_pic,
                           height=860,
                           margin=2,
                           ).generate(text)

    wcloud.to_file("連城訣cloud.jpg")
    plt.imshow(wcloud)
    plt.axis('off')
    plt.show()


#生成人物關系圖（全按書上抄的）
def creat_relationship():
    Names=['狄云', '水笙', '萬震山', '丁典', ' 戚芳', ' 萬圭 ', '花鐵干' ,' 血刀老祖 ', '戚長發', ' 言達平' , '寶象',' 汪嘯風' ,'水岱']
    relations={}
    lst_para=(read_txt()).split('\n')#lst_para是每一段
    for text in lst_para:
        for name_0 in Names:
            if name_0 in text:
                for name_1 in Names:
                    if name_1 in text and name_0!=name_1 and (name_1,name_0) not in relations:
                        relations[(name_0,name_1)]=relations.get((name_0,name_1),0)+1
    maxRela=max([v for k,v in relations.items()])
    relations={k:v /  maxRela for k,v in relations.items()}
    #return relations


    plt.figure(figsize=(15,15))
    G=nx.Graph()
    for k,v in relations.items():
        G.add_edge(k[0],k[1],weight=v)
        #篩選權重大于0.6的邊
    elarge=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']>0.6]
    #篩選權重大于0.3小于0.6的邊
    emidle=[(u,v) for (u,v,d) in G.edges(data=True) if (d['weight']>0.3) & (d['weight']<=0.6)]
    #篩選權重小于0.3的邊
    esmall=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']<=0.3]
    #設定圖形布局
    pos=nx.spring_layout(G)
    #設定節點樣式
    nx.draw_networkx_nodes(G,pos,alpha=0.8, node_size=1200)
    #設定大于0.6的邊的樣式
    nx.draw_networkx_edges(G,pos,edgelist=elarge, width=2.5,alpha=0.9,edge_color='g')
    #0.3~0.6
    nx.draw_networkx_edges(G,pos,edgelist=emidle, width=1.5,alpha=0.6,edge_color='y')
    #<0.3
    nx.draw_networkx_edges(G,pos,edgelist=esmall, width=1,alpha=0.4,edge_color='b',style='dashed')
    nx.draw_networkx_labels(G,pos,font_size=12)

    plt.axis('off')
    plt.title("連城訣人物權重圖")
    plt.show()

def main():
    write_txt()
    creat_wordcloud()
    creat_relationship()

if __name__ == '__main__':
    main()

轉載請註明出處，本文鏈接：https://www.uj5u.com/houduan/259737.html

標籤：python

上一篇：Python_資料型別轉換和運算子

下一篇：Python學習筆記（十三）：例外處理機制

python實作對小說的文本分析（人物關系圖，人物詞云等）

文本分析

1.整體思路：

2.具體實作：

1.讀取文本:

2.詞性統計（寫入檔案）：

3.匯入停詞檔案

4.分詞生成人物（寫入檔案）

5.生成詞云：

6.生成人物關系圖：

完整代碼：