在python中用pyTorch搭建CNN神經網路實作數字（0~9）語音識別-有解無憂

在python中用pyTorch搭建CNN神經網路實作數字（0~9）語音識別

1.收集訓練資料
speech_commands_v0.01.tar.gz
http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
自己用迅雷下載什么都行（推薦迅雷）

2.準備環境
①pycharm軟體
②cuda和cudnn（我的是11.3）
③python（我的是3.9）
④支持cuda的pytorch

對于環境的準備，老樣子，自己csdn查教程一大把，
推薦不要用conda，直接全pip，一步到胃，

3.直接上代碼
用pycharm創建CNN_project
(1)提取資料并保存
①先把所需資料集（里面包括0-9語音集）保存到dataset檔案夾
在這里插入圖片描述

②對資料集提取語譜圖（spectrogram）并保存資料集和標簽集為data.npy,label.npy

#Process_data.py
import os 
import librosa
import numpy as np
import matplotlib.pyplot as plt
import cv2

def get_spectrogram(path):
    data, fs = librosa.load(path, sr=None, mono=False) 
    spect = librosa.stft(data, n_fft=1024, hop_length=320, win_length=1024)
    #print(spect.shape)
    #畫語譜圖
    #plt.matshow(spect)
    #plt.ylabel('Frequency')
    #plt.xlabel('Time(s)')
    #plt.title('Spectrogram')
    #plt.show()
    return spect


def extract_features():
    data_path="G:\\CNN_project\\dataset"
    labels=['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
    print("標簽名：",labels)

    total_data=[]
    total_label=[]

    for label in labels:
        label_path=data_path+"\\"+label
        wav_names=os.listdir(label_path)
        for wav_name in wav_names:
            if wav_name.endswith(".wav"):
                wav_path=label_path+"\\"+wav_name
                print(wav_path)
                spect=get_spectrogram(wav_path)
                spect=cv2.resize(spect,(28,28))
                total_data.append(spect)
                total_label.append(labels.index(label))

    total_data=np.array(total_data)
    total_label=np.array(total_label)
    print(total_data.shape)
    print(total_data.total_label)
    np.save("data.npy",total_data)
    np.save("label.npy",total_label)


extract_features()

(2)創建模型

#Cnn.py
from torch import nn

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(  
            nn.Conv2d(
                in_channels=1,    #輸入為單層影像
                out_channels=16,  #卷積成16層
                kernel_size=5,    #卷積殼5x5
                stride=1,         #步長，每次移動1步
                padding=2,        #邊緣層，給影像邊緣增加像素值為0的框      
            ),  
            nn.ReLU(),            #激活函式
            nn.MaxPool2d(kernel_size=2),  #池化層，將影像長寬減少一半
        )
        self.conv2 = nn.Sequential(  
            nn.Conv2d(16, 32, 5, 1, 2),  
            nn.ReLU(),  
            nn.MaxPool2d(2),  
        )


        self.out = nn.Linear(32 * 7 * 7, 100)  

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = x.view(x.size(0), -1) 
        output = self.out(x)
        return output  


def save_model(net,path)
    torch.save(net,path)

def load_model(path)
    net=torch.load(path)
    return net

(3)訓練模型并保存

#Train_model.py
from Cnn import CNN,save_model,load_model

def train_model(net,data,label,lr, batch_size, epoch)
    print(net)
    #能用顯卡跑的就用顯卡跑
    #net = net.cuda() 
    data = torch.Tensor(data)
    data = data.unsqueeze(1)
    label= torch.Tensor(label).long()
    #data =data.cuda()
    #label=label.cuda()
    #訓練集和測驗集7：3
    train_data, test_data, train_label, test_label = train_test_split(data , label, test_size=0.3,random_state=0)

    #學習率
    LR = lr
    #每次投入訓練資料大小
    BATCH_SIZE = batch_size
    #訓練模型次數
    EPOCH = epoch

    optimizer = torch.optim.SGD(net.parameters(), lr=LR)

    train_dataset = Data.TensorDataset(train_data, train_label)
    train_loader = Data.DataLoader(
        dataset=train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
    )

    test_dataset = Data.TensorDataset(test_data, test_label)
    test_loader = Data.DataLoader(
        dataset=test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
    )

    for epoch in range(EPOCH):
        for step, (batch_data, batch_label) in enumerate(train_loader):
            print('Epoch:', epoch + 1, '/', EPOCH, 'Step:', step)
            prediction = net(batch_data)
            loss = F.cross_entropy(prediction, batch_label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if step % 50 == 0:
                accuracy = []
                for stp, (test_x, test_y) in enumerate(test_loader):
                    test_output = net(test_x)
                    _, pred_y = torch.max(test_output, 1)
                    accuracy.append(torch.sum(pred_y == test_y).item() / len(test_y))

                print('Epoch', epoch+1, '| train loss:%.4f' % loss, '| test accuracy:%.4f' % np.mean(accuracy))

    return net

def test_model(net,data,label):
    data = torch.Tensor(data)
    label= torch.Tensor(label).long()
    #訓練集和測驗集7：3
    train_data, test_data, train_label, test_label = train_test_split(data , label, test_size=0.3,random_state=0)
    
    test_dataset = Data.TensorDataset(test_data, test_label)
    test_loader = Data.DataLoader(
        dataset=test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
    )
    
    y_true = []
    y_pred = []
    for stp, (test_x, test_y) in enumerate(test_loader):
        test_output = net(test_x)
        _, pred_y = torch.max(test_output, 1)
        y_true.extend(test_y)
        y_pred.extend(pred_y)

    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision_score:", precision_score(y_true, y_pred, average='macro'))
    print("Recall_score:", recall_score(y_true, y_pred, average='macro'))
    print("F1_score", f1_score(y_true, y_pred, average='macro'))



if __name__ == '__main__':
    data = np.load("data.npy")
    label = np.load("label.npy")

    cnn=CNN()
    cnn=train_model(cnn,data,label,lr=0.001,batch_size=500,epoch=10)
    save_model(cnn,"cnn.pkl")
    cnn=load_model("cnn.pkl")
    test_model(cnn)

4.最終實作效果
識別率大概在0.9左右

轉載請註明出處，本文鏈接：https://www.uj5u.com/qita/294805.html

標籤：其他

上一篇：基于InceptionV3深度學習實作巖石影像智能識別與分類

下一篇：ROS中用cv_bridge和opencv時出現cv::xxx未定義的問題