在python中用pyTorch搭建CNN神經網路實作數字(0~9)語音識別
1.收集訓練資料
speech_commands_v0.01.tar.gz
http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
自己用迅雷下載什么都行(推薦迅雷)
2.準備環境
①pycharm軟體
②cuda和cudnn(我的是11.3)
③python(我的是3.9)
④支持cuda的pytorch
對于環境的準備,老樣子,自己csdn查教程一大把,
推薦不要用conda,直接全pip,一步到胃,
3.直接上代碼
用pycharm創建CNN_project
(1)提取資料并保存
①先把所需資料集(里面包括0-9語音集)保存到dataset檔案夾

②對資料集提取語譜圖(spectrogram)并保存資料集和標簽集為data.npy,label.npy
#Process_data.py
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import cv2
def get_spectrogram(path):
data, fs = librosa.load(path, sr=None, mono=False)
spect = librosa.stft(data, n_fft=1024, hop_length=320, win_length=1024)
#print(spect.shape)
#畫語譜圖
#plt.matshow(spect)
#plt.ylabel('Frequency')
#plt.xlabel('Time(s)')
#plt.title('Spectrogram')
#plt.show()
return spect
def extract_features():
data_path="G:\\CNN_project\\dataset"
labels=['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
print("標簽名:",labels)
total_data=[]
total_label=[]
for label in labels:
label_path=data_path+"\\"+label
wav_names=os.listdir(label_path)
for wav_name in wav_names:
if wav_name.endswith(".wav"):
wav_path=label_path+"\\"+wav_name
print(wav_path)
spect=get_spectrogram(wav_path)
spect=cv2.resize(spect,(28,28))
total_data.append(spect)
total_label.append(labels.index(label))
total_data=np.array(total_data)
total_label=np.array(total_label)
print(total_data.shape)
print(total_data.total_label)
np.save("data.npy",total_data)
np.save("label.npy",total_label)
extract_features()
(2)創建模型
#Cnn.py
from torch import nn
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(
in_channels=1, #輸入為單層影像
out_channels=16, #卷積成16層
kernel_size=5, #卷積殼5x5
stride=1, #步長,每次移動1步
padding=2, #邊緣層,給影像邊緣增加像素值為0的框
),
nn.ReLU(), #激活函式
nn.MaxPool2d(kernel_size=2), #池化層,將影像長寬減少一半
)
self.conv2 = nn.Sequential(
nn.Conv2d(16, 32, 5, 1, 2),
nn.ReLU(),
nn.MaxPool2d(2),
)
self.out = nn.Linear(32 * 7 * 7, 100)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = x.view(x.size(0), -1)
output = self.out(x)
return output
def save_model(net,path)
torch.save(net,path)
def load_model(path)
net=torch.load(path)
return net
(3)訓練模型并保存
#Train_model.py
from Cnn import CNN,save_model,load_model
def train_model(net,data,label,lr, batch_size, epoch)
print(net)
#能用顯卡跑的就用顯卡跑
#net = net.cuda()
data = torch.Tensor(data)
data = data.unsqueeze(1)
label= torch.Tensor(label).long()
#data =data.cuda()
#label=label.cuda()
#訓練集和測驗集7:3
train_data, test_data, train_label, test_label = train_test_split(data , label, test_size=0.3,random_state=0)
#學習率
LR = lr
#每次投入訓練資料大小
BATCH_SIZE = batch_size
#訓練模型次數
EPOCH = epoch
optimizer = torch.optim.SGD(net.parameters(), lr=LR)
train_dataset = Data.TensorDataset(train_data, train_label)
train_loader = Data.DataLoader(
dataset=train_dataset,
batch_size=BATCH_SIZE,
shuffle=True,
)
test_dataset = Data.TensorDataset(test_data, test_label)
test_loader = Data.DataLoader(
dataset=test_dataset,
batch_size=BATCH_SIZE,
shuffle=True,
)
for epoch in range(EPOCH):
for step, (batch_data, batch_label) in enumerate(train_loader):
print('Epoch:', epoch + 1, '/', EPOCH, 'Step:', step)
prediction = net(batch_data)
loss = F.cross_entropy(prediction, batch_label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if step % 50 == 0:
accuracy = []
for stp, (test_x, test_y) in enumerate(test_loader):
test_output = net(test_x)
_, pred_y = torch.max(test_output, 1)
accuracy.append(torch.sum(pred_y == test_y).item() / len(test_y))
print('Epoch', epoch+1, '| train loss:%.4f' % loss, '| test accuracy:%.4f' % np.mean(accuracy))
return net
def test_model(net,data,label):
data = torch.Tensor(data)
label= torch.Tensor(label).long()
#訓練集和測驗集7:3
train_data, test_data, train_label, test_label = train_test_split(data , label, test_size=0.3,random_state=0)
test_dataset = Data.TensorDataset(test_data, test_label)
test_loader = Data.DataLoader(
dataset=test_dataset,
batch_size=BATCH_SIZE,
shuffle=True,
)
y_true = []
y_pred = []
for stp, (test_x, test_y) in enumerate(test_loader):
test_output = net(test_x)
_, pred_y = torch.max(test_output, 1)
y_true.extend(test_y)
y_pred.extend(pred_y)
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision_score:", precision_score(y_true, y_pred, average='macro'))
print("Recall_score:", recall_score(y_true, y_pred, average='macro'))
print("F1_score", f1_score(y_true, y_pred, average='macro'))
if __name__ == '__main__':
data = np.load("data.npy")
label = np.load("label.npy")
cnn=CNN()
cnn=train_model(cnn,data,label,lr=0.001,batch_size=500,epoch=10)
save_model(cnn,"cnn.pkl")
cnn=load_model("cnn.pkl")
test_model(cnn)
4.最終實作效果
識別率大概在0.9左右
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/294805.html
標籤:其他
