From fb89aabbad9c9786c889998d3ed0f738c711070f Mon Sep 17 00:00:00 2001 From: nl8590687 <3210346136@qq.com> Date: Fri, 11 May 2018 16:56:59 +0800 Subject: [PATCH] fix bugs and improve asrserver --- .gitignore | 1 + SpeechModel22.py | 49 +++++++++++++++++++++++++++++----- asrserver.py | 20 ++++++-------- dict.txt | 24 ++++++++--------- general_function/file_wav.py | 51 +++++++++++++++++++++++++++++++----- test.py | 2 +- 6 files changed, 109 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index a61d32b..cb5eb65 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ __pycache__ *.wav +*.model_yaml Test_Report_* dataset diff --git a/SpeechModel22.py b/SpeechModel22.py index 01f79b3..a3f905c 100644 --- a/SpeechModel22.py +++ b/SpeechModel22.py @@ -16,6 +16,7 @@ import numpy as np import random +from keras.models import model_from_yaml from keras.models import Sequential, Model from keras.layers import Dense, Dropout, Input, Reshape # , Flatten,LSTM,Convolution1D,MaxPooling1D,Merge from keras.layers import Conv1D,LSTM,MaxPooling1D, Lambda, TimeDistributed, Activation,Conv2D, MaxPooling2D #, Merge,Conv1D @@ -40,7 +41,14 @@ def __init__(self, datapath): self.label_max_string_length = 64 self.AUDIO_LENGTH = 1600 self.AUDIO_FEATURE_LENGTH = 200 + + self.model_name = 'm22' + + #if(not os.path.exists(self.model_name + '.model_yaml')): # 判断保存模型的目录是否存在 self._model, self.base_model = self.CreateModel() + #else: + # self._model, self.base_model = self.load_model_yaml(self.model_name) + self.datapath = datapath self.slash = '' @@ -112,7 +120,7 @@ def CreateModel(self): model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out) - model.summary() + #model.summary() # clipnorm seems to speeds up convergence #sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5) @@ -125,6 +133,9 @@ def CreateModel(self): # captures output of softmax so we can decode the output during visualization test_func = K.function([input_data], [y_pred]) + #kr.utils.plot_model(model, to_file='model.png', show_shapes=False, show_layer_names=True) # 可视化展示模型 + self.save_model_yaml(model, model_data) + print('[*提示] 创建模型成功,模型编译成功') return model, model_data @@ -135,7 +146,32 @@ def ctc_lambda_func(self, args): #y_pred = y_pred[:, 2:, :] return K.ctc_batch_cost(labels, y_pred, input_length, label_length) + def save_model_yaml(self,model,model_data): + ''' + 保存模型的配置结构 + ''' + str_yaml_model = model.to_yaml() + str_yaml_model_data = model_data.to_yaml() + f = open(self.model_name + '.model_yaml','w') + f.write(str_yaml_model) + f.close() + f = open(self.model_name + '_base.model_yaml','w') + f.write(str_yaml_model_data) + f.close() + def load_model_yaml(self, model_name): + ''' + 加载模型的配置结构 + ''' + f = open(self.model_name + '.model_yaml','r') + str_yaml_model = f.read() + f.close() + f = open(self.model_name + '_base.model_yaml','r') + str_yaml_model_data = f.read() + f.close() + model = model_from_yaml(str_yaml_model) + model_data = model_from_yaml(str_yaml_model_data) + return model, model_data def TrainModel(self, datapath, epoch = 2, save_step = 1000, batch_size = 32, filename = 'model_speech/speech_model2'): ''' @@ -230,7 +266,7 @@ def TestModel(self, datapath='', str_dataset='dev', data_count = 32, out_report txt += 'True:\t' + str(data_labels) + '\n' txt += 'Pred:\t' + str(pre) + '\n' txt += '\n' - txt_obj.write(txt) + txt_obj.write(txt) print('*[测试结果] 语音识别 ' + str_dataset + ' 集语音单字错误率:', word_error_num / words_num * 100, '%') if(out_report == True): @@ -356,7 +392,7 @@ def model(self): import tensorflow as tf from keras.backend.tensorflow_backend import set_session - os.environ["CUDA_VISIBLE_DEVICES"] = "1" + os.environ["CUDA_VISIBLE_DEVICES"] = "0" #进行配置,使用70%的GPU config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.7 @@ -385,9 +421,10 @@ def model(self): ms = ModelSpeech(datapath) - #ms.LoadModel(modelpath + 'm22_2\\1\\speech_model22_e_0_step_159000.model') - ms.TrainModel(datapath, epoch = 50, batch_size = 4, save_step = 500) - #ms.TestModel(datapath, str_dataset='test', data_count = 128, out_report = True) + #ms.LoadModel(modelpath + 'm22_2\\1\\speech_model22_e_0_step_327500.model') + ms.LoadModel(modelpath + 'm22_2/1/speech_model22_e_0_step_327500.model') + #ms.TrainModel(datapath, epoch = 50, batch_size = 4, save_step = 500) + #ms.TestModel(datapath, str_dataset='train', data_count = 128, out_report = True) #r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00241I0053.wav') #r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00020I0087.wav') #r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\wav\\train\\A11\\A11_167.WAV') diff --git a/asrserver.py b/asrserver.py index d047ff4..bfa7250 100644 --- a/asrserver.py +++ b/asrserver.py @@ -11,6 +11,13 @@ from SpeechModel22 import ModelSpeech from LanguageModel import ModelLanguage +datapath = 'data/' +modelpath = 'model_speech/' +ms = ModelSpeech(datapath) +ms.LoadModel(modelpath + 'speech_model22_e_0_step_216500.model') + +ml = ModelLanguage('model_language') +ml.LoadModel() class TestHTTPHandle(http.server.BaseHTTPRequestHandler): @@ -66,7 +73,7 @@ def do_POST(self): if(token == 'qwertasd'): #buf = '成功\n'+'wavs:\n'+str(wavs)+'\nfs:\n'+str(fs) - buf = r[0] + buf = r else: buf = '403' @@ -74,24 +81,13 @@ def do_POST(self): self._set_response() - - - - #buf = ' \n \n\nPost page\n \nPost Data:%s
Path:%s\n \n'%(datas,self.path) buf = bytes(buf,encoding="utf-8") self.wfile.write(buf) def recognize(self, wavs, fs): - datapath = 'data/' - modelpath = 'model_speech/' - ms = ModelSpeech(datapath) - ms.LoadModel(modelpath + 'speech_model22_e_0_step_6500.model') - r_speech = ms.RecognizeSpeech(wavs, fs) - ml = ModelLanguage('model_language') - ml.LoadModel() str_pinyin = r_speech r = ml.SpeechToText(str_pinyin) return r diff --git a/dict.txt b/dict.txt index 160efb0..0a692c7 100644 --- a/dict.txt +++ b/dict.txt @@ -1,13 +1,13 @@ -a1 阿啊呵腌吖锕雅 +a1 阿啊呵腌吖锕 a2 啊呵嗄 a3 啊呵 a4 啊呵 -a5 阿啊呵娃 +a5 阿啊呵 ai1 哀挨埃唉哎捱锿诶 ai2 呆挨癌皑捱矮 ai3 矮哎蔼霭嗳 -ai4 爱碍艾唉哎隘暧嗳瑷嗌嫒砹愛以 -an1 安谙鞍氨庵桉鹌广厂 +ai4 爱碍艾唉哎隘暧嗳瑷嗌嫒砹愛 +an1 安谙鞍氨庵桉鹌 an3 俺铵揞埯 an4 案按暗岸黯胺犴 ang1 肮 @@ -19,7 +19,7 @@ ao3 袄拗媪 ao4 奥澳傲懊坳拗骜岙鏊 ba1 八吧巴叭芭扒疤笆粑岜捌 ba2 八拔跋茇菝魃 -ba3 把靶钯靶星 +ba3 把靶钯靶 ba4 把爸罢霸坝耙灞鲅壩 ba5 吧罢巴叭 bai1 掰 @@ -34,11 +34,11 @@ bang3 膀榜绑 bang4 棒膀傍磅谤镑蚌蒡 bao1 包胞炮剥褒苞孢煲龅 bao2 薄雹保 -bao3 保宝饱堡葆褓鸨乖 -bao4 报暴抱爆鲍曝刨瀑豹趵在 +bao3 保宝饱堡葆褓鸨 +bao4 报暴抱爆鲍曝刨瀑豹趵 bei1 背悲杯碑卑陂埤萆鹎 bei3 北 -bei4 被备背辈倍贝蓓惫悖狈焙邶钡孛碚褙鐾鞴宝 +bei4 被备背辈倍贝蓓惫悖狈焙邶钡孛碚褙鐾鞴 bei5 臂呗备 ben1 奔贲锛 ben3 本苯畚 @@ -82,7 +82,7 @@ cai1 猜 cai2 才财材裁采 cai3 采彩踩睬 cai4 采菜蔡 -can1 参餐骖食 +can1 参餐骖 can2 残惭蚕 can3 惨黪 can4 惨灿掺璨孱粲 @@ -95,7 +95,7 @@ ce4 策测侧厕册恻 cen1 参 cen2 岑涔 ceng1 噌 -ceng2 曾层太 +ceng2 曾层 ceng4 蹭 cha1 差插叉碴喳嚓杈馇锸 cha2 查察茶叉茬碴楂猹搽槎檫 @@ -484,8 +484,8 @@ kao3 考烤拷栲 kao4 靠铐犒 ke1 科颗柯呵棵苛磕坷嗑瞌轲稞疴蝌钶窠颏珂髁 ke2 咳壳颏可 -ke3 可渴坷轲岢以 -ke4 可克客刻课恪嗑溘骒缂氪锞蚵科谎 +ke3 可渴坷轲岢 +ke4 克客刻课恪嗑溘骒缂氪锞蚵科可 ken3 肯恳啃垦龈 ken4 裉 keng1 坑吭铿 diff --git a/general_function/file_wav.py b/general_function/file_wav.py index d64b36d..b1c292d 100644 --- a/general_function/file_wav.py +++ b/general_function/file_wav.py @@ -102,6 +102,37 @@ def GetFrequencyFeature2(wavsignal, fs): #print(data_input.shape) return data_input +def GetFrequencyFeature3(wavsignal, fs): + # wav波形 加时间窗以及时移10ms + time_window = 25 # 单位ms + window_length = fs / 1000 * time_window # 计算窗长度的公式,目前全部为400固定值 + + wav_arr = np.array(wavsignal) + #wav_length = len(wavsignal[0]) + wav_length = wav_arr.shape[1] + + range0_end = int(len(wavsignal[0])/fs*1000 - time_window) // 10 # 计算循环终止的位置,也就是最终生成的窗数 + data_input = np.zeros((range0_end, 200), dtype = np.float) # 用于存放最终的频率特征数据 + data_line = np.zeros((1, 400), dtype = np.float) + for i in range(0, range0_end): + p_start = i * 160 + p_end = p_start + 400 + + data_line = wav_arr[0, p_start:p_end] + + x=np.linspace(0, 400 - 1, 400, dtype = np.int64) + w = 0.54 - 0.46 * np.cos(2 * np.pi * (x) / (400 - 1) ) # 汉明窗 + data_line = data_line * w # 加窗 + + data_line = np.abs(fft(data_line)) / wav_length + + + data_input[i]=data_line[0:200] # 设置为400除以2的值(即200)是取一半数据,因为是对称的 + + #print(data_input.shape) + data_input = np.log(data_input + 1) + return data_input + def wav_scale(energy): ''' 语音信号能量归一化 @@ -178,13 +209,19 @@ def get_wav_symbol(filename): return dic_symbol_list,list_symbolmark if(__name__=='__main__'): - #dic=get_wav_symbol('E:\\语音数据集\\doc\\doc\\trans\\train.syllable.txt') - #print(dic) - #dic=get_wav_list('E:\\语音数据集\\doc\\doc\\list\\train.wav.lst') - #for i in dic: - #print(i,dic[i]) + wave_data, fs = read_wav_data("A2_0.wav") - #wave_data[0]=wav_scale(wave_data[0]) - #print(fs) + wav_show(wave_data[0],fs) + #t0=time.time() + freimg = GetFrequencyFeature3(wave_data,fs) + #t1=time.time() + #print('time cost:',t1-t0) + + freimg = freimg.T + plt.subplot(111) + plt.imshow(freimg) + plt.colorbar(cax=None,ax=None,shrink=0.5) + + plt.show() diff --git a/test.py b/test.py index 1618847..a7edd95 100644 --- a/test.py +++ b/test.py @@ -32,7 +32,7 @@ ms.LoadModel(modelpath + 'm22_2/0/speech_model22_e_0_step_257000.model') #ms.TestModel(datapath, str_dataset='test', data_count = 64, out_report = True) -r = ms.RecognizeSpeech_FromFile('E:\语音数据集\ST-CMDS-20170001_1-OS\\20170001P00241I0052.wav') +r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00241I0052.wav') #r = ms.RecognizeSpeech_FromFile('E:\语音数据集\ST-CMDS-20170001_1-OS\\20170001P00241I0053.wav') #r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00020I0087.wav') #r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\wav\\train\\A11\\A11_167.WAV')