From fb89aabbad9c9786c889998d3ed0f738c711070f Mon Sep 17 00:00:00 2001
From: nl8590687 <3210346136@qq.com>
Date: Fri, 11 May 2018 16:56:59 +0800
Subject: [PATCH] fix bugs and improve asrserver

---
 .gitignore                   |  1 +
 SpeechModel22.py             | 49 +++++++++++++++++++++++++++++-----
 asrserver.py                 | 20 ++++++--------
 dict.txt                     | 24 ++++++++---------
 general_function/file_wav.py | 51 +++++++++++++++++++++++++++++++-----
 test.py                      |  2 +-
 6 files changed, 109 insertions(+), 38 deletions(-)

diff --git a/.gitignore b/.gitignore
index a61d32b..cb5eb65 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@
 
 __pycache__
 *.wav
+*.model_yaml
 Test_Report_*
 
 dataset
diff --git a/SpeechModel22.py b/SpeechModel22.py
index 01f79b3..a3f905c 100644
--- a/SpeechModel22.py
+++ b/SpeechModel22.py
@@ -16,6 +16,7 @@
 import numpy as np
 import random
 
+from keras.models import model_from_yaml
 from keras.models import Sequential, Model
 from keras.layers import Dense, Dropout, Input, Reshape # , Flatten,LSTM,Convolution1D,MaxPooling1D,Merge
 from keras.layers import Conv1D,LSTM,MaxPooling1D, Lambda, TimeDistributed, Activation,Conv2D, MaxPooling2D #, Merge,Conv1D
@@ -40,7 +41,14 @@ def __init__(self, datapath):
 		self.label_max_string_length = 64
 		self.AUDIO_LENGTH = 1600
 		self.AUDIO_FEATURE_LENGTH = 200
+		
+		self.model_name = 'm22'
+		
+		#if(not os.path.exists(self.model_name + '.model_yaml')): # 判断保存模型的目录是否存在
 		self._model, self.base_model = self.CreateModel() 
+		#else:
+		#	self._model, self.base_model = self.load_model_yaml(self.model_name)
+		
 		
 		self.datapath = datapath
 		self.slash = ''
@@ -112,7 +120,7 @@ def CreateModel(self):
 		
 		model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
 		
-		model.summary()
+		#model.summary()
 		
 		# clipnorm seems to speeds up convergence
 		#sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
@@ -125,6 +133,9 @@ def CreateModel(self):
 		# captures output of softmax so we can decode the output during visualization
 		test_func = K.function([input_data], [y_pred])
 		
+		#kr.utils.plot_model(model, to_file='model.png', show_shapes=False, show_layer_names=True) # 可视化展示模型
+		self.save_model_yaml(model, model_data)
+		
 		print('[*提示] 创建模型成功，模型编译成功')
 		return model, model_data
 		
@@ -135,7 +146,32 @@ def ctc_lambda_func(self, args):
 		#y_pred = y_pred[:, 2:, :]
 		return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
 	
+	def save_model_yaml(self,model,model_data):
+		'''
+		保存模型的配置结构
+		'''
+		str_yaml_model = model.to_yaml()
+		str_yaml_model_data = model_data.to_yaml()
+		f = open(self.model_name + '.model_yaml','w')
+		f.write(str_yaml_model)
+		f.close()
+		f = open(self.model_name + '_base.model_yaml','w')
+		f.write(str_yaml_model_data)
+		f.close()
 	
+	def load_model_yaml(self, model_name):
+		'''
+		加载模型的配置结构
+		'''
+		f = open(self.model_name + '.model_yaml','r')
+		str_yaml_model = f.read()
+		f.close()
+		f = open(self.model_name + '_base.model_yaml','r')
+		str_yaml_model_data = f.read()
+		f.close()
+		model = model_from_yaml(str_yaml_model)
+		model_data = model_from_yaml(str_yaml_model_data)
+		return model, model_data
 	
 	def TrainModel(self, datapath, epoch = 2, save_step = 1000, batch_size = 32, filename = 'model_speech/speech_model2'):
 		'''
@@ -230,7 +266,7 @@ def TestModel(self, datapath='', str_dataset='dev', data_count = 32, out_report
 					txt += 'True:\t' + str(data_labels) + '\n'
 					txt += 'Pred:\t' + str(pre) + '\n'
 					txt += '\n'
-				txt_obj.write(txt)
+					txt_obj.write(txt)
 			
 			print('*[测试结果] 语音识别 ' + str_dataset + ' 集语音单字错误率：', word_error_num / words_num * 100, '%')
 			if(out_report == True):
@@ -356,7 +392,7 @@ def model(self):
 	
 	import tensorflow as tf
 	from keras.backend.tensorflow_backend import set_session
-	os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+	os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 	#进行配置，使用70%的GPU
 	config = tf.ConfigProto()
 	config.gpu_options.per_process_gpu_memory_fraction = 0.7
@@ -385,9 +421,10 @@ def model(self):
 	
 	ms = ModelSpeech(datapath)
 	
-	#ms.LoadModel(modelpath + 'm22_2\\1\\speech_model22_e_0_step_159000.model')
-	ms.TrainModel(datapath, epoch = 50, batch_size = 4, save_step = 500)
-	#ms.TestModel(datapath, str_dataset='test', data_count = 128, out_report = True)
+	#ms.LoadModel(modelpath + 'm22_2\\1\\speech_model22_e_0_step_327500.model')
+	ms.LoadModel(modelpath + 'm22_2/1/speech_model22_e_0_step_327500.model')
+	#ms.TrainModel(datapath, epoch = 50, batch_size = 4, save_step = 500)
+	#ms.TestModel(datapath, str_dataset='train', data_count = 128, out_report = True)
 	#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00241I0053.wav')
 	#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00020I0087.wav')
 	#r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\wav\\train\\A11\\A11_167.WAV')
diff --git a/asrserver.py b/asrserver.py
index d047ff4..bfa7250 100644
--- a/asrserver.py
+++ b/asrserver.py
@@ -11,6 +11,13 @@
 from SpeechModel22 import ModelSpeech
 from LanguageModel import ModelLanguage
 
+datapath = 'data/'
+modelpath = 'model_speech/'
+ms = ModelSpeech(datapath)
+ms.LoadModel(modelpath + 'speech_model22_e_0_step_216500.model')
+
+ml = ModelLanguage('model_language')
+ml.LoadModel()
 
 class TestHTTPHandle(http.server.BaseHTTPRequestHandler):  
 	
@@ -66,7 +73,7 @@ def do_POST(self):
 		
 		if(token == 'qwertasd'):
 			#buf = '成功\n'+'wavs:\n'+str(wavs)+'\nfs:\n'+str(fs)
-			buf = r[0]
+			buf = r
 		else:
 			buf = '403'
 		
@@ -74,24 +81,13 @@ def do_POST(self):
 		
 		self._set_response()
 		
-		
-		
-		
-		
 		#buf = '<!DOCTYPE HTML> \n<html> \n<head>\n<title>Post page</title>\n</head> \n<body>Post Data:%s  <br />Path:%s\n</body>  \n</html>'%(datas,self.path)  
 		buf = bytes(buf,encoding="utf-8")
 		self.wfile.write(buf)  
 		
 	def recognize(self, wavs, fs):
-		datapath = 'data/'
-		modelpath = 'model_speech/'
-		ms = ModelSpeech(datapath)
-		ms.LoadModel(modelpath + 'speech_model22_e_0_step_6500.model')
-		
 		r_speech = ms.RecognizeSpeech(wavs, fs)
 		
-		ml = ModelLanguage('model_language')
-		ml.LoadModel()
 		str_pinyin = r_speech
 		r = ml.SpeechToText(str_pinyin)
 		return r
diff --git a/dict.txt b/dict.txt
index 160efb0..0a692c7 100644
--- a/dict.txt
+++ b/dict.txt
@@ -1,13 +1,13 @@
-a1	阿啊呵腌吖锕雅
+a1	阿啊呵腌吖锕
 a2	啊呵嗄
 a3	啊呵
 a4	啊呵
-a5	阿啊呵娃
+a5	阿啊呵
 ai1	哀挨埃唉哎捱锿诶
 ai2	呆挨癌皑捱矮
 ai3	矮哎蔼霭嗳
-ai4	爱碍艾唉哎隘暧嗳瑷嗌嫒砹愛以
-an1	安谙鞍氨庵桉鹌广厂
+ai4	爱碍艾唉哎隘暧嗳瑷嗌嫒砹愛
+an1	安谙鞍氨庵桉鹌
 an3	俺铵揞埯
 an4	案按暗岸黯胺犴
 ang1	肮
@@ -19,7 +19,7 @@ ao3	袄拗媪
 ao4	奥澳傲懊坳拗骜岙鏊
 ba1	八吧巴叭芭扒疤笆粑岜捌
 ba2	八拔跋茇菝魃
-ba3	把靶钯靶星
+ba3	把靶钯靶
 ba4	把爸罢霸坝耙灞鲅壩
 ba5	吧罢巴叭
 bai1	掰
@@ -34,11 +34,11 @@ bang3	膀榜绑
 bang4	棒膀傍磅谤镑蚌蒡
 bao1	包胞炮剥褒苞孢煲龅
 bao2	薄雹保
-bao3	保宝饱堡葆褓鸨乖
-bao4	报暴抱爆鲍曝刨瀑豹趵在
+bao3	保宝饱堡葆褓鸨
+bao4	报暴抱爆鲍曝刨瀑豹趵
 bei1	背悲杯碑卑陂埤萆鹎
 bei3	北
-bei4	被备背辈倍贝蓓惫悖狈焙邶钡孛碚褙鐾鞴宝
+bei4	被备背辈倍贝蓓惫悖狈焙邶钡孛碚褙鐾鞴
 bei5	臂呗备
 ben1	奔贲锛
 ben3	本苯畚
@@ -82,7 +82,7 @@ cai1	猜
 cai2	才财材裁采
 cai3	采彩踩睬
 cai4	采菜蔡
-can1	参餐骖食
+can1	参餐骖
 can2	残惭蚕
 can3	惨黪
 can4	惨灿掺璨孱粲
@@ -95,7 +95,7 @@ ce4	策测侧厕册恻
 cen1	参
 cen2	岑涔
 ceng1	噌
-ceng2	曾层太
+ceng2	曾层
 ceng4	蹭
 cha1	差插叉碴喳嚓杈馇锸
 cha2	查察茶叉茬碴楂猹搽槎檫
@@ -484,8 +484,8 @@ kao3	考烤拷栲
 kao4	靠铐犒
 ke1	科颗柯呵棵苛磕坷嗑瞌轲稞疴蝌钶窠颏珂髁
 ke2	咳壳颏可
-ke3	可渴坷轲岢以
-ke4	可克客刻课恪嗑溘骒缂氪锞蚵科谎
+ke3	可渴坷轲岢
+ke4	克客刻课恪嗑溘骒缂氪锞蚵科可
 ken3	肯恳啃垦龈
 ken4	裉
 keng1	坑吭铿
diff --git a/general_function/file_wav.py b/general_function/file_wav.py
index d64b36d..b1c292d 100644
--- a/general_function/file_wav.py
+++ b/general_function/file_wav.py
@@ -102,6 +102,37 @@ def GetFrequencyFeature2(wavsignal, fs):
 	#print(data_input.shape)
 	return data_input
 
+def GetFrequencyFeature3(wavsignal, fs):
+	# wav波形 加时间窗以及时移10ms
+	time_window = 25 # 单位ms
+	window_length = fs / 1000 * time_window # 计算窗长度的公式，目前全部为400固定值
+	
+	wav_arr = np.array(wavsignal)
+	#wav_length = len(wavsignal[0])
+	wav_length = wav_arr.shape[1]
+	
+	range0_end = int(len(wavsignal[0])/fs*1000 - time_window) // 10 # 计算循环终止的位置，也就是最终生成的窗数
+	data_input = np.zeros((range0_end, 200), dtype = np.float) # 用于存放最终的频率特征数据
+	data_line = np.zeros((1, 400), dtype = np.float)
+	for i in range(0, range0_end):
+		p_start = i * 160
+		p_end = p_start + 400
+		
+		data_line = wav_arr[0, p_start:p_end]
+		
+		x=np.linspace(0, 400 - 1, 400, dtype = np.int64)
+		w = 0.54 - 0.46 * np.cos(2 * np.pi * (x) / (400 - 1) ) # 汉明窗
+		data_line = data_line * w # 加窗
+		
+		data_line = np.abs(fft(data_line)) / wav_length
+		
+		
+		data_input[i]=data_line[0:200] # 设置为400除以2的值（即200）是取一半数据，因为是对称的
+		
+	#print(data_input.shape)
+	data_input = np.log(data_input + 1)
+	return data_input
+
 def wav_scale(energy):
 	'''
 	语音信号能量归一化
@@ -178,13 +209,19 @@ def get_wav_symbol(filename):
 	return dic_symbol_list,list_symbolmark
 	
 if(__name__=='__main__'):
-	#dic=get_wav_symbol('E:\\语音数据集\\doc\\doc\\trans\\train.syllable.txt')
-	#print(dic)
-	#dic=get_wav_list('E:\\语音数据集\\doc\\doc\\list\\train.wav.lst')
-	#for i in dic:
-		#print(i,dic[i])
+	
 	wave_data, fs = read_wav_data("A2_0.wav")  
-	#wave_data[0]=wav_scale(wave_data[0])
-	#print(fs)
+	
 	wav_show(wave_data[0],fs)
+	#t0=time.time()
+	freimg = GetFrequencyFeature3(wave_data,fs)
+	#t1=time.time()
+	#print('time cost:',t1-t0)
+	
+	freimg = freimg.T
+	plt.subplot(111)
 	
+	plt.imshow(freimg)
+	plt.colorbar(cax=None,ax=None,shrink=0.5)  
+	 
+	plt.show() 
diff --git a/test.py b/test.py
index 1618847..a7edd95 100644
--- a/test.py
+++ b/test.py
@@ -32,7 +32,7 @@
 ms.LoadModel(modelpath + 'm22_2/0/speech_model22_e_0_step_257000.model')
 
 #ms.TestModel(datapath, str_dataset='test', data_count = 64, out_report = True)
-r = ms.RecognizeSpeech_FromFile('E:\语音数据集\ST-CMDS-20170001_1-OS\\20170001P00241I0052.wav')
+r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00241I0052.wav')
 #r = ms.RecognizeSpeech_FromFile('E:\语音数据集\ST-CMDS-20170001_1-OS\\20170001P00241I0053.wav')
 #r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\ST-CMDS-20170001_1-OS\\20170001P00020I0087.wav')
 #r = ms.RecognizeSpeech_FromFile('E:\\语音数据集\\wav\\train\\A11\\A11_167.WAV')