-
Notifications
You must be signed in to change notification settings - Fork 41
/
infer_audio.py
53 lines (39 loc) · 1.69 KB
/
infer_audio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import numpy as np
import tensorflow as tf
import glob
from audioset import vggish_embeddings
import keras
flags = tf.app.flags
flags.DEFINE_string(
'wav_file', None,
'Path to a wav file. Should contain signed 16-bit PCM samples. '
'If none is provided, a synthetic sound is used.')
flags.DEFINE_string(
'wav_directory', None,
'Path to a directory of wav files for inference.'
'Predictions will be written to predict.csv in that directory'
)
flags.DEFINE_string(
'keras_model', 'Models/LSTM_trained_on_audioset_40epoch.h5',
'Path to trained keras model that will be used to run inference.')
flags.DEFINE_string(
'tfrecord_file', None,
'Path to a TFRecord file where embeddings will be written.')
FLAGS = flags.FLAGS
def predict_laugh(processed_embedding):
model = keras.models.load_model(FLAGS.keras_model)
return model.predict(processed_embedding)
if __name__ == '__main__':
audio_embedder = vggish_embeddings.VGGishEmbedder(FLAGS.tfrecord_file)
if FLAGS.wav_directory:
files = glob.glob(FLAGS.wav_directory+'/*.wav')
embeddings = [audio_embedder.convert_audio_to_embedding(f) for f in files]
max_len = np.max([e.shape[0] for e in embeddings])
embeddings = np.array([np.append(e, np.zeros([(max_len - e.shape[0]), 128], np.float32), axis=0) for e in embeddings])
scores = predict_laugh(embeddings)
for name, score in zip(files, scores[:, 0]):
print('{:>12}: {:0.6f}'.format(name, score))
else:
processed_embedding = audio_embedder.convert_audio_to_embedding(FLAGS.wav_file)
p = predict_laugh(np.expand_dims(processed_embedding, axis=0))
print('Laugh Score: {}'.format(p))