Skip to content

Commit 7bbb4d2

Browse files
committed
- Default to 1 thread to use as multithreading slows down on CPU
- Use sapply instead of for loop to loop over the windowed samples inside a with_no_grad chunk - Function silero no longer requires to provide the sample_rate, this is now extracted using audio::load.wave - Factor out the use of package av to only the examples - internally replaced with package audio
1 parent 93caa8c commit 7bbb4d2

File tree

5 files changed

+72
-43
lines changed

5 files changed

+72
-43
lines changed

NAMESPACE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,7 @@ importFrom(audio,load.wave)
66
importFrom(torch,autograd_set_grad_mode)
77
importFrom(torch,jit_load)
88
importFrom(torch,jit_scalar)
9+
importFrom(torch,torch_float)
10+
importFrom(torch,torch_set_num_threads)
911
importFrom(torch,torch_tensor)
12+
importFrom(torch,with_no_grad)

NEWS.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
## CHANGES IN audio.vadsilero VERSION 0.2
22

3-
- disable the gradient history recording & set model in evaluation (inference) mode
4-
- replace package wav with package audio for reading in the wav file
3+
- Disable the gradient history recording & set model in evaluation (inference) mode
4+
- Replace package wav with package audio for reading in the wav file
5+
- Default to 1 thread to use as multithreading slows down on CPU
6+
- Use sapply instead of for loop to loop over the windowed samples inside a with_no_grad chunk
7+
- Function silero no longer requires to provide the sample_rate, this is now extracted using audio::load.wave
8+
- Factor out the use of package av to only the examples - internally replaced with package audio
59

610
## CHANGES IN audio.vadsilero VERSION 0.1
711

R/pkg.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
#' @importFrom audio load.wave
2-
#' @importFrom torch torch_tensor jit_load jit_scalar autograd_set_grad_mode
2+
#' @importFrom torch torch_tensor jit_load jit_scalar autograd_set_grad_mode torch_float with_no_grad torch_set_num_threads
33
NULL

R/silero.R

Lines changed: 53 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
#' It works with .wav audio files with a sample rate of 8 or 16 Khz an can be applied over a window of eiher 32, 64 or 96 milliseconds.
44
#' @param file the path to an audio file which should be a wav file in 16 bit with mono PCM samples (pcm_s16le codec) with a sampling rate of either 8Khz or 16KHz
55
#' @param milliseconds integer with the number of milliseconds indicating to compute by this number of milliseconds the VAD signal. Can only be 32, 64 or 96 Defaults to 64.
6-
#' @param sample_rate integer with the sample rate of \code{file}. If not provided, will use package av to extract it.
76
#' @param threshold numeric indicating if the probability is above this threshold, the segment is detected as voiced. Defaults to 0.5
7+
#' @param threads integer with the number of threads to use, which is passed on to \code{\link[torch]{torch_set_num_threads}}. Defaults to 1.
88
#' @return an object of class \code{VAD} which is a list with elements
99
#' \itemize{
1010
#' \item{file: the path to the file}
@@ -32,42 +32,42 @@
3232
#' plot(vad$vad$millisecond, vad$vad$probability, type = "l",
3333
#' xlab = "Millisecond", ylab = "Probability voiced")
3434
#'
35-
#' \dontrun{
36-
#' library(av)
37-
#' x <- read_audio_bin(file)
35+
#' library(audio)
36+
#' x <- load.wave(file)
3837
#' plot(seq_along(x) / 16000, x, type = "l")
3938
#' abline(v = vad$vad_segments$start, col = "red", lwd = 2)
4039
#' abline(v = vad$vad_segments$end, col = "blue", lwd = 2)
4140
#'
41+
#' \dontrun{
4242
#' ##
4343
#' ## If you have audio which is not in mono or another sample rate
4444
#' ## consider using R package av to convert to the desired format
45+
#' library(av)
4546
#' av_media_info(file)
4647
#' av_audio_convert(file, output = "audio_pcm_16khz.wav",
4748
#' format = "wav", channels = 1, sample_rate = 16000)
4849
#' vad <- silero("audio_pcm_16khz.wav", milliseconds = 64)
4950
#' }
5051
silero <- function(file,
5152
milliseconds = 64,
52-
sample_rate,
53-
threshold = 0.5){
53+
threshold = 0.5,
54+
threads = 1L){
5455
stopifnot(file.exists(file))
55-
if(requireNamespace(package = "av", quietly = TRUE)){
56-
info <- av::av_media_info(file)
57-
if(info$audio$channels != 1){
58-
stop(sprintf("%s does not contain audio in mono", file))
59-
}
60-
if(missing(sample_rate)){
61-
sample_rate <- info$audio$sample_rate
62-
}
63-
}
6456
milliseconds <- as.integer(milliseconds)
6557
stopifnot(milliseconds %in% c(32L, 64L, 96L))
58+
sound <- audio::load.wave(file)
59+
sample_rate <- attr(sound, which = "rate")
60+
sample_rate <- as.integer(sample_rate)
61+
if(is.matrix(sound)){
62+
stop(sprintf("%s does not contain audio in mono", file))
63+
}
6664
if(!sample_rate %in% c(8000L, 16000L)){
6765
stop(sprintf("%s should be in 8000Hz or 16000Hz, not in %s Hz", file, sample_rate))
6866
}
67+
torch_set_num_threads(threads)
68+
6969
model <- SILERO()
70-
msg <- predict.SILERO(model, file, sample_rate = sample_rate, milliseconds = milliseconds, threshold = threshold)
70+
msg <- predict.SILERO(model, sound, file = file, sample_rate = sample_rate, milliseconds = milliseconds, threshold = threshold)
7171

7272
## Get groups of sequences of voice/non-voice
7373
grp <- rle(msg$vad$has_voice)
@@ -101,9 +101,7 @@ SILERO <- function(){
101101
out
102102
}
103103

104-
predict.SILERO <- function(object, newdata, sample_rate, milliseconds, window = milliseconds * (sample_rate / 1000), threshold = 0.5){
105-
sound <- audio::load.wave(newdata)
106-
sample_rate <- attr(sound, which = "rate")
104+
predict.SILERO <- function(object, sound, file = "", sample_rate, milliseconds, window = milliseconds * (sample_rate / 1000), threshold = 0.5){
107105
n_samples <- length(sound)
108106
sample_rate <- torch::jit_scalar(as.integer(sample_rate))
109107

@@ -113,31 +111,49 @@ predict.SILERO <- function(object, newdata, sample_rate, milliseconds, window =
113111
if(!window %in% c(256, 512, 768, 1024, 1536)){
114112
stop("Unknown combination of milliseconds and sample_rate")
115113
}
116-
117-
test <- torch::torch_tensor(sound)
118-
119114
elements <- seq.int(from = 1, to = n_samples, by = window)
120115
out <- numeric(length = length(elements))
121-
for(i in seq_along(elements)){
122-
#cat(i, sep = "\n")
123-
if((elements[i]+window-1) > n_samples){
124-
samples <- sound[elements[i]:length(sound)]
125-
samples <- c(samples, rep(as.numeric(0), times = window - length(samples)))
126-
samples <- torch::torch_tensor(samples)
127-
out[i] <- as.numeric(object$model$forward(samples, sr = sample_rate))
128-
}else{
129-
samples <- test[elements[i]:(elements[i]+window-1)]
130-
#samples <- torch::torch_tensor(samples)
131-
#print(str(samples))
132-
out[i] <- as.numeric(object$model$forward(samples, sr = sample_rate))
133-
}
134-
}
116+
117+
# test <- torch::torch_tensor(sound)
118+
# for(i in seq_along(elements)){
119+
# #cat(i, sep = "\n")
120+
# if((elements[i]+window-1) > n_samples){
121+
# samples <- sound[elements[i]:length(sound)]
122+
# samples <- c(samples, rep(as.numeric(0), times = window - length(samples)))
123+
# samples <- torch::torch_tensor(samples)
124+
# out[i] <- as.numeric(object$model$forward(samples, sr = sample_rate))
125+
# }else{
126+
# samples <- test[elements[i]:(elements[i]+window-1)]
127+
# #samples <- torch::torch_tensor(samples)
128+
# #print(str(samples))
129+
# out[i] <- as.numeric(object$model$forward(samples, sr = sample_rate))
130+
# }
131+
# }
132+
133+
samples <- torch::torch_tensor(rep(0, times = window), dtype = torch::torch_float())
134+
with_no_grad({
135+
out <- sapply(seq_along(elements), FUN = function(i){
136+
if((elements[i]+window-1) > n_samples){
137+
samples <- sound[elements[i]:length(sound)]
138+
samples <- c(samples, rep(as.numeric(0), times = window - length(samples)))
139+
samples <- torch::torch_tensor(samples, dtype = torch::torch_float())
140+
#samples[] <- samples
141+
score <- object$model$forward(samples, sr = sample_rate)
142+
}else{
143+
samples[] <- sound[elements[i]:(elements[i]+window-1)]
144+
score <- object$model$forward(samples, sr = sample_rate)
145+
}
146+
as.numeric(score)
147+
}, USE.NAMES = FALSE)
148+
})
149+
150+
135151
sample_rate <- as.integer(sample_rate)
136152
vad <- data.frame(millisecond = elements, probability = out, stringsAsFactors = FALSE)
137153
vad$has_voice <- ifelse(vad$probability > threshold, TRUE, FALSE)
138154
vad$millisecond <- as.integer(vad$millisecond / (sample_rate / 1000))
139155
msg <- list(
140-
file= newdata,
156+
file = file,
141157
sample_rate = sample_rate,
142158
channels = 1L,
143159
samples = n_samples,

man/silero.Rd

Lines changed: 9 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)