Skip to content

Commit

Permalink
feat: android model download support
Browse files Browse the repository at this point in the history
  • Loading branch information
jamsch committed Jul 8, 2024
1 parent b532bb8 commit da4f106
Show file tree
Hide file tree
Showing 11 changed files with 271 additions and 57 deletions.
1 change: 1 addition & 0 deletions .npmignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ __tests__
/android/src/test/
/android/build/
/example/
/images/
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -222,3 +222,30 @@ console.log("Speech recognition services:", packages.join(", "));
const available = ExpoSpeechRecognitionModule.isOnDeviceRecognitionAvailable();
console.log("OnDevice recognition available:", available);
```

### On Device Speech Recognition (Android)

Users on Android devices will first need to download the offline model for the locale they want to use in order to use the on-device speech recognition.

You can see which locales are supported and installed on your device by running `getSupportedLocales` with the `onDevice` option set to `true`.

To download the offline model for a specific locale, use the `androidTriggerOfflineModelDownload` function.

```ts
import { ExpoSpeechRecognitionModule } from "@jamsch/expo-speech-recognition";

// Download the offline model for the specified locale
ExpoSpeechRecognitionModule.androidTriggerOfflineModelDownload({
locale: "en-US",
})
.then(() => {
console.log("Offline model downloaded successfully!");
})
.catch((err) => {
console.log("Failed to download offline model!", err.message);
});
```

The device will display a dialog to download the model. Once the model is downloaded, you can use the `getSupportedLocales` function to get the list of installed locales.

![On Device Recognition](./images/on-device-recognition.jpg)
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
package expo.modules.speechrecognition

import android.Manifest.permission.RECORD_AUDIO
import android.annotation.SuppressLint
import android.content.Context
import android.content.Intent
import android.os.Build
import android.os.Handler
import android.speech.ModelDownloadListener
import android.speech.RecognitionService
import android.speech.RecognitionSupport
import android.speech.RecognitionSupportCallback
Expand All @@ -22,14 +24,17 @@ import expo.modules.kotlin.records.Record
import java.util.concurrent.Executors

class SpeechRecognitionOptions : Record {
@Field val interimResults: Boolean = false
@Field
val interimResults: Boolean = false

@Field val lang: String = "en-US"
@Field
val lang: String = "en-US"

@Field
val continuous: Boolean = false

@Field val maxAlternatives: Int = 1
@Field
val maxAlternatives: Int = 1

@Field
var contextualStrings: List<String>? = null
Expand All @@ -55,6 +60,11 @@ class GetSupportedLocaleOptions : Record {
val onDevice: Boolean = false
}

class TriggerOfflineModelDownloadOptions : Record {
@Field
val locale: String = "en-US"
}

class ExpoSpeechRecognitionModule : Module() {
// Each module class must implement the definition function. The definition consists of components
// that describes the module's functionality and behavior.
Expand Down Expand Up @@ -136,7 +146,6 @@ class ExpoSpeechRecognitionModule : Module() {
val service =
ExpoSpeechService.getInstance(appContext.reactContext!!) { name, body ->
val nonNullBody = body ?: emptyMap()
// Log.d("ESR", "Send event: $name with body: $nonNullBody")
sendEvent(name, nonNullBody)
}
service.start(options)
Expand All @@ -162,10 +171,63 @@ class ExpoSpeechRecognitionModule : Module() {
false
}
}

var isDownloadingModel = false

AsyncFunction("androidTriggerOfflineModelDownload") { options: TriggerOfflineModelDownloadOptions, promise: Promise ->
if (isDownloadingModel) {
promise.reject("download_in_progress", "An offline model download is already in progress.", Throwable())
return@AsyncFunction
}

if (Build.VERSION.SDK_INT < Build.VERSION_CODES.UPSIDE_DOWN_CAKE) {
promise.reject("not_supported", "Android version is too old to trigger offline model download.", Throwable())
return@AsyncFunction
}
isDownloadingModel = true
val intent = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH)
intent.putExtra(RecognizerIntent.EXTRA_LANGUAGE, options.locale)
Handler(appContext.reactContext!!.mainLooper).post {
val recognizer =
SpeechRecognizer.createOnDeviceSpeechRecognizer(appContext.reactContext!!)
recognizer.triggerModelDownload(
intent,
Executors.newSingleThreadExecutor(),
@SuppressLint("NewApi")
object : ModelDownloadListener {
override fun onProgress(p0: Int) {
// Todo: let user know the progress
}

override fun onSuccess() {
promise.resolve(true)
isDownloadingModel = false
recognizer.destroy()
}

override fun onScheduled() {
//
}

override fun onError(error: Int) {
isDownloadingModel = false
promise.reject(
"error_$error",
"Failed to download offline model download with error: $error",
Throwable(),
)
recognizer.destroy()
}
},
)
}
}
}

private fun hasNotGrantedPermissions(): Boolean = appContext.permissions?.hasGrantedPermissions(RECORD_AUDIO)?.not() ?: false

// private fun getAvailableLocales(appContext: Context, promise: Promise) {

private fun getSupportedLocales(
options: GetSupportedLocaleOptions,
appContext: Context,
Expand Down Expand Up @@ -198,9 +260,11 @@ class ExpoSpeechRecognitionModule : Module() {
}

val recognizerIntent = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH)
val pkg = options.androidRecognitionServicePackage ?: "com.google.android.googlequicksearchbox"
recognizerIntent.setPackage(pkg)
Log.d("ESR", "Recognizer intent: $recognizerIntent with package: $pkg")
if (!options.onDevice) {
val pkg = options.androidRecognitionServicePackage ?: "com.google.android.googlequicksearchbox"
recognizerIntent.setPackage(pkg)
}
Log.d("ESR", "Recognizer intent: $recognizerIntent")

recognizer?.checkRecognitionSupport(
recognizerIntent,
Expand All @@ -213,8 +277,17 @@ class ExpoSpeechRecognitionModule : Module() {
return
}
didResolve = true
val supportedLocales = recognitionSupport.supportedOnDeviceLanguages
promise.resolve(supportedLocales)
// These languages are supported but need to be downloaded before use.
val installedLocales = recognitionSupport.installedOnDeviceLanguages

val locales = recognitionSupport.supportedOnDeviceLanguages.union(installedLocales)

promise.resolve(
mapOf(
"locales" to locales,
"installedLocales" to installedLocales,
),
)
recognizer.destroy()
}

Expand All @@ -228,6 +301,7 @@ class ExpoSpeechRecognitionModule : Module() {
Throwable(),
)
}
recognizer.destroy()
}
},
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class ExpoSpeechService

for (service in services) {
if (service.serviceInfo.packageName == packageName) {
Log.d("ESR", "Found service for package $packageName: ${service.serviceInfo.name}")
log("Found service for package $packageName: ${service.serviceInfo.name}")
return ComponentName(service.serviceInfo.packageName, service.serviceInfo.name)
}
}
Expand Down Expand Up @@ -128,10 +128,13 @@ class ExpoSpeechService
// The server may ignore a request for partial results in some or all cases.
intent.putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, options.interimResults)

intent.putExtra(
RecognizerIntent.EXTRA_LANGUAGE_MODEL,
RecognizerIntent.LANGUAGE_MODEL_FREE_FORM,
)
// Allow users to override the language mode
if (options.androidIntentOptions?.containsKey("EXTRA_LANGUAGE_MODEL") != true) {
intent.putExtra(
RecognizerIntent.EXTRA_LANGUAGE_MODEL,
RecognizerIntent.LANGUAGE_MODEL_FREE_FORM,
)
}

val contextualStrings = options.contextualStrings
if (!contextualStrings.isNullOrEmpty() && Build.VERSION.SDK_INT >= Build.VERSION_CODES.TIRAMISU) {
Expand All @@ -147,9 +150,9 @@ class ExpoSpeechService

// Offline recognition
// to be used with ACTION_RECOGNIZE_SPEECH, ACTION_VOICE_SEARCH_HANDS_FREE, ACTION_WEB_SEARCH
// if (options.requiresOnDeviceRecognition) {
// intent.putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true)
// }
if (options.requiresOnDeviceRecognition) {
intent.putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true)
}

// Optional limit on the maximum number of results to return.
// If omitted the recognizer will choose how many results to return. Must be an integer.
Expand All @@ -158,7 +161,7 @@ class ExpoSpeechService
val language = options.lang.takeIf { it.isNotEmpty() } ?: Locale.getDefault().toString()
intent.putExtra(RecognizerIntent.EXTRA_LANGUAGE, language)

Log.d("ESR", "androidIntentOptions: ${options.androidIntentOptions}")
log("androidIntentOptions: ${options.androidIntentOptions}")

// Add any additional intent extras provided by the user
options.androidIntentOptions?.forEach { (key, value) ->
Expand All @@ -167,7 +170,7 @@ class ExpoSpeechService
val field = RecognizerIntent::class.java.getDeclaredField(key)
val fieldValue = field.get(null) as? String

Log.d("ESR", "Resolved key $key -> $fieldValue with value: $value (${value.javaClass.name})")
log("Resolved key $key -> $fieldValue with value: $value (${value.javaClass.name})")
when (value) {
is Boolean -> intent.putExtra(fieldValue, value)
is Int -> intent.putExtra(fieldValue, value)
Expand Down Expand Up @@ -213,7 +216,7 @@ class ExpoSpeechService
override fun onEndOfSpeech() {
// recognitionState = RecognitionState.INACTIVE
// sendEvent("end", null)
Log.d("ESR", "onEndOfSpeech()")
log("onEndOfSpeech()")
}

override fun onError(error: Int) {
Expand All @@ -227,7 +230,7 @@ class ExpoSpeechService

sendEvent("error", mapOf("code" to errorInfo.error, "message" to errorInfo.message))
sendEvent("end", null)
Log.d("ESR", "onError() - ${errorInfo.error}: ${errorInfo.message} - code: $error")
log("onError() - ${errorInfo.error}: ${errorInfo.message} - code: $error")
}

override fun onResults(results: Bundle?) {
Expand All @@ -241,7 +244,7 @@ class ExpoSpeechService
resultsList.add("")
}
sendEvent("result", mapOf("transcriptions" to resultsList, "isFinal" to true))
Log.d("ESR", "onResults()")
log("onResults(), transcriptions: ${resultsList.joinToString(", ")}")
sendEvent("end", null)
}

Expand All @@ -251,7 +254,7 @@ class ExpoSpeechService
partialResultsList.addAll(matches)
}
sendEvent("result", mapOf("transcriptions" to partialResultsList, "isFinal" to false))
Log.d("ESR", "onPartialResults()")
log("onPartialResults(), transcriptions: ${partialResultsList.joinToString(", ")}")
}

override fun onEvent(
Expand Down Expand Up @@ -293,7 +296,7 @@ class ExpoSpeechService
// Extra codes/messages
SpeechRecognizer.ERROR_RECOGNIZER_BUSY -> "RecognitionService busy."
SpeechRecognizer.ERROR_SPEECH_TIMEOUT -> "No speech input."
SpeechRecognizer.ERROR_LANGUAGE_UNAVAILABLE -> "The selected language is not available."
SpeechRecognizer.ERROR_LANGUAGE_UNAVAILABLE -> "Requested language is supported, but not yet downloaded."
else -> "Unknown error"
}

Expand Down
Loading

0 comments on commit da4f106

Please sign in to comment.