feat: android model download support

sultanmyrza · Jul 8, 2024 · da4f106 · da4f106
1 parent b532bb8
commit da4f106
Show file tree

Hide file tree

Showing 11 changed files with 271 additions and 57 deletions.
diff --git a/.npmignore b/.npmignore
@@ -9,3 +9,4 @@ __tests__
 /android/src/test/
 /android/build/
 /example/
+/images/
diff --git a/README.md b/README.md
@@ -222,3 +222,30 @@ console.log("Speech recognition services:", packages.join(", "));
 const available = ExpoSpeechRecognitionModule.isOnDeviceRecognitionAvailable();
 console.log("OnDevice recognition available:", available);
 ```
+
+### On Device Speech Recognition (Android)
+
+Users on Android devices will first need to download the offline model for the locale they want to use in order to use the on-device speech recognition.
+
+You can see which locales are supported and installed on your device by running `getSupportedLocales` with the `onDevice` option set to `true`.
+
+To download the offline model for a specific locale, use the `androidTriggerOfflineModelDownload` function.
+
+```ts
+import { ExpoSpeechRecognitionModule } from "@jamsch/expo-speech-recognition";
+
+// Download the offline model for the specified locale
+ExpoSpeechRecognitionModule.androidTriggerOfflineModelDownload({
+  locale: "en-US",
+})
+  .then(() => {
+    console.log("Offline model downloaded successfully!");
+  })
+  .catch((err) => {
+    console.log("Failed to download offline model!", err.message);
+  });
+```
+
+The device will display a dialog to download the model. Once the model is downloaded, you can use the `getSupportedLocales` function to get the list of installed locales.
+
+![On Device Recognition](./images/on-device-recognition.jpg)
diff --git a/android/src/main/java/expo/modules/speechrecognition/ExpoSpeechRecognitionModule.kt b/android/src/main/java/expo/modules/speechrecognition/ExpoSpeechRecognitionModule.kt
@@ -1,10 +1,12 @@
 package expo.modules.speechrecognition
 
 import android.Manifest.permission.RECORD_AUDIO
+import android.annotation.SuppressLint
 import android.content.Context
 import android.content.Intent
 import android.os.Build
 import android.os.Handler
+import android.speech.ModelDownloadListener
 import android.speech.RecognitionService
 import android.speech.RecognitionSupport
 import android.speech.RecognitionSupportCallback
@@ -22,14 +24,17 @@ import expo.modules.kotlin.records.Record
 import java.util.concurrent.Executors
 
 class SpeechRecognitionOptions : Record {
-    @Field val interimResults: Boolean = false
+    @Field
+    val interimResults: Boolean = false
 
-    @Field val lang: String = "en-US"
+    @Field
+    val lang: String = "en-US"
 
     @Field
     val continuous: Boolean = false
 
-    @Field val maxAlternatives: Int = 1
+    @Field
+    val maxAlternatives: Int = 1
 
     @Field
     var contextualStrings: List<String>? = null
@@ -55,6 +60,11 @@ class GetSupportedLocaleOptions : Record {
     val onDevice: Boolean = false
 }
 
+class TriggerOfflineModelDownloadOptions : Record {
+    @Field
+    val locale: String = "en-US"
+}
+
 class ExpoSpeechRecognitionModule : Module() {
     // Each module class must implement the definition function. The definition consists of components
     // that describes the module's functionality and behavior.
@@ -136,7 +146,6 @@ class ExpoSpeechRecognitionModule : Module() {
                 val service =
                     ExpoSpeechService.getInstance(appContext.reactContext!!) { name, body ->
                         val nonNullBody = body ?: emptyMap()
-                        // Log.d("ESR", "Send event: $name with body: $nonNullBody")
                         sendEvent(name, nonNullBody)
                     }
                 service.start(options)
@@ -162,10 +171,63 @@ class ExpoSpeechRecognitionModule : Module() {
                     false
                 }
             }
+
+            var isDownloadingModel = false
+
+            AsyncFunction("androidTriggerOfflineModelDownload") { options: TriggerOfflineModelDownloadOptions, promise: Promise ->
+                if (isDownloadingModel) {
+                    promise.reject("download_in_progress", "An offline model download is already in progress.", Throwable())
+                    return@AsyncFunction
+                }
+
+                if (Build.VERSION.SDK_INT < Build.VERSION_CODES.UPSIDE_DOWN_CAKE) {
+                    promise.reject("not_supported", "Android version is too old to trigger offline model download.", Throwable())
+                    return@AsyncFunction
+                }
+                isDownloadingModel = true
+                val intent = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH)
+                intent.putExtra(RecognizerIntent.EXTRA_LANGUAGE, options.locale)
+                Handler(appContext.reactContext!!.mainLooper).post {
+                    val recognizer =
+                        SpeechRecognizer.createOnDeviceSpeechRecognizer(appContext.reactContext!!)
+                    recognizer.triggerModelDownload(
+                        intent,
+                        Executors.newSingleThreadExecutor(),
+                        @SuppressLint("NewApi")
+                        object : ModelDownloadListener {
+                            override fun onProgress(p0: Int) {
+                                // Todo: let user know the progress
+                            }
+
+                            override fun onSuccess() {
+                                promise.resolve(true)
+                                isDownloadingModel = false
+                                recognizer.destroy()
+                            }
+
+                            override fun onScheduled() {
+                                //
+                            }
+
+                            override fun onError(error: Int) {
+                                isDownloadingModel = false
+                                promise.reject(
+                                    "error_$error",
+                                    "Failed to download offline model download with error: $error",
+                                    Throwable(),
+                                )
+                                recognizer.destroy()
+                            }
+                        },
+                    )
+                }
+            }
         }
 
     private fun hasNotGrantedPermissions(): Boolean = appContext.permissions?.hasGrantedPermissions(RECORD_AUDIO)?.not() ?: false
 
+    // private fun getAvailableLocales(appContext: Context, promise: Promise) {
+
     private fun getSupportedLocales(
         options: GetSupportedLocaleOptions,
         appContext: Context,
@@ -198,9 +260,11 @@ class ExpoSpeechRecognitionModule : Module() {
                 }
 
             val recognizerIntent = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH)
-            val pkg = options.androidRecognitionServicePackage ?: "com.google.android.googlequicksearchbox"
-            recognizerIntent.setPackage(pkg)
-            Log.d("ESR", "Recognizer intent: $recognizerIntent with package: $pkg")
+            if (!options.onDevice) {
+                val pkg = options.androidRecognitionServicePackage ?: "com.google.android.googlequicksearchbox"
+                recognizerIntent.setPackage(pkg)
+            }
+            Log.d("ESR", "Recognizer intent: $recognizerIntent")
 
             recognizer?.checkRecognitionSupport(
                 recognizerIntent,
@@ -213,8 +277,17 @@ class ExpoSpeechRecognitionModule : Module() {
                             return
                         }
                         didResolve = true
-                        val supportedLocales = recognitionSupport.supportedOnDeviceLanguages
-                        promise.resolve(supportedLocales)
+                        // These languages are supported but need to be downloaded before use.
+                        val installedLocales = recognitionSupport.installedOnDeviceLanguages
+
+                        val locales = recognitionSupport.supportedOnDeviceLanguages.union(installedLocales)
+
+                        promise.resolve(
+                            mapOf(
+                                "locales" to locales,
+                                "installedLocales" to installedLocales,
+                            ),
+                        )
                         recognizer.destroy()
                     }
 
@@ -228,6 +301,7 @@ class ExpoSpeechRecognitionModule : Module() {
                                 Throwable(),
                             )
                         }
+                        recognizer.destroy()
                     }
                 },
             )

diff --git a/android/src/main/java/expo/modules/speechrecognition/ExpoSpeechService.kt b/android/src/main/java/expo/modules/speechrecognition/ExpoSpeechService.kt
@@ -59,7 +59,7 @@ class ExpoSpeechService
 
             for (service in services) {
                 if (service.serviceInfo.packageName == packageName) {
-                    Log.d("ESR", "Found service for package $packageName: ${service.serviceInfo.name}")
+                    log("Found service for package $packageName: ${service.serviceInfo.name}")
                     return ComponentName(service.serviceInfo.packageName, service.serviceInfo.name)
                 }
             }
@@ -128,10 +128,13 @@ class ExpoSpeechService
             // The server may ignore a request for partial results in some or all cases.
             intent.putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, options.interimResults)
 
-            intent.putExtra(
-                RecognizerIntent.EXTRA_LANGUAGE_MODEL,
-                RecognizerIntent.LANGUAGE_MODEL_FREE_FORM,
-            )
+            // Allow users to override the language mode
+            if (options.androidIntentOptions?.containsKey("EXTRA_LANGUAGE_MODEL") != true) {
+                intent.putExtra(
+                    RecognizerIntent.EXTRA_LANGUAGE_MODEL,
+                    RecognizerIntent.LANGUAGE_MODEL_FREE_FORM,
+                )
+            }
 
             val contextualStrings = options.contextualStrings
             if (!contextualStrings.isNullOrEmpty() && Build.VERSION.SDK_INT >= Build.VERSION_CODES.TIRAMISU) {
@@ -147,9 +150,9 @@ class ExpoSpeechService
 
             // Offline recognition
             // to be used with ACTION_RECOGNIZE_SPEECH, ACTION_VOICE_SEARCH_HANDS_FREE, ACTION_WEB_SEARCH
-            // if (options.requiresOnDeviceRecognition) {
-            //     intent.putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true)
-            // }
+            if (options.requiresOnDeviceRecognition) {
+                intent.putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true)
+            }
 
             // Optional limit on the maximum number of results to return.
             // If omitted the recognizer will choose how many results to return. Must be an integer.
@@ -158,7 +161,7 @@ class ExpoSpeechService
             val language = options.lang.takeIf { it.isNotEmpty() } ?: Locale.getDefault().toString()
             intent.putExtra(RecognizerIntent.EXTRA_LANGUAGE, language)
 
-            Log.d("ESR", "androidIntentOptions: ${options.androidIntentOptions}")
+            log("androidIntentOptions: ${options.androidIntentOptions}")
 
             // Add any additional intent extras provided by the user
             options.androidIntentOptions?.forEach { (key, value) ->
@@ -167,7 +170,7 @@ class ExpoSpeechService
                 val field = RecognizerIntent::class.java.getDeclaredField(key)
                 val fieldValue = field.get(null) as? String
 
-                Log.d("ESR", "Resolved key $key -> $fieldValue with value: $value (${value.javaClass.name})")
+                log("Resolved key $key -> $fieldValue with value: $value (${value.javaClass.name})")
                 when (value) {
                     is Boolean -> intent.putExtra(fieldValue, value)
                     is Int -> intent.putExtra(fieldValue, value)
@@ -213,7 +216,7 @@ class ExpoSpeechService
         override fun onEndOfSpeech() {
             // recognitionState = RecognitionState.INACTIVE
             // sendEvent("end", null)
-            Log.d("ESR", "onEndOfSpeech()")
+            log("onEndOfSpeech()")
         }
 
         override fun onError(error: Int) {
@@ -227,7 +230,7 @@ class ExpoSpeechService
 
             sendEvent("error", mapOf("code" to errorInfo.error, "message" to errorInfo.message))
             sendEvent("end", null)
-            Log.d("ESR", "onError() - ${errorInfo.error}: ${errorInfo.message} - code: $error")
+            log("onError() - ${errorInfo.error}: ${errorInfo.message} - code: $error")
         }
 
         override fun onResults(results: Bundle?) {
@@ -241,7 +244,7 @@ class ExpoSpeechService
                 resultsList.add("")
             }
             sendEvent("result", mapOf("transcriptions" to resultsList, "isFinal" to true))
-            Log.d("ESR", "onResults()")
+            log("onResults(), transcriptions: ${resultsList.joinToString(", ")}")
             sendEvent("end", null)
         }
 
@@ -251,7 +254,7 @@ class ExpoSpeechService
                 partialResultsList.addAll(matches)
             }
             sendEvent("result", mapOf("transcriptions" to partialResultsList, "isFinal" to false))
-            Log.d("ESR", "onPartialResults()")
+            log("onPartialResults(), transcriptions: ${partialResultsList.joinToString(", ")}")
         }
 
         override fun onEvent(
@@ -293,7 +296,7 @@ class ExpoSpeechService
                     // Extra codes/messages
                     SpeechRecognizer.ERROR_RECOGNIZER_BUSY -> "RecognitionService busy."
                     SpeechRecognizer.ERROR_SPEECH_TIMEOUT -> "No speech input."
-                    SpeechRecognizer.ERROR_LANGUAGE_UNAVAILABLE -> "The selected language is not available."
+                    SpeechRecognizer.ERROR_LANGUAGE_UNAVAILABLE -> "Requested language is supported, but not yet downloaded."
                     else -> "Unknown error"
                 }