Fangjun Kuang
Committed by GitHub

Add JNI support for spoken language identification (#782)

@@ -161,10 +161,12 @@ jobs: @@ -161,10 +161,12 @@ jobs:
161 ./run-vits-vctk.sh 161 ./run-vits-vctk.sh
162 rm -rf vits-vctk 162 rm -rf vits-vctk
163 163
164 - echo "Test vits-zh-aishell3"  
165 - git clone https://huggingface.co/csukuangfj/vits-zh-aishell3 164 + echo "Test vits-icefall-zh-aishell3"
  165 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
  166 + tar xvf vits-icefall-zh-aishell3.tar.bz2
  167 + rm vits-icefall-zh-aishell3.tar.bz2
166 ./run-vits-zh-aishell3.sh 168 ./run-vits-zh-aishell3.sh
167 - rm -rf vits-zh-aishell3 169 + rm -rf vits-icefall-zh-aishell3*
168 170
169 echo "Test vits-piper-en_US-lessac-medium" 171 echo "Test vits-piper-en_US-lessac-medium"
170 git clone https://huggingface.co/csukuangfj/vits-piper-en_US-lessac-medium 172 git clone https://huggingface.co/csukuangfj/vits-piper-en_US-lessac-medium
@@ -92,3 +92,4 @@ sr-data @@ -92,3 +92,4 @@ sr-data
92 92
93 vits-icefall-* 93 vits-icefall-*
94 sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12 94 sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12
  95 +spoken-language-identification-test-wavs
@@ -6,7 +6,7 @@ import android.util.Log @@ -6,7 +6,7 @@ import android.util.Log
6 private val TAG = "sherpa-onnx" 6 private val TAG = "sherpa-onnx"
7 7
8 data class OfflineZipformerAudioTaggingModelConfig( 8 data class OfflineZipformerAudioTaggingModelConfig(
9 - val model: String, 9 + var model: String,
10 ) 10 )
11 11
12 data class AudioTaggingModelConfig( 12 data class AudioTaggingModelConfig(
@@ -134,4 +134,4 @@ fun getAudioTaggingConfig(type: Int, numThreads: Int=1): AudioTaggingConfig? { @@ -134,4 +134,4 @@ fun getAudioTaggingConfig(type: Int, numThreads: Int=1): AudioTaggingConfig? {
134 } 134 }
135 135
136 return null 136 return null
137 -}  
  137 +}
@@ -7,6 +7,7 @@ fun callback(samples: FloatArray): Unit { @@ -7,6 +7,7 @@ fun callback(samples: FloatArray): Unit {
7 } 7 }
8 8
9 fun main() { 9 fun main() {
  10 + testSpokenLanguageIdentifcation()
10 testAudioTagging() 11 testAudioTagging()
11 testSpeakerRecognition() 12 testSpeakerRecognition()
12 testTts() 13 testTts()
@@ -14,6 +15,41 @@ fun main() { @@ -14,6 +15,41 @@ fun main() {
14 testAsr("zipformer2-ctc") 15 testAsr("zipformer2-ctc")
15 } 16 }
16 17
  18 +fun testSpokenLanguageIdentifcation() {
  19 + val config = SpokenLanguageIdentificationConfig(
  20 + whisper = SpokenLanguageIdentificationWhisperConfig(
  21 + encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx",
  22 + decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx",
  23 + tailPaddings = 33,
  24 + ),
  25 + numThreads=1,
  26 + debug=true,
  27 + provider="cpu",
  28 + )
  29 + val slid = SpokenLanguageIdentification(assetManager=null, config=config)
  30 +
  31 + val testFiles = arrayOf(
  32 + "./spoken-language-identification-test-wavs/ar-arabic.wav",
  33 + "./spoken-language-identification-test-wavs/bg-bulgarian.wav",
  34 + "./spoken-language-identification-test-wavs/de-german.wav",
  35 + )
  36 +
  37 + for (waveFilename in testFiles) {
  38 + val objArray = WaveReader.readWaveFromFile(
  39 + filename = waveFilename,
  40 + )
  41 + val samples: FloatArray = objArray[0] as FloatArray
  42 + val sampleRate: Int = objArray[1] as Int
  43 +
  44 + val stream = slid.createStream()
  45 + stream.acceptWaveform(samples, sampleRate = sampleRate)
  46 + val lang = slid.compute(stream)
  47 + stream.release()
  48 + println(waveFilename)
  49 + println(lang)
  50 + }
  51 +}
  52 +
17 fun testAudioTagging() { 53 fun testAudioTagging() {
18 val config = AudioTaggingConfig( 54 val config = AudioTaggingConfig(
19 model=AudioTaggingModelConfig( 55 model=AudioTaggingModelConfig(
@@ -5,32 +5,22 @@ import android.util.Log @@ -5,32 +5,22 @@ import android.util.Log
5 5
6 private val TAG = "sherpa-onnx" 6 private val TAG = "sherpa-onnx"
7 7
8 -data class OfflineZipformerAudioTaggingModelConfig (  
9 - val model: String, 8 +data class SpokenLanguageIdentificationWhisperConfig (
  9 + var encoder: String,
  10 + var decoder: String,
  11 + var tailPaddings: Int = -1,
10 ) 12 )
11 13
12 -data class AudioTaggingModelConfig (  
13 - var zipformer: OfflineZipformerAudioTaggingModelConfig, 14 +data class SpokenLanguageIdentificationConfig (
  15 + var whisper: SpokenLanguageIdentificationWhisperConfig,
14 var numThreads: Int = 1, 16 var numThreads: Int = 1,
15 var debug: Boolean = false, 17 var debug: Boolean = false,
16 var provider: String = "cpu", 18 var provider: String = "cpu",
17 ) 19 )
18 20
19 -data class AudioTaggingConfig (  
20 - var model: AudioTaggingModelConfig,  
21 - var labels: String,  
22 - var topK: Int = 5,  
23 -)  
24 -  
25 -data class AudioEvent (  
26 - val name: String,  
27 - val index: Int,  
28 - val prob: Float,  
29 -)  
30 -  
31 -class AudioTagging( 21 +class SpokenLanguageIdentification (
32 assetManager: AssetManager? = null, 22 assetManager: AssetManager? = null,
33 - config: AudioTaggingConfig, 23 + config: SpokenLanguageIdentificationConfig,
34 ) { 24 ) {
35 private var ptr: Long 25 private var ptr: Long
36 26
@@ -43,10 +33,10 @@ class AudioTagging( @@ -43,10 +33,10 @@ class AudioTagging(
43 } 33 }
44 34
45 protected fun finalize() { 35 protected fun finalize() {
46 - if(ptr != 0) {  
47 - delete(ptr)  
48 - ptr = 0  
49 - } 36 + if (ptr != 0L) {
  37 + delete(ptr)
  38 + ptr = 0
  39 + }
50 } 40 }
51 41
52 fun release() = finalize() 42 fun release() = finalize()
@@ -56,25 +46,22 @@ class AudioTagging( @@ -56,25 +46,22 @@ class AudioTagging(
56 return OfflineStream(p) 46 return OfflineStream(p)
57 } 47 }
58 48
59 - // fun compute(stream: OfflineStream, topK: Int=-1): Array<AudioEvent> {  
60 - fun compute(stream: OfflineStream, topK: Int=-1): Array<Any> {  
61 - var events :Array<Any> = compute(ptr, stream.ptr, topK)  
62 - } 49 + fun compute(stream: OfflineStream) = compute(ptr, stream.ptr)
63 50
64 private external fun newFromAsset( 51 private external fun newFromAsset(
65 assetManager: AssetManager, 52 assetManager: AssetManager,
66 - config: AudioTaggingConfig, 53 + config: SpokenLanguageIdentificationConfig,
67 ): Long 54 ): Long
68 55
69 private external fun newFromFile( 56 private external fun newFromFile(
70 - config: AudioTaggingConfig, 57 + config: SpokenLanguageIdentificationConfig,
71 ): Long 58 ): Long
72 59
73 private external fun delete(ptr: Long) 60 private external fun delete(ptr: Long)
74 61
75 private external fun createStream(ptr: Long): Long 62 private external fun createStream(ptr: Long): Long
76 63
77 - private external fun compute(ptr: Long, streamPtr: Long, topK: Int): Array<Any> 64 + private external fun compute(ptr: Long, streamPtr: Long): String
78 65
79 companion object { 66 companion object {
80 init { 67 init {
@@ -30,19 +30,19 @@ cd ../kotlin-api-examples @@ -30,19 +30,19 @@ cd ../kotlin-api-examples
30 30
31 function testSpeakerEmbeddingExtractor() { 31 function testSpeakerEmbeddingExtractor() {
32 if [ ! -f ./3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx ]; then 32 if [ ! -f ./3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx ]; then
33 - wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx 33 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx
34 fi 34 fi
35 35
36 if [ ! -f ./speaker1_a_cn_16k.wav ]; then 36 if [ ! -f ./speaker1_a_cn_16k.wav ]; then
37 - wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker1_a_cn_16k.wav 37 + curl -SL -O https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker1_a_cn_16k.wav
38 fi 38 fi
39 39
40 if [ ! -f ./speaker1_b_cn_16k.wav ]; then 40 if [ ! -f ./speaker1_b_cn_16k.wav ]; then
41 - wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker1_b_cn_16k.wav 41 + curl -SL -O https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker1_b_cn_16k.wav
42 fi 42 fi
43 43
44 if [ ! -f ./speaker2_a_cn_16k.wav ]; then 44 if [ ! -f ./speaker2_a_cn_16k.wav ]; then
45 - wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker2_a_cn_16k.wav 45 + curl -SL -O https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker2_a_cn_16k.wav
46 fi 46 fi
47 } 47 }
48 48
@@ -53,7 +53,7 @@ function testAsr() { @@ -53,7 +53,7 @@ function testAsr() {
53 fi 53 fi
54 54
55 if [ ! -d ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 ]; then 55 if [ ! -d ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 ]; then
56 - wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 56 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
57 tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 57 tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
58 rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 58 rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
59 fi 59 fi
@@ -61,7 +61,7 @@ function testAsr() { @@ -61,7 +61,7 @@ function testAsr() {
61 61
62 function testTts() { 62 function testTts() {
63 if [ ! -f ./vits-piper-en_US-amy-low/en_US-amy-low.onnx ]; then 63 if [ ! -f ./vits-piper-en_US-amy-low/en_US-amy-low.onnx ]; then
64 - wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 64 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
65 tar xf vits-piper-en_US-amy-low.tar.bz2 65 tar xf vits-piper-en_US-amy-low.tar.bz2
66 rm vits-piper-en_US-amy-low.tar.bz2 66 rm vits-piper-en_US-amy-low.tar.bz2
67 fi 67 fi
@@ -75,7 +75,22 @@ function testAudioTagging() { @@ -75,7 +75,22 @@ function testAudioTagging() {
75 fi 75 fi
76 } 76 }
77 77
  78 +function testSpokenLanguageIdentification() {
  79 + if [ ! -f ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx ]; then
  80 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
  81 + tar xvf sherpa-onnx-whisper-tiny.tar.bz2
  82 + rm sherpa-onnx-whisper-tiny.tar.bz2
  83 + fi
  84 +
  85 + if [ ! -f ./spoken-language-identification-test-wavs/ar-arabic.wav ]; then
  86 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/spoken-language-identification-test-wavs.tar.bz2
  87 + tar xvf spoken-language-identification-test-wavs.tar.bz2
  88 + rm spoken-language-identification-test-wavs.tar.bz2
  89 + fi
  90 +}
  91 +
78 function test() { 92 function test() {
  93 + testSpokenLanguageIdentification
79 testAudioTagging 94 testAudioTagging
80 testSpeakerEmbeddingExtractor 95 testSpeakerEmbeddingExtractor
81 testAsr 96 testAsr
@@ -90,6 +105,7 @@ kotlinc-jvm -include-runtime -d main.jar \ @@ -90,6 +105,7 @@ kotlinc-jvm -include-runtime -d main.jar \
90 OfflineStream.kt \ 105 OfflineStream.kt \
91 SherpaOnnx.kt \ 106 SherpaOnnx.kt \
92 Speaker.kt \ 107 Speaker.kt \
  108 + SpokenLanguageIdentification.kt \
93 Tts.kt \ 109 Tts.kt \
94 WaveReader.kt \ 110 WaveReader.kt \
95 faked-asset-manager.kt \ 111 faked-asset-manager.kt \
@@ -101,13 +117,13 @@ java -Djava.library.path=../build/lib -jar main.jar @@ -101,13 +117,13 @@ java -Djava.library.path=../build/lib -jar main.jar
101 117
102 function testTwoPass() { 118 function testTwoPass() {
103 if [ ! -f ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.int8.onnx ]; then 119 if [ ! -f ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.int8.onnx ]; then
104 - wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2 120 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
105 tar xvf sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2 121 tar xvf sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
106 rm sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2 122 rm sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
107 fi 123 fi
108 124
109 if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx ]; then 125 if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx ]; then
110 - wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 126 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
111 tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 127 tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
112 rm sherpa-onnx-whisper-tiny.en.tar.bz2 128 rm sherpa-onnx-whisper-tiny.en.tar.bz2
113 fi 129 fi
@@ -13,6 +13,7 @@ add_library(sherpa-onnx-jni @@ -13,6 +13,7 @@ add_library(sherpa-onnx-jni
13 audio-tagging.cc 13 audio-tagging.cc
14 jni.cc 14 jni.cc
15 offline-stream.cc 15 offline-stream.cc
  16 + spoken-language-identification.cc
16 ) 17 )
17 target_link_libraries(sherpa-onnx-jni sherpa-onnx-core) 18 target_link_libraries(sherpa-onnx-jni sherpa-onnx-core)
18 install(TARGETS sherpa-onnx-jni DESTINATION lib) 19 install(TARGETS sherpa-onnx-jni DESTINATION lib)
  1 +// sherpa-onnx/jni/spoken-language-identification.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/spoken-language-identification.h"
  6 +
  7 +#include "sherpa-onnx/csrc/macros.h"
  8 +#include "sherpa-onnx/jni/common.h"
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +static SpokenLanguageIdentificationConfig GetSpokenLanguageIdentificationConfig(
  13 + JNIEnv *env, jobject config) {
  14 + SpokenLanguageIdentificationConfig ans;
  15 +
  16 + jclass cls = env->GetObjectClass(config);
  17 + jfieldID fid = env->GetFieldID(
  18 + cls, "whisper",
  19 + "Lcom/k2fsa/sherpa/onnx/SpokenLanguageIdentificationWhisperConfig;");
  20 +
  21 + jobject whisper = env->GetObjectField(config, fid);
  22 + jclass whisper_cls = env->GetObjectClass(whisper);
  23 +
  24 + fid = env->GetFieldID(whisper_cls, "encoder", "Ljava/lang/String;");
  25 +
  26 + jstring s = (jstring)env->GetObjectField(whisper, fid);
  27 + const char *p = env->GetStringUTFChars(s, nullptr);
  28 + ans.whisper.encoder = p;
  29 + env->ReleaseStringUTFChars(s, p);
  30 +
  31 + fid = env->GetFieldID(whisper_cls, "decoder", "Ljava/lang/String;");
  32 + s = (jstring)env->GetObjectField(whisper, fid);
  33 + p = env->GetStringUTFChars(s, nullptr);
  34 + ans.whisper.decoder = p;
  35 + env->ReleaseStringUTFChars(s, p);
  36 +
  37 + fid = env->GetFieldID(whisper_cls, "tailPaddings", "I");
  38 + ans.whisper.tail_paddings = env->GetIntField(whisper, fid);
  39 +
  40 + fid = env->GetFieldID(cls, "numThreads", "I");
  41 + ans.num_threads = env->GetIntField(config, fid);
  42 +
  43 + fid = env->GetFieldID(cls, "debug", "Z");
  44 + ans.debug = env->GetBooleanField(config, fid);
  45 +
  46 + fid = env->GetFieldID(cls, "provider", "Ljava/lang/String;");
  47 + s = (jstring)env->GetObjectField(config, fid);
  48 + p = env->GetStringUTFChars(s, nullptr);
  49 + ans.provider = p;
  50 + env->ReleaseStringUTFChars(s, p);
  51 +
  52 + return ans;
  53 +}
  54 +
  55 +} // namespace sherpa_onnx
  56 +
  57 +SHERPA_ONNX_EXTERN_C
  58 +JNIEXPORT jlong JNICALL
  59 +Java_com_k2fsa_sherpa_onnx_SpokenLanguageIdentification_newFromFile(
  60 + JNIEnv *env, jobject /*obj*/, jobject _config) {
  61 + auto config =
  62 + sherpa_onnx::GetSpokenLanguageIdentificationConfig(env, _config);
  63 + SHERPA_ONNX_LOGE("SpokenLanguageIdentification newFromFile config:\n%s",
  64 + config.ToString().c_str());
  65 +
  66 + if (!config.Validate()) {
  67 + SHERPA_ONNX_LOGE("Errors found in config!");
  68 + return 0;
  69 + }
  70 +
  71 + auto tagger = new sherpa_onnx::SpokenLanguageIdentification(config);
  72 +
  73 + return (jlong)tagger;
  74 +}
  75 +
  76 +SHERPA_ONNX_EXTERN_C
  77 +JNIEXPORT jlong JNICALL
  78 +Java_com_k2fsa_sherpa_onnx_SpokenLanguageIdentification_createStream(
  79 + JNIEnv *env, jobject /*obj*/, jlong ptr) {
  80 + auto slid =
  81 + reinterpret_cast<sherpa_onnx::SpokenLanguageIdentification *>(ptr);
  82 + std::unique_ptr<sherpa_onnx::OfflineStream> s = slid->CreateStream();
  83 +
  84 + // The user is responsible to free the returned pointer.
  85 + //
  86 + // See Java_com_k2fsa_sherpa_onnx_OfflineStream_delete() from
  87 + // ./offline-stream.cc
  88 + sherpa_onnx::OfflineStream *p = s.release();
  89 + return (jlong)p;
  90 +}
  91 +
  92 +SHERPA_ONNX_EXTERN_C
  93 +JNIEXPORT jstring JNICALL
  94 +Java_com_k2fsa_sherpa_onnx_SpokenLanguageIdentification_compute(JNIEnv *env,
  95 + jobject /*obj*/,
  96 + jlong ptr,
  97 + jlong s_ptr) {
  98 + sherpa_onnx::SpokenLanguageIdentification *slid =
  99 + reinterpret_cast<sherpa_onnx::SpokenLanguageIdentification *>(ptr);
  100 + sherpa_onnx::OfflineStream *s =
  101 + reinterpret_cast<sherpa_onnx::OfflineStream *>(s_ptr);
  102 + std::string lang = slid->Compute(s);
  103 + return env->NewStringUTF(lang.c_str());
  104 +}