Fangjun Kuang
Committed by GitHub

Add Java API for spoken language identification with whisper multilingual models (#817)

@@ -57,6 +57,7 @@ jobs: @@ -57,6 +57,7 @@ jobs:
57 ./build-android-arm64-v8a.sh 57 ./build-android-arm64-v8a.sh
58 mkdir -p jniLibs/arm64-v8a/ 58 mkdir -p jniLibs/arm64-v8a/
59 cp -v ./build-android-arm64-v8a/install/lib/*.so ./jniLibs/arm64-v8a/ 59 cp -v ./build-android-arm64-v8a/install/lib/*.so ./jniLibs/arm64-v8a/
  60 + rm -rf ./build-android-arm64-v8a/
60 61
61 - name: build android armv7-eabi 62 - name: build android armv7-eabi
62 shell: bash 63 shell: bash
@@ -65,6 +66,7 @@ jobs: @@ -65,6 +66,7 @@ jobs:
65 ./build-android-armv7-eabi.sh 66 ./build-android-armv7-eabi.sh
66 mkdir -p ./jniLibs/armeabi-v7a/ 67 mkdir -p ./jniLibs/armeabi-v7a/
67 cp -v ./build-android-armv7-eabi/install/lib/*.so ./jniLibs/armeabi-v7a/ 68 cp -v ./build-android-armv7-eabi/install/lib/*.so ./jniLibs/armeabi-v7a/
  69 + rm -rf ./build-android-armv7-eabi
68 70
69 - name: build android x86_64 71 - name: build android x86_64
70 shell: bash 72 shell: bash
@@ -73,6 +75,7 @@ jobs: @@ -73,6 +75,7 @@ jobs:
73 ./build-android-x86-64.sh 75 ./build-android-x86-64.sh
74 mkdir -p ./jniLibs/x86_64 76 mkdir -p ./jniLibs/x86_64
75 cp -v ./build-android-x86-64/install/lib/*.so ./jniLibs/x86_64 77 cp -v ./build-android-x86-64/install/lib/*.so ./jniLibs/x86_64
  78 + rm -rf ./build-android-x86-64
76 79
77 - name: build android x86 80 - name: build android x86
78 shell: bash 81 shell: bash
@@ -81,6 +84,7 @@ jobs: @@ -81,6 +84,7 @@ jobs:
81 ./build-android-x86.sh 84 ./build-android-x86.sh
82 mkdir -p ./jniLibs/x86 85 mkdir -p ./jniLibs/x86
83 cp -v ./build-android-x86/install/lib/*.so ./jniLibs/x86 86 cp -v ./build-android-x86/install/lib/*.so ./jniLibs/x86
  87 + rm -rf ./build-android-x86
84 88
85 - name: Copy files 89 - name: Copy files
86 shell: bash 90 shell: bash
@@ -112,6 +116,8 @@ jobs: @@ -112,6 +116,8 @@ jobs:
112 command: | 116 command: |
113 git config --global user.email "csukuangfj@gmail.com" 117 git config --global user.email "csukuangfj@gmail.com"
114 git config --global user.name "Fangjun Kuang" 118 git config --global user.name "Fangjun Kuang"
  119 + du -h -d1 .
  120 + ls -lh
115 121
116 rm -rf huggingface 122 rm -rf huggingface
117 GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface 123 GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
@@ -44,6 +44,23 @@ jobs: @@ -44,6 +44,23 @@ jobs:
44 echo "ANDROID_NDK_LATEST_HOME: ${ANDROID_NDK_LATEST_HOME}" 44 echo "ANDROID_NDK_LATEST_HOME: ${ANDROID_NDK_LATEST_HOME}"
45 ls -lh ${ANDROID_NDK_LATEST_HOME} 45 ls -lh ${ANDROID_NDK_LATEST_HOME}
46 46
  47 + - name: Setup build tool version variable
  48 + shell: bash
  49 + run: |
  50 + echo "---"
  51 + ls -lh /usr/local/lib/android/
  52 + echo "---"
  53 +
  54 + ls -lh /usr/local/lib/android/sdk
  55 + echo "---"
  56 +
  57 + ls -lh /usr/local/lib/android/sdk/build-tools
  58 + echo "---"
  59 +
  60 + BUILD_TOOL_VERSION=$(ls /usr/local/lib/android/sdk/build-tools/ | tail -n 1)
  61 + echo "BUILD_TOOL_VERSION=$BUILD_TOOL_VERSION" >> $GITHUB_ENV
  62 + echo "Last build tool version is: $BUILD_TOOL_VERSION"
  63 +
47 - name: build APK 64 - name: build APK
48 shell: bash 65 shell: bash
49 run: | 66 run: |
@@ -59,13 +76,77 @@ jobs: @@ -59,13 +76,77 @@ jobs:
59 run: | 76 run: |
60 ls -lh ./apks/ 77 ls -lh ./apks/
61 78
62 - - uses: actions/upload-artifact@v4 79 +
  80 + # https://github.com/marketplace/actions/sign-android-release
  81 + - uses: r0adkll/sign-android-release@v1
  82 + name: Sign app APK
63 with: 83 with:
64 - path: ./apks/*.apk 84 + releaseDirectory: ./apks
  85 + signingKeyBase64: ${{ secrets.ANDROID_SIGNING_KEY }}
  86 + alias: ${{ secrets.ANDROID_SIGNING_KEY_ALIAS }}
  87 + keyStorePassword: ${{ secrets.ANDROID_SIGNING_KEY_STORE_PASSWORD }}
  88 + env:
  89 + BUILD_TOOLS_VERSION: ${{ env.BUILD_TOOL_VERSION }}
65 90
66 - - name: Release APK  
67 - uses: svenstaro/upload-release-action@v2 91 + - name: Display APK after signing
  92 + shell: bash
  93 + run: |
  94 + ls -lh ./apks/
  95 + du -h -d1 .
  96 +
  97 + - name: Rename APK after signing
  98 + shell: bash
  99 + run: |
  100 + cd apks
  101 + rm -fv signingKey.jks
  102 + rm -fv *.apk.idsig
  103 + rm -fv *-aligned.apk
  104 +
  105 + all_apks=$(ls -1 *-signed.apk)
  106 + echo "----"
  107 + echo $all_apks
  108 + echo "----"
  109 + for apk in ${all_apks[@]}; do
  110 + n=$(echo $apk | sed -e s/-signed//)
  111 + mv -v $apk $n
  112 + done
  113 +
  114 + cd ..
  115 +
  116 + ls -lh ./apks/
  117 + du -h -d1 .
  118 +
  119 + - name: Display APK after rename
  120 + shell: bash
  121 + run: |
  122 + ls -lh ./apks/
  123 + du -h -d1 .
  124 +
  125 + - name: Publish to huggingface
  126 + env:
  127 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  128 + uses: nick-fields/retry@v3
68 with: 129 with:
69 - file_glob: true  
70 - file: apks/*.apk  
71 - overwrite: true 130 + max_attempts: 20
  131 + timeout_seconds: 200
  132 + shell: bash
  133 + command: |
  134 + git config --global user.email "csukuangfj@gmail.com"
  135 + git config --global user.name "Fangjun Kuang"
  136 +
  137 + rm -rf huggingface
  138 + export GIT_LFS_SKIP_SMUDGE=1
  139 +
  140 + git clone https://huggingface.co/csukuangfj/sherpa-onnx-apk huggingface
  141 + cd huggingface
  142 + git fetch
  143 + git pull
  144 + git merge -m "merge remote" --ff origin main
  145 +
  146 + mkdir -p kws
  147 + cp -v ../apks/*.apk ./kws/
  148 + git status
  149 + git lfs track "*.apk"
  150 + git add .
  151 + git commit -m "add more apks"
  152 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-apk main
@@ -106,6 +106,14 @@ jobs: @@ -106,6 +106,14 @@ jobs:
106 make -j4 106 make -j4
107 ls -lh lib 107 ls -lh lib
108 108
  109 + - name: Run java test (Spoken language identification)
  110 + shell: bash
  111 + run: |
  112 + cd ./java-api-examples
  113 + ./run-spoken-language-identification-whisper.sh
  114 + # Delete model files to save space
  115 + rm -rf sherpa-onnx-whisper-*
  116 +
109 - name: Run java test (Streaming ASR) 117 - name: Run java test (Streaming ASR)
110 shell: bash 118 shell: bash
111 run: | 119 run: |
@@ -200,7 +200,7 @@ class MainActivity : AppCompatActivity() { @@ -200,7 +200,7 @@ class MainActivity : AppCompatActivity() {
200 val config = OnlineRecognizerConfig( 200 val config = OnlineRecognizerConfig(
201 featConfig = getFeatureConfig(sampleRate = sampleRateInHz, featureDim = 80), 201 featConfig = getFeatureConfig(sampleRate = sampleRateInHz, featureDim = 80),
202 modelConfig = getModelConfig(type = type)!!, 202 modelConfig = getModelConfig(type = type)!!,
203 - lmConfig = getOnlineLMConfig(type = type), 203 + // lmConfig = getOnlineLMConfig(type = type),
204 endpointConfig = getEndpointConfig(), 204 endpointConfig = getEndpointConfig(),
205 enableEndpoint = true, 205 enableEndpoint = true,
206 ) 206 )
1 lib 1 lib
2 hs_err* 2 hs_err*
3 -!run-streaming*.sh  
4 -!run-non-streaming*.sh 3 +!run-*.sh
@@ -29,3 +29,9 @@ This directory contains examples for the JAVA API of sherpa-onnx. @@ -29,3 +29,9 @@ This directory contains examples for the JAVA API of sherpa-onnx.
29 ./run-non-streaming-tts-coqui-de.sh 29 ./run-non-streaming-tts-coqui-de.sh
30 ./run-non-streaming-tts-vits-zh.sh 30 ./run-non-streaming-tts-vits-zh.sh
31 ``` 31 ```
  32 +
  33 +## Spoken language identification
  34 +
  35 +```bash
  36 +./run-spoken-language-identification-whisper.sh
  37 +```
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +// This file shows how to use a multilingual whisper model for
  4 +// spoken language identification.
  5 +//
  6 +// Note that it needs a multilingual whisper model. For instance,
  7 +// tiny works, but tiny.en doesn't.
  8 +import com.k2fsa.sherpa.onnx.*;
  9 +
  10 +public class SpokenLanguageIdentificationWhisper {
  11 + public static void main(String[] args) {
  12 + // please download model and test files from
  13 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  14 + String encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx";
  15 + String decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx";
  16 +
  17 + String[] testFiles =
  18 + new String[] {
  19 + "./spoken-language-identification-test-wavs/en-english.wav",
  20 + "./spoken-language-identification-test-wavs/de-german.wav",
  21 + "./spoken-language-identification-test-wavs/zh-chinese.wav",
  22 + "./spoken-language-identification-test-wavs/es-spanish.wav",
  23 + "./spoken-language-identification-test-wavs/fa-persian.wav",
  24 + "./spoken-language-identification-test-wavs/ko-korean.wav",
  25 + "./spoken-language-identification-test-wavs/ja-japanese.wav",
  26 + "./spoken-language-identification-test-wavs/ru-russian.wav",
  27 + "./spoken-language-identification-test-wavs/uk-ukrainian.wav",
  28 + };
  29 +
  30 + SpokenLanguageIdentificationWhisperConfig whisper =
  31 + SpokenLanguageIdentificationWhisperConfig.builder()
  32 + .setEncoder(encoder)
  33 + .setDecoder(decoder)
  34 + .build();
  35 +
  36 + SpokenLanguageIdentificationConfig config =
  37 + SpokenLanguageIdentificationConfig.builder()
  38 + .setWhisper(whisper)
  39 + .setNumThreads(1)
  40 + .setDebug(true)
  41 + .build();
  42 +
  43 + SpokenLanguageIdentification slid = new SpokenLanguageIdentification(config);
  44 + for (String filename : testFiles) {
  45 + WaveReader reader = new WaveReader(filename);
  46 +
  47 + OfflineStream stream = slid.createStream();
  48 + stream.acceptWaveform(reader.getSamples(), reader.getSampleRate());
  49 +
  50 + String lang = slid.compute(stream);
  51 + System.out.println("---");
  52 + System.out.printf("filename: %s\n", filename);
  53 + System.out.printf("lang: %s\n", lang);
  54 +
  55 + stream.release();
  56 + }
  57 + System.out.println("---");
  58 +
  59 + slid.release();
  60 + }
  61 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  6 + mkdir -p ../build
  7 + pushd ../build
  8 + cmake \
  9 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  10 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  11 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  12 + -DBUILD_SHARED_LIBS=ON \
  13 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  14 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  15 + ..
  16 +
  17 + make -j4
  18 + ls -lh lib
  19 + popd
  20 +fi
  21 +
  22 +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  23 + pushd ../sherpa-onnx/java-api
  24 + make
  25 + popd
  26 +fi
  27 +
  28 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  29 + cmake \
  30 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  31 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  32 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  33 + -DBUILD_SHARED_LIBS=ON \
  34 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  35 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  36 + ..
  37 +
  38 + make -j4
  39 + ls -lh lib
  40 +fi
  41 +
  42 +# Note that it needs a multilingual whisper model. so, for example, tiny works while tiny.en does not work
  43 +# https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
  44 +if [ ! -f ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx ]; then
  45 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
  46 + tar xvf sherpa-onnx-whisper-tiny.tar.bz2
  47 + rm sherpa-onnx-whisper-tiny.tar.bz2
  48 +fi
  49 +
  50 +if [ ! -f ./spoken-language-identification-test-wavs/en-english.wav ]; then
  51 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/spoken-language-identification-test-wavs.tar.bz2
  52 + tar xvf spoken-language-identification-test-wavs.tar.bz2
  53 + rm spoken-language-identification-test-wavs.tar.bz2
  54 +fi
  55 +
  56 +java \
  57 + -Djava.library.path=$PWD/../build/lib \
  58 + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  59 + ./SpokenLanguageIdentificationWhisper.java
@@ -36,6 +36,10 @@ java_files += OfflineTtsConfig.java @@ -36,6 +36,10 @@ java_files += OfflineTtsConfig.java
36 java_files += GeneratedAudio.java 36 java_files += GeneratedAudio.java
37 java_files += OfflineTts.java 37 java_files += OfflineTts.java
38 38
  39 +java_files += SpokenLanguageIdentificationWhisperConfig.java
  40 +java_files += SpokenLanguageIdentificationConfig.java
  41 +java_files += SpokenLanguageIdentification.java
  42 +
39 class_files := $(java_files:%.java=%.class) 43 class_files := $(java_files:%.java=%.class)
40 44
41 java_files := $(addprefix src/$(package_dir)/,$(java_files)) 45 java_files := $(addprefix src/$(package_dir)/,$(java_files))
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +package com.k2fsa.sherpa.onnx;
  4 +
  5 +import java.util.HashMap;
  6 +import java.util.Locale;
  7 +import java.util.Map;
  8 +
  9 +public class SpokenLanguageIdentification {
  10 + static {
  11 + System.loadLibrary("sherpa-onnx-jni");
  12 + }
  13 +
  14 + private final Map<String, String> localeMap;
  15 + private long ptr = 0; // this is the asr engine ptrss
  16 +
  17 + public SpokenLanguageIdentification(SpokenLanguageIdentificationConfig config) {
  18 + ptr = newFromFile(config);
  19 +
  20 + String[] languages = Locale.getISOLanguages();
  21 + localeMap = new HashMap<String, String>(languages.length);
  22 + for (String language : languages) {
  23 + Locale locale = new Locale(language);
  24 + localeMap.put(language, locale.getDisplayName());
  25 + }
  26 + }
  27 +
  28 + public String compute(OfflineStream stream) {
  29 + String lang = compute(ptr, stream.getPtr());
  30 + return localeMap.getOrDefault(lang, lang);
  31 + }
  32 +
  33 + public OfflineStream createStream() {
  34 + long p = createStream(ptr);
  35 + return new OfflineStream(p);
  36 + }
  37 +
  38 + @Override
  39 + protected void finalize() throws Throwable {
  40 + release();
  41 + }
  42 +
  43 + // You'd better call it manually if it is not used anymore
  44 + public void release() {
  45 + if (this.ptr == 0) {
  46 + return;
  47 + }
  48 + delete(this.ptr);
  49 + this.ptr = 0;
  50 + }
  51 +
  52 + private native void delete(long ptr);
  53 +
  54 + private native long newFromFile(SpokenLanguageIdentificationConfig config);
  55 +
  56 + private native long createStream(long ptr);
  57 +
  58 + private native String compute(long ptr, long streamPtr);
  59 +}
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +package com.k2fsa.sherpa.onnx;
  4 +
  5 +public class SpokenLanguageIdentificationConfig {
  6 + private final SpokenLanguageIdentificationWhisperConfig whisper;
  7 + private final int numThreads;
  8 + private final boolean debug;
  9 + private final String provider;
  10 +
  11 + private SpokenLanguageIdentificationConfig(Builder builder) {
  12 + this.whisper = builder.whisper;
  13 + this.numThreads = builder.numThreads;
  14 + this.debug = builder.debug;
  15 + this.provider = builder.provider;
  16 + }
  17 +
  18 + public static Builder builder() {
  19 + return new Builder();
  20 + }
  21 +
  22 + public SpokenLanguageIdentificationWhisperConfig getWhisper() {
  23 + return whisper;
  24 + }
  25 +
  26 + public static class Builder {
  27 + private SpokenLanguageIdentificationWhisperConfig whisper = SpokenLanguageIdentificationWhisperConfig.builder().build();
  28 + private int numThreads = 1;
  29 + private boolean debug = true;
  30 + private String provider = "cpu";
  31 +
  32 + public SpokenLanguageIdentificationConfig build() {
  33 + return new SpokenLanguageIdentificationConfig(this);
  34 + }
  35 +
  36 + public Builder setWhisper(SpokenLanguageIdentificationWhisperConfig whisper) {
  37 + this.whisper = whisper;
  38 + return this;
  39 + }
  40 +
  41 + public Builder setNumThreads(int numThreads) {
  42 + this.numThreads = numThreads;
  43 + return this;
  44 + }
  45 +
  46 + public Builder setDebug(boolean debug) {
  47 + this.debug = debug;
  48 + return this;
  49 + }
  50 +
  51 + public Builder setProvider(String provider) {
  52 + this.provider = provider;
  53 + return this;
  54 + }
  55 + }
  56 +}
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +package com.k2fsa.sherpa.onnx;
  4 +
  5 +public class SpokenLanguageIdentificationWhisperConfig {
  6 + private final String encoder;
  7 + private final String decoder;
  8 + private final int tailPaddings;
  9 +
  10 + private SpokenLanguageIdentificationWhisperConfig(Builder builder) {
  11 + this.encoder = builder.encoder;
  12 + this.decoder = builder.decoder;
  13 + this.tailPaddings = builder.tailPaddings;
  14 + }
  15 +
  16 + public static Builder builder() {
  17 + return new Builder();
  18 + }
  19 +
  20 + public String getEncoder() {
  21 + return encoder;
  22 + }
  23 +
  24 + public String getDecoder() {
  25 + return decoder;
  26 + }
  27 +
  28 + public int getTailPaddings() {
  29 + return tailPaddings;
  30 + }
  31 +
  32 + public static class Builder {
  33 + private String encoder = "";
  34 + private String decoder = "";
  35 + private int tailPaddings = 1000; // number of frames to pad
  36 +
  37 + public SpokenLanguageIdentificationWhisperConfig build() {
  38 + return new SpokenLanguageIdentificationWhisperConfig(this);
  39 + }
  40 +
  41 + public Builder setEncoder(String encoder) {
  42 + this.encoder = encoder;
  43 + return this;
  44 + }
  45 +
  46 + public Builder setDecoder(String decoder) {
  47 + this.decoder = decoder;
  48 + return this;
  49 + }
  50 +
  51 + public Builder setTailPaddings(int tailPaddings) {
  52 + this.tailPaddings = tailPaddings;
  53 + return this;
  54 + }
  55 + }
  56 +}