Fangjun Kuang
Committed by GitHub

Add Kotlin and Java API for homophone replacer (#2166)

* Add Kotlin API for homonphone replacer

* Add Java API for homonphone replacer
... ... @@ -105,6 +105,14 @@ jobs:
make -j4
ls -lh lib
- name: Run java test (Non-streaming SenseVoice with homophone replacer)
shell: bash
run: |
cd ./java-api-examples
./run-non-streaming-decode-file-sense-voice-with-hr.sh
rm -rf sherpa-onnx-sense-*
rm -rf dict lexicon.txt replace.fst
- name: Run java test (VAD + Non-streaming Dolphin CTC)
shell: bash
run: |
... ...
../../../../../../../../../../sherpa-onnx/kotlin-api/HomophoneReplacerConfig.kt
\ No newline at end of file
... ...
../../../../../../../../../../sherpa-onnx/kotlin-api/HomophoneReplacerConfig.kt
\ No newline at end of file
... ...
../../../../../../../../../../sherpa-onnx/kotlin-api/HomophoneReplacerConfig.kt
\ No newline at end of file
... ...
../../../../../../../../../../sherpa-onnx/kotlin-api/HomophoneReplacerConfig.kt
\ No newline at end of file
... ...
../../../../../../../../../../sherpa-onnx/kotlin-api/HomophoneReplacerConfig.kt
\ No newline at end of file
... ...
// Copyright 2025 Xiaomi Corporation
// This file shows how to use an offline SenseVoice model,
// i.e., non-streaming SenseVoice model
// to decode files with homophone replacer.
import com.k2fsa.sherpa.onnx.*;
public class NonStreamingDecodeFileSenseVoiceWithHr {
public static void main(String[] args) {
// please refer to
// https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html
// to download model files
String model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
String tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";
String waveFilename = "./test-hr.wav";
WaveReader reader = new WaveReader(waveFilename);
OfflineSenseVoiceModelConfig senseVoice =
OfflineSenseVoiceModelConfig.builder().setModel(model).build();
OfflineModelConfig modelConfig =
OfflineModelConfig.builder()
.setSenseVoice(senseVoice)
.setTokens(tokens)
.setNumThreads(1)
.setDebug(true)
.build();
HomophoneReplacerConfig hr =
HomophoneReplacerConfig.builder()
.setDictDir("./dict")
.setLexicon("./lexicon.txt")
.setRuleFsts("./replace.fst")
.build();
OfflineRecognizerConfig config =
OfflineRecognizerConfig.builder()
.setOfflineModelConfig(modelConfig)
.setDecodingMethod("greedy_search")
.setHr(hr)
.build();
OfflineRecognizer recognizer = new OfflineRecognizer(config);
OfflineStream stream = recognizer.createStream();
stream.acceptWaveform(reader.getSamples(), reader.getSampleRate());
recognizer.decode(stream);
String text = recognizer.getResult(stream).getText();
System.out.printf("filename:%s\nresult:%s\n", waveFilename, text);
stream.release();
recognizer.release();
}
}
... ...
... ... @@ -31,6 +31,11 @@ This directory contains examples for the JAVA API of sherpa-onnx.
./run-non-streaming-decode-file-nemo.sh
```
## Non-Streaming Speech recognition with homophone replacer
```bash
./run-non-streaming-decode-file-sense-voice-with-hr.sh
```
## Non-Streaming text-to-speech
... ...
#!/usr/bin/env bash
set -ex
if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
mkdir -p ../build
pushd ../build
cmake \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
-DSHERPA_ONNX_ENABLE_JNI=ON \
..
make -j4
ls -lh lib
popd
fi
if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
pushd ../sherpa-onnx/java-api
make
popd
fi
if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
fi
if [ ! -d dict ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/dict.tar.bz2
tar xf dict.tar.bz2
rm dict.tar.bz2
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/replace.fst
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/test-hr.wav
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/lexicon.txt
fi
java \
-Djava.library.path=$PWD/../build/lib \
-cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
NonStreamingDecodeFileSenseVoiceWithHr.java
... ...
../sherpa-onnx/kotlin-api/HomophoneReplacerConfig.kt
\ No newline at end of file
... ...
... ... @@ -87,6 +87,7 @@ function testOnlineAsr() {
kotlinc-jvm -include-runtime -d $out_filename \
test_online_asr.kt \
FeatureConfig.kt \
HomophoneReplacerConfig.kt \
OnlineRecognizer.kt \
OnlineStream.kt \
WaveReader.kt \
... ... @@ -244,6 +245,7 @@ function testOfflineAsr() {
kotlinc-jvm -include-runtime -d $out_filename \
test_offline_asr.kt \
FeatureConfig.kt \
HomophoneReplacerConfig.kt \
OfflineRecognizer.kt \
OfflineStream.kt \
WaveReader.kt \
... ... @@ -272,6 +274,7 @@ function testInverseTextNormalizationOfflineAsr() {
kotlinc-jvm -include-runtime -d $out_filename \
test_itn_offline_asr.kt \
FeatureConfig.kt \
HomophoneReplacerConfig.kt \
OfflineRecognizer.kt \
OfflineStream.kt \
WaveReader.kt \
... ... @@ -300,6 +303,7 @@ function testInverseTextNormalizationOnlineAsr() {
kotlinc-jvm -include-runtime -d $out_filename \
test_itn_online_asr.kt \
FeatureConfig.kt \
HomophoneReplacerConfig.kt \
OnlineRecognizer.kt \
OnlineStream.kt \
WaveReader.kt \
... ... @@ -402,6 +406,38 @@ function testOfflineSpeechDenoiser() {
ls -lh *.wav
}
function testOfflineSenseVoiceWithHr() {
if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
fi
if [ ! -d dict ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/dict.tar.bz2
tar xf dict.tar.bz2
rm dict.tar.bz2
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/replace.fst
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/test-hr.wav
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/lexicon.txt
fi
out_filename=test_offline_sense_voice_with_hr.jar
kotlinc-jvm -include-runtime -d $out_filename \
test_offline_sense_voice_with_hr.kt \
FeatureConfig.kt \
HomophoneReplacerConfig.kt \
OfflineRecognizer.kt \
OfflineStream.kt \
WaveReader.kt \
faked-asset-manager.kt
ls -lh $out_filename
java -Djava.library.path=../build/lib -jar $out_filename
}
testOfflineSenseVoiceWithHr
testOfflineSpeechDenoiser
testOfflineSpeakerDiarization
testSpeakerEmbeddingExtractor
... ...
package com.k2fsa.sherpa.onnx
fun main() {
val recognizer = createOfflineRecognizer()
val waveFilename = "./test-hr.wav"
val objArray = WaveReader.readWaveFromFile(
filename = waveFilename,
)
val samples: FloatArray = objArray[0] as FloatArray
val sampleRate: Int = objArray[1] as Int
val stream = recognizer.createStream()
stream.acceptWaveform(samples, sampleRate=sampleRate)
recognizer.decode(stream)
val result = recognizer.getResult(stream)
println(result)
stream.release()
recognizer.release()
}
fun createOfflineRecognizer(): OfflineRecognizer {
val config = OfflineRecognizerConfig(
featConfig = getFeatureConfig(sampleRate = 16000, featureDim = 80),
modelConfig = getOfflineModelConfig(type = 15)!!,
hr = HomophoneReplacerConfig(
dictDir = "./dict",
lexicon = "./lexicon.txt",
ruleFsts = "./replace.fst"),
)
return OfflineRecognizer(config = config)
}
... ...
... ... @@ -11,6 +11,7 @@ java_files += WaveWriter.java
java_files += EndpointRule.java
java_files += EndpointConfig.java
java_files += FeatureConfig.java
java_files += HomophoneReplacerConfig.java
java_files += OnlineLMConfig.java
java_files += OnlineParaformerModelConfig.java
java_files += OnlineZipformer2CtcModelConfig.java
... ...
// Copyright 2025 Xiaomi Corporation
package com.k2fsa.sherpa.onnx;
public class HomophoneReplacerConfig {
private final String dictDir;
private final String lexicon;
private final String ruleFsts;
private HomophoneReplacerConfig(Builder builder) {
this.dictDir = builder.dictDir;
this.lexicon = builder.lexicon;
this.ruleFsts = builder.ruleFsts;
}
public static Builder builder() {
return new Builder();
}
public String getDictDir() {
return dictDir;
}
public String getLexicon() {
return lexicon;
}
public String getRuleFsts() {
return ruleFsts;
}
public static class Builder {
private String dictDir = "";
private String lexicon = "";
private String ruleFsts = "";
public HomophoneReplacerConfig build() {
return new HomophoneReplacerConfig(this);
}
public Builder setDictDir(String dictDir) {
this.dictDir = dictDir;
return this;
}
public Builder setLexicon(String lexicon) {
this.lexicon = lexicon;
return this;
}
public Builder setRuleFsts(String ruleFsts) {
this.ruleFsts = ruleFsts;
return this;
}
}
}
... ...
... ... @@ -5,6 +5,7 @@ package com.k2fsa.sherpa.onnx;
public class OfflineRecognizerConfig {
private final FeatureConfig featConfig;
private final OfflineModelConfig modelConfig;
private final HomophoneReplacerConfig hr;
private final String decodingMethod;
private final int maxActivePaths;
private final String hotwordsFile;
... ... @@ -16,6 +17,7 @@ public class OfflineRecognizerConfig {
private OfflineRecognizerConfig(Builder builder) {
this.featConfig = builder.featConfig;
this.modelConfig = builder.modelConfig;
this.hr = builder.hr;
this.decodingMethod = builder.decodingMethod;
this.maxActivePaths = builder.maxActivePaths;
this.hotwordsFile = builder.hotwordsFile;
... ... @@ -36,6 +38,7 @@ public class OfflineRecognizerConfig {
public static class Builder {
private FeatureConfig featConfig = FeatureConfig.builder().build();
private OfflineModelConfig modelConfig = OfflineModelConfig.builder().build();
private HomophoneReplacerConfig hr = HomophoneReplacerConfig.builder().build();
private String decodingMethod = "greedy_search";
private int maxActivePaths = 4;
private String hotwordsFile = "";
... ... @@ -58,6 +61,11 @@ public class OfflineRecognizerConfig {
return this;
}
public Builder setHr(HomophoneReplacerConfig hr) {
this.hr = hr;
return this;
}
public Builder setDecodingMethod(String decodingMethod) {
this.decodingMethod = decodingMethod;
return this;
... ...
... ... @@ -10,6 +10,7 @@ public class OnlineRecognizerConfig {
private final OnlineCtcFstDecoderConfig ctcFstDecoderConfig;
private final EndpointConfig endpointConfig;
private final HomophoneReplacerConfig hr;
private final boolean enableEndpoint;
private final String decodingMethod;
private final int maxActivePaths;
... ... @@ -25,6 +26,7 @@ public class OnlineRecognizerConfig {
this.lmConfig = builder.lmConfig;
this.ctcFstDecoderConfig = builder.ctcFstDecoderConfig;
this.endpointConfig = builder.endpointConfig;
this.hr = builder.hr;
this.enableEndpoint = builder.enableEndpoint;
this.decodingMethod = builder.decodingMethod;
this.maxActivePaths = builder.maxActivePaths;
... ... @@ -49,6 +51,7 @@ public class OnlineRecognizerConfig {
private OnlineLMConfig lmConfig = OnlineLMConfig.builder().build();
private OnlineCtcFstDecoderConfig ctcFstDecoderConfig = OnlineCtcFstDecoderConfig.builder().build();
private EndpointConfig endpointConfig = EndpointConfig.builder().build();
private HomophoneReplacerConfig hr = HomophoneReplacerConfig.builder().build();
private boolean enableEndpoint = true;
private String decodingMethod = "greedy_search";
private int maxActivePaths = 4;
... ... @@ -87,6 +90,11 @@ public class OnlineRecognizerConfig {
return this;
}
public Builder setHr(HomophoneReplacerConfig hr) {
this.hr = hr;
return this;
}
public Builder setEnableEndpoint(boolean enableEndpoint) {
this.enableEndpoint = enableEndpoint;
return this;
... ...
... ... @@ -284,6 +284,30 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) {
ans.model_config.telespeech_ctc = p;
env->ReleaseStringUTFChars(s, p);
// homophone replacer config
fid = env->GetFieldID(cls, "hr",
"Lcom/k2fsa/sherpa/onnx/HomophoneReplacerConfig;");
jobject hr_config = env->GetObjectField(config, fid);
jclass hr_config_cls = env->GetObjectClass(hr_config);
fid = env->GetFieldID(hr_config_cls, "dictDir", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(hr_config, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.hr.dict_dir = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(hr_config_cls, "lexicon", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(hr_config, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.hr.lexicon = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(hr_config_cls, "ruleFsts", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(hr_config, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.hr.rule_fsts = p;
env->ReleaseStringUTFChars(s, p);
return ans;
}
... ...
... ... @@ -253,6 +253,30 @@ static OnlineRecognizerConfig GetConfig(JNIEnv *env, jobject config) {
ans.ctc_fst_decoder_config.max_active =
env->GetIntField(fst_decoder_config, fid);
// homophone replacer config
fid = env->GetFieldID(cls, "hr",
"Lcom/k2fsa/sherpa/onnx/HomophoneReplacerConfig;");
jobject hr_config = env->GetObjectField(config, fid);
jclass hr_config_cls = env->GetObjectClass(hr_config);
fid = env->GetFieldID(hr_config_cls, "dictDir", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(hr_config, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.hr.dict_dir = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(hr_config_cls, "lexicon", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(hr_config, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.hr.lexicon = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(hr_config_cls, "ruleFsts", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(hr_config, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.hr.rule_fsts = p;
env->ReleaseStringUTFChars(s, p);
return ans;
}
} // namespace sherpa_onnx
... ...
package com.k2fsa.sherpa.onnx
data class HomophoneReplacerConfig(
var dictDir: String = "",
var lexicon: String = "",
var ruleFsts: String = "",
)
... ...
... ... @@ -78,6 +78,7 @@ data class OfflineRecognizerConfig(
var featConfig: FeatureConfig = FeatureConfig(),
var modelConfig: OfflineModelConfig = OfflineModelConfig(),
// var lmConfig: OfflineLMConfig(), // TODO(fangjun): enable it
var hr: HomophoneReplacerConfig = HomophoneReplacerConfig(),
var decodingMethod: String = "greedy_search",
var maxActivePaths: Int = 4,
var hotwordsFile: String = "",
... ...
... ... @@ -57,12 +57,12 @@ data class OnlineCtcFstDecoderConfig(
var maxActive: Int = 3000,
)
data class OnlineRecognizerConfig(
var featConfig: FeatureConfig = FeatureConfig(),
var modelConfig: OnlineModelConfig = OnlineModelConfig(),
var lmConfig: OnlineLMConfig = OnlineLMConfig(),
var ctcFstDecoderConfig: OnlineCtcFstDecoderConfig = OnlineCtcFstDecoderConfig(),
var hr: HomophoneReplacerConfig = HomophoneReplacerConfig(),
var endpointConfig: EndpointConfig = EndpointConfig(),
var enableEndpoint: Boolean = true,
var decodingMethod: String = "greedy_search",
... ...