Fangjun Kuang
Committed by GitHub

Add C API for streaming HLG decoding (#734)

正在显示 39 个修改的文件 包含 839 行增加8 行删除
... ... @@ -2,7 +2,10 @@
cd dotnet-examples/
cd spoken-language-identification
cd streaming-hlg-decoding/
./run.sh
cd ../spoken-language-identification
./run.sh
cd ../online-decode-files
... ...
... ... @@ -58,6 +58,13 @@ rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
node ./test-online-zipformer2-ctc.js
rm -rf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
node ./test-online-zipformer2-ctc-hlg.js
rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18
# offline tts
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
... ...
... ... @@ -7,6 +7,10 @@ echo "pwd: $PWD"
cd swift-api-examples
ls -lh
./run-streaming-hlg-decode-file.sh
rm ./streaming-hlg-decode-file
rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18
./run-spoken-language-identification.sh
rm -rf sherpa-onnx-whisper*
... ... @@ -31,4 +35,5 @@ sed -i.bak '20d' ./decode-file.swift
./run-decode-file-non-streaming.sh
ls -lh
... ...
... ... @@ -178,6 +178,7 @@ jobs:
cp -v scripts/dotnet/examples/online-decode-files.csproj dotnet-examples/online-decode-files/
cp -v scripts/dotnet/examples/speech-recognition-from-microphone.csproj dotnet-examples/speech-recognition-from-microphone/
cp -v scripts/dotnet/examples/spoken-language-identification.csproj dotnet-examples/spoken-language-identification/
cp -v scripts/dotnet/examples/streaming-hlg-decoding.csproj dotnet-examples/streaming-hlg-decoding
ls -lh /tmp
... ...
... ... @@ -66,12 +66,77 @@ jobs:
run: |
gcc --version
- name: Test speaker identification
- name: Test streaming HLG decoding (Linux/macOS)
if: matrix.os != 'windows-latest'
shell: bash
run: |
cd go-api-examples/streaming-hlg-decoding/
./run.sh
- name: Test speaker identification (Linux/macOS)
if: matrix.os != 'windows-latest'
shell: bash
run: |
cd go-api-examples/speaker-identification
./run.sh
- name: Test speaker identification (Win64)
if: matrix.os == 'windows-latest' && matrix.arch == 'x64'
shell: bash
run: |
cd go-api-examples/speaker-identification
go mod tidy
cat go.mod
go build
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx
git clone https://github.com/csukuangfj/sr-data
ls -lh
echo $PWD
ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/
ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/*
cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/x86_64-pc-windows-gnu/*.dll .
ls -lh
go mod tidy
go build
go run ./main.go
- name: Test speaker identification (Win32)
if: matrix.os == 'windows-latest' && matrix.arch == 'x86'
shell: bash
run: |
cd go-api-examples/speaker-identification
go mod tidy
cat go.mod
ls -lh
go env GOARCH
go env
echo "------------------------------"
go env -w GOARCH=386
go env -w CGO_ENABLED=1
go env
go clean
go build
echo $PWD
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx
git clone https://github.com/csukuangfj/sr-data
ls -lh
echo $PWD
ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/
ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/*
cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/i686-pc-windows-gnu/*.dll .
ls -lh
go mod tidy
go build
go run ./main.go
rm -rf sr-data
rm -rf *.onnx
- name: Test non-streaming TTS (Linux/macOS)
if: matrix.os != 'windows-latest'
shell: bash
... ...
... ... @@ -74,6 +74,12 @@ jobs:
go mod tidy
go build
- name: Test streaming HLG decoding
shell: bash
run: |
cd scripts/go/_internal/streaming-hlg-decoding/
./run.sh
- name: Test speaker identification
shell: bash
run: |
... ...
... ... @@ -15,6 +15,9 @@ target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api)
add_executable(speaker-identification-c-api speaker-identification-c-api.c)
target_link_libraries(speaker-identification-c-api sherpa-onnx-c-api)
add_executable(streaming-hlg-decode-file-c-api streaming-hlg-decode-file-c-api.c)
target_link_libraries(streaming-hlg-decode-file-c-api sherpa-onnx-c-api)
if(SHERPA_ONNX_HAS_ALSA)
add_subdirectory(./asr-microphone-example)
elseif((UNIX AND NOT APPLE) OR LINUX)
... ...
// c-api-examples/streaming-hlg-decode-file-c-api.c
//
// Copyright (c) 2024 Xiaomi Corporation
/*
We use the following model as an example
// clang-format off
Download the model from
https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
build/bin/streaming-hlg-decode-file-c-api
(The above model is from https://github.com/k2-fsa/icefall/pull/1557)
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "sherpa-onnx/c-api/c-api.h"
int32_t main() {
// clang-format off
//
// Please download the model from
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
const char *model = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx";
const char *tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt";
const char *graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst";
const char *wav_filename = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav";
// clang-format on
SherpaOnnxOnlineRecognizerConfig config;
memset(&config, 0, sizeof(config));
config.feat_config.sample_rate = 16000;
config.feat_config.feature_dim = 80;
config.model_config.zipformer2_ctc.model = model;
config.model_config.tokens = tokens;
config.model_config.num_threads = 1;
config.model_config.provider = "cpu";
config.model_config.debug = 0;
config.ctc_fst_decoder_config.graph = graph;
const SherpaOnnxOnlineRecognizer *recognizer =
CreateOnlineRecognizer(&config);
if (!recognizer) {
fprintf(stderr, "Failed to create recognizer");
exit(-1);
}
const SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer);
const SherpaOnnxDisplay *display = CreateDisplay(50);
int32_t segment_id = 0;
const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
if (wave == NULL) {
fprintf(stderr, "Failed to read %s\n", wav_filename);
exit(-1);
}
// simulate streaming. You can choose an arbitrary N
#define N 3200
int16_t buffer[N];
float samples[N];
fprintf(stderr, "sample rate: %d, num samples: %d, duration: %.2f s\n",
wave->sample_rate, wave->num_samples,
(float)wave->num_samples / wave->sample_rate);
int32_t k = 0;
while (k < wave->num_samples) {
int32_t start = k;
int32_t end =
(start + N > wave->num_samples) ? wave->num_samples : (start + N);
k += N;
AcceptWaveform(stream, wave->sample_rate, wave->samples + start,
end - start);
while (IsOnlineStreamReady(recognizer, stream)) {
DecodeOnlineStream(recognizer, stream);
}
const SherpaOnnxOnlineRecognizerResult *r =
GetOnlineStreamResult(recognizer, stream);
if (strlen(r->text)) {
SherpaOnnxPrint(display, segment_id, r->text);
}
if (IsEndpoint(recognizer, stream)) {
if (strlen(r->text)) {
++segment_id;
}
Reset(recognizer, stream);
}
DestroyOnlineRecognizerResult(r);
}
// add some tail padding
float tail_paddings[4800] = {0}; // 0.3 seconds at 16 kHz sample rate
AcceptWaveform(stream, wave->sample_rate, tail_paddings, 4800);
SherpaOnnxFreeWave(wave);
InputFinished(stream);
while (IsOnlineStreamReady(recognizer, stream)) {
DecodeOnlineStream(recognizer, stream);
}
const SherpaOnnxOnlineRecognizerResult *r =
GetOnlineStreamResult(recognizer, stream);
if (strlen(r->text)) {
SherpaOnnxPrint(display, segment_id, r->text);
}
DestroyOnlineRecognizerResult(r);
DestroyDisplay(display);
DestroyOnlineStream(stream);
DestroyOnlineRecognizer(recognizer);
fprintf(stderr, "\n");
return 0;
}
... ...
... ... @@ -5,7 +5,7 @@ function(download_onnxruntime)
message(STATUS "CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}")
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if(SHERPA_ONNX_ENABLE_WASM)
include(onnxruntime-wasm-simd)
include(onnxruntime-wasm-simd)
elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL riscv64)
if(BUILD_SHARED_LIBS)
include(onnxruntime-linux-riscv64)
... ...
... ... @@ -15,6 +15,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts-play", "offline
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "spoken-language-identification", "spoken-language-identification\spoken-language-identification.csproj", "{3D7CF3D6-AC45-4D50-9619-5687B1443E94}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "streaming-hlg-decoding", "streaming-hlg-decoding\streaming-hlg-decoding.csproj", "{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
... ... @@ -48,5 +50,9 @@ Global
{3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Debug|Any CPU.Build.0 = Debug|Any CPU
{3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.ActiveCfg = Release|Any CPU
{3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.Build.0 = Release|Any CPU
{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Debug|Any CPU.Build.0 = Debug|Any CPU
{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
EndGlobal
... ...
// Copyright (c) 2024 Xiaomi Corporation
//
// This file shows how to do streaming HLG decoding.
//
// 1. Download the model for testing
//
// curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
// tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
// rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
//
// 2. Now run it
//
// dotnet run
using SherpaOnnx;
using System.Collections.Generic;
using System;
class StreamingHlgDecodingDemo
{
static void Main(string[] args)
{
var config = new OnlineRecognizerConfig();
config.FeatConfig.SampleRate = 16000;
config.FeatConfig.FeatureDim = 80;
config.ModelConfig.Zipformer2Ctc.Model = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx";
config.ModelConfig.Tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt";
config.ModelConfig.Provider = "cpu";
config.ModelConfig.NumThreads = 1;
config.ModelConfig.Debug = 0;
config.CtcFstDecoderConfig.Graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst";
OnlineRecognizer recognizer = new OnlineRecognizer(config);
var filename = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav";
WaveReader waveReader = new WaveReader(filename);
OnlineStream s = recognizer.CreateStream();
s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
float[] tailPadding = new float[(int)(waveReader.SampleRate * 0.3)];
s.AcceptWaveform(waveReader.SampleRate, tailPadding);
s.InputFinished();
while (recognizer.IsReady(s))
{
recognizer.Decode(s);
}
OnlineRecognizerResult r = recognizer.GetResult(s);
var text = r.Text;
var tokens = r.Tokens;
Console.WriteLine("--------------------");
Console.WriteLine(filename);
Console.WriteLine("text: {0}", text);
Console.WriteLine("tokens: [{0}]", string.Join(", ", tokens));
Console.Write("timestamps: [");
r.Timestamps.ToList().ForEach(i => Console.Write(String.Format("{0:0.00}", i) + ", "));
Console.WriteLine("]");
Console.WriteLine("--------------------");
}
}
... ...
../online-decode-files/WaveReader.cs
\ No newline at end of file
... ...
#!/usr/bin/env bash
set -ex
if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
fi
dotnet run -c Release
... ...
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<RootNamespace>streaming_hlg_decoding</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
</ItemGroup>
</Project>
... ...
module streaming-hlg-decoding
go 1.12
... ...
package main
import (
"bytes"
"encoding/binary"
sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
"github.com/youpy/go-wav"
"log"
"os"
"strings"
)
func main() {
log.SetFlags(log.LstdFlags | log.Lmicroseconds)
config := sherpa.OnlineRecognizerConfig{}
config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80}
// please download model files from
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
config.ModelConfig.Zipformer2Ctc.Model = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx"
config.ModelConfig.Tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt"
config.ModelConfig.NumThreads = 1
config.ModelConfig.Debug = 0
config.ModelConfig.Provider = "cpu"
config.CtcFstDecoderConfig.Graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst"
wav_filename := "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav"
samples, sampleRate := readWave(wav_filename)
log.Println("Initializing recognizer (may take several seconds)")
recognizer := sherpa.NewOnlineRecognizer(&config)
log.Println("Recognizer created!")
defer sherpa.DeleteOnlineRecognizer(recognizer)
log.Println("Start decoding!")
stream := sherpa.NewOnlineStream(recognizer)
defer sherpa.DeleteOnlineStream(stream)
stream.AcceptWaveform(sampleRate, samples)
tailPadding := make([]float32, int(float32(sampleRate)*0.3))
stream.AcceptWaveform(sampleRate, tailPadding)
for recognizer.IsReady(stream) {
recognizer.Decode(stream)
}
log.Println("Decoding done!")
result := recognizer.GetResult(stream)
log.Println(strings.ToLower(result.Text))
log.Printf("Wave duration: %v seconds", float32(len(samples))/float32(sampleRate))
}
func readWave(filename string) (samples []float32, sampleRate int) {
file, _ := os.Open(filename)
defer file.Close()
reader := wav.NewReader(file)
format, err := reader.Format()
if err != nil {
log.Fatalf("Failed to read wave format")
}
if format.AudioFormat != 1 {
log.Fatalf("Support only PCM format. Given: %v\n", format.AudioFormat)
}
if format.NumChannels != 1 {
log.Fatalf("Support only 1 channel wave file. Given: %v\n", format.NumChannels)
}
if format.BitsPerSample != 16 {
log.Fatalf("Support only 16-bit per sample. Given: %v\n", format.BitsPerSample)
}
reader.Duration() // so that it initializes reader.Size
buf := make([]byte, reader.Size)
n, err := reader.Read(buf)
if n != int(reader.Size) {
log.Fatalf("Failed to read %v bytes. Returned %v bytes\n", reader.Size, n)
}
samples = samplesInt16ToFloat(buf)
sampleRate = int(format.SampleRate)
return
}
func samplesInt16ToFloat(inSamples []byte) []float32 {
numSamples := len(inSamples) / 2
outSamples := make([]float32, numSamples)
for i := 0; i != numSamples; i++ {
s := inSamples[i*2 : (i+1)*2]
var s16 int16
buf := bytes.NewReader(s)
err := binary.Read(buf, binary.LittleEndian, &s16)
if err != nil {
log.Fatal("Failed to parse 16-bit sample")
}
outSamples[i] = float32(s16) / 32768
}
return outSamples
}
... ...
#!/usr/bin/env bash
set -ex
if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
fi
go mod tidy
go build
ls -lh
./streaming-hlg-decoding
... ...
... ... @@ -174,3 +174,16 @@ wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherp
tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
node ./test-online-zipformer2-ctc.js
```
## ./test-online-zipformer2-ctc-hlg.js
[./test-online-zipformer2-ctc-hlg.js](./test-online-zipformer2-ctc-hlg.js) demonstrates
how to decode a file using a streaming zipformer2 CTC model with HLG. In the code
we use [sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2).
You can use the following command to run it:
```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
node ./test-online-zipformer2-ctc-hlg.js
```
... ...
... ... @@ -50,6 +50,10 @@ function createOnlineRecognizer() {
rule3MinUtteranceLength: 20,
hotwordsFile: '',
hotwordsScore: 1.5,
ctcFstDecoderConfig: {
graph: '',
maxActive: 3000,
}
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
... ...
... ... @@ -51,6 +51,10 @@ function createOnlineRecognizer() {
rule3MinUtteranceLength: 20,
hotwordsFile: '',
hotwordsScore: 1.5,
ctcFstDecoderConfig: {
graph: '',
maxActive: 3000,
}
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
... ...
... ... @@ -52,6 +52,10 @@ function createOnlineRecognizer() {
rule3MinUtteranceLength: 20,
hotwordsFile: '',
hotwordsScore: 1.5,
ctcFstDecoderConfig: {
graph: '',
maxActive: 3000,
}
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
... ...
... ... @@ -53,6 +53,10 @@ function createOnlineRecognizer() {
rule3MinUtteranceLength: 20,
hotwordsFile: '',
hotwordsScore: 1.5,
ctcFstDecoderConfig: {
graph: '',
maxActive: 3000,
}
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
... ...
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOnlineRecognizer() {
let onlineTransducerModelConfig = {
encoder: '',
decoder: '',
joiner: '',
};
let onlineParaformerModelConfig = {
encoder: '',
decoder: '',
};
let onlineZipformer2CtcModelConfig = {
model:
'./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx',
};
let onlineModelConfig = {
transducer: onlineTransducerModelConfig,
paraformer: onlineParaformerModelConfig,
zipformer2Ctc: onlineZipformer2CtcModelConfig,
tokens: './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt',
numThreads: 1,
provider: 'cpu',
debug: 0,
modelType: '',
};
let featureConfig = {
sampleRate: 16000,
featureDim: 80,
};
let recognizerConfig = {
featConfig: featureConfig,
modelConfig: onlineModelConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
enableEndpoint: 1,
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,
rule3MinUtteranceLength: 20,
hotwordsFile: '',
hotwordsScore: 1.5,
ctcFstDecoderConfig: {
graph: './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst',
maxActive: 3000,
}
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
}
const recognizer = createOnlineRecognizer();
const stream = recognizer.createStream();
const waveFilename =
'./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav';
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
function decode(samples) {
stream.acceptWaveform(gSampleRate, samples);
while (recognizer.isReady(stream)) {
recognizer.decode(stream);
}
const text = recognizer.getResult(stream);
console.log(text);
}
let gSampleRate = 16000;
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
gSampleRate = sampleRate;
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
decode(floatSamples);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
decode(floatSamples);
}
});
... ...
... ... @@ -51,6 +51,10 @@ function createOnlineRecognizer() {
rule3MinUtteranceLength: 20,
hotwordsFile: '',
hotwordsScore: 1.5,
ctcFstDecoderConfig: {
graph: '',
maxActive: 3000,
}
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
... ...
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<RootNamespace>streaming_hlg_decoding</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<PropertyGroup>
<RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
</ItemGroup>
</Project>
... ...
... ... @@ -117,6 +117,21 @@ namespace SherpaOnnx
}
[StructLayout(LayoutKind.Sequential)]
public struct OnlineCtcFstDecoderConfig
{
public OnlineCtcFstDecoderConfig()
{
Graph = "";
MaxActive = 3000;
}
[MarshalAs(UnmanagedType.LPStr)]
public string Graph;
public int MaxActive;
}
[StructLayout(LayoutKind.Sequential)]
public struct OnlineRecognizerConfig
{
public OnlineRecognizerConfig()
... ... @@ -131,6 +146,7 @@ namespace SherpaOnnx
Rule3MinUtteranceLength = 20.0F;
HotwordsFile = "";
HotwordsScore = 1.5F;
CtcFstDecoderConfig = new OnlineCtcFstDecoderConfig();
}
public FeatureConfig FeatConfig;
public OnlineModelConfig ModelConfig;
... ... @@ -167,6 +183,8 @@ namespace SherpaOnnx
/// Bonus score for each token in hotwords.
public float HotwordsScore;
public OnlineCtcFstDecoderConfig CtcFstDecoderConfig;
}
public class OnlineRecognizerResult
... ...
module streaming-hlg-decoding
go 1.12
replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
... ...
../../../../go-api-examples/streaming-hlg-decoding/main.go
\ No newline at end of file
... ...
../../../../go-api-examples/streaming-hlg-decoding/run.sh
\ No newline at end of file
... ...
... ... @@ -99,6 +99,11 @@ type FeatureConfig struct {
FeatureDim int
}
type OnlineCtcFstDecoderConfig struct {
Graph string
MaxActive int
}
// Configuration for the online/streaming recognizer.
type OnlineRecognizerConfig struct {
FeatConfig FeatureConfig
... ... @@ -120,6 +125,7 @@ type OnlineRecognizerConfig struct {
Rule1MinTrailingSilence float32
Rule2MinTrailingSilence float32
Rule3MinUtteranceLength float32
CtcFstDecoderConfig OnlineCtcFstDecoderConfig
}
// It contains the recognition result for a online stream.
... ... @@ -190,6 +196,10 @@ func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer {
c.rule2_min_trailing_silence = C.float(config.Rule2MinTrailingSilence)
c.rule3_min_utterance_length = C.float(config.Rule3MinUtteranceLength)
c.ctc_fst_decoder_config.graph = C.CString(config.CtcFstDecoderConfig.Graph)
defer C.free(unsafe.Pointer(c.ctc_fst_decoder_config.graph))
c.ctc_fst_decoder_config.max_active = C.int(config.CtcFstDecoderConfig.MaxActive)
recognizer := &OnlineRecognizer{}
recognizer.impl = C.CreateOnlineRecognizer(&c)
... ...
... ... @@ -99,6 +99,11 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer(
recognizer_config.hotwords_score =
SHERPA_ONNX_OR(config->hotwords_score, 1.5);
recognizer_config.ctc_fst_decoder_config.graph =
SHERPA_ONNX_OR(config->ctc_fst_decoder_config.graph, "");
recognizer_config.ctc_fst_decoder_config.max_active =
SHERPA_ONNX_OR(config->ctc_fst_decoder_config.max_active, 3000);
if (config->model_config.debug) {
SHERPA_ONNX_LOGE("%s\n", recognizer_config.ToString().c_str());
}
... ...
... ... @@ -96,6 +96,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxFeatureConfig {
int32_t feature_dim;
} SherpaOnnxFeatureConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOnlineCtcFstDecoderConfig {
const char *graph;
int32_t max_active;
} SherpaOnnxOnlineCtcFstDecoderConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerConfig {
SherpaOnnxFeatureConfig feat_config;
SherpaOnnxOnlineModelConfig model_config;
... ... @@ -131,6 +136,8 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerConfig {
/// Bonus score for each token in hotwords.
float hotwords_score;
SherpaOnnxOnlineCtcFstDecoderConfig ctc_fst_decoder_config;
} SherpaOnnxOnlineRecognizerConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerResult {
... ...
... ... @@ -7,3 +7,4 @@ vits-vctk
sherpa-onnx-paraformer-zh-2023-09-14
!*.sh
*.bak
streaming-hlg-decode-file
... ...
... ... @@ -111,6 +111,15 @@ func sherpaOnnxFeatureConfig(
feature_dim: Int32(featureDim))
}
func sherpaOnnxOnlineCtcFstDecoderConfig(
graph: String = "",
maxActive: Int = 3000
) -> SherpaOnnxOnlineCtcFstDecoderConfig {
return SherpaOnnxOnlineCtcFstDecoderConfig(
graph: toCPointer(graph),
max_active: Int32(maxActive))
}
func sherpaOnnxOnlineRecognizerConfig(
featConfig: SherpaOnnxFeatureConfig,
modelConfig: SherpaOnnxOnlineModelConfig,
... ... @@ -121,7 +130,8 @@ func sherpaOnnxOnlineRecognizerConfig(
decodingMethod: String = "greedy_search",
maxActivePaths: Int = 4,
hotwordsFile: String = "",
hotwordsScore: Float = 1.5
hotwordsScore: Float = 1.5,
ctcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig = sherpaOnnxOnlineCtcFstDecoderConfig()
) -> SherpaOnnxOnlineRecognizerConfig {
return SherpaOnnxOnlineRecognizerConfig(
feat_config: featConfig,
... ... @@ -133,7 +143,9 @@ func sherpaOnnxOnlineRecognizerConfig(
rule2_min_trailing_silence: rule2MinTrailingSilence,
rule3_min_utterance_length: rule3MinUtteranceLength,
hotwords_file: toCPointer(hotwordsFile),
hotwords_score: hotwordsScore)
hotwords_score: hotwordsScore,
ctc_fst_decoder_config: ctcFstDecoderConfig
)
}
/// Wrapper for recognition result.
... ...
#!/usr/bin/env bash
set -ex
if [ ! -d ../build-swift-macos ]; then
echo "Please run ../build-swift-macos.sh first!"
exit 1
fi
if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst ]; then
echo "Downloading the pre-trained model for testing."
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
fi
if [ ! -e ./streaming-hlg-decode-file ]; then
# Note: We use -lc++ to link against libc++ instead of libstdc++
swiftc \
-lc++ \
-I ../build-swift-macos/install/include \
-import-objc-header ./SherpaOnnx-Bridging-Header.h \
./streaming-hlg-decode-file.swift ./SherpaOnnx.swift \
-L ../build-swift-macos/install/lib/ \
-l sherpa-onnx \
-l onnxruntime \
-o streaming-hlg-decode-file
strip ./streaming-hlg-decode-file
else
echo "./streaming-hlg-decode-file exists - skip building"
fi
export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
./streaming-hlg-decode-file
... ...
import AVFoundation
extension AudioBuffer {
func array() -> [Float] {
return Array(UnsafeBufferPointer(self))
}
}
extension AVAudioPCMBuffer {
func array() -> [Float] {
return self.audioBufferList.pointee.mBuffers.array()
}
}
func run() {
let filePath =
"./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav"
let model =
"./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx"
let tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt"
let zipfomer2CtcModelConfig = sherpaOnnxOnlineZipformer2CtcModelConfig(
model: model
)
let modelConfig = sherpaOnnxOnlineModelConfig(
tokens: tokens,
zipformer2Ctc: zipfomer2CtcModelConfig
)
let featConfig = sherpaOnnxFeatureConfig(
sampleRate: 16000,
featureDim: 80
)
let ctcFstDecoderConfig = sherpaOnnxOnlineCtcFstDecoderConfig(
graph: "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst",
maxActive: 3000
)
var config = sherpaOnnxOnlineRecognizerConfig(
featConfig: featConfig,
modelConfig: modelConfig,
ctcFstDecoderConfig: ctcFstDecoderConfig
)
let recognizer = SherpaOnnxRecognizer(config: &config)
let fileURL: NSURL = NSURL(fileURLWithPath: filePath)
let audioFile = try! AVAudioFile(forReading: fileURL as URL)
let audioFormat = audioFile.processingFormat
assert(audioFormat.channelCount == 1)
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
let audioFrameCount = UInt32(audioFile.length)
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
try! audioFile.read(into: audioFileBuffer!)
let array: [Float]! = audioFileBuffer?.array()
recognizer.acceptWaveform(samples: array, sampleRate: Int(audioFormat.sampleRate))
let tailPadding = [Float](repeating: 0.0, count: 3200)
recognizer.acceptWaveform(samples: tailPadding, sampleRate: Int(audioFormat.sampleRate))
recognizer.inputFinished()
while recognizer.isReady() {
recognizer.decode()
}
let result = recognizer.getResult()
print("\nresult is:\n\(result.text)")
}
@main
struct App {
static func main() {
run()
}
}
... ...
... ... @@ -43,6 +43,10 @@ function freeConfig(config, Module) {
freeConfig(config.lm, Module)
}
if ('ctcFstDecoder' in config) {
freeConfig(config.ctcFstDecoder, Module)
}
Module._free(config.ptr);
}
... ... @@ -193,11 +197,26 @@ function initSherpaOnnxFeatureConfig(config, Module) {
return {ptr: ptr, len: len};
}
function initSherpaOnnxOnlineCtcFstDecoderConfig(config, Module) {
const len = 2 * 4;
const ptr = Module._malloc(len);
const graphLen = Module.lengthBytesUTF8(config.graph) + 1;
const buffer = Module._malloc(graphLen);
Module.stringToUTF8(config.graph, buffer, graphLen);
Module.setValue(ptr, buffer, 'i8*');
Module.setValue(ptr + 4, config.maxActive, 'i32');
return {ptr: ptr, len: len, buffer: buffer};
}
function initSherpaOnnxOnlineRecognizerConfig(config, Module) {
const feat = initSherpaOnnxFeatureConfig(config.featConfig, Module);
const model = initSherpaOnnxOnlineModelConfig(config.modelConfig, Module);
const ctcFstDecoder = initSherpaOnnxOnlineCtcFstDecoderConfig(
config.ctcFstDecoderConfig, Module)
const len = feat.len + model.len + 8 * 4;
const len = feat.len + model.len + 8 * 4 + ctcFstDecoder.len;
const ptr = Module._malloc(len);
let offset = 0;
... ... @@ -243,8 +262,11 @@ function initSherpaOnnxOnlineRecognizerConfig(config, Module) {
Module.setValue(ptr + offset, config.hotwordsScore, 'float');
offset += 4;
Module._CopyHeap(ctcFstDecoder.ptr, ctcFstDecoder.len, ptr + offset);
return {
buffer: buffer, ptr: ptr, len: len, feat: feat, model: model
buffer: buffer, ptr: ptr, len: len, feat: feat, model: model,
ctcFstDecoder: ctcFstDecoder
}
}
... ... @@ -313,6 +335,10 @@ function createOnlineRecognizer(Module, myConfig) {
rule3MinUtteranceLength: 20,
hotwordsFile: '',
hotwordsScore: 1.5,
ctcFstDecoderConfig: {
graph: '',
maxActive: 3000,
}
};
if (myConfig) {
recognizerConfig = myConfig;
... ...
... ... @@ -22,9 +22,11 @@ static_assert(sizeof(SherpaOnnxOnlineModelConfig) ==
sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) + 5 * 4,
"");
static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
static_assert(sizeof(SherpaOnnxOnlineCtcFstDecoderConfig) == 2 * 4, "");
static_assert(sizeof(SherpaOnnxOnlineRecognizerConfig) ==
sizeof(SherpaOnnxFeatureConfig) +
sizeof(SherpaOnnxOnlineModelConfig) + 8 * 4,
sizeof(SherpaOnnxOnlineModelConfig) + 8 * 4 +
sizeof(SherpaOnnxOnlineCtcFstDecoderConfig),
"");
void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) {
... ... @@ -67,6 +69,11 @@ void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) {
config->rule3_min_utterance_length);
fprintf(stdout, "hotwords_file: %s\n", config->hotwords_file);
fprintf(stdout, "hotwords_score: %.2f\n", config->hotwords_score);
fprintf(stdout, "----------ctc fst decoder config----------\n");
fprintf(stdout, "graph: %s\n", config->ctc_fst_decoder_config.graph);
fprintf(stdout, "max_active: %d\n",
config->ctc_fst_decoder_config.max_active);
}
void CopyHeap(const char *src, int32_t num_bytes, char *dst) {
... ...