Fangjun Kuang
Committed by GitHub

Add C API for streaming HLG decoding (#734)

正在显示 39 个修改的文件 包含 839 行增加8 行删除
@@ -2,7 +2,10 @@ @@ -2,7 +2,10 @@
2 2
3 cd dotnet-examples/ 3 cd dotnet-examples/
4 4
5 -cd spoken-language-identification 5 +cd streaming-hlg-decoding/
  6 +./run.sh
  7 +
  8 +cd ../spoken-language-identification
6 ./run.sh 9 ./run.sh
7 10
8 cd ../online-decode-files 11 cd ../online-decode-files
@@ -58,6 +58,13 @@ rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 @@ -58,6 +58,13 @@ rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
58 node ./test-online-zipformer2-ctc.js 58 node ./test-online-zipformer2-ctc.js
59 rm -rf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 59 rm -rf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
60 60
  61 +
  62 +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  63 +tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  64 +rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  65 +node ./test-online-zipformer2-ctc-hlg.js
  66 +rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18
  67 +
61 # offline tts 68 # offline tts
62 69
63 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 70 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
@@ -7,6 +7,10 @@ echo "pwd: $PWD" @@ -7,6 +7,10 @@ echo "pwd: $PWD"
7 cd swift-api-examples 7 cd swift-api-examples
8 ls -lh 8 ls -lh
9 9
  10 +./run-streaming-hlg-decode-file.sh
  11 +rm ./streaming-hlg-decode-file
  12 +rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18
  13 +
10 ./run-spoken-language-identification.sh 14 ./run-spoken-language-identification.sh
11 rm -rf sherpa-onnx-whisper* 15 rm -rf sherpa-onnx-whisper*
12 16
@@ -31,4 +35,5 @@ sed -i.bak '20d' ./decode-file.swift @@ -31,4 +35,5 @@ sed -i.bak '20d' ./decode-file.swift
31 35
32 ./run-decode-file-non-streaming.sh 36 ./run-decode-file-non-streaming.sh
33 37
  38 +
34 ls -lh 39 ls -lh
@@ -178,6 +178,7 @@ jobs: @@ -178,6 +178,7 @@ jobs:
178 cp -v scripts/dotnet/examples/online-decode-files.csproj dotnet-examples/online-decode-files/ 178 cp -v scripts/dotnet/examples/online-decode-files.csproj dotnet-examples/online-decode-files/
179 cp -v scripts/dotnet/examples/speech-recognition-from-microphone.csproj dotnet-examples/speech-recognition-from-microphone/ 179 cp -v scripts/dotnet/examples/speech-recognition-from-microphone.csproj dotnet-examples/speech-recognition-from-microphone/
180 cp -v scripts/dotnet/examples/spoken-language-identification.csproj dotnet-examples/spoken-language-identification/ 180 cp -v scripts/dotnet/examples/spoken-language-identification.csproj dotnet-examples/spoken-language-identification/
  181 + cp -v scripts/dotnet/examples/streaming-hlg-decoding.csproj dotnet-examples/streaming-hlg-decoding
181 182
182 ls -lh /tmp 183 ls -lh /tmp
183 184
@@ -66,12 +66,77 @@ jobs: @@ -66,12 +66,77 @@ jobs:
66 run: | 66 run: |
67 gcc --version 67 gcc --version
68 68
69 - - name: Test speaker identification 69 + - name: Test streaming HLG decoding (Linux/macOS)
  70 + if: matrix.os != 'windows-latest'
  71 + shell: bash
  72 + run: |
  73 + cd go-api-examples/streaming-hlg-decoding/
  74 + ./run.sh
  75 +
  76 + - name: Test speaker identification (Linux/macOS)
  77 + if: matrix.os != 'windows-latest'
70 shell: bash 78 shell: bash
71 run: | 79 run: |
72 cd go-api-examples/speaker-identification 80 cd go-api-examples/speaker-identification
73 ./run.sh 81 ./run.sh
74 82
  83 + - name: Test speaker identification (Win64)
  84 + if: matrix.os == 'windows-latest' && matrix.arch == 'x64'
  85 + shell: bash
  86 + run: |
  87 + cd go-api-examples/speaker-identification
  88 + go mod tidy
  89 + cat go.mod
  90 + go build
  91 +
  92 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx
  93 + git clone https://github.com/csukuangfj/sr-data
  94 + ls -lh
  95 + echo $PWD
  96 + ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/
  97 + ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/*
  98 + cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/x86_64-pc-windows-gnu/*.dll .
  99 + ls -lh
  100 + go mod tidy
  101 + go build
  102 + go run ./main.go
  103 +
  104 + - name: Test speaker identification (Win32)
  105 + if: matrix.os == 'windows-latest' && matrix.arch == 'x86'
  106 + shell: bash
  107 + run: |
  108 + cd go-api-examples/speaker-identification
  109 + go mod tidy
  110 + cat go.mod
  111 + ls -lh
  112 +
  113 + go env GOARCH
  114 + go env
  115 + echo "------------------------------"
  116 + go env -w GOARCH=386
  117 + go env -w CGO_ENABLED=1
  118 + go env
  119 +
  120 + go clean
  121 + go build
  122 +
  123 + echo $PWD
  124 +
  125 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx
  126 + git clone https://github.com/csukuangfj/sr-data
  127 + ls -lh
  128 + echo $PWD
  129 + ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/
  130 + ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/*
  131 + cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/i686-pc-windows-gnu/*.dll .
  132 + ls -lh
  133 + go mod tidy
  134 + go build
  135 + go run ./main.go
  136 +
  137 + rm -rf sr-data
  138 + rm -rf *.onnx
  139 +
75 - name: Test non-streaming TTS (Linux/macOS) 140 - name: Test non-streaming TTS (Linux/macOS)
76 if: matrix.os != 'windows-latest' 141 if: matrix.os != 'windows-latest'
77 shell: bash 142 shell: bash
@@ -74,6 +74,12 @@ jobs: @@ -74,6 +74,12 @@ jobs:
74 go mod tidy 74 go mod tidy
75 go build 75 go build
76 76
  77 + - name: Test streaming HLG decoding
  78 + shell: bash
  79 + run: |
  80 + cd scripts/go/_internal/streaming-hlg-decoding/
  81 + ./run.sh
  82 +
77 - name: Test speaker identification 83 - name: Test speaker identification
78 shell: bash 84 shell: bash
79 run: | 85 run: |
@@ -15,6 +15,9 @@ target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api) @@ -15,6 +15,9 @@ target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api)
15 add_executable(speaker-identification-c-api speaker-identification-c-api.c) 15 add_executable(speaker-identification-c-api speaker-identification-c-api.c)
16 target_link_libraries(speaker-identification-c-api sherpa-onnx-c-api) 16 target_link_libraries(speaker-identification-c-api sherpa-onnx-c-api)
17 17
  18 +add_executable(streaming-hlg-decode-file-c-api streaming-hlg-decode-file-c-api.c)
  19 +target_link_libraries(streaming-hlg-decode-file-c-api sherpa-onnx-c-api)
  20 +
18 if(SHERPA_ONNX_HAS_ALSA) 21 if(SHERPA_ONNX_HAS_ALSA)
19 add_subdirectory(./asr-microphone-example) 22 add_subdirectory(./asr-microphone-example)
20 elseif((UNIX AND NOT APPLE) OR LINUX) 23 elseif((UNIX AND NOT APPLE) OR LINUX)
  1 +// c-api-examples/streaming-hlg-decode-file-c-api.c
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +/*
  5 +We use the following model as an example
  6 +
  7 +// clang-format off
  8 +
  9 +Download the model from
  10 +https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  11 +
  12 +tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  13 +rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  14 +
  15 +build/bin/streaming-hlg-decode-file-c-api
  16 +
  17 +(The above model is from https://github.com/k2-fsa/icefall/pull/1557)
  18 +*/
  19 +#include <stdio.h>
  20 +#include <stdlib.h>
  21 +#include <string.h>
  22 +
  23 +#include "sherpa-onnx/c-api/c-api.h"
  24 +
  25 +int32_t main() {
  26 + // clang-format off
  27 + //
  28 + // Please download the model from
  29 + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  30 + const char *model = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx";
  31 + const char *tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt";
  32 + const char *graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst";
  33 + const char *wav_filename = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav";
  34 + // clang-format on
  35 +
  36 + SherpaOnnxOnlineRecognizerConfig config;
  37 +
  38 + memset(&config, 0, sizeof(config));
  39 + config.feat_config.sample_rate = 16000;
  40 + config.feat_config.feature_dim = 80;
  41 + config.model_config.zipformer2_ctc.model = model;
  42 + config.model_config.tokens = tokens;
  43 + config.model_config.num_threads = 1;
  44 + config.model_config.provider = "cpu";
  45 + config.model_config.debug = 0;
  46 + config.ctc_fst_decoder_config.graph = graph;
  47 + const SherpaOnnxOnlineRecognizer *recognizer =
  48 + CreateOnlineRecognizer(&config);
  49 + if (!recognizer) {
  50 + fprintf(stderr, "Failed to create recognizer");
  51 + exit(-1);
  52 + }
  53 +
  54 + const SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer);
  55 +
  56 + const SherpaOnnxDisplay *display = CreateDisplay(50);
  57 + int32_t segment_id = 0;
  58 +
  59 + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
  60 + if (wave == NULL) {
  61 + fprintf(stderr, "Failed to read %s\n", wav_filename);
  62 + exit(-1);
  63 + }
  64 +
  65 +// simulate streaming. You can choose an arbitrary N
  66 +#define N 3200
  67 +
  68 + int16_t buffer[N];
  69 + float samples[N];
  70 + fprintf(stderr, "sample rate: %d, num samples: %d, duration: %.2f s\n",
  71 + wave->sample_rate, wave->num_samples,
  72 + (float)wave->num_samples / wave->sample_rate);
  73 +
  74 + int32_t k = 0;
  75 + while (k < wave->num_samples) {
  76 + int32_t start = k;
  77 + int32_t end =
  78 + (start + N > wave->num_samples) ? wave->num_samples : (start + N);
  79 + k += N;
  80 +
  81 + AcceptWaveform(stream, wave->sample_rate, wave->samples + start,
  82 + end - start);
  83 + while (IsOnlineStreamReady(recognizer, stream)) {
  84 + DecodeOnlineStream(recognizer, stream);
  85 + }
  86 +
  87 + const SherpaOnnxOnlineRecognizerResult *r =
  88 + GetOnlineStreamResult(recognizer, stream);
  89 +
  90 + if (strlen(r->text)) {
  91 + SherpaOnnxPrint(display, segment_id, r->text);
  92 + }
  93 +
  94 + if (IsEndpoint(recognizer, stream)) {
  95 + if (strlen(r->text)) {
  96 + ++segment_id;
  97 + }
  98 + Reset(recognizer, stream);
  99 + }
  100 +
  101 + DestroyOnlineRecognizerResult(r);
  102 + }
  103 +
  104 + // add some tail padding
  105 + float tail_paddings[4800] = {0}; // 0.3 seconds at 16 kHz sample rate
  106 + AcceptWaveform(stream, wave->sample_rate, tail_paddings, 4800);
  107 +
  108 + SherpaOnnxFreeWave(wave);
  109 +
  110 + InputFinished(stream);
  111 + while (IsOnlineStreamReady(recognizer, stream)) {
  112 + DecodeOnlineStream(recognizer, stream);
  113 + }
  114 +
  115 + const SherpaOnnxOnlineRecognizerResult *r =
  116 + GetOnlineStreamResult(recognizer, stream);
  117 +
  118 + if (strlen(r->text)) {
  119 + SherpaOnnxPrint(display, segment_id, r->text);
  120 + }
  121 +
  122 + DestroyOnlineRecognizerResult(r);
  123 +
  124 + DestroyDisplay(display);
  125 + DestroyOnlineStream(stream);
  126 + DestroyOnlineRecognizer(recognizer);
  127 + fprintf(stderr, "\n");
  128 +
  129 + return 0;
  130 +}
@@ -5,7 +5,7 @@ function(download_onnxruntime) @@ -5,7 +5,7 @@ function(download_onnxruntime)
5 message(STATUS "CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") 5 message(STATUS "CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}")
6 message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") 6 message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
7 if(SHERPA_ONNX_ENABLE_WASM) 7 if(SHERPA_ONNX_ENABLE_WASM)
8 - include(onnxruntime-wasm-simd) 8 + include(onnxruntime-wasm-simd)
9 elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL riscv64) 9 elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL riscv64)
10 if(BUILD_SHARED_LIBS) 10 if(BUILD_SHARED_LIBS)
11 include(onnxruntime-linux-riscv64) 11 include(onnxruntime-linux-riscv64)
@@ -15,6 +15,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts-play", "offline @@ -15,6 +15,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts-play", "offline
15 EndProject 15 EndProject
16 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "spoken-language-identification", "spoken-language-identification\spoken-language-identification.csproj", "{3D7CF3D6-AC45-4D50-9619-5687B1443E94}" 16 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "spoken-language-identification", "spoken-language-identification\spoken-language-identification.csproj", "{3D7CF3D6-AC45-4D50-9619-5687B1443E94}"
17 EndProject 17 EndProject
  18 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "streaming-hlg-decoding", "streaming-hlg-decoding\streaming-hlg-decoding.csproj", "{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}"
  19 +EndProject
18 Global 20 Global
19 GlobalSection(SolutionConfigurationPlatforms) = preSolution 21 GlobalSection(SolutionConfigurationPlatforms) = preSolution
20 Debug|Any CPU = Debug|Any CPU 22 Debug|Any CPU = Debug|Any CPU
@@ -48,5 +50,9 @@ Global @@ -48,5 +50,9 @@ Global
48 {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Debug|Any CPU.Build.0 = Debug|Any CPU 50 {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Debug|Any CPU.Build.0 = Debug|Any CPU
49 {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.ActiveCfg = Release|Any CPU 51 {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.ActiveCfg = Release|Any CPU
50 {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.Build.0 = Release|Any CPU 52 {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.Build.0 = Release|Any CPU
  53 + {C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
  54 + {C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Debug|Any CPU.Build.0 = Debug|Any CPU
  55 + {C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Release|Any CPU.ActiveCfg = Release|Any CPU
  56 + {C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Release|Any CPU.Build.0 = Release|Any CPU
51 EndGlobalSection 57 EndGlobalSection
52 EndGlobal 58 EndGlobal
  1 +// Copyright (c) 2024 Xiaomi Corporation
  2 +//
  3 +// This file shows how to do streaming HLG decoding.
  4 +//
  5 +// 1. Download the model for testing
  6 +//
  7 +// curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  8 +// tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  9 +// rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  10 +//
  11 +// 2. Now run it
  12 +//
  13 +// dotnet run
  14 +
  15 +using SherpaOnnx;
  16 +using System.Collections.Generic;
  17 +using System;
  18 +
  19 +class StreamingHlgDecodingDemo
  20 +{
  21 +
  22 + static void Main(string[] args)
  23 + {
  24 + var config = new OnlineRecognizerConfig();
  25 + config.FeatConfig.SampleRate = 16000;
  26 + config.FeatConfig.FeatureDim = 80;
  27 + config.ModelConfig.Zipformer2Ctc.Model = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx";
  28 +
  29 + config.ModelConfig.Tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt";
  30 + config.ModelConfig.Provider = "cpu";
  31 + config.ModelConfig.NumThreads = 1;
  32 + config.ModelConfig.Debug = 0;
  33 + config.CtcFstDecoderConfig.Graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst";
  34 +
  35 + OnlineRecognizer recognizer = new OnlineRecognizer(config);
  36 +
  37 + var filename = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav";
  38 +
  39 + WaveReader waveReader = new WaveReader(filename);
  40 + OnlineStream s = recognizer.CreateStream();
  41 + s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
  42 +
  43 + float[] tailPadding = new float[(int)(waveReader.SampleRate * 0.3)];
  44 + s.AcceptWaveform(waveReader.SampleRate, tailPadding);
  45 + s.InputFinished();
  46 +
  47 + while (recognizer.IsReady(s))
  48 + {
  49 + recognizer.Decode(s);
  50 + }
  51 +
  52 + OnlineRecognizerResult r = recognizer.GetResult(s);
  53 + var text = r.Text;
  54 + var tokens = r.Tokens;
  55 + Console.WriteLine("--------------------");
  56 + Console.WriteLine(filename);
  57 + Console.WriteLine("text: {0}", text);
  58 + Console.WriteLine("tokens: [{0}]", string.Join(", ", tokens));
  59 + Console.Write("timestamps: [");
  60 + r.Timestamps.ToList().ForEach(i => Console.Write(String.Format("{0:0.00}", i) + ", "));
  61 + Console.WriteLine("]");
  62 + Console.WriteLine("--------------------");
  63 + }
  64 +}
  65 +
  66 +
  1 +../online-decode-files/WaveReader.cs
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  7 + tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  8 + rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  9 +fi
  10 +
  11 +dotnet run -c Release
  1 +<Project Sdk="Microsoft.NET.Sdk">
  2 +
  3 + <PropertyGroup>
  4 + <OutputType>Exe</OutputType>
  5 + <TargetFramework>net6.0</TargetFramework>
  6 + <RootNamespace>streaming_hlg_decoding</RootNamespace>
  7 + <ImplicitUsings>enable</ImplicitUsings>
  8 + <Nullable>enable</Nullable>
  9 + </PropertyGroup>
  10 +
  11 + <ItemGroup>
  12 + <PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
  13 + </ItemGroup>
  14 +
  15 +</Project>
  1 +module streaming-hlg-decoding
  2 +
  3 +go 1.12
  1 +package main
  2 +
  3 +import (
  4 + "bytes"
  5 + "encoding/binary"
  6 + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
  7 + "github.com/youpy/go-wav"
  8 + "log"
  9 + "os"
  10 + "strings"
  11 +)
  12 +
  13 +func main() {
  14 + log.SetFlags(log.LstdFlags | log.Lmicroseconds)
  15 +
  16 + config := sherpa.OnlineRecognizerConfig{}
  17 + config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80}
  18 +
  19 + // please download model files from
  20 + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  21 + config.ModelConfig.Zipformer2Ctc.Model = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx"
  22 + config.ModelConfig.Tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt"
  23 +
  24 + config.ModelConfig.NumThreads = 1
  25 + config.ModelConfig.Debug = 0
  26 + config.ModelConfig.Provider = "cpu"
  27 + config.CtcFstDecoderConfig.Graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst"
  28 +
  29 + wav_filename := "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav"
  30 +
  31 + samples, sampleRate := readWave(wav_filename)
  32 +
  33 + log.Println("Initializing recognizer (may take several seconds)")
  34 + recognizer := sherpa.NewOnlineRecognizer(&config)
  35 + log.Println("Recognizer created!")
  36 + defer sherpa.DeleteOnlineRecognizer(recognizer)
  37 +
  38 + log.Println("Start decoding!")
  39 + stream := sherpa.NewOnlineStream(recognizer)
  40 + defer sherpa.DeleteOnlineStream(stream)
  41 +
  42 + stream.AcceptWaveform(sampleRate, samples)
  43 +
  44 + tailPadding := make([]float32, int(float32(sampleRate)*0.3))
  45 + stream.AcceptWaveform(sampleRate, tailPadding)
  46 +
  47 + for recognizer.IsReady(stream) {
  48 + recognizer.Decode(stream)
  49 + }
  50 + log.Println("Decoding done!")
  51 + result := recognizer.GetResult(stream)
  52 + log.Println(strings.ToLower(result.Text))
  53 + log.Printf("Wave duration: %v seconds", float32(len(samples))/float32(sampleRate))
  54 +}
  55 +
  56 +func readWave(filename string) (samples []float32, sampleRate int) {
  57 + file, _ := os.Open(filename)
  58 + defer file.Close()
  59 +
  60 + reader := wav.NewReader(file)
  61 + format, err := reader.Format()
  62 + if err != nil {
  63 + log.Fatalf("Failed to read wave format")
  64 + }
  65 +
  66 + if format.AudioFormat != 1 {
  67 + log.Fatalf("Support only PCM format. Given: %v\n", format.AudioFormat)
  68 + }
  69 +
  70 + if format.NumChannels != 1 {
  71 + log.Fatalf("Support only 1 channel wave file. Given: %v\n", format.NumChannels)
  72 + }
  73 +
  74 + if format.BitsPerSample != 16 {
  75 + log.Fatalf("Support only 16-bit per sample. Given: %v\n", format.BitsPerSample)
  76 + }
  77 +
  78 + reader.Duration() // so that it initializes reader.Size
  79 +
  80 + buf := make([]byte, reader.Size)
  81 + n, err := reader.Read(buf)
  82 + if n != int(reader.Size) {
  83 + log.Fatalf("Failed to read %v bytes. Returned %v bytes\n", reader.Size, n)
  84 + }
  85 +
  86 + samples = samplesInt16ToFloat(buf)
  87 + sampleRate = int(format.SampleRate)
  88 +
  89 + return
  90 +}
  91 +
  92 +func samplesInt16ToFloat(inSamples []byte) []float32 {
  93 + numSamples := len(inSamples) / 2
  94 + outSamples := make([]float32, numSamples)
  95 +
  96 + for i := 0; i != numSamples; i++ {
  97 + s := inSamples[i*2 : (i+1)*2]
  98 +
  99 + var s16 int16
  100 + buf := bytes.NewReader(s)
  101 + err := binary.Read(buf, binary.LittleEndian, &s16)
  102 + if err != nil {
  103 + log.Fatal("Failed to parse 16-bit sample")
  104 + }
  105 + outSamples[i] = float32(s16) / 32768
  106 + }
  107 +
  108 + return outSamples
  109 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  7 + tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  8 + rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  9 +fi
  10 +
  11 +go mod tidy
  12 +go build
  13 +ls -lh
  14 +./streaming-hlg-decoding
@@ -174,3 +174,16 @@ wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherp @@ -174,3 +174,16 @@ wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherp
174 tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 174 tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
175 node ./test-online-zipformer2-ctc.js 175 node ./test-online-zipformer2-ctc.js
176 ``` 176 ```
  177 +
  178 +## ./test-online-zipformer2-ctc-hlg.js
  179 +[./test-online-zipformer2-ctc-hlg.js](./test-online-zipformer2-ctc-hlg.js) demonstrates
  180 +how to decode a file using a streaming zipformer2 CTC model with HLG. In the code
  181 +we use [sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2).
  182 +
  183 +You can use the following command to run it:
  184 +
  185 +```bash
  186 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  187 +tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  188 +node ./test-online-zipformer2-ctc-hlg.js
  189 +```
@@ -50,6 +50,10 @@ function createOnlineRecognizer() { @@ -50,6 +50,10 @@ function createOnlineRecognizer() {
50 rule3MinUtteranceLength: 20, 50 rule3MinUtteranceLength: 20,
51 hotwordsFile: '', 51 hotwordsFile: '',
52 hotwordsScore: 1.5, 52 hotwordsScore: 1.5,
  53 + ctcFstDecoderConfig: {
  54 + graph: '',
  55 + maxActive: 3000,
  56 + }
53 }; 57 };
54 58
55 return sherpa_onnx.createOnlineRecognizer(recognizerConfig); 59 return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
@@ -51,6 +51,10 @@ function createOnlineRecognizer() { @@ -51,6 +51,10 @@ function createOnlineRecognizer() {
51 rule3MinUtteranceLength: 20, 51 rule3MinUtteranceLength: 20,
52 hotwordsFile: '', 52 hotwordsFile: '',
53 hotwordsScore: 1.5, 53 hotwordsScore: 1.5,
  54 + ctcFstDecoderConfig: {
  55 + graph: '',
  56 + maxActive: 3000,
  57 + }
54 }; 58 };
55 59
56 return sherpa_onnx.createOnlineRecognizer(recognizerConfig); 60 return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
@@ -52,6 +52,10 @@ function createOnlineRecognizer() { @@ -52,6 +52,10 @@ function createOnlineRecognizer() {
52 rule3MinUtteranceLength: 20, 52 rule3MinUtteranceLength: 20,
53 hotwordsFile: '', 53 hotwordsFile: '',
54 hotwordsScore: 1.5, 54 hotwordsScore: 1.5,
  55 + ctcFstDecoderConfig: {
  56 + graph: '',
  57 + maxActive: 3000,
  58 + }
55 }; 59 };
56 60
57 return sherpa_onnx.createOnlineRecognizer(recognizerConfig); 61 return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
@@ -53,6 +53,10 @@ function createOnlineRecognizer() { @@ -53,6 +53,10 @@ function createOnlineRecognizer() {
53 rule3MinUtteranceLength: 20, 53 rule3MinUtteranceLength: 20,
54 hotwordsFile: '', 54 hotwordsFile: '',
55 hotwordsScore: 1.5, 55 hotwordsScore: 1.5,
  56 + ctcFstDecoderConfig: {
  57 + graph: '',
  58 + maxActive: 3000,
  59 + }
56 }; 60 };
57 61
58 return sherpa_onnx.createOnlineRecognizer(recognizerConfig); 62 return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
  1 +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +//
  3 +const fs = require('fs');
  4 +const {Readable} = require('stream');
  5 +const wav = require('wav');
  6 +
  7 +const sherpa_onnx = require('sherpa-onnx');
  8 +
  9 +function createOnlineRecognizer() {
  10 + let onlineTransducerModelConfig = {
  11 + encoder: '',
  12 + decoder: '',
  13 + joiner: '',
  14 + };
  15 +
  16 + let onlineParaformerModelConfig = {
  17 + encoder: '',
  18 + decoder: '',
  19 + };
  20 +
  21 + let onlineZipformer2CtcModelConfig = {
  22 + model:
  23 + './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx',
  24 + };
  25 +
  26 + let onlineModelConfig = {
  27 + transducer: onlineTransducerModelConfig,
  28 + paraformer: onlineParaformerModelConfig,
  29 + zipformer2Ctc: onlineZipformer2CtcModelConfig,
  30 + tokens: './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt',
  31 + numThreads: 1,
  32 + provider: 'cpu',
  33 + debug: 0,
  34 + modelType: '',
  35 + };
  36 +
  37 + let featureConfig = {
  38 + sampleRate: 16000,
  39 + featureDim: 80,
  40 + };
  41 +
  42 + let recognizerConfig = {
  43 + featConfig: featureConfig,
  44 + modelConfig: onlineModelConfig,
  45 + decodingMethod: 'greedy_search',
  46 + maxActivePaths: 4,
  47 + enableEndpoint: 1,
  48 + rule1MinTrailingSilence: 2.4,
  49 + rule2MinTrailingSilence: 1.2,
  50 + rule3MinUtteranceLength: 20,
  51 + hotwordsFile: '',
  52 + hotwordsScore: 1.5,
  53 + ctcFstDecoderConfig: {
  54 + graph: './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst',
  55 + maxActive: 3000,
  56 + }
  57 + };
  58 +
  59 + return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
  60 +}
  61 +
  62 +const recognizer = createOnlineRecognizer();
  63 +const stream = recognizer.createStream();
  64 +
  65 +const waveFilename =
  66 + './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav';
  67 +
  68 +const reader = new wav.Reader();
  69 +const readable = new Readable().wrap(reader);
  70 +
  71 +function decode(samples) {
  72 + stream.acceptWaveform(gSampleRate, samples);
  73 +
  74 + while (recognizer.isReady(stream)) {
  75 + recognizer.decode(stream);
  76 + }
  77 + const text = recognizer.getResult(stream);
  78 + console.log(text);
  79 +}
  80 +
  81 +let gSampleRate = 16000;
  82 +
  83 +reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
  84 + gSampleRate = sampleRate;
  85 +
  86 + if (audioFormat != 1) {
  87 + throw new Error(`Only support PCM format. Given ${audioFormat}`);
  88 + }
  89 +
  90 + if (channels != 1) {
  91 + throw new Error(`Only a single channel. Given ${channel}`);
  92 + }
  93 +
  94 + if (bitDepth != 16) {
  95 + throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
  96 + }
  97 +});
  98 +
  99 +fs.createReadStream(waveFilename, {'highWaterMark': 4096})
  100 + .pipe(reader)
  101 + .on('finish', function(err) {
  102 + // tail padding
  103 + const floatSamples =
  104 + new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
  105 + decode(floatSamples);
  106 + stream.free();
  107 + recognizer.free();
  108 + });
  109 +
  110 +readable.on('readable', function() {
  111 + let chunk;
  112 + while ((chunk = readable.read()) != null) {
  113 + const int16Samples = new Int16Array(
  114 + chunk.buffer, chunk.byteOffset,
  115 + chunk.length / Int16Array.BYTES_PER_ELEMENT);
  116 +
  117 + const floatSamples = new Float32Array(int16Samples.length);
  118 +
  119 + for (let i = 0; i < floatSamples.length; i++) {
  120 + floatSamples[i] = int16Samples[i] / 32768.0;
  121 + }
  122 +
  123 + decode(floatSamples);
  124 + }
  125 +});
@@ -51,6 +51,10 @@ function createOnlineRecognizer() { @@ -51,6 +51,10 @@ function createOnlineRecognizer() {
51 rule3MinUtteranceLength: 20, 51 rule3MinUtteranceLength: 20,
52 hotwordsFile: '', 52 hotwordsFile: '',
53 hotwordsScore: 1.5, 53 hotwordsScore: 1.5,
  54 + ctcFstDecoderConfig: {
  55 + graph: '',
  56 + maxActive: 3000,
  57 + }
54 }; 58 };
55 59
56 return sherpa_onnx.createOnlineRecognizer(recognizerConfig); 60 return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
  1 +<Project Sdk="Microsoft.NET.Sdk">
  2 +
  3 + <PropertyGroup>
  4 + <OutputType>Exe</OutputType>
  5 + <TargetFramework>net6.0</TargetFramework>
  6 + <RootNamespace>streaming_hlg_decoding</RootNamespace>
  7 + <ImplicitUsings>enable</ImplicitUsings>
  8 + <Nullable>enable</Nullable>
  9 + </PropertyGroup>
  10 +
  11 + <PropertyGroup>
  12 + <RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources>
  13 + </PropertyGroup>
  14 +
  15 + <ItemGroup>
  16 + <PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
  17 + </ItemGroup>
  18 +
  19 +</Project>
@@ -117,6 +117,21 @@ namespace SherpaOnnx @@ -117,6 +117,21 @@ namespace SherpaOnnx
117 } 117 }
118 118
119 [StructLayout(LayoutKind.Sequential)] 119 [StructLayout(LayoutKind.Sequential)]
  120 + public struct OnlineCtcFstDecoderConfig
  121 + {
  122 + public OnlineCtcFstDecoderConfig()
  123 + {
  124 + Graph = "";
  125 + MaxActive = 3000;
  126 + }
  127 +
  128 + [MarshalAs(UnmanagedType.LPStr)]
  129 + public string Graph;
  130 +
  131 + public int MaxActive;
  132 + }
  133 +
  134 + [StructLayout(LayoutKind.Sequential)]
120 public struct OnlineRecognizerConfig 135 public struct OnlineRecognizerConfig
121 { 136 {
122 public OnlineRecognizerConfig() 137 public OnlineRecognizerConfig()
@@ -131,6 +146,7 @@ namespace SherpaOnnx @@ -131,6 +146,7 @@ namespace SherpaOnnx
131 Rule3MinUtteranceLength = 20.0F; 146 Rule3MinUtteranceLength = 20.0F;
132 HotwordsFile = ""; 147 HotwordsFile = "";
133 HotwordsScore = 1.5F; 148 HotwordsScore = 1.5F;
  149 + CtcFstDecoderConfig = new OnlineCtcFstDecoderConfig();
134 } 150 }
135 public FeatureConfig FeatConfig; 151 public FeatureConfig FeatConfig;
136 public OnlineModelConfig ModelConfig; 152 public OnlineModelConfig ModelConfig;
@@ -167,6 +183,8 @@ namespace SherpaOnnx @@ -167,6 +183,8 @@ namespace SherpaOnnx
167 183
168 /// Bonus score for each token in hotwords. 184 /// Bonus score for each token in hotwords.
169 public float HotwordsScore; 185 public float HotwordsScore;
  186 +
  187 + public OnlineCtcFstDecoderConfig CtcFstDecoderConfig;
170 } 188 }
171 189
172 public class OnlineRecognizerResult 190 public class OnlineRecognizerResult
  1 +module streaming-hlg-decoding
  2 +
  3 +go 1.12
  4 +
  5 +replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
  1 +../../../../go-api-examples/streaming-hlg-decoding/main.go
  1 +../../../../go-api-examples/streaming-hlg-decoding/run.sh
@@ -99,6 +99,11 @@ type FeatureConfig struct { @@ -99,6 +99,11 @@ type FeatureConfig struct {
99 FeatureDim int 99 FeatureDim int
100 } 100 }
101 101
  102 +type OnlineCtcFstDecoderConfig struct {
  103 + Graph string
  104 + MaxActive int
  105 +}
  106 +
102 // Configuration for the online/streaming recognizer. 107 // Configuration for the online/streaming recognizer.
103 type OnlineRecognizerConfig struct { 108 type OnlineRecognizerConfig struct {
104 FeatConfig FeatureConfig 109 FeatConfig FeatureConfig
@@ -120,6 +125,7 @@ type OnlineRecognizerConfig struct { @@ -120,6 +125,7 @@ type OnlineRecognizerConfig struct {
120 Rule1MinTrailingSilence float32 125 Rule1MinTrailingSilence float32
121 Rule2MinTrailingSilence float32 126 Rule2MinTrailingSilence float32
122 Rule3MinUtteranceLength float32 127 Rule3MinUtteranceLength float32
  128 + CtcFstDecoderConfig OnlineCtcFstDecoderConfig
123 } 129 }
124 130
125 // It contains the recognition result for a online stream. 131 // It contains the recognition result for a online stream.
@@ -190,6 +196,10 @@ func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer { @@ -190,6 +196,10 @@ func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer {
190 c.rule2_min_trailing_silence = C.float(config.Rule2MinTrailingSilence) 196 c.rule2_min_trailing_silence = C.float(config.Rule2MinTrailingSilence)
191 c.rule3_min_utterance_length = C.float(config.Rule3MinUtteranceLength) 197 c.rule3_min_utterance_length = C.float(config.Rule3MinUtteranceLength)
192 198
  199 + c.ctc_fst_decoder_config.graph = C.CString(config.CtcFstDecoderConfig.Graph)
  200 + defer C.free(unsafe.Pointer(c.ctc_fst_decoder_config.graph))
  201 + c.ctc_fst_decoder_config.max_active = C.int(config.CtcFstDecoderConfig.MaxActive)
  202 +
193 recognizer := &OnlineRecognizer{} 203 recognizer := &OnlineRecognizer{}
194 recognizer.impl = C.CreateOnlineRecognizer(&c) 204 recognizer.impl = C.CreateOnlineRecognizer(&c)
195 205
@@ -99,6 +99,11 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer( @@ -99,6 +99,11 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer(
99 recognizer_config.hotwords_score = 99 recognizer_config.hotwords_score =
100 SHERPA_ONNX_OR(config->hotwords_score, 1.5); 100 SHERPA_ONNX_OR(config->hotwords_score, 1.5);
101 101
  102 + recognizer_config.ctc_fst_decoder_config.graph =
  103 + SHERPA_ONNX_OR(config->ctc_fst_decoder_config.graph, "");
  104 + recognizer_config.ctc_fst_decoder_config.max_active =
  105 + SHERPA_ONNX_OR(config->ctc_fst_decoder_config.max_active, 3000);
  106 +
102 if (config->model_config.debug) { 107 if (config->model_config.debug) {
103 SHERPA_ONNX_LOGE("%s\n", recognizer_config.ToString().c_str()); 108 SHERPA_ONNX_LOGE("%s\n", recognizer_config.ToString().c_str());
104 } 109 }
@@ -96,6 +96,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxFeatureConfig { @@ -96,6 +96,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxFeatureConfig {
96 int32_t feature_dim; 96 int32_t feature_dim;
97 } SherpaOnnxFeatureConfig; 97 } SherpaOnnxFeatureConfig;
98 98
  99 +SHERPA_ONNX_API typedef struct SherpaOnnxOnlineCtcFstDecoderConfig {
  100 + const char *graph;
  101 + int32_t max_active;
  102 +} SherpaOnnxOnlineCtcFstDecoderConfig;
  103 +
99 SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerConfig { 104 SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerConfig {
100 SherpaOnnxFeatureConfig feat_config; 105 SherpaOnnxFeatureConfig feat_config;
101 SherpaOnnxOnlineModelConfig model_config; 106 SherpaOnnxOnlineModelConfig model_config;
@@ -131,6 +136,8 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerConfig { @@ -131,6 +136,8 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerConfig {
131 136
132 /// Bonus score for each token in hotwords. 137 /// Bonus score for each token in hotwords.
133 float hotwords_score; 138 float hotwords_score;
  139 +
  140 + SherpaOnnxOnlineCtcFstDecoderConfig ctc_fst_decoder_config;
134 } SherpaOnnxOnlineRecognizerConfig; 141 } SherpaOnnxOnlineRecognizerConfig;
135 142
136 SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerResult { 143 SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerResult {
@@ -7,3 +7,4 @@ vits-vctk @@ -7,3 +7,4 @@ vits-vctk
7 sherpa-onnx-paraformer-zh-2023-09-14 7 sherpa-onnx-paraformer-zh-2023-09-14
8 !*.sh 8 !*.sh
9 *.bak 9 *.bak
  10 +streaming-hlg-decode-file
@@ -111,6 +111,15 @@ func sherpaOnnxFeatureConfig( @@ -111,6 +111,15 @@ func sherpaOnnxFeatureConfig(
111 feature_dim: Int32(featureDim)) 111 feature_dim: Int32(featureDim))
112 } 112 }
113 113
  114 +func sherpaOnnxOnlineCtcFstDecoderConfig(
  115 + graph: String = "",
  116 + maxActive: Int = 3000
  117 +) -> SherpaOnnxOnlineCtcFstDecoderConfig {
  118 + return SherpaOnnxOnlineCtcFstDecoderConfig(
  119 + graph: toCPointer(graph),
  120 + max_active: Int32(maxActive))
  121 +}
  122 +
114 func sherpaOnnxOnlineRecognizerConfig( 123 func sherpaOnnxOnlineRecognizerConfig(
115 featConfig: SherpaOnnxFeatureConfig, 124 featConfig: SherpaOnnxFeatureConfig,
116 modelConfig: SherpaOnnxOnlineModelConfig, 125 modelConfig: SherpaOnnxOnlineModelConfig,
@@ -121,7 +130,8 @@ func sherpaOnnxOnlineRecognizerConfig( @@ -121,7 +130,8 @@ func sherpaOnnxOnlineRecognizerConfig(
121 decodingMethod: String = "greedy_search", 130 decodingMethod: String = "greedy_search",
122 maxActivePaths: Int = 4, 131 maxActivePaths: Int = 4,
123 hotwordsFile: String = "", 132 hotwordsFile: String = "",
124 - hotwordsScore: Float = 1.5 133 + hotwordsScore: Float = 1.5,
  134 + ctcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig = sherpaOnnxOnlineCtcFstDecoderConfig()
125 ) -> SherpaOnnxOnlineRecognizerConfig { 135 ) -> SherpaOnnxOnlineRecognizerConfig {
126 return SherpaOnnxOnlineRecognizerConfig( 136 return SherpaOnnxOnlineRecognizerConfig(
127 feat_config: featConfig, 137 feat_config: featConfig,
@@ -133,7 +143,9 @@ func sherpaOnnxOnlineRecognizerConfig( @@ -133,7 +143,9 @@ func sherpaOnnxOnlineRecognizerConfig(
133 rule2_min_trailing_silence: rule2MinTrailingSilence, 143 rule2_min_trailing_silence: rule2MinTrailingSilence,
134 rule3_min_utterance_length: rule3MinUtteranceLength, 144 rule3_min_utterance_length: rule3MinUtteranceLength,
135 hotwords_file: toCPointer(hotwordsFile), 145 hotwords_file: toCPointer(hotwordsFile),
136 - hotwords_score: hotwordsScore) 146 + hotwords_score: hotwordsScore,
  147 + ctc_fst_decoder_config: ctcFstDecoderConfig
  148 + )
137 } 149 }
138 150
139 /// Wrapper for recognition result. 151 /// Wrapper for recognition result.
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d ../build-swift-macos ]; then
  6 + echo "Please run ../build-swift-macos.sh first!"
  7 + exit 1
  8 +fi
  9 +
  10 +if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst ]; then
  11 + echo "Downloading the pre-trained model for testing."
  12 +
  13 + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  14 + tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  15 + rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
  16 +fi
  17 +
  18 +if [ ! -e ./streaming-hlg-decode-file ]; then
  19 + # Note: We use -lc++ to link against libc++ instead of libstdc++
  20 + swiftc \
  21 + -lc++ \
  22 + -I ../build-swift-macos/install/include \
  23 + -import-objc-header ./SherpaOnnx-Bridging-Header.h \
  24 + ./streaming-hlg-decode-file.swift ./SherpaOnnx.swift \
  25 + -L ../build-swift-macos/install/lib/ \
  26 + -l sherpa-onnx \
  27 + -l onnxruntime \
  28 + -o streaming-hlg-decode-file
  29 +
  30 + strip ./streaming-hlg-decode-file
  31 +else
  32 + echo "./streaming-hlg-decode-file exists - skip building"
  33 +fi
  34 +
  35 +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
  36 +./streaming-hlg-decode-file
  1 +import AVFoundation
  2 +
  3 +extension AudioBuffer {
  4 + func array() -> [Float] {
  5 + return Array(UnsafeBufferPointer(self))
  6 + }
  7 +}
  8 +
  9 +extension AVAudioPCMBuffer {
  10 + func array() -> [Float] {
  11 + return self.audioBufferList.pointee.mBuffers.array()
  12 + }
  13 +}
  14 +
  15 +func run() {
  16 + let filePath =
  17 + "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav"
  18 + let model =
  19 + "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx"
  20 + let tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt"
  21 + let zipfomer2CtcModelConfig = sherpaOnnxOnlineZipformer2CtcModelConfig(
  22 + model: model
  23 + )
  24 +
  25 + let modelConfig = sherpaOnnxOnlineModelConfig(
  26 + tokens: tokens,
  27 + zipformer2Ctc: zipfomer2CtcModelConfig
  28 + )
  29 +
  30 + let featConfig = sherpaOnnxFeatureConfig(
  31 + sampleRate: 16000,
  32 + featureDim: 80
  33 + )
  34 +
  35 + let ctcFstDecoderConfig = sherpaOnnxOnlineCtcFstDecoderConfig(
  36 + graph: "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst",
  37 + maxActive: 3000
  38 + )
  39 +
  40 + var config = sherpaOnnxOnlineRecognizerConfig(
  41 + featConfig: featConfig,
  42 + modelConfig: modelConfig,
  43 + ctcFstDecoderConfig: ctcFstDecoderConfig
  44 + )
  45 +
  46 + let recognizer = SherpaOnnxRecognizer(config: &config)
  47 +
  48 + let fileURL: NSURL = NSURL(fileURLWithPath: filePath)
  49 + let audioFile = try! AVAudioFile(forReading: fileURL as URL)
  50 +
  51 + let audioFormat = audioFile.processingFormat
  52 + assert(audioFormat.channelCount == 1)
  53 + assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
  54 +
  55 + let audioFrameCount = UInt32(audioFile.length)
  56 + let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
  57 +
  58 + try! audioFile.read(into: audioFileBuffer!)
  59 + let array: [Float]! = audioFileBuffer?.array()
  60 + recognizer.acceptWaveform(samples: array, sampleRate: Int(audioFormat.sampleRate))
  61 +
  62 + let tailPadding = [Float](repeating: 0.0, count: 3200)
  63 + recognizer.acceptWaveform(samples: tailPadding, sampleRate: Int(audioFormat.sampleRate))
  64 +
  65 + recognizer.inputFinished()
  66 + while recognizer.isReady() {
  67 + recognizer.decode()
  68 + }
  69 +
  70 + let result = recognizer.getResult()
  71 + print("\nresult is:\n\(result.text)")
  72 +}
  73 +
  74 +@main
  75 +struct App {
  76 + static func main() {
  77 + run()
  78 + }
  79 +}
@@ -43,6 +43,10 @@ function freeConfig(config, Module) { @@ -43,6 +43,10 @@ function freeConfig(config, Module) {
43 freeConfig(config.lm, Module) 43 freeConfig(config.lm, Module)
44 } 44 }
45 45
  46 + if ('ctcFstDecoder' in config) {
  47 + freeConfig(config.ctcFstDecoder, Module)
  48 + }
  49 +
46 Module._free(config.ptr); 50 Module._free(config.ptr);
47 } 51 }
48 52
@@ -193,11 +197,26 @@ function initSherpaOnnxFeatureConfig(config, Module) { @@ -193,11 +197,26 @@ function initSherpaOnnxFeatureConfig(config, Module) {
193 return {ptr: ptr, len: len}; 197 return {ptr: ptr, len: len};
194 } 198 }
195 199
  200 +function initSherpaOnnxOnlineCtcFstDecoderConfig(config, Module) {
  201 + const len = 2 * 4;
  202 + const ptr = Module._malloc(len);
  203 +
  204 + const graphLen = Module.lengthBytesUTF8(config.graph) + 1;
  205 + const buffer = Module._malloc(graphLen);
  206 + Module.stringToUTF8(config.graph, buffer, graphLen);
  207 +
  208 + Module.setValue(ptr, buffer, 'i8*');
  209 + Module.setValue(ptr + 4, config.maxActive, 'i32');
  210 + return {ptr: ptr, len: len, buffer: buffer};
  211 +}
  212 +
196 function initSherpaOnnxOnlineRecognizerConfig(config, Module) { 213 function initSherpaOnnxOnlineRecognizerConfig(config, Module) {
197 const feat = initSherpaOnnxFeatureConfig(config.featConfig, Module); 214 const feat = initSherpaOnnxFeatureConfig(config.featConfig, Module);
198 const model = initSherpaOnnxOnlineModelConfig(config.modelConfig, Module); 215 const model = initSherpaOnnxOnlineModelConfig(config.modelConfig, Module);
  216 + const ctcFstDecoder = initSherpaOnnxOnlineCtcFstDecoderConfig(
  217 + config.ctcFstDecoderConfig, Module)
199 218
200 - const len = feat.len + model.len + 8 * 4; 219 + const len = feat.len + model.len + 8 * 4 + ctcFstDecoder.len;
201 const ptr = Module._malloc(len); 220 const ptr = Module._malloc(len);
202 221
203 let offset = 0; 222 let offset = 0;
@@ -243,8 +262,11 @@ function initSherpaOnnxOnlineRecognizerConfig(config, Module) { @@ -243,8 +262,11 @@ function initSherpaOnnxOnlineRecognizerConfig(config, Module) {
243 Module.setValue(ptr + offset, config.hotwordsScore, 'float'); 262 Module.setValue(ptr + offset, config.hotwordsScore, 'float');
244 offset += 4; 263 offset += 4;
245 264
  265 + Module._CopyHeap(ctcFstDecoder.ptr, ctcFstDecoder.len, ptr + offset);
  266 +
246 return { 267 return {
247 - buffer: buffer, ptr: ptr, len: len, feat: feat, model: model 268 + buffer: buffer, ptr: ptr, len: len, feat: feat, model: model,
  269 + ctcFstDecoder: ctcFstDecoder
248 } 270 }
249 } 271 }
250 272
@@ -313,6 +335,10 @@ function createOnlineRecognizer(Module, myConfig) { @@ -313,6 +335,10 @@ function createOnlineRecognizer(Module, myConfig) {
313 rule3MinUtteranceLength: 20, 335 rule3MinUtteranceLength: 20,
314 hotwordsFile: '', 336 hotwordsFile: '',
315 hotwordsScore: 1.5, 337 hotwordsScore: 1.5,
  338 + ctcFstDecoderConfig: {
  339 + graph: '',
  340 + maxActive: 3000,
  341 + }
316 }; 342 };
317 if (myConfig) { 343 if (myConfig) {
318 recognizerConfig = myConfig; 344 recognizerConfig = myConfig;
@@ -22,9 +22,11 @@ static_assert(sizeof(SherpaOnnxOnlineModelConfig) == @@ -22,9 +22,11 @@ static_assert(sizeof(SherpaOnnxOnlineModelConfig) ==
22 sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) + 5 * 4, 22 sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) + 5 * 4,
23 ""); 23 "");
24 static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); 24 static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
  25 +static_assert(sizeof(SherpaOnnxOnlineCtcFstDecoderConfig) == 2 * 4, "");
25 static_assert(sizeof(SherpaOnnxOnlineRecognizerConfig) == 26 static_assert(sizeof(SherpaOnnxOnlineRecognizerConfig) ==
26 sizeof(SherpaOnnxFeatureConfig) + 27 sizeof(SherpaOnnxFeatureConfig) +
27 - sizeof(SherpaOnnxOnlineModelConfig) + 8 * 4, 28 + sizeof(SherpaOnnxOnlineModelConfig) + 8 * 4 +
  29 + sizeof(SherpaOnnxOnlineCtcFstDecoderConfig),
28 ""); 30 "");
29 31
30 void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) { 32 void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) {
@@ -67,6 +69,11 @@ void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) { @@ -67,6 +69,11 @@ void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) {
67 config->rule3_min_utterance_length); 69 config->rule3_min_utterance_length);
68 fprintf(stdout, "hotwords_file: %s\n", config->hotwords_file); 70 fprintf(stdout, "hotwords_file: %s\n", config->hotwords_file);
69 fprintf(stdout, "hotwords_score: %.2f\n", config->hotwords_score); 71 fprintf(stdout, "hotwords_score: %.2f\n", config->hotwords_score);
  72 +
  73 + fprintf(stdout, "----------ctc fst decoder config----------\n");
  74 + fprintf(stdout, "graph: %s\n", config->ctc_fst_decoder_config.graph);
  75 + fprintf(stdout, "max_active: %d\n",
  76 + config->ctc_fst_decoder_config.max_active);
70 } 77 }
71 78
72 void CopyHeap(const char *src, int32_t num_bytes, char *dst) { 79 void CopyHeap(const char *src, int32_t num_bytes, char *dst) {