Fangjun Kuang
Committed by GitHub

Real-time speech recognition from microphone for .Net (#192)

* Real-time speech recognition from microphone for .Net supporting Linux, macOS, and Windows

* Use PortAudioSharp2.

We will maintain it by ourselves. The project is at
https://github.com/csukuangfj/PortAudioSharp2

* minor fixes
1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR) 1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
2 project(sherpa-onnx) 2 project(sherpa-onnx)
3 3
4 -set(SHERPA_ONNX_VERSION "1.4.6") 4 +set(SHERPA_ONNX_VERSION "1.4.7")
5 5
6 # Disable warning about 6 # Disable warning about
7 # 7 #
@@ -7,6 +7,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "online-decode-files", "onli @@ -7,6 +7,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "online-decode-files", "onli
7 EndProject 7 EndProject
8 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-decode-files", "offline-decode-files\offline-decode-files.csproj", "{2DAB152C-9E24-47A0-9DB0-781297ECE458}" 8 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-decode-files", "offline-decode-files\offline-decode-files.csproj", "{2DAB152C-9E24-47A0-9DB0-781297ECE458}"
9 EndProject 9 EndProject
  10 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "speech-recognition-from-microphone", "speech-recognition-from-microphone\speech-recognition-from-microphone.csproj", "{FE4EA1FF-062A-46B3-B78D-C828FED7B82E}"
  11 +EndProject
10 Global 12 Global
11 GlobalSection(SolutionConfigurationPlatforms) = preSolution 13 GlobalSection(SolutionConfigurationPlatforms) = preSolution
12 Debug|Any CPU = Debug|Any CPU 14 Debug|Any CPU = Debug|Any CPU
@@ -24,5 +26,9 @@ Global @@ -24,5 +26,9 @@ Global
24 {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Debug|Any CPU.Build.0 = Debug|Any CPU 26 {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Debug|Any CPU.Build.0 = Debug|Any CPU
25 {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Release|Any CPU.ActiveCfg = Release|Any CPU 27 {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Release|Any CPU.ActiveCfg = Release|Any CPU
26 {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Release|Any CPU.Build.0 = Release|Any CPU 28 {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Release|Any CPU.Build.0 = Release|Any CPU
  29 + {FE4EA1FF-062A-46B3-B78D-C828FED7B82E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
  30 + {FE4EA1FF-062A-46B3-B78D-C828FED7B82E}.Debug|Any CPU.Build.0 = Debug|Any CPU
  31 + {FE4EA1FF-062A-46B3-B78D-C828FED7B82E}.Release|Any CPU.ActiveCfg = Release|Any CPU
  32 + {FE4EA1FF-062A-46B3-B78D-C828FED7B82E}.Release|Any CPU.Build.0 = Release|Any CPU
27 EndGlobalSection 33 EndGlobalSection
28 EndGlobal 34 EndGlobal
  1 +// Copyright (c) 2023 Xiaomi Corporation
  2 +//
  3 +// This file shows how to use a streaming model for real-time speech
  4 +// recognition from a microphone.
  5 +// Please refer to
  6 +// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html
  7 +// to download streaming models
  8 +
  9 +using CommandLine.Text;
  10 +using CommandLine;
  11 +using PortAudioSharp;
  12 +using System.Threading;
  13 +using SherpaOnnx;
  14 +using System.Collections.Generic;
  15 +using System.Runtime.InteropServices;
  16 +using System;
  17 +
  18 +
  19 +class OnlineDecodeFiles
  20 +{
  21 + class Options
  22 + {
  23 + [Option(Required = true, HelpText = "Path to tokens.txt")]
  24 + public string Tokens { get; set; }
  25 +
  26 + [Option(Required = false, Default = "cpu", HelpText = "Provider, e.g., cpu, coreml")]
  27 + public string Provider { get; set; }
  28 +
  29 + [Option(Required = true, HelpText = "Path to encoder.onnx")]
  30 + public string Encoder { get; set; }
  31 +
  32 + [Option(Required = true, HelpText = "Path to decoder.onnx")]
  33 + public string Decoder { get; set; }
  34 +
  35 + [Option(Required = true, HelpText = "Path to joiner.onnx")]
  36 + public string Joiner { get; set; }
  37 +
  38 + [Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")]
  39 + public int NumThreads { get; set; }
  40 +
  41 + [Option("decoding-method", Required = false, Default = "greedy_search",
  42 + HelpText = "Valid decoding methods are: greedy_search, modified_beam_search")]
  43 + public string DecodingMethod { get; set; }
  44 +
  45 + [Option(Required = false, Default = false, HelpText = "True to show model info during loading")]
  46 + public bool Debug { get; set; }
  47 +
  48 + [Option("sample-rate", Required = false, Default = 16000, HelpText = "Sample rate of the data used to train the model")]
  49 + public int SampleRate { get; set; }
  50 +
  51 + [Option("max-active-paths", Required = false, Default = 4,
  52 + HelpText = @"Used only when --decoding--method is modified_beam_search.
  53 +It specifies number of active paths to keep during the search")]
  54 + public int MaxActivePaths { get; set; }
  55 +
  56 + [Option("enable-endpoint", Required = false, Default = true,
  57 + HelpText = "True to enable endpoint detection.")]
  58 + public bool EnableEndpoint { get; set; }
  59 +
  60 + [Option("rule1-min-trailing-silence", Required = false, Default = 2.4F,
  61 + HelpText = @"An endpoint is detected if trailing silence in seconds is
  62 +larger than this value even if nothing has been decoded. Used only when --enable-endpoint is true.")]
  63 + public float Rule1MinTrailingSilence { get; set; }
  64 +
  65 + [Option("rule2-min-trailing-silence", Required = false, Default = 0.8F,
  66 + HelpText = @"An endpoint is detected if trailing silence in seconds is
  67 +larger than this value after something that is not blank has been decoded. Used
  68 +only when --enable-endpoint is true.")]
  69 + public float Rule2MinTrailingSilence { get; set; }
  70 +
  71 + [Option("rule3-min-utterance-length", Required = false, Default = 20.0F,
  72 + HelpText = @"An endpoint is detected if the utterance in seconds is
  73 +larger than this value. Used only when --enable-endpoint is true.")]
  74 + public float Rule3MinUtteranceLength { get; set; }
  75 + }
  76 +
  77 + static void Main(string[] args)
  78 + {
  79 + var parser = new CommandLine.Parser(with => with.HelpWriter = null);
  80 + var parserResult = parser.ParseArguments<Options>(args);
  81 +
  82 + parserResult
  83 + .WithParsed<Options>(options => Run(options))
  84 + .WithNotParsed(errs => DisplayHelp(parserResult, errs));
  85 + }
  86 +
  87 + private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs)
  88 + {
  89 + string usage = @"
  90 +dotnet run -c Release \
  91 + --tokens ./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt \
  92 + --encoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx \
  93 + --decoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx \
  94 + --joiner ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx \
  95 +
  96 +Please refer to
  97 +https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html
  98 +to download pre-trained streaming models.
  99 +";
  100 +
  101 + var helpText = HelpText.AutoBuild(result, h =>
  102 + {
  103 + h.AdditionalNewLineAfterOption = false;
  104 + h.Heading = usage;
  105 + h.Copyright = "Copyright (c) 2023 Xiaomi Corporation";
  106 + return HelpText.DefaultParsingErrorsHandler(result, h);
  107 + }, e => e);
  108 + Console.WriteLine(helpText);
  109 + }
  110 +
  111 + private static void Run(Options options)
  112 + {
  113 + OnlineRecognizerConfig config = new OnlineRecognizerConfig();
  114 + config.FeatConfig.SampleRate = options.SampleRate;
  115 +
  116 + // All models from icefall using feature dim 80.
  117 + // You can change it if your model has a different feature dim.
  118 + config.FeatConfig.FeatureDim = 80;
  119 +
  120 + config.TransducerModelConfig.Encoder = options.Encoder;
  121 + config.TransducerModelConfig.Decoder = options.Decoder;
  122 + config.TransducerModelConfig.Joiner = options.Joiner;
  123 + config.TransducerModelConfig.Tokens = options.Tokens;
  124 + config.TransducerModelConfig.Provider = options.Provider;
  125 + config.TransducerModelConfig.NumThreads = options.NumThreads;
  126 + config.TransducerModelConfig.Debug = options.Debug ? 1 : 0;
  127 +
  128 + config.DecodingMethod = options.DecodingMethod;
  129 + config.MaxActivePaths = options.MaxActivePaths;
  130 + config.EnableEndpoint = options.EnableEndpoint ? 1 : 0;
  131 +
  132 + config.Rule1MinTrailingSilence = options.Rule1MinTrailingSilence;
  133 + config.Rule2MinTrailingSilence = options.Rule2MinTrailingSilence;
  134 + config.Rule3MinUtteranceLength = options.Rule3MinUtteranceLength;
  135 +
  136 + OnlineRecognizer recognizer = new OnlineRecognizer(config);
  137 +
  138 +
  139 + OnlineStream s = recognizer.CreateStream();
  140 +
  141 + Console.WriteLine(PortAudio.VersionInfo.versionText);
  142 + PortAudio.Initialize();
  143 +
  144 + Console.WriteLine($"Number of devices: {PortAudio.DeviceCount}");
  145 + for (int i = 0; i != PortAudio.DeviceCount; ++i)
  146 + {
  147 + Console.WriteLine($" Device {i}");
  148 + DeviceInfo deviceInfo = PortAudio.GetDeviceInfo(i);
  149 + Console.WriteLine($" Name: {deviceInfo.name}");
  150 + Console.WriteLine($" Max input channels: {deviceInfo.maxInputChannels}");
  151 + Console.WriteLine($" Default sample rate: {deviceInfo.defaultSampleRate}");
  152 + }
  153 + int deviceIndex = PortAudio.DefaultInputDevice;
  154 + if (deviceIndex == PortAudio.NoDevice)
  155 + {
  156 + Console.WriteLine("No default input device found");
  157 + Environment.Exit(1);
  158 + }
  159 +
  160 + DeviceInfo info = PortAudio.GetDeviceInfo(deviceIndex);
  161 +
  162 + Console.WriteLine();
  163 + Console.WriteLine($"Use default device {deviceIndex} ({info.name})");
  164 +
  165 + StreamParameters param = new StreamParameters();
  166 + param.device = deviceIndex;
  167 + param.channelCount = 1;
  168 + param.sampleFormat = SampleFormat.Float32;
  169 + param.suggestedLatency = info.defaultLowInputLatency;
  170 + param.hostApiSpecificStreamInfo = IntPtr.Zero;
  171 +
  172 + PortAudioSharp.Stream.Callback callback = (IntPtr input, IntPtr output,
  173 + UInt32 frameCount,
  174 + ref StreamCallbackTimeInfo timeInfo,
  175 + StreamCallbackFlags statusFlags,
  176 + IntPtr userData
  177 + ) =>
  178 + {
  179 + float[] samples = new float[frameCount];
  180 + Marshal.Copy(input, samples, 0, (Int32)frameCount);
  181 +
  182 + s.AcceptWaveform(options.SampleRate, samples);
  183 +
  184 + return StreamCallbackResult.Continue;
  185 + };
  186 +
  187 + PortAudioSharp.Stream stream = new PortAudioSharp.Stream(inParams: param, outParams: null, sampleRate: options.SampleRate,
  188 + framesPerBuffer: 0,
  189 + streamFlags: StreamFlags.ClipOff,
  190 + callback: callback,
  191 + userData: IntPtr.Zero
  192 + );
  193 +
  194 + Console.WriteLine(param);
  195 +
  196 + stream.Start();
  197 +
  198 + int segment_index = 0;
  199 + String lastText = "";
  200 + int segmentIndex = 0;
  201 +
  202 + while (true)
  203 + {
  204 + while (recognizer.IsReady(s))
  205 + {
  206 + recognizer.Decode(s);
  207 + }
  208 +
  209 + var text = recognizer.GetResult(s).Text;
  210 + bool isEndpoint = recognizer.IsEndpoint(s);
  211 + if (!string.IsNullOrWhiteSpace(text) && lastText != text)
  212 + {
  213 + lastText = text;
  214 + Console.Write($"\r{segmentIndex}: {lastText}");
  215 + }
  216 +
  217 + if (isEndpoint)
  218 + {
  219 + if (!string.IsNullOrWhiteSpace(text))
  220 + {
  221 + ++segmentIndex;
  222 + Console.WriteLine();
  223 + }
  224 + recognizer.Reset(s);
  225 + }
  226 +
  227 + Thread.Sleep(200); // ms
  228 + }
  229 +
  230 + PortAudio.Terminate();
  231 +
  232 +
  233 + }
  234 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +# Please refer to
  4 +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english
  5 +# to download the model files
  6 +#
  7 +export LD_LIBRARY_PATH=$PWD:$LD_LIBRARY_PATH
  8 +export DYLD_LIBRARY_PATH=$PWD:$DYLD_LIBRARY_PATH
  9 +
  10 +if [ ! -d ./icefall-asr-zipformer-streaming-wenetspeech-20230615 ]; then
  11 + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615
  12 + cd icefall-asr-zipformer-streaming-wenetspeech-20230615
  13 + git lfs pull --include "*.onnx"
  14 + cd ..
  15 +fi
  16 +
  17 +dotnet run -c Release \
  18 + --tokens ./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt \
  19 + --encoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx \
  20 + --decoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx \
  21 + --joiner ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx
  1 +<Project Sdk="Microsoft.NET.Sdk">
  2 +
  3 + <PropertyGroup>
  4 + <OutputType>Exe</OutputType>
  5 + <TargetFramework>net6.0</TargetFramework>
  6 + <RootNamespace>speech_recognition_from_microphone</RootNamespace>
  7 + <ImplicitUsings>enable</ImplicitUsings>
  8 + <Nullable>enable</Nullable>
  9 + </PropertyGroup>
  10 +
  11 + <ItemGroup>
  12 + <PackageReference Include="CommandLineParser" Version="2.9.1" />
  13 + <PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
  14 + <PackageReference Include="PortAudioSharp2" Version="*" />
  15 + </ItemGroup>
  16 +
  17 +</Project>