Committed by
GitHub
Real-time speech recognition from microphone for .Net (#192)
* Real-time speech recognition from microphone for .Net supporting Linux, macOS, and Windows * Use PortAudioSharp2. We will maintain it by ourselves. The project is at https://github.com/csukuangfj/PortAudioSharp2 * minor fixes
正在显示
5 个修改的文件
包含
279 行增加
和
1 行删除
| @@ -7,6 +7,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "online-decode-files", "onli | @@ -7,6 +7,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "online-decode-files", "onli | ||
| 7 | EndProject | 7 | EndProject |
| 8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-decode-files", "offline-decode-files\offline-decode-files.csproj", "{2DAB152C-9E24-47A0-9DB0-781297ECE458}" | 8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-decode-files", "offline-decode-files\offline-decode-files.csproj", "{2DAB152C-9E24-47A0-9DB0-781297ECE458}" |
| 9 | EndProject | 9 | EndProject |
| 10 | +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "speech-recognition-from-microphone", "speech-recognition-from-microphone\speech-recognition-from-microphone.csproj", "{FE4EA1FF-062A-46B3-B78D-C828FED7B82E}" | ||
| 11 | +EndProject | ||
| 10 | Global | 12 | Global |
| 11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution | 13 | GlobalSection(SolutionConfigurationPlatforms) = preSolution |
| 12 | Debug|Any CPU = Debug|Any CPU | 14 | Debug|Any CPU = Debug|Any CPU |
| @@ -24,5 +26,9 @@ Global | @@ -24,5 +26,9 @@ Global | ||
| 24 | {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Debug|Any CPU.Build.0 = Debug|Any CPU | 26 | {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Debug|Any CPU.Build.0 = Debug|Any CPU |
| 25 | {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Release|Any CPU.ActiveCfg = Release|Any CPU | 27 | {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Release|Any CPU.ActiveCfg = Release|Any CPU |
| 26 | {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Release|Any CPU.Build.0 = Release|Any CPU | 28 | {2DAB152C-9E24-47A0-9DB0-781297ECE458}.Release|Any CPU.Build.0 = Release|Any CPU |
| 29 | + {FE4EA1FF-062A-46B3-B78D-C828FED7B82E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
| 30 | + {FE4EA1FF-062A-46B3-B78D-C828FED7B82E}.Debug|Any CPU.Build.0 = Debug|Any CPU | ||
| 31 | + {FE4EA1FF-062A-46B3-B78D-C828FED7B82E}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
| 32 | + {FE4EA1FF-062A-46B3-B78D-C828FED7B82E}.Release|Any CPU.Build.0 = Release|Any CPU | ||
| 27 | EndGlobalSection | 33 | EndGlobalSection |
| 28 | EndGlobal | 34 | EndGlobal |
| 1 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 2 | +// | ||
| 3 | +// This file shows how to use a streaming model for real-time speech | ||
| 4 | +// recognition from a microphone. | ||
| 5 | +// Please refer to | ||
| 6 | +// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html | ||
| 7 | +// to download streaming models | ||
| 8 | + | ||
| 9 | +using CommandLine.Text; | ||
| 10 | +using CommandLine; | ||
| 11 | +using PortAudioSharp; | ||
| 12 | +using System.Threading; | ||
| 13 | +using SherpaOnnx; | ||
| 14 | +using System.Collections.Generic; | ||
| 15 | +using System.Runtime.InteropServices; | ||
| 16 | +using System; | ||
| 17 | + | ||
| 18 | + | ||
| 19 | +class OnlineDecodeFiles | ||
| 20 | +{ | ||
| 21 | + class Options | ||
| 22 | + { | ||
| 23 | + [Option(Required = true, HelpText = "Path to tokens.txt")] | ||
| 24 | + public string Tokens { get; set; } | ||
| 25 | + | ||
| 26 | + [Option(Required = false, Default = "cpu", HelpText = "Provider, e.g., cpu, coreml")] | ||
| 27 | + public string Provider { get; set; } | ||
| 28 | + | ||
| 29 | + [Option(Required = true, HelpText = "Path to encoder.onnx")] | ||
| 30 | + public string Encoder { get; set; } | ||
| 31 | + | ||
| 32 | + [Option(Required = true, HelpText = "Path to decoder.onnx")] | ||
| 33 | + public string Decoder { get; set; } | ||
| 34 | + | ||
| 35 | + [Option(Required = true, HelpText = "Path to joiner.onnx")] | ||
| 36 | + public string Joiner { get; set; } | ||
| 37 | + | ||
| 38 | + [Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")] | ||
| 39 | + public int NumThreads { get; set; } | ||
| 40 | + | ||
| 41 | + [Option("decoding-method", Required = false, Default = "greedy_search", | ||
| 42 | + HelpText = "Valid decoding methods are: greedy_search, modified_beam_search")] | ||
| 43 | + public string DecodingMethod { get; set; } | ||
| 44 | + | ||
| 45 | + [Option(Required = false, Default = false, HelpText = "True to show model info during loading")] | ||
| 46 | + public bool Debug { get; set; } | ||
| 47 | + | ||
| 48 | + [Option("sample-rate", Required = false, Default = 16000, HelpText = "Sample rate of the data used to train the model")] | ||
| 49 | + public int SampleRate { get; set; } | ||
| 50 | + | ||
| 51 | + [Option("max-active-paths", Required = false, Default = 4, | ||
| 52 | + HelpText = @"Used only when --decoding--method is modified_beam_search. | ||
| 53 | +It specifies number of active paths to keep during the search")] | ||
| 54 | + public int MaxActivePaths { get; set; } | ||
| 55 | + | ||
| 56 | + [Option("enable-endpoint", Required = false, Default = true, | ||
| 57 | + HelpText = "True to enable endpoint detection.")] | ||
| 58 | + public bool EnableEndpoint { get; set; } | ||
| 59 | + | ||
| 60 | + [Option("rule1-min-trailing-silence", Required = false, Default = 2.4F, | ||
| 61 | + HelpText = @"An endpoint is detected if trailing silence in seconds is | ||
| 62 | +larger than this value even if nothing has been decoded. Used only when --enable-endpoint is true.")] | ||
| 63 | + public float Rule1MinTrailingSilence { get; set; } | ||
| 64 | + | ||
| 65 | + [Option("rule2-min-trailing-silence", Required = false, Default = 0.8F, | ||
| 66 | + HelpText = @"An endpoint is detected if trailing silence in seconds is | ||
| 67 | +larger than this value after something that is not blank has been decoded. Used | ||
| 68 | +only when --enable-endpoint is true.")] | ||
| 69 | + public float Rule2MinTrailingSilence { get; set; } | ||
| 70 | + | ||
| 71 | + [Option("rule3-min-utterance-length", Required = false, Default = 20.0F, | ||
| 72 | + HelpText = @"An endpoint is detected if the utterance in seconds is | ||
| 73 | +larger than this value. Used only when --enable-endpoint is true.")] | ||
| 74 | + public float Rule3MinUtteranceLength { get; set; } | ||
| 75 | + } | ||
| 76 | + | ||
| 77 | + static void Main(string[] args) | ||
| 78 | + { | ||
| 79 | + var parser = new CommandLine.Parser(with => with.HelpWriter = null); | ||
| 80 | + var parserResult = parser.ParseArguments<Options>(args); | ||
| 81 | + | ||
| 82 | + parserResult | ||
| 83 | + .WithParsed<Options>(options => Run(options)) | ||
| 84 | + .WithNotParsed(errs => DisplayHelp(parserResult, errs)); | ||
| 85 | + } | ||
| 86 | + | ||
| 87 | + private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs) | ||
| 88 | + { | ||
| 89 | + string usage = @" | ||
| 90 | +dotnet run -c Release \ | ||
| 91 | + --tokens ./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt \ | ||
| 92 | + --encoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx \ | ||
| 93 | + --decoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx \ | ||
| 94 | + --joiner ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx \ | ||
| 95 | + | ||
| 96 | +Please refer to | ||
| 97 | +https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html | ||
| 98 | +to download pre-trained streaming models. | ||
| 99 | +"; | ||
| 100 | + | ||
| 101 | + var helpText = HelpText.AutoBuild(result, h => | ||
| 102 | + { | ||
| 103 | + h.AdditionalNewLineAfterOption = false; | ||
| 104 | + h.Heading = usage; | ||
| 105 | + h.Copyright = "Copyright (c) 2023 Xiaomi Corporation"; | ||
| 106 | + return HelpText.DefaultParsingErrorsHandler(result, h); | ||
| 107 | + }, e => e); | ||
| 108 | + Console.WriteLine(helpText); | ||
| 109 | + } | ||
| 110 | + | ||
| 111 | + private static void Run(Options options) | ||
| 112 | + { | ||
| 113 | + OnlineRecognizerConfig config = new OnlineRecognizerConfig(); | ||
| 114 | + config.FeatConfig.SampleRate = options.SampleRate; | ||
| 115 | + | ||
| 116 | + // All models from icefall using feature dim 80. | ||
| 117 | + // You can change it if your model has a different feature dim. | ||
| 118 | + config.FeatConfig.FeatureDim = 80; | ||
| 119 | + | ||
| 120 | + config.TransducerModelConfig.Encoder = options.Encoder; | ||
| 121 | + config.TransducerModelConfig.Decoder = options.Decoder; | ||
| 122 | + config.TransducerModelConfig.Joiner = options.Joiner; | ||
| 123 | + config.TransducerModelConfig.Tokens = options.Tokens; | ||
| 124 | + config.TransducerModelConfig.Provider = options.Provider; | ||
| 125 | + config.TransducerModelConfig.NumThreads = options.NumThreads; | ||
| 126 | + config.TransducerModelConfig.Debug = options.Debug ? 1 : 0; | ||
| 127 | + | ||
| 128 | + config.DecodingMethod = options.DecodingMethod; | ||
| 129 | + config.MaxActivePaths = options.MaxActivePaths; | ||
| 130 | + config.EnableEndpoint = options.EnableEndpoint ? 1 : 0; | ||
| 131 | + | ||
| 132 | + config.Rule1MinTrailingSilence = options.Rule1MinTrailingSilence; | ||
| 133 | + config.Rule2MinTrailingSilence = options.Rule2MinTrailingSilence; | ||
| 134 | + config.Rule3MinUtteranceLength = options.Rule3MinUtteranceLength; | ||
| 135 | + | ||
| 136 | + OnlineRecognizer recognizer = new OnlineRecognizer(config); | ||
| 137 | + | ||
| 138 | + | ||
| 139 | + OnlineStream s = recognizer.CreateStream(); | ||
| 140 | + | ||
| 141 | + Console.WriteLine(PortAudio.VersionInfo.versionText); | ||
| 142 | + PortAudio.Initialize(); | ||
| 143 | + | ||
| 144 | + Console.WriteLine($"Number of devices: {PortAudio.DeviceCount}"); | ||
| 145 | + for (int i = 0; i != PortAudio.DeviceCount; ++i) | ||
| 146 | + { | ||
| 147 | + Console.WriteLine($" Device {i}"); | ||
| 148 | + DeviceInfo deviceInfo = PortAudio.GetDeviceInfo(i); | ||
| 149 | + Console.WriteLine($" Name: {deviceInfo.name}"); | ||
| 150 | + Console.WriteLine($" Max input channels: {deviceInfo.maxInputChannels}"); | ||
| 151 | + Console.WriteLine($" Default sample rate: {deviceInfo.defaultSampleRate}"); | ||
| 152 | + } | ||
| 153 | + int deviceIndex = PortAudio.DefaultInputDevice; | ||
| 154 | + if (deviceIndex == PortAudio.NoDevice) | ||
| 155 | + { | ||
| 156 | + Console.WriteLine("No default input device found"); | ||
| 157 | + Environment.Exit(1); | ||
| 158 | + } | ||
| 159 | + | ||
| 160 | + DeviceInfo info = PortAudio.GetDeviceInfo(deviceIndex); | ||
| 161 | + | ||
| 162 | + Console.WriteLine(); | ||
| 163 | + Console.WriteLine($"Use default device {deviceIndex} ({info.name})"); | ||
| 164 | + | ||
| 165 | + StreamParameters param = new StreamParameters(); | ||
| 166 | + param.device = deviceIndex; | ||
| 167 | + param.channelCount = 1; | ||
| 168 | + param.sampleFormat = SampleFormat.Float32; | ||
| 169 | + param.suggestedLatency = info.defaultLowInputLatency; | ||
| 170 | + param.hostApiSpecificStreamInfo = IntPtr.Zero; | ||
| 171 | + | ||
| 172 | + PortAudioSharp.Stream.Callback callback = (IntPtr input, IntPtr output, | ||
| 173 | + UInt32 frameCount, | ||
| 174 | + ref StreamCallbackTimeInfo timeInfo, | ||
| 175 | + StreamCallbackFlags statusFlags, | ||
| 176 | + IntPtr userData | ||
| 177 | + ) => | ||
| 178 | + { | ||
| 179 | + float[] samples = new float[frameCount]; | ||
| 180 | + Marshal.Copy(input, samples, 0, (Int32)frameCount); | ||
| 181 | + | ||
| 182 | + s.AcceptWaveform(options.SampleRate, samples); | ||
| 183 | + | ||
| 184 | + return StreamCallbackResult.Continue; | ||
| 185 | + }; | ||
| 186 | + | ||
| 187 | + PortAudioSharp.Stream stream = new PortAudioSharp.Stream(inParams: param, outParams: null, sampleRate: options.SampleRate, | ||
| 188 | + framesPerBuffer: 0, | ||
| 189 | + streamFlags: StreamFlags.ClipOff, | ||
| 190 | + callback: callback, | ||
| 191 | + userData: IntPtr.Zero | ||
| 192 | + ); | ||
| 193 | + | ||
| 194 | + Console.WriteLine(param); | ||
| 195 | + | ||
| 196 | + stream.Start(); | ||
| 197 | + | ||
| 198 | + int segment_index = 0; | ||
| 199 | + String lastText = ""; | ||
| 200 | + int segmentIndex = 0; | ||
| 201 | + | ||
| 202 | + while (true) | ||
| 203 | + { | ||
| 204 | + while (recognizer.IsReady(s)) | ||
| 205 | + { | ||
| 206 | + recognizer.Decode(s); | ||
| 207 | + } | ||
| 208 | + | ||
| 209 | + var text = recognizer.GetResult(s).Text; | ||
| 210 | + bool isEndpoint = recognizer.IsEndpoint(s); | ||
| 211 | + if (!string.IsNullOrWhiteSpace(text) && lastText != text) | ||
| 212 | + { | ||
| 213 | + lastText = text; | ||
| 214 | + Console.Write($"\r{segmentIndex}: {lastText}"); | ||
| 215 | + } | ||
| 216 | + | ||
| 217 | + if (isEndpoint) | ||
| 218 | + { | ||
| 219 | + if (!string.IsNullOrWhiteSpace(text)) | ||
| 220 | + { | ||
| 221 | + ++segmentIndex; | ||
| 222 | + Console.WriteLine(); | ||
| 223 | + } | ||
| 224 | + recognizer.Reset(s); | ||
| 225 | + } | ||
| 226 | + | ||
| 227 | + Thread.Sleep(200); // ms | ||
| 228 | + } | ||
| 229 | + | ||
| 230 | + PortAudio.Terminate(); | ||
| 231 | + | ||
| 232 | + | ||
| 233 | + } | ||
| 234 | +} |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +# Please refer to | ||
| 4 | +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english | ||
| 5 | +# to download the model files | ||
| 6 | +# | ||
| 7 | +export LD_LIBRARY_PATH=$PWD:$LD_LIBRARY_PATH | ||
| 8 | +export DYLD_LIBRARY_PATH=$PWD:$DYLD_LIBRARY_PATH | ||
| 9 | + | ||
| 10 | +if [ ! -d ./icefall-asr-zipformer-streaming-wenetspeech-20230615 ]; then | ||
| 11 | + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615 | ||
| 12 | + cd icefall-asr-zipformer-streaming-wenetspeech-20230615 | ||
| 13 | + git lfs pull --include "*.onnx" | ||
| 14 | + cd .. | ||
| 15 | +fi | ||
| 16 | + | ||
| 17 | +dotnet run -c Release \ | ||
| 18 | + --tokens ./icefall-asr-zipformer-streaming-wenetspeech-20230615/data/lang_char/tokens.txt \ | ||
| 19 | + --encoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx \ | ||
| 20 | + --decoder ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx \ | ||
| 21 | + --joiner ./icefall-asr-zipformer-streaming-wenetspeech-20230615/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx |
dotnet-examples/speech-recognition-from-microphone/speech-recognition-from-microphone.csproj
0 → 100644
| 1 | +<Project Sdk="Microsoft.NET.Sdk"> | ||
| 2 | + | ||
| 3 | + <PropertyGroup> | ||
| 4 | + <OutputType>Exe</OutputType> | ||
| 5 | + <TargetFramework>net6.0</TargetFramework> | ||
| 6 | + <RootNamespace>speech_recognition_from_microphone</RootNamespace> | ||
| 7 | + <ImplicitUsings>enable</ImplicitUsings> | ||
| 8 | + <Nullable>enable</Nullable> | ||
| 9 | + </PropertyGroup> | ||
| 10 | + | ||
| 11 | + <ItemGroup> | ||
| 12 | + <PackageReference Include="CommandLineParser" Version="2.9.1" /> | ||
| 13 | + <PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" /> | ||
| 14 | + <PackageReference Include="PortAudioSharp2" Version="*" /> | ||
| 15 | + </ItemGroup> | ||
| 16 | + | ||
| 17 | +</Project> |
-
请 注册 或 登录 后发表评论