Committed by
GitHub
Add C# API for spoken language identification (#697)
正在显示
10 个修改的文件
包含
265 行增加
和
55 行删除
.github/scripts/test-dot-net.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +cd dotnet-examples/ | ||
| 4 | + | ||
| 5 | +cd spoken-language-identification | ||
| 6 | +./run.sh | ||
| 7 | + | ||
| 8 | +cd ../online-decode-files | ||
| 9 | +./run-zipformer2-ctc.sh | ||
| 10 | +./run-transducer.sh | ||
| 11 | +./run-paraformer.sh | ||
| 12 | + | ||
| 13 | +cd ../offline-decode-files | ||
| 14 | +./run-nemo-ctc.sh | ||
| 15 | +./run-paraformer.sh | ||
| 16 | +./run-zipformer.sh | ||
| 17 | +./run-hotwords.sh | ||
| 18 | +./run-whisper.sh | ||
| 19 | +./run-tdnn-yesno.sh | ||
| 20 | + | ||
| 21 | +cd ../offline-tts | ||
| 22 | +./run-aishell3.sh | ||
| 23 | +./run-piper.sh | ||
| 24 | +ls -lh | ||
| 25 | + | ||
| 26 | +cd ../.. | ||
| 27 | + | ||
| 28 | +mkdir tts | ||
| 29 | + | ||
| 30 | +cp dotnet-examples/offline-tts/*.wav ./tts |
| @@ -40,33 +40,10 @@ jobs: | @@ -40,33 +40,10 @@ jobs: | ||
| 40 | - name: Check dotnet | 40 | - name: Check dotnet |
| 41 | run: dotnet --info | 41 | run: dotnet --info |
| 42 | 42 | ||
| 43 | - - name: Decode a file | 43 | + - name: Run tests |
| 44 | shell: bash | 44 | shell: bash |
| 45 | run: | | 45 | run: | |
| 46 | - cd dotnet-examples/ | ||
| 47 | - | ||
| 48 | - cd online-decode-files | ||
| 49 | - ./run-transducer.sh | ||
| 50 | - ./run-paraformer.sh | ||
| 51 | - | ||
| 52 | - cd ../offline-decode-files | ||
| 53 | - ./run-nemo-ctc.sh | ||
| 54 | - ./run-paraformer.sh | ||
| 55 | - ./run-zipformer.sh | ||
| 56 | - ./run-hotwords.sh | ||
| 57 | - ./run-whisper.sh | ||
| 58 | - ./run-tdnn-yesno.sh | ||
| 59 | - | ||
| 60 | - cd ../offline-tts | ||
| 61 | - ./run-aishell3.sh | ||
| 62 | - ./run-piper.sh | ||
| 63 | - ls -lh | ||
| 64 | - | ||
| 65 | - cd ../.. | ||
| 66 | - | ||
| 67 | - mkdir tts | ||
| 68 | - | ||
| 69 | - cp dotnet-examples/offline-tts/*.wav ./tts | 46 | + .github/scripts/test-dot-net.sh |
| 70 | 47 | ||
| 71 | - uses: actions/upload-artifact@v4 | 48 | - uses: actions/upload-artifact@v4 |
| 72 | with: | 49 | with: |
| @@ -177,39 +177,16 @@ jobs: | @@ -177,39 +177,16 @@ jobs: | ||
| 177 | cp -v scripts/dotnet/examples/offline-decode-files.csproj dotnet-examples/offline-decode-files/ | 177 | cp -v scripts/dotnet/examples/offline-decode-files.csproj dotnet-examples/offline-decode-files/ |
| 178 | cp -v scripts/dotnet/examples/online-decode-files.csproj dotnet-examples/online-decode-files/ | 178 | cp -v scripts/dotnet/examples/online-decode-files.csproj dotnet-examples/online-decode-files/ |
| 179 | cp -v scripts/dotnet/examples/speech-recognition-from-microphone.csproj dotnet-examples/speech-recognition-from-microphone/ | 179 | cp -v scripts/dotnet/examples/speech-recognition-from-microphone.csproj dotnet-examples/speech-recognition-from-microphone/ |
| 180 | + cp -v scripts/dotnet/examples/spoken-language-identification.csproj dotnet-examples/spoken-language-identification/ | ||
| 180 | 181 | ||
| 181 | ls -lh /tmp | 182 | ls -lh /tmp |
| 182 | 183 | ||
| 183 | - - name: Decode a file | 184 | + - name: Run tests |
| 184 | shell: bash | 185 | shell: bash |
| 185 | run: | | 186 | run: | |
| 186 | - cd dotnet-examples/ | 187 | + .github/scripts/test-dot-net.sh |
| 187 | 188 | ||
| 188 | - cd online-decode-files | ||
| 189 | - ./run-zipformer2-ctc.sh | ||
| 190 | - ./run-transducer.sh | ||
| 191 | - ./run-paraformer.sh | ||
| 192 | - | ||
| 193 | - cd ../offline-decode-files | ||
| 194 | - ./run-nemo-ctc.sh | ||
| 195 | - ./run-paraformer.sh | ||
| 196 | - ./run-zipformer.sh | ||
| 197 | - ./run-hotwords.sh | ||
| 198 | - ./run-whisper.sh | ||
| 199 | - ./run-tdnn-yesno.sh | ||
| 200 | - | ||
| 201 | - cd ../offline-tts | ||
| 202 | - ./run-aishell3.sh | ||
| 203 | - ./run-piper.sh | ||
| 204 | - ls -lh | ||
| 205 | - | ||
| 206 | - cd ../.. | ||
| 207 | - | ||
| 208 | - mkdir tts | ||
| 209 | - | ||
| 210 | - cp dotnet-examples/offline-tts/*.wav ./tts | ||
| 211 | - | ||
| 212 | - - uses: actions/upload-artifact@v3 | 189 | + - uses: actions/upload-artifact@v4 |
| 213 | with: | 190 | with: |
| 214 | name: dot-net-tts-generated-test-files-${{ matrix.os }} | 191 | name: dot-net-tts-generated-test-files-${{ matrix.os }} |
| 215 | path: tts | 192 | path: tts |
| @@ -13,6 +13,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts", "offline-tts\ | @@ -13,6 +13,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts", "offline-tts\ | ||
| 13 | EndProject | 13 | EndProject |
| 14 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts-play", "offline-tts-play\offline-tts-play.csproj", "{40781464-5948-462B-BA4B-98932711513F}" | 14 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts-play", "offline-tts-play\offline-tts-play.csproj", "{40781464-5948-462B-BA4B-98932711513F}" |
| 15 | EndProject | 15 | EndProject |
| 16 | +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "spoken-language-identification", "spoken-language-identification\spoken-language-identification.csproj", "{3D7CF3D6-AC45-4D50-9619-5687B1443E94}" | ||
| 17 | +EndProject | ||
| 16 | Global | 18 | Global |
| 17 | GlobalSection(SolutionConfigurationPlatforms) = preSolution | 19 | GlobalSection(SolutionConfigurationPlatforms) = preSolution |
| 18 | Debug|Any CPU = Debug|Any CPU | 20 | Debug|Any CPU = Debug|Any CPU |
| @@ -42,5 +44,9 @@ Global | @@ -42,5 +44,9 @@ Global | ||
| 42 | {40781464-5948-462B-BA4B-98932711513F}.Debug|Any CPU.Build.0 = Debug|Any CPU | 44 | {40781464-5948-462B-BA4B-98932711513F}.Debug|Any CPU.Build.0 = Debug|Any CPU |
| 43 | {40781464-5948-462B-BA4B-98932711513F}.Release|Any CPU.ActiveCfg = Release|Any CPU | 45 | {40781464-5948-462B-BA4B-98932711513F}.Release|Any CPU.ActiveCfg = Release|Any CPU |
| 44 | {40781464-5948-462B-BA4B-98932711513F}.Release|Any CPU.Build.0 = Release|Any CPU | 46 | {40781464-5948-462B-BA4B-98932711513F}.Release|Any CPU.Build.0 = Release|Any CPU |
| 47 | + {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
| 48 | + {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Debug|Any CPU.Build.0 = Debug|Any CPU | ||
| 49 | + {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
| 50 | + {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.Build.0 = Release|Any CPU | ||
| 45 | EndGlobalSection | 51 | EndGlobalSection |
| 46 | EndGlobal | 52 | EndGlobal |
| 1 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 2 | +// | ||
| 3 | +// This file shows how to do spoken language identification with whisper. | ||
| 4 | +// | ||
| 5 | +// 1. Download a whisper multilingual model. We use a tiny model below. | ||
| 6 | +// Please refer to https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 7 | +// to download more models. | ||
| 8 | +// | ||
| 9 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2 | ||
| 10 | +// tar xvf sherpa-onnx-whisper-tiny.tar.bz2 | ||
| 11 | +// rm sherpa-onnx-whisper-tiny.tar.bz2 | ||
| 12 | +// | ||
| 13 | +// 2. Now run it | ||
| 14 | +// | ||
| 15 | +// dotnet run | ||
| 16 | + | ||
| 17 | +using SherpaOnnx; | ||
| 18 | +using System.Collections.Generic; | ||
| 19 | +using System; | ||
| 20 | + | ||
| 21 | +class SpokenLanguageIdentificationDemo | ||
| 22 | +{ | ||
| 23 | + | ||
| 24 | + static void Main(string[] args) | ||
| 25 | + { | ||
| 26 | + var config = new SpokenLanguageIdentificationConfig(); | ||
| 27 | + config.Whisper.Encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx"; | ||
| 28 | + config.Whisper.Decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx"; | ||
| 29 | + | ||
| 30 | + var slid = new SpokenLanguageIdentification(config); | ||
| 31 | + var filename = "./sherpa-onnx-whisper-tiny/test_wavs/0.wav"; | ||
| 32 | + | ||
| 33 | + WaveReader waveReader = new WaveReader(filename); | ||
| 34 | + | ||
| 35 | + var s = slid.CreateStream(); | ||
| 36 | + s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples); | ||
| 37 | + var result = slid.Compute(s); | ||
| 38 | + Console.WriteLine($"Filename: {filename}"); | ||
| 39 | + Console.WriteLine($"Detected language: {result.Lang}"); | ||
| 40 | + } | ||
| 41 | +} | ||
| 42 | + |
| 1 | +../offline-decode-files/WaveReader.cs |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [ ! -d ./sherpa-onnx-whisper-tiny ]; then | ||
| 6 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2 | ||
| 7 | + tar xvf sherpa-onnx-whisper-tiny.tar.bz2 | ||
| 8 | + rm sherpa-onnx-whisper-tiny.tar.bz2 | ||
| 9 | +fi | ||
| 10 | + | ||
| 11 | +dotnet run | ||
| 12 | + |
| 1 | +<Project Sdk="Microsoft.NET.Sdk"> | ||
| 2 | + | ||
| 3 | + <PropertyGroup> | ||
| 4 | + <OutputType>Exe</OutputType> | ||
| 5 | + <TargetFramework>net6.0</TargetFramework> | ||
| 6 | + <RootNamespace>spoken_language_identification</RootNamespace> | ||
| 7 | + <ImplicitUsings>enable</ImplicitUsings> | ||
| 8 | + <Nullable>enable</Nullable> | ||
| 9 | + </PropertyGroup> | ||
| 10 | + | ||
| 11 | + <ItemGroup> | ||
| 12 | + <PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" /> | ||
| 13 | + </ItemGroup> | ||
| 14 | + | ||
| 15 | +</Project> |
| 1 | +<Project Sdk="Microsoft.NET.Sdk"> | ||
| 2 | + | ||
| 3 | + <PropertyGroup> | ||
| 4 | + <OutputType>Exe</OutputType> | ||
| 5 | + <TargetFramework>net6.0</TargetFramework> | ||
| 6 | + <RootNamespace>spoken_language_identification</RootNamespace> | ||
| 7 | + <ImplicitUsings>enable</ImplicitUsings> | ||
| 8 | + <Nullable>enable</Nullable> | ||
| 9 | + </PropertyGroup> | ||
| 10 | + | ||
| 11 | + <PropertyGroup> | ||
| 12 | + <RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources> | ||
| 13 | + </PropertyGroup> | ||
| 14 | + | ||
| 15 | + <ItemGroup> | ||
| 16 | + <PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" /> | ||
| 17 | + </ItemGroup> | ||
| 18 | + | ||
| 19 | +</Project> |
| @@ -403,8 +403,8 @@ namespace SherpaOnnx | @@ -403,8 +403,8 @@ namespace SherpaOnnx | ||
| 403 | while (*buffer != 0) | 403 | while (*buffer != 0) |
| 404 | { | 404 | { |
| 405 | ++buffer; | 405 | ++buffer; |
| 406 | + length += 1; | ||
| 406 | } | 407 | } |
| 407 | - length = (int)(buffer - (byte*)impl.Text); | ||
| 408 | } | 408 | } |
| 409 | 409 | ||
| 410 | byte[] stringBuffer = new byte[length]; | 410 | byte[] stringBuffer = new byte[length]; |
| @@ -496,8 +496,6 @@ namespace SherpaOnnx | @@ -496,8 +496,6 @@ namespace SherpaOnnx | ||
| 496 | return new OfflineStream(p); | 496 | return new OfflineStream(p); |
| 497 | } | 497 | } |
| 498 | 498 | ||
| 499 | - /// You have to ensure that IsReady(stream) returns true before | ||
| 500 | - /// you call this method | ||
| 501 | public void Decode(OfflineStream stream) | 499 | public void Decode(OfflineStream stream) |
| 502 | { | 500 | { |
| 503 | Decode(_handle.Handle, stream.Handle); | 501 | Decode(_handle.Handle, stream.Handle); |
| @@ -549,4 +547,137 @@ namespace SherpaOnnx | @@ -549,4 +547,137 @@ namespace SherpaOnnx | ||
| 549 | private static extern void Decode(IntPtr handle, IntPtr[] streams, int n); | 547 | private static extern void Decode(IntPtr handle, IntPtr[] streams, int n); |
| 550 | } | 548 | } |
| 551 | 549 | ||
| 550 | + [StructLayout(LayoutKind.Sequential)] | ||
| 551 | + public struct SpokenLanguageIdentificationWhisperConfig | ||
| 552 | + { | ||
| 553 | + public SpokenLanguageIdentificationWhisperConfig() | ||
| 554 | + { | ||
| 555 | + Encoder = ""; | ||
| 556 | + Decoder = ""; | ||
| 557 | + TailPaddings = -1; | ||
| 558 | + } | ||
| 559 | + | ||
| 560 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 561 | + public string Encoder; | ||
| 562 | + | ||
| 563 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 564 | + public string Decoder; | ||
| 565 | + | ||
| 566 | + public int TailPaddings; | ||
| 567 | + } | ||
| 568 | + | ||
| 569 | + public struct SpokenLanguageIdentificationConfig | ||
| 570 | + { | ||
| 571 | + public SpokenLanguageIdentificationConfig() | ||
| 572 | + { | ||
| 573 | + Whisper = new SpokenLanguageIdentificationWhisperConfig(); | ||
| 574 | + NumThreads = 1; | ||
| 575 | + Debug = 0; | ||
| 576 | + Provider = "cpu"; | ||
| 577 | + } | ||
| 578 | + public SpokenLanguageIdentificationWhisperConfig Whisper; | ||
| 579 | + | ||
| 580 | + public int NumThreads; | ||
| 581 | + public int Debug; | ||
| 582 | + | ||
| 583 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 584 | + public string Provider; | ||
| 585 | + } | ||
| 586 | + | ||
| 587 | + public class SpokenLanguageIdentificationResult | ||
| 588 | + { | ||
| 589 | + public SpokenLanguageIdentificationResult(IntPtr handle) | ||
| 590 | + { | ||
| 591 | + Impl impl = (Impl)Marshal.PtrToStructure(handle, typeof(Impl)); | ||
| 592 | + | ||
| 593 | + // PtrToStringUTF8() requires .net standard 2.1 | ||
| 594 | + // _text = Marshal.PtrToStringUTF8(impl.Text); | ||
| 595 | + | ||
| 596 | + int length = 0; | ||
| 597 | + | ||
| 598 | + unsafe | ||
| 599 | + { | ||
| 600 | + byte* buffer = (byte*)impl.Lang; | ||
| 601 | + while (*buffer != 0) | ||
| 602 | + { | ||
| 603 | + ++buffer; | ||
| 604 | + length += 1; | ||
| 605 | + } | ||
| 606 | + } | ||
| 607 | + | ||
| 608 | + byte[] stringBuffer = new byte[length]; | ||
| 609 | + Marshal.Copy(impl.Lang, stringBuffer, 0, length); | ||
| 610 | + _lang = Encoding.UTF8.GetString(stringBuffer); | ||
| 611 | + } | ||
| 612 | + | ||
| 613 | + [StructLayout(LayoutKind.Sequential)] | ||
| 614 | + struct Impl | ||
| 615 | + { | ||
| 616 | + public IntPtr Lang; | ||
| 617 | + } | ||
| 618 | + | ||
| 619 | + private String _lang; | ||
| 620 | + public String Lang => _lang; | ||
| 621 | + } | ||
| 622 | + | ||
| 623 | + public class SpokenLanguageIdentification : IDisposable | ||
| 624 | + { | ||
| 625 | + public SpokenLanguageIdentification(SpokenLanguageIdentificationConfig config) | ||
| 626 | + { | ||
| 627 | + IntPtr h = SherpaOnnxCreateSpokenLanguageIdentification(ref config); | ||
| 628 | + _handle = new HandleRef(this, h); | ||
| 629 | + } | ||
| 630 | + | ||
| 631 | + public OfflineStream CreateStream() | ||
| 632 | + { | ||
| 633 | + IntPtr p = SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(_handle.Handle); | ||
| 634 | + return new OfflineStream(p); | ||
| 635 | + } | ||
| 636 | + | ||
| 637 | + public SpokenLanguageIdentificationResult Compute(OfflineStream stream) | ||
| 638 | + { | ||
| 639 | + IntPtr h = SherpaOnnxSpokenLanguageIdentificationCompute(_handle.Handle, stream.Handle); | ||
| 640 | + SpokenLanguageIdentificationResult result = new SpokenLanguageIdentificationResult(h); | ||
| 641 | + SherpaOnnxDestroySpokenLanguageIdentificationResult(h); | ||
| 642 | + return result; | ||
| 643 | + } | ||
| 644 | + | ||
| 645 | + public void Dispose() | ||
| 646 | + { | ||
| 647 | + Cleanup(); | ||
| 648 | + // Prevent the object from being placed on the | ||
| 649 | + // finalization queue | ||
| 650 | + System.GC.SuppressFinalize(this); | ||
| 651 | + } | ||
| 652 | + | ||
| 653 | + ~SpokenLanguageIdentification() | ||
| 654 | + { | ||
| 655 | + Cleanup(); | ||
| 656 | + } | ||
| 657 | + | ||
| 658 | + private void Cleanup() | ||
| 659 | + { | ||
| 660 | + SherpaOnnxDestroySpokenLanguageIdentification(_handle.Handle); | ||
| 661 | + | ||
| 662 | + // Don't permit the handle to be used again. | ||
| 663 | + _handle = new HandleRef(this, IntPtr.Zero); | ||
| 664 | + } | ||
| 665 | + | ||
| 666 | + private HandleRef _handle; | ||
| 667 | + | ||
| 668 | + [DllImport(Dll.Filename)] | ||
| 669 | + private static extern IntPtr SherpaOnnxCreateSpokenLanguageIdentification(ref SpokenLanguageIdentificationConfig config); | ||
| 670 | + | ||
| 671 | + [DllImport(Dll.Filename)] | ||
| 672 | + private static extern void SherpaOnnxDestroySpokenLanguageIdentification(IntPtr handle); | ||
| 673 | + | ||
| 674 | + [DllImport(Dll.Filename)] | ||
| 675 | + private static extern IntPtr SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(IntPtr handle); | ||
| 676 | + | ||
| 677 | + [DllImport(Dll.Filename)] | ||
| 678 | + private static extern IntPtr SherpaOnnxSpokenLanguageIdentificationCompute(IntPtr handle, IntPtr stream); | ||
| 679 | + | ||
| 680 | + [DllImport(Dll.Filename)] | ||
| 681 | + private static extern void SherpaOnnxDestroySpokenLanguageIdentificationResult(IntPtr handle); | ||
| 682 | + } | ||
| 552 | } | 683 | } |
-
请 注册 或 登录 后发表评论