正在显示
13 个修改的文件
包含
632 行增加
和
3 行删除
| @@ -141,4 +141,15 @@ for wave in ${waves[@]}; do | @@ -141,4 +141,15 @@ for wave in ${waves[@]}; do | ||
| 141 | 4 | 141 | 4 |
| 142 | done | 142 | done |
| 143 | 143 | ||
| 144 | +# Decode a URL | ||
| 145 | +if [ $EXE == "sherpa-onnx-ffmpeg" ]; then | ||
| 146 | + time $EXE \ | ||
| 147 | + $repo/tokens.txt \ | ||
| 148 | + $repo/encoder-epoch-99-avg-1.onnx \ | ||
| 149 | + $repo/decoder-epoch-99-avg-1.onnx \ | ||
| 150 | + $repo/joiner-epoch-99-avg-1.onnx \ | ||
| 151 | + https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/resolve/main/test_wavs/4.wav \ | ||
| 152 | + 4 | ||
| 153 | +fi | ||
| 154 | + | ||
| 144 | rm -rf $repo | 155 | rm -rf $repo |
| @@ -10,6 +10,9 @@ on: | @@ -10,6 +10,9 @@ on: | ||
| 10 | - 'CMakeLists.txt' | 10 | - 'CMakeLists.txt' |
| 11 | - 'cmake/**' | 11 | - 'cmake/**' |
| 12 | - 'sherpa-onnx/csrc/*' | 12 | - 'sherpa-onnx/csrc/*' |
| 13 | + - 'sherpa-onnx/c-api/*' | ||
| 14 | + - 'ffmpeg-examples/**' | ||
| 15 | + - 'c-api-examples/**' | ||
| 13 | pull_request: | 16 | pull_request: |
| 14 | branches: | 17 | branches: |
| 15 | - master | 18 | - master |
| @@ -19,6 +22,8 @@ on: | @@ -19,6 +22,8 @@ on: | ||
| 19 | - 'CMakeLists.txt' | 22 | - 'CMakeLists.txt' |
| 20 | - 'cmake/**' | 23 | - 'cmake/**' |
| 21 | - 'sherpa-onnx/csrc/*' | 24 | - 'sherpa-onnx/csrc/*' |
| 25 | + - 'sherpa-onnx/c-api/*' | ||
| 26 | + - 'ffmpeg-examples/**' | ||
| 22 | 27 | ||
| 23 | concurrency: | 28 | concurrency: |
| 24 | group: linux-${{ github.ref }} | 29 | group: linux-${{ github.ref }} |
| @@ -40,6 +45,23 @@ jobs: | @@ -40,6 +45,23 @@ jobs: | ||
| 40 | with: | 45 | with: |
| 41 | fetch-depth: 0 | 46 | fetch-depth: 0 |
| 42 | 47 | ||
| 48 | + - name: Install ffmpeg | ||
| 49 | + shell: bash | ||
| 50 | + run: | | ||
| 51 | + sudo apt-get install -y software-properties-common | ||
| 52 | + sudo add-apt-repository ppa:savoury1/ffmpeg4 | ||
| 53 | + sudo add-apt-repository ppa:savoury1/ffmpeg5 | ||
| 54 | + | ||
| 55 | + sudo apt-get install -y libavdevice-dev libavutil-dev ffmpeg | ||
| 56 | + pkg-config --modversion libavutil | ||
| 57 | + ffmpeg -version | ||
| 58 | + | ||
| 59 | + - name: Show ffmpeg version | ||
| 60 | + shell: bash | ||
| 61 | + run: | | ||
| 62 | + pkg-config --modversion libavutil | ||
| 63 | + ffmpeg -version | ||
| 64 | + | ||
| 43 | - name: Configure CMake | 65 | - name: Configure CMake |
| 44 | shell: bash | 66 | shell: bash |
| 45 | run: | | 67 | run: | |
| @@ -56,12 +78,22 @@ jobs: | @@ -56,12 +78,22 @@ jobs: | ||
| 56 | ls -lh lib | 78 | ls -lh lib |
| 57 | ls -lh bin | 79 | ls -lh bin |
| 58 | 80 | ||
| 81 | + cd ../ffmpeg-examples | ||
| 82 | + make | ||
| 83 | + | ||
| 59 | - name: Display dependencies of sherpa-onnx for linux | 84 | - name: Display dependencies of sherpa-onnx for linux |
| 60 | shell: bash | 85 | shell: bash |
| 61 | run: | | 86 | run: | |
| 62 | file build/bin/sherpa-onnx | 87 | file build/bin/sherpa-onnx |
| 63 | readelf -d build/bin/sherpa-onnx | 88 | readelf -d build/bin/sherpa-onnx |
| 64 | 89 | ||
| 90 | + - name: Test sherpa-onnx-ffmpeg | ||
| 91 | + run: | | ||
| 92 | + export PATH=$PWD/ffmpeg-examples:$PATH | ||
| 93 | + export EXE=sherpa-onnx-ffmpeg | ||
| 94 | + | ||
| 95 | + .github/scripts/test-online-transducer.sh | ||
| 96 | + | ||
| 65 | - name: Test online transducer | 97 | - name: Test online transducer |
| 66 | shell: bash | 98 | shell: bash |
| 67 | run: | | 99 | run: | |
| @@ -39,6 +39,7 @@ cmake \ | @@ -39,6 +39,7 @@ cmake \ | ||
| 39 | -DBUILD_SHARED_LIBS=OFF \ | 39 | -DBUILD_SHARED_LIBS=OFF \ |
| 40 | -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | 40 | -DSHERPA_ONNX_ENABLE_TESTS=OFF \ |
| 41 | -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | 41 | -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ |
| 42 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 42 | -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake \ | 43 | -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake \ |
| 43 | .. | 44 | .. |
| 44 | 45 |
| 1 | 1 | ||
| 2 | +CUR_DIR :=$(shell pwd) | ||
| 3 | + | ||
| 2 | CFLAGS := -I ../ | 4 | CFLAGS := -I ../ |
| 3 | LDFLAGS := -L ../build/lib | 5 | LDFLAGS := -L ../build/lib |
| 4 | LDFLAGS += -L ../build/_deps/onnxruntime-src/lib | 6 | LDFLAGS += -L ../build/_deps/onnxruntime-src/lib |
| 5 | LDFLAGS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lonnxruntime -lkaldi-native-fbank-core | 7 | LDFLAGS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lonnxruntime -lkaldi-native-fbank-core |
| 6 | -LDFLAGS += -Wl,-rpath,../build/lib | ||
| 7 | -LDFLAGS += -Wl,-rpath,../build/_deps/onnxruntime-src/lib | 8 | +LDFLAGS += -Wl,-rpath,${CUR_DIR}/../build/lib |
| 9 | +LDFLAGS += -Wl,-rpath,${CUR_DIR}/../build/_deps/onnxruntime-src/lib | ||
| 8 | 10 | ||
| 9 | decode-file-c-api: decode-file-c-api.c | 11 | decode-file-c-api: decode-file-c-api.c |
| 10 | $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) | 12 | $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) |
| 1 | -// c-api-examples/decode-file-c-api.cc | 1 | +// c-api-examples/decode-file-c-api.c |
| 2 | // | 2 | // |
| 3 | // Copyright (c) 2023 Xiaomi Corporation | 3 | // Copyright (c) 2023 Xiaomi Corporation |
| 4 | 4 | ||
| 5 | +// This file shows how to use sherpa-onnx C API | ||
| 6 | +// to decode a file. | ||
| 7 | + | ||
| 5 | #include <stdio.h> | 8 | #include <stdio.h> |
| 6 | #include <stdlib.h> | 9 | #include <stdlib.h> |
| 7 | #include <string.h> | 10 | #include <string.h> |
c-api-examples/run.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [ ! -d ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 ]; then | ||
| 6 | + echo "Please download the pre-trained model for testing." | ||
| 7 | + echo "You can refer to" | ||
| 8 | + echo "" | ||
| 9 | + echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/zipformer-transducer-models.html#sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english" | ||
| 10 | + echo "for help" | ||
| 11 | + exit 1 | ||
| 12 | +fi | ||
| 13 | + | ||
| 14 | +if [[ ! -f ../build/lib/libsherpa-onnx-core.a && ! -f ../build/lib/libsherpa-onnx-core.dylib && ! -f ../build/lib/libsherpa-onnx-core.so ]]; then | ||
| 15 | + echo "Please build sherpa-onnx first. You can use" | ||
| 16 | + echo "" | ||
| 17 | + echo " cd /path/to/sherpa-onnx" | ||
| 18 | + echo " mkdir build" | ||
| 19 | + echo " cd build" | ||
| 20 | + echo " cmake .." | ||
| 21 | + echo " make -j4" | ||
| 22 | + exit 1 | ||
| 23 | +fi | ||
| 24 | + | ||
| 25 | +if [ ! -f ./decode-file-c-api ]; then | ||
| 26 | + make | ||
| 27 | +fi | ||
| 28 | + | ||
| 29 | +./decode-file-c-api \ | ||
| 30 | + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ | ||
| 31 | + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \ | ||
| 32 | + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ | ||
| 33 | + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx \ | ||
| 34 | + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/0.wav |
| @@ -12,6 +12,7 @@ function(download_portaudio) | @@ -12,6 +12,7 @@ function(download_portaudio) | ||
| 12 | ${PROJECT_SOURCE_DIR}/pa_stable_v190700_20210406.tgz | 12 | ${PROJECT_SOURCE_DIR}/pa_stable_v190700_20210406.tgz |
| 13 | ${PROJECT_BINARY_DIR}/pa_stable_v190700_20210406.tgz | 13 | ${PROJECT_BINARY_DIR}/pa_stable_v190700_20210406.tgz |
| 14 | /tmp/pa_stable_v190700_20210406.tgz | 14 | /tmp/pa_stable_v190700_20210406.tgz |
| 15 | + /star-fj/fangjun/download/github/pa_stable_v190700_20210406.tgz | ||
| 15 | ) | 16 | ) |
| 16 | 17 | ||
| 17 | foreach(f IN LISTS possible_file_locations) | 18 | foreach(f IN LISTS possible_file_locations) |
ffmpeg-examples/Makefile
0 → 100644
| 1 | +CC=g++ | ||
| 2 | +GDB ?= FALSE | ||
| 3 | + | ||
| 4 | +# use pkg-config for getting CFLAGS and LDLIBS | ||
| 5 | +SHARED_LIBS=libavdevice \ | ||
| 6 | + libavformat \ | ||
| 7 | + libavfilter \ | ||
| 8 | + libavcodec \ | ||
| 9 | + libswresample \ | ||
| 10 | + libswscale \ | ||
| 11 | + libavutil | ||
| 12 | + | ||
| 13 | +ifeq ($(GDB), TRUE) | ||
| 14 | + OPTFLAG += -g | ||
| 15 | +endif | ||
| 16 | + | ||
| 17 | +CFLAGS := $(shell pkg-config --cflags $(SHARED_LIBS)) -I.. -Wall -std=c++11 -fopenmp ${OPTFLAG} | ||
| 18 | +LDLIBS := $(shell pkg-config --libs $(SHARED_LIBS)) | ||
| 19 | + | ||
| 20 | +CUR_DIR :=$(shell pwd) | ||
| 21 | + | ||
| 22 | +LDLIBS += -L ../build/lib | ||
| 23 | +LDLIBS += -L ../build/_deps/onnxruntime-src/lib | ||
| 24 | +LDLIBS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lonnxruntime -lkaldi-native-fbank-core | ||
| 25 | +LDLIBS += -Wl,-rpath,${CUR_DIR}/../build/lib | ||
| 26 | +LDLIBS += -Wl,-rpath,${CUR_DIR}/../build/_deps/onnxruntime-src/lib | ||
| 27 | + | ||
| 28 | + | ||
| 29 | +#Get libavutil version and extract major, minor and micro | ||
| 30 | +LIBAVUTIL_VERSION := $(shell pkg-config --modversion libavutil) | ||
| 31 | +LIBAVUTIL_MAJOR := $(shell echo "$(LIBAVUTIL_VERSION)" | awk -F. '{print $$1}') | ||
| 32 | +LIBAVUTIL_MINOR := $(shell echo "$(LIBAVUTIL_VERSION)" | awk -F. '{print $$2}') | ||
| 33 | +LIBAVUTIL_MICRO := $(shell echo "$(LIBAVUTIL_VERSION)" | awk -F. '{print $$3}') | ||
| 34 | +#Check if libavutil version is 57.28.100 or above | ||
| 35 | +FFMPEG_51_AND_ABOVE = $(shell echo "$(LIBAVUTIL_MAJOR) $(LIBAVUTIL_MINOR) $(LIBAVUTIL_MICRO)" | awk '{if ($$1 > 57 || ($$1 == 57 && $$2 > 28) || ($$1 == 57 && $$2 == 28 && $$3 >= 100)) print "TRUE"; else print "FALSE"}') | ||
| 36 | +ifeq ($(FFMPEG_51_AND_ABOVE), FALSE) | ||
| 37 | +$(error FFmpeg version should be n5.1 or above!) | ||
| 38 | +endif | ||
| 39 | + | ||
| 40 | +EXAMPLES=sherpa-onnx-ffmpeg | ||
| 41 | + | ||
| 42 | +OBJS=$(addsuffix .o,$(EXAMPLES)) | ||
| 43 | + | ||
| 44 | +.phony: all clean | ||
| 45 | + | ||
| 46 | +all: $(EXAMPLES) | ||
| 47 | + @echo $(EXAMPLES) | ||
| 48 | + $(RM) $(OBJS) | ||
| 49 | + | ||
| 50 | +$(EXAMPLES): $(OBJS) | ||
| 51 | + $(CC) $(addsuffix .o,$@) $(CFLAGS) $(LDLIBS) -o $@ | ||
| 52 | + | ||
| 53 | +%.o : %.c | ||
| 54 | + ${CC} ${CFLAGS} -c -o $@ $< | ||
| 55 | + | ||
| 56 | +clean: | ||
| 57 | + $(RM) $(EXAMPLES) $(OBJS) | ||
| 58 | + | ||
| 59 | +build_info: | ||
| 60 | + @echo "libavutil version: $(LIBAVUTIL_VERSION)" | ||
| 61 | + @echo "Supported examples: $(EXAMPLES)" |
ffmpeg-examples/README.md
0 → 100644
| 1 | +# Introduction | ||
| 2 | + | ||
| 3 | +You can use `sherpa-onnx-ffmpeg` to decode a wav, mp3, or even a URL. | ||
| 4 | + | ||
| 5 | +See <https://github.com/ossrs/srs> | ||
| 6 | +for more supported formats and protocols, e.g., | ||
| 7 | +RTMP/WebRTC/HLS/HTTP-FLV/SRT/MPEG-DASH/GB28181. | ||
| 8 | + | ||
| 9 | + | ||
| 10 | +## How to use | ||
| 11 | + | ||
| 12 | +Please have a look at | ||
| 13 | + | ||
| 14 | +``` | ||
| 15 | +./run.sh | ||
| 16 | +``` |
ffmpeg-examples/how-to-fix-errors.md
0 → 100644
| 1 | +# Fixes for errors | ||
| 2 | + | ||
| 3 | +To fix the following error: | ||
| 4 | +``` | ||
| 5 | +Package libavdevice was not found in the pkg-config search path. | ||
| 6 | +``` | ||
| 7 | +please run | ||
| 8 | + | ||
| 9 | +``` | ||
| 10 | +sudo apt-get install libavdevice-dev | ||
| 11 | +``` | ||
| 12 | + | ||
| 13 | +To fix the following error | ||
| 14 | +``` | ||
| 15 | +Makefile:28: *** FFmpeg version should be n5.1 or above!. Stop. | ||
| 16 | +``` | ||
| 17 | +please run | ||
| 18 | +``` | ||
| 19 | +sudo apt-get install software-properties-common | ||
| 20 | +sudo add-apt-repository ppa:savoury1/ffmpeg4 | ||
| 21 | +sudo add-apt-repository ppa:savoury1/ffmpeg5 | ||
| 22 | +sudo apt-get update | ||
| 23 | +sudo apt-get install ffmpeg --reinstall | ||
| 24 | +sudo apt-get install libavutil-dev --reinstall | ||
| 25 | +``` | ||
| 26 | + | ||
| 27 | +To fix the following error: | ||
| 28 | +``` | ||
| 29 | +ModuleNotFoundError: No module named 'apt_pkg' | ||
| 30 | +``` | ||
| 31 | +please run: | ||
| 32 | +``` | ||
| 33 | +sudo apt-get install python-apt | ||
| 34 | +``` |
ffmpeg-examples/run.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [ ! -d ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 ]; then | ||
| 6 | + echo "Please download the pre-trained model for testing." | ||
| 7 | + echo "You can refer to" | ||
| 8 | + echo "" | ||
| 9 | + echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/zipformer-transducer-models.html#sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english" | ||
| 10 | + echo "for help" | ||
| 11 | + exit 1 | ||
| 12 | +fi | ||
| 13 | + | ||
| 14 | +if [[ ! -f ../build/lib/libsherpa-onnx-core.a && ! -f ../build/lib/libsherpa-onnx-core.dylib && ! -f ../build/lib/libsherpa-onnx-core.so ]]; then | ||
| 15 | + echo "Please build sherpa-onnx first. You can use" | ||
| 16 | + echo "" | ||
| 17 | + echo " cd /path/to/sherpa-onnx" | ||
| 18 | + echo " mkdir build" | ||
| 19 | + echo " cd build" | ||
| 20 | + echo " cmake .." | ||
| 21 | + echo " make -j4" | ||
| 22 | + exit 1 | ||
| 23 | +fi | ||
| 24 | + | ||
| 25 | +if [ ! -f ./sherpa-onnx-ffmpeg ]; then | ||
| 26 | + make | ||
| 27 | +fi | ||
| 28 | + | ||
| 29 | +../ffmpeg-examples/sherpa-onnx-ffmpeg \ | ||
| 30 | + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ | ||
| 31 | + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \ | ||
| 32 | + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ | ||
| 33 | + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx \ | ||
| 34 | + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/4.wav | ||
| 35 | + | ||
| 36 | +echo "Decoding a URL" | ||
| 37 | + | ||
| 38 | +../ffmpeg-examples/sherpa-onnx-ffmpeg \ | ||
| 39 | + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ | ||
| 40 | + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \ | ||
| 41 | + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ | ||
| 42 | + ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx \ | ||
| 43 | + https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/resolve/main/test_wavs/4.wav |
ffmpeg-examples/sherpa-onnx-ffmpeg.c
0 → 100644
| 1 | +// ffmpeg-examples/sherpa-onnx-ffmpeg.c | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | +#include <stdio.h> | ||
| 5 | +#include <stdlib.h> | ||
| 6 | +#include <string.h> | ||
| 7 | + | ||
| 8 | +#include "sherpa-onnx/c-api/c-api.h" | ||
| 9 | + | ||
| 10 | + | ||
| 11 | +/* | ||
| 12 | + * Copyright (c) 2010 Nicolas George | ||
| 13 | + * Copyright (c) 2011 Stefano Sabatini | ||
| 14 | + * Copyright (c) 2012 Clément Bœsch | ||
| 15 | + * | ||
| 16 | + * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| 17 | + * of this software and associated documentation files (the "Software"), to deal | ||
| 18 | + * in the Software without restriction, including without limitation the rights | ||
| 19 | + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| 20 | + * copies of the Software, and to permit persons to whom the Software is | ||
| 21 | + * furnished to do so, subject to the following conditions: | ||
| 22 | + * | ||
| 23 | + * The above copyright notice and this permission notice shall be included in | ||
| 24 | + * all copies or substantial portions of the Software. | ||
| 25 | + * | ||
| 26 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| 27 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| 28 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
| 29 | + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| 30 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| 31 | + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
| 32 | + * THE SOFTWARE. | ||
| 33 | + */ | ||
| 34 | + | ||
| 35 | +/** | ||
| 36 | + * @file audio decoding and filtering usage example | ||
| 37 | + * @example sherpa-onnx-ffmpeg.c | ||
| 38 | + * | ||
| 39 | + * Demux, decode and filter audio input file, generate a raw audio | ||
| 40 | + * file to be played with ffplay. | ||
| 41 | + */ | ||
| 42 | + | ||
| 43 | +#include <unistd.h> | ||
| 44 | +extern "C" { | ||
| 45 | +#include <libavcodec/avcodec.h> | ||
| 46 | +#include <libavformat/avformat.h> | ||
| 47 | +#include <libavfilter/buffersink.h> | ||
| 48 | +#include <libavfilter/buffersrc.h> | ||
| 49 | +#include <libavutil/channel_layout.h> | ||
| 50 | +#include <libavutil/opt.h> | ||
| 51 | +} | ||
| 52 | + | ||
| 53 | +static const char *filter_descr = "aresample=16000,aformat=sample_fmts=s16:channel_layouts=mono"; | ||
| 54 | + | ||
| 55 | +static AVFormatContext *fmt_ctx; | ||
| 56 | +static AVCodecContext *dec_ctx; | ||
| 57 | +AVFilterContext *buffersink_ctx; | ||
| 58 | +AVFilterContext *buffersrc_ctx; | ||
| 59 | +AVFilterGraph *filter_graph; | ||
| 60 | +static int audio_stream_index = -1; | ||
| 61 | + | ||
| 62 | +static int open_input_file(const char *filename) | ||
| 63 | +{ | ||
| 64 | + const AVCodec *dec; | ||
| 65 | + int ret; | ||
| 66 | + | ||
| 67 | + if ((ret = avformat_open_input(&fmt_ctx, filename, NULL, NULL)) < 0) { | ||
| 68 | + av_log(NULL, AV_LOG_ERROR, "Cannot open input file %s\n", filename); | ||
| 69 | + return ret; | ||
| 70 | + } | ||
| 71 | + | ||
| 72 | + if ((ret = avformat_find_stream_info(fmt_ctx, NULL)) < 0) { | ||
| 73 | + av_log(NULL, AV_LOG_ERROR, "Cannot find stream information\n"); | ||
| 74 | + return ret; | ||
| 75 | + } | ||
| 76 | + | ||
| 77 | + /* select the audio stream */ | ||
| 78 | + ret = av_find_best_stream(fmt_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, &dec, 0); | ||
| 79 | + if (ret < 0) { | ||
| 80 | + av_log(NULL, AV_LOG_ERROR, "Cannot find an audio stream in the input file\n"); | ||
| 81 | + return ret; | ||
| 82 | + } | ||
| 83 | + audio_stream_index = ret; | ||
| 84 | + | ||
| 85 | + /* create decoding context */ | ||
| 86 | + dec_ctx = avcodec_alloc_context3(dec); | ||
| 87 | + if (!dec_ctx) | ||
| 88 | + return AVERROR(ENOMEM); | ||
| 89 | + avcodec_parameters_to_context(dec_ctx, fmt_ctx->streams[audio_stream_index]->codecpar); | ||
| 90 | + | ||
| 91 | + /* init the audio decoder */ | ||
| 92 | + if ((ret = avcodec_open2(dec_ctx, dec, NULL)) < 0) { | ||
| 93 | + av_log(NULL, AV_LOG_ERROR, "Cannot open audio decoder\n"); | ||
| 94 | + return ret; | ||
| 95 | + } | ||
| 96 | + | ||
| 97 | + return 0; | ||
| 98 | +} | ||
| 99 | + | ||
| 100 | +static int init_filters(const char *filters_descr) | ||
| 101 | +{ | ||
| 102 | + char args[512]; | ||
| 103 | + int ret = 0; | ||
| 104 | + const AVFilter *abuffersrc = avfilter_get_by_name("abuffer"); | ||
| 105 | + const AVFilter *abuffersink = avfilter_get_by_name("abuffersink"); | ||
| 106 | + AVFilterInOut *outputs = avfilter_inout_alloc(); | ||
| 107 | + AVFilterInOut *inputs = avfilter_inout_alloc(); | ||
| 108 | + static const enum AVSampleFormat out_sample_fmts[] = { AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_NONE }; | ||
| 109 | + static const int out_sample_rates[] = { 16000, -1 }; | ||
| 110 | + const AVFilterLink *outlink; | ||
| 111 | + AVRational time_base = fmt_ctx->streams[audio_stream_index]->time_base; | ||
| 112 | + | ||
| 113 | + filter_graph = avfilter_graph_alloc(); | ||
| 114 | + if (!outputs || !inputs || !filter_graph) { | ||
| 115 | + ret = AVERROR(ENOMEM); | ||
| 116 | + goto end; | ||
| 117 | + } | ||
| 118 | + | ||
| 119 | + /* buffer audio source: the decoded frames from the decoder will be inserted here. */ | ||
| 120 | + if (dec_ctx->ch_layout.order == AV_CHANNEL_ORDER_UNSPEC) | ||
| 121 | + av_channel_layout_default(&dec_ctx->ch_layout, dec_ctx->ch_layout.nb_channels); | ||
| 122 | + ret = snprintf(args, sizeof(args), | ||
| 123 | + "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=", | ||
| 124 | + time_base.num, time_base.den, dec_ctx->sample_rate, | ||
| 125 | + av_get_sample_fmt_name(dec_ctx->sample_fmt)); | ||
| 126 | + av_channel_layout_describe(&dec_ctx->ch_layout, args + ret, sizeof(args) - ret); | ||
| 127 | + ret = avfilter_graph_create_filter(&buffersrc_ctx, abuffersrc, "in", | ||
| 128 | + args, NULL, filter_graph); | ||
| 129 | + if (ret < 0) { | ||
| 130 | + av_log(NULL, AV_LOG_ERROR, "Cannot create audio buffer source\n"); | ||
| 131 | + goto end; | ||
| 132 | + } | ||
| 133 | + | ||
| 134 | + /* buffer audio sink: to terminate the filter chain. */ | ||
| 135 | + ret = avfilter_graph_create_filter(&buffersink_ctx, abuffersink, "out", | ||
| 136 | + NULL, NULL, filter_graph); | ||
| 137 | + if (ret < 0) { | ||
| 138 | + av_log(NULL, AV_LOG_ERROR, "Cannot create audio buffer sink\n"); | ||
| 139 | + goto end; | ||
| 140 | + } | ||
| 141 | + | ||
| 142 | + ret = av_opt_set_int_list(buffersink_ctx, "sample_fmts", out_sample_fmts, -1, | ||
| 143 | + AV_OPT_SEARCH_CHILDREN); | ||
| 144 | + if (ret < 0) { | ||
| 145 | + av_log(NULL, AV_LOG_ERROR, "Cannot set output sample format\n"); | ||
| 146 | + goto end; | ||
| 147 | + } | ||
| 148 | + | ||
| 149 | + ret = av_opt_set(buffersink_ctx, "ch_layouts", "mono", | ||
| 150 | + AV_OPT_SEARCH_CHILDREN); | ||
| 151 | + if (ret < 0) { | ||
| 152 | + av_log(NULL, AV_LOG_ERROR, "Cannot set output channel layout\n"); | ||
| 153 | + goto end; | ||
| 154 | + } | ||
| 155 | + | ||
| 156 | + ret = av_opt_set_int_list(buffersink_ctx, "sample_rates", out_sample_rates, -1, | ||
| 157 | + AV_OPT_SEARCH_CHILDREN); | ||
| 158 | + if (ret < 0) { | ||
| 159 | + av_log(NULL, AV_LOG_ERROR, "Cannot set output sample rate\n"); | ||
| 160 | + goto end; | ||
| 161 | + } | ||
| 162 | + | ||
| 163 | + /* | ||
| 164 | + * Set the endpoints for the filter graph. The filter_graph will | ||
| 165 | + * be linked to the graph described by filters_descr. | ||
| 166 | + */ | ||
| 167 | + | ||
| 168 | + /* | ||
| 169 | + * The buffer source output must be connected to the input pad of | ||
| 170 | + * the first filter described by filters_descr; since the first | ||
| 171 | + * filter input label is not specified, it is set to "in" by | ||
| 172 | + * default. | ||
| 173 | + */ | ||
| 174 | + outputs->name = av_strdup("in"); | ||
| 175 | + outputs->filter_ctx = buffersrc_ctx; | ||
| 176 | + outputs->pad_idx = 0; | ||
| 177 | + outputs->next = NULL; | ||
| 178 | + | ||
| 179 | + /* | ||
| 180 | + * The buffer sink input must be connected to the output pad of | ||
| 181 | + * the last filter described by filters_descr; since the last | ||
| 182 | + * filter output label is not specified, it is set to "out" by | ||
| 183 | + * default. | ||
| 184 | + */ | ||
| 185 | + inputs->name = av_strdup("out"); | ||
| 186 | + inputs->filter_ctx = buffersink_ctx; | ||
| 187 | + inputs->pad_idx = 0; | ||
| 188 | + inputs->next = NULL; | ||
| 189 | + | ||
| 190 | + if ((ret = avfilter_graph_parse_ptr(filter_graph, filters_descr, | ||
| 191 | + &inputs, &outputs, NULL)) < 0) | ||
| 192 | + goto end; | ||
| 193 | + | ||
| 194 | + if ((ret = avfilter_graph_config(filter_graph, NULL)) < 0) | ||
| 195 | + goto end; | ||
| 196 | + | ||
| 197 | + /* Print summary of the sink buffer | ||
| 198 | + * Note: args buffer is reused to store channel layout string */ | ||
| 199 | + outlink = buffersink_ctx->inputs[0]; | ||
| 200 | + av_channel_layout_describe(&outlink->ch_layout, args, sizeof(args)); | ||
| 201 | + av_log(NULL, AV_LOG_INFO, "Output: srate:%dHz fmt:%s chlayout:%s\n", | ||
| 202 | + (int)outlink->sample_rate, | ||
| 203 | + (char *)av_x_if_null(av_get_sample_fmt_name((AVSampleFormat)outlink->format), "?"), | ||
| 204 | + args); | ||
| 205 | + | ||
| 206 | +end: | ||
| 207 | + avfilter_inout_free(&inputs); | ||
| 208 | + avfilter_inout_free(&outputs); | ||
| 209 | + | ||
| 210 | + return ret; | ||
| 211 | +} | ||
| 212 | + | ||
| 213 | +static void sherpa_decode_frame(const AVFrame *frame, SherpaOnnxOnlineRecognizer *recognizer, | ||
| 214 | + SherpaOnnxOnlineStream* stream) | ||
| 215 | +{ | ||
| 216 | +#define N 3200 // 100s. Sample rate is fixed to 16 kHz | ||
| 217 | + static float samples[N]; | ||
| 218 | + static int nb_samples = 0; | ||
| 219 | + const int16_t *p = (int16_t*)frame->data[0]; | ||
| 220 | + | ||
| 221 | + if (frame->nb_samples + nb_samples > N) { | ||
| 222 | + AcceptWaveform(stream, 16000, samples, nb_samples); | ||
| 223 | + while (IsOnlineStreamReady(recognizer, stream)) { | ||
| 224 | + DecodeOnlineStream(recognizer, stream); | ||
| 225 | + } | ||
| 226 | + | ||
| 227 | + | ||
| 228 | + if (IsEndpoint(recognizer, stream)) { | ||
| 229 | + SherpaOnnxOnlineRecognizerResult *r = | ||
| 230 | + GetOnlineStreamResult(recognizer, stream); | ||
| 231 | + if (strlen(r->text)) { | ||
| 232 | + fprintf(stderr, "%s\n", r->text); | ||
| 233 | + } | ||
| 234 | + DestroyOnlineRecognizerResult(r); | ||
| 235 | + } | ||
| 236 | + nb_samples = 0; | ||
| 237 | + } | ||
| 238 | + | ||
| 239 | + for (int i = 0; i < frame->nb_samples; i++) { | ||
| 240 | + samples[nb_samples++] = p[i] / 32768.; | ||
| 241 | + } | ||
| 242 | +} | ||
| 243 | + | ||
| 244 | +static inline char *__av_err2str(int errnum) | ||
| 245 | +{ | ||
| 246 | + static char str[AV_ERROR_MAX_STRING_SIZE]; | ||
| 247 | + memset(str, 0, sizeof(str)); | ||
| 248 | + return av_make_error_string(str, AV_ERROR_MAX_STRING_SIZE, errnum); | ||
| 249 | +} | ||
| 250 | + | ||
| 251 | +int main(int argc, char **argv) | ||
| 252 | +{ | ||
| 253 | + int ret; | ||
| 254 | + int num_threads = 4; | ||
| 255 | + AVPacket *packet = av_packet_alloc(); | ||
| 256 | + AVFrame *frame = av_frame_alloc(); | ||
| 257 | + AVFrame *filt_frame = av_frame_alloc(); | ||
| 258 | + const char *kUsage = | ||
| 259 | + "\n" | ||
| 260 | + "Usage:\n" | ||
| 261 | + " ./sherpa-onnx-ffmpeg \\\n" | ||
| 262 | + " /path/to/tokens.txt \\\n" | ||
| 263 | + " /path/to/encoder.onnx\\\n" | ||
| 264 | + " /path/to/decoder.onnx\\\n" | ||
| 265 | + " /path/to/joiner.onnx\\\n" | ||
| 266 | + " /path/to/foo.wav [num_threads]" | ||
| 267 | + "\n\n" | ||
| 268 | + "Please refer to \n" | ||
| 269 | + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html\n" | ||
| 270 | + "for a list of pre-trained models to download.\n"; | ||
| 271 | + | ||
| 272 | + | ||
| 273 | + if (!packet || !frame || !filt_frame) { | ||
| 274 | + fprintf(stderr, "Could not allocate frame or packet\n"); | ||
| 275 | + exit(1); | ||
| 276 | + } | ||
| 277 | + | ||
| 278 | + if (argc < 6 || argc > 7) { | ||
| 279 | + fprintf(stderr, "%s\n", kUsage); | ||
| 280 | + return -1; | ||
| 281 | + } | ||
| 282 | + | ||
| 283 | + SherpaOnnxOnlineRecognizerConfig config; | ||
| 284 | + config.model_config.tokens = argv[1]; | ||
| 285 | + config.model_config.encoder = argv[2]; | ||
| 286 | + config.model_config.decoder = argv[3]; | ||
| 287 | + config.model_config.joiner = argv[4]; | ||
| 288 | + | ||
| 289 | + if (argc == 7 && atoi(argv[6]) > 0) { | ||
| 290 | + num_threads = atoi(argv[6]); | ||
| 291 | + } | ||
| 292 | + config.model_config.num_threads = num_threads; | ||
| 293 | + config.model_config.debug = 0; | ||
| 294 | + | ||
| 295 | + config.feat_config.sample_rate = 16000; | ||
| 296 | + config.feat_config.feature_dim = 80; | ||
| 297 | + | ||
| 298 | + config.enable_endpoint = 1; | ||
| 299 | + config.rule1_min_trailing_silence = 2.4; | ||
| 300 | + config.rule2_min_trailing_silence = 1.2; | ||
| 301 | + config.rule3_min_utterance_length = 300; | ||
| 302 | + | ||
| 303 | + SherpaOnnxOnlineRecognizer *recognizer = CreateOnlineRecognizer(&config); | ||
| 304 | + SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer); | ||
| 305 | + | ||
| 306 | + if ((ret = open_input_file(argv[5])) < 0) | ||
| 307 | + exit(1); | ||
| 308 | + | ||
| 309 | + if ((ret = init_filters(filter_descr)) < 0) | ||
| 310 | + exit(1); | ||
| 311 | + | ||
| 312 | + /* read all packets */ | ||
| 313 | + while (1) { | ||
| 314 | + if ((ret = av_read_frame(fmt_ctx, packet)) < 0) | ||
| 315 | + break; | ||
| 316 | + | ||
| 317 | + if (packet->stream_index == audio_stream_index) { | ||
| 318 | + ret = avcodec_send_packet(dec_ctx, packet); | ||
| 319 | + if (ret < 0) { | ||
| 320 | + av_log(NULL, AV_LOG_ERROR, "Error while sending a packet to the decoder\n"); | ||
| 321 | + break; | ||
| 322 | + } | ||
| 323 | + | ||
| 324 | + while (ret >= 0) { | ||
| 325 | + ret = avcodec_receive_frame(dec_ctx, frame); | ||
| 326 | + if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) { | ||
| 327 | + break; | ||
| 328 | + } else if (ret < 0) { | ||
| 329 | + av_log(NULL, AV_LOG_ERROR, "Error while receiving a frame from the decoder\n"); | ||
| 330 | + exit(1); | ||
| 331 | + } | ||
| 332 | + | ||
| 333 | + if (ret >= 0) { | ||
| 334 | + /* push the audio data from decoded frame into the filtergraph */ | ||
| 335 | + if (av_buffersrc_add_frame_flags(buffersrc_ctx, frame, AV_BUFFERSRC_FLAG_KEEP_REF) < 0) { | ||
| 336 | + av_log(NULL, AV_LOG_ERROR, "Error while feeding the audio filtergraph\n"); | ||
| 337 | + break; | ||
| 338 | + } | ||
| 339 | + | ||
| 340 | + /* pull filtered audio from the filtergraph */ | ||
| 341 | + while (1) { | ||
| 342 | + ret = av_buffersink_get_frame(buffersink_ctx, filt_frame); | ||
| 343 | + if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) | ||
| 344 | + break; | ||
| 345 | + if (ret < 0) | ||
| 346 | + exit(1); | ||
| 347 | + sherpa_decode_frame(filt_frame, recognizer, stream); | ||
| 348 | + av_frame_unref(filt_frame); | ||
| 349 | + } | ||
| 350 | + av_frame_unref(frame); | ||
| 351 | + } | ||
| 352 | + } | ||
| 353 | + } | ||
| 354 | + av_packet_unref(packet); | ||
| 355 | + } | ||
| 356 | + | ||
| 357 | + // add some tail padding | ||
| 358 | + float tail_paddings[4800] = {0}; // 0.3 seconds at 16 kHz sample rate | ||
| 359 | + AcceptWaveform(stream, 16000, tail_paddings, 4800); | ||
| 360 | + InputFinished(stream); | ||
| 361 | + | ||
| 362 | + while (IsOnlineStreamReady(recognizer, stream)) { | ||
| 363 | + DecodeOnlineStream(recognizer, stream); | ||
| 364 | + } | ||
| 365 | + | ||
| 366 | + SherpaOnnxOnlineRecognizerResult *r = | ||
| 367 | + GetOnlineStreamResult(recognizer, stream); | ||
| 368 | + if (strlen(r->text)) { | ||
| 369 | + fprintf(stderr, "%s\n", r->text); | ||
| 370 | + } | ||
| 371 | + | ||
| 372 | + DestroyOnlineRecognizerResult(r); | ||
| 373 | + | ||
| 374 | + DestoryOnlineStream(stream); | ||
| 375 | + DestroyOnlineRecognizer(recognizer); | ||
| 376 | + | ||
| 377 | + avfilter_graph_free(&filter_graph); | ||
| 378 | + avcodec_free_context(&dec_ctx); | ||
| 379 | + avformat_close_input(&fmt_ctx); | ||
| 380 | + av_packet_free(&packet); | ||
| 381 | + av_frame_free(&frame); | ||
| 382 | + av_frame_free(&filt_frame); | ||
| 383 | + | ||
| 384 | + if (ret < 0 && ret != AVERROR_EOF) { | ||
| 385 | + fprintf(stderr, "Error occurred: %s\n", __av_err2str(ret)); | ||
| 386 | + exit(1); | ||
| 387 | + } | ||
| 388 | + | ||
| 389 | + return 0; | ||
| 390 | +} |
-
请 注册 或 登录 后发表评论