需引入依赖javacv、vosk相关依赖,
至于javacv依赖,网上有很多缩减方案,注释部分是可行的缩减方案,至于视频提取视频这里无需安装ffmpeg,只需引入依赖。而vosk需要下载模型方可使用,并且下载比较慢,可先用小模型跑通。
<properties><project.build.sourceEncoding>UTF-8</project.build.sourceEncoding><javacv.version>1.5.6</javacv.version><system.windowsx64>windows-x86_64</system.windowsx64></properties><!-- javacv+javacpp -->
<!-- <dependency>-->
<!-- <groupId>org.bytedeco</groupId>-->
<!-- <artifactId>javacv</artifactId>-->
<!-- <version>${javacv.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.bytedeco</groupId>-->
<!-- <artifactId>javacpp-platform</artifactId>-->
<!-- <version>${javacv.version}</version>-->
<!-- </dependency>-->
<!-- <!– ffmpeg最小依赖包,必须包含上面的javacv+javacpp核心库 –>-->
<!-- <dependency>-->
<!-- <groupId>org.bytedeco</groupId>-->
<!-- <artifactId>ffmpeg</artifactId>-->
<!-- <version>4.4-${javacv.version}</version>-->
<!-- <classifier>${system.windowsx64}</classifier>-->
<!-- </dependency>-->
<!--<!– 最小opencv依赖包 ,必须包含上面的javacv+javacpp–>-->
<!-- <dependency>-->
<!-- <groupId>org.bytedeco</groupId>-->
<!-- <artifactId>opencv</artifactId>-->
<!-- <version>4.5.1-${javacv.version}</version>-->
<!-- <classifier>${system.windowsx64}</classifier>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.bytedeco</groupId>-->
<!-- <artifactId>openblas</artifactId>-->
<!-- <version>0.3.13-${javacv.version}</version>-->
<!-- <classifier>${system.windowsx64}</classifier>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.bytedeco</groupId>-->
<!-- <artifactId>flycapture</artifactId>-->
<!-- <version>2.13.3.31-${javacv.version}</version>-->
<!-- <classifier>${system.windowsx64}</classifier>-->
<!-- </dependency>--><dependencies><!-- 视频提取音频信息 --><dependency><groupId>org.bytedeco</groupId><artifactId>javacv-platform</artifactId><version>1.5.10</version></dependency><!-- 获取音频信息 --><dependency><groupId>org</groupId><artifactId>jaudiotagger</artifactId><version>2.0.3</version></dependency><dependency><groupId>net.java.dev.jna</groupId><artifactId>jna</artifactId><version>5.13.0</version></dependency><dependency><groupId>com.alphacephei</groupId><artifactId>vosk</artifactId><version>0.3.45</version></dependency><!-- JAVE2(Java音频视频编码器)库是ffmpeg项目上的Java包装器。 --><dependency><groupId>ws.schild</groupId><artifactId>jave-core</artifactId><version>3.1.1</version></dependency><dependency><groupId>com.alibaba</groupId><artifactId>fastjson</artifactId><version>1.2.83</version></dependency></dependencies>
视频提取音频
java">package org.example;import org.bytedeco.ffmpeg.global.avcodec;
import org.bytedeco.javacv.FFmpegFrameGrabber;
import org.bytedeco.javacv.FFmpegFrameRecorder;
import org.bytedeco.javacv.Frame;public class Test {public static void extractVoice(String sourceFileName, String audioUrl) throws FFmpegFrameGrabber.Exception, FFmpegFrameRecorder.Exception {//抓取资源FFmpegFrameGrabber frameGrabber = new FFmpegFrameGrabber(sourceFileName);Frame frame = null;FFmpegFrameRecorder recorder = null;frameGrabber.start();//转录为单轨, 16K采样率, wav格式recorder = new FFmpegFrameRecorder(audioUrl, frameGrabber.getAudioChannels());recorder.setFormat(frameGrabber.getFormat());recorder.setSampleRate(frameGrabber.getSampleRate());//frameGrabber.getSampleRate()//recorder.setAudioBitrate(128000);// 音频比特率recorder.setTimestamp(frameGrabber.getTimestamp());recorder.setVideoCodec(avcodec.AV_CODEC_ID_NONE); // 不录制视频recorder.start();int index = 0;while (true) {frame = frameGrabber.grabSamples();if (frame == null) break;if (frame.samples != null) {recorder.recordSamples(frame.sampleRate, frame.audioChannels, frame.samples);recorder.setTimestamp(frameGrabber.getTimestamp());}index++;}recorder.stop();recorder.release();frameGrabber.stop();frameGrabber.release();}public static void main(String[] args) throws FFmpegFrameGrabber.Exception, FFmpegFrameRecorder.Exception {String videoFilePath = "I:\\workspace\\test.mp4"; // 视频文件路径String audioOutputPath = "I:\\workspace\\test_audio.wav"; // 输出的音频文件路径long s = System.currentTimeMillis();extractVoice(videoFilePath, audioOutputPath);System.out.println(System.currentTimeMillis() - s);}}
音频提取文字
至于model可去此网站下载,解压使用。大模型下载较慢
VOSK Models
java">package org.example;import com.alibaba.fastjson.JSON;
import org.vosk.LibVosk;
import org.vosk.LogLevel;
import org.vosk.Model;
import org.vosk.Recognizer;import javax.sound.sampled.*;
import java.io.*;
import java.util.Optional;public class Test3 {public static void main(String[] args) {StringBuilder result = new StringBuilder();LibVosk.setLogLevel(LogLevel.DEBUG);AudioFormat format = new AudioFormat(AudioFormat.Encoding.PCM_SIGNED, 44100, 16, 2, 4, 44100, false);DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);TargetDataLine microphone;SourceDataLine speakers;try (Model model = new Model("I:\\workspace\\vosk-model-small-cn-0.22");InputStream ais = AudioSystem.getAudioInputStream(new BufferedInputStream(new FileInputStream("I:\\workspace\\test_audio.wav")));Recognizer recognizer = new Recognizer(model, 120000)) {try {microphone = (TargetDataLine) AudioSystem.getLine(info);microphone.open(format);microphone.start();ByteArrayOutputStream out = new ByteArrayOutputStream();int numBytesRead;int CHUNK_SIZE = 1024;int bytesRead = 0;DataLine.Info dataLineInfo = new DataLine.Info(SourceDataLine.class, format);speakers = (SourceDataLine) AudioSystem.getLine(dataLineInfo);speakers.open(format);speakers.start();byte[] b = new byte[4096];while (bytesRead <= 100000000) {byte[] audioData = new byte[CHUNK_SIZE];numBytesRead = ais.read(audioData, 0, CHUNK_SIZE);bytesRead += numBytesRead;out.write(audioData, 0, numBytesRead);speakers.write(audioData, 0, numBytesRead);if (recognizer.acceptWaveForm(audioData, numBytesRead)) {result.append(getResult(recognizer.getResult()));} else {result.append(getResult(recognizer.getPartialResult()));}}result.append(getResult(recognizer.getFinalResult()));speakers.drain();speakers.close();microphone.close();} catch (Exception e) {e.printStackTrace();}System.out.println(result.toString());} catch (IOException e) {throw new RuntimeException(e);} catch (UnsupportedAudioFileException e) {throw new RuntimeException(e);}}/*** 获取返回结果** @param result* @return*/private static String getResult(String result) {VoskResult vr = JSON.parseObject(result,VoskResult.class);return Optional.ofNullable(vr).map(VoskResult::getText).orElse("");}public static void main1(String[] argv) throws IOException, UnsupportedAudioFileException {LibVosk.setLogLevel(LogLevel.DEBUG);StringBuilder result = new StringBuilder();try (Model model = new Model("I:\\workspace\\vosk-model-small-cn-0.22");InputStream ais = AudioSystem.getAudioInputStream(new BufferedInputStream(new FileInputStream("I:\\workspace\\test_audio.wav")));Recognizer recognizer = new Recognizer(model, 120000)) {int nbytes;byte[] b = new byte[4096];while ((nbytes = ais.read(b)) >= 0) {if (recognizer.acceptWaveForm(b, nbytes)) {result.append(getResult(recognizer.getResult()));} else {result.append(getResult(recognizer.getPartialResult()));}}result.append(getResult(recognizer.getFinalResult()));}System.out.println(result);}
}
感谢网上各位大佬能分享这些信息
测试可行,识别率没有做过对比、大模型也没有试过。这里也就提供一种可行的离线解决方案。