Java实现pdf/word文字识别,调用OCR提取图片文字聚合

embedded/2024/11/9 17:08:40/
java">@Tag(name = "pdf/word/图片文字识别")
word">public word">class OcrController word">extends BaseController {@Autowiredword">private OcrService ocrService;@Autowiredword">private BaiduOcrServiceImpl baiduOcrService;/*** pdf/word文字识别** @param file* @return*/@PostMapping("/recognize-text")@Operation(summary = "pdf/word识别文字", description = "识别")word">public String recognizeText(@RequestParam("file") MultipartFile file) {word">return ocrService.recognizeText(file);}}
java">word">package com.jt.console.service.impl;word">import com.jt.common.beans.ServiceAssert;
word">import com.jt.console.service.OcrService;
word">import org.apache.pdfbox.cos.COSName;
word">import org.apache.pdfbox.pdmodel.PDDocument;
word">import org.apache.pdfbox.pdmodel.PDPage;
word">import org.apache.pdfbox.pdmodel.PDPageTree;
word">import org.apache.pdfbox.pdmodel.PDResources;
word">import org.apache.pdfbox.pdmodel.graphics.PDXObject;
word">import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
word">import org.apache.pdfbox.text.PDFTextStripper;
word">import org.apache.poi.hwpf.HWPFDocument;
word">import org.apache.poi.hwpf.extractor.WordExtractor;
word">import org.apache.poi.openxml4j.util.ZipSecureFile;
word">import org.apache.poi.xwpf.usermodel.*;
word">import org.springframework.beans.factory.annotation.Autowired;
word">import org.springframework.stereotype.Service;
word">import org.springframework.web.multipart.MultipartFile;word">import javax.imageio.ImageIO;
word">import java.awt.image.BufferedImage;
word">import java.io.File;
word">import java.io.FileOutputStream;
word">import java.io.IOException;
word">import java.io.InputStream;
word">import java.net.URLEncoder;
word">import java.util.Base64;word">import word">static com.jt.console.service.impl.BaiduOcrServiceImpl.formatOcrResult;/*** pdf/word/图片识别* @author chenchao* @date 2024/8/12 16:17*/
@Service
word">public word">class OcrServiceImpl word">implements OcrService {@Autowiredword">private BaiduOcrServiceImpl baiduOcrService;/*** 对于一些表格和公式的处理会有识别错乱问题* 识别上传文件中的文本内容* @param file 上传的文件* @return 提取的文本内容或错误信息*/@Overrideword">public String recognizeText(MultipartFile file) {String contentType = file.getContentType();word">if (contentType == word">null) {ServiceAssert.isTrue(false, "文件类型不支持");word">return word">null;}InputStream inputStream = word">null;word">try {inputStream = file.getInputStream();word">if (contentType.equals("application/pdf")) {word">return extractTextFromPdf(inputStream);} word">else word">if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document") ||contentType.equals("application/x-tika-ooxml")) {word">return extractTextFromDocx(inputStream);} word">else word">if (contentType.equals("application/msword")) {word">return extractTextFromDoc(inputStream);} word">else {ServiceAssert.isTrue(false, "不支持的文件类型");word">return word">null;}} word">catch (Exception e) {e.printStackTrace();ServiceAssert.isTrue(false, "处理文件出错");word">return word">null;} word">finally {word">if (inputStream != word">null) {word">try {inputStream.close();} word">catch (IOException e) {e.printStackTrace();}}}}/*** 从 PDF 文件中提取文本内容* @param inputStream PDF 文件的输入流* @return 提取的文本内容* @throws IOException 读取文件时发生的异常*/word">private String extractTextFromPdf(InputStream inputStream) word">throws IOException {StringBuilder text = word">new StringBuilder();word">try (PDDocument document = PDDocument.load(inputStream)) {// 禁止显示与 CMap 表相关的特定警告System.setProperty("org.apache.pdfbox.logging.SILENT", "true");PDFTextStripper pdfStripper = word">new PDFTextStripper();text.append(pdfStripper.getText(document));// 如果您需要从 PDF 中提取图像,请取消注释下面的行// extractImagesFromPdf(document);}word">return text.toString();}/*** 从 DOCX 文件中提取文本内容* @param inputStream DOCX 文件的输入流* @return 提取的文本内容* @throws IOException 读取文件时发生的异常*/word">private String extractTextFromDocx(InputStream inputStream) word">throws IOException {StringBuilder text = word">new StringBuilder();ZipSecureFile.setMinInflateRatio(0.001); // For safetyword">try (XWPFDocument document = word">new XWPFDocument(inputStream)) {// Extract text from paragraphsdocument.getParagraphs().forEach(paragraph -> text.append(paragraph.getText()).append("\n"));// Extract text from tablesword">for (XWPFTable table : document.getTables()) {word">for (XWPFTableRow row : table.getRows()) {word">for (XWPFTableCell cell : row.getTableCells()) {text.append(cell.getText()).append("\t");}text.append("\n");}}// 如果您需要从 DOCX 中提取图像,请取消注释下面的行// extractImagesFromDocx(document);}word">return text.toString();}/*** 从 DOC 文件中提取文本内容* @param inputStream DOC 文件的输入流* @return 提取的文本内容* @throws IOException 读取文件时发生的异常*/word">private String extractTextFromDoc(InputStream inputStream) word">throws IOException {StringBuilder text = word">new StringBuilder();word">try (HWPFDocument document = word">new HWPFDocument(inputStream)) {WordExtractor extractor = word">new WordExtractor(document);String[] paragraphs = extractor.getParagraphText();word">for (String paragraph : paragraphs) {text.append(paragraph).append("\n");}}word">return text.toString();}/*** 从 PDF 文件中提取图片* @param document PDF 文档对象* @throws IOException 读取文件时发生的异常*/word">private word">void extractImagesFromPdf(PDDocument document) word">throws IOException {PDPageTree pages = document.getPages();word">int imageCounter = 0;word">for (PDPage page : pages) {PDResources resources = page.getResources();word">for (COSName xObjectName : resources.getXObjectNames()) {PDXObject xObject = resources.getXObject(xObjectName);word">if (xObject word">instanceof PDImageXObject) {PDImageXObject image = (PDImageXObject) xObject;BufferedImage bufferedImage = image.getImage();// Save image to fileFile imageFile = word">new File("image" + (++imageCounter) + ".png");word">try (FileOutputStream fos = word">new FileOutputStream(imageFile)) {ImageIO.write(bufferedImage, "PNG", fos);}}}}}/*** 从 DOCX 文件中提取图片* @param document DOCX 文档对象* @throws IOException 读取文件时发生的异常*/word">public String extractImagesFromDocx(XWPFDocument document, word">boolean urlEncode) word">throws IOException {StringBuilder recognitionResults = word">new StringBuilder();word">int imageCounter = 0;word">for (XWPFPictureData pictureData : document.getAllPictures()) {word">byte[] bytes = pictureData.getData();// 将图片数据转换为 Base64 编码String base64Image = Base64.getEncoder().encodeToString(bytes);// 如果需要 URL 编码word">if (urlEncode) {base64Image = URLEncoder.encode(base64Image, "utf-8");}// 识别图片String ocrResult = baiduOcrService.recognizeImage(base64Image);String formattedResult = formatOcrResult(ocrResult);recognitionResults.append("Image ").append(++imageCounter).append(": ").append(formattedResult).append("\n");}word">return recognitionResults.toString();}}
java">word">package com.jt.console.service.impl;word">import com.alibaba.fastjson.JSON;
word">import com.alibaba.fastjson.JSONObject;
word">import com.jt.common.beans.ServiceAssert;
word">import okhttp3.*;
word">import org.springframework.beans.factory.annotation.Value;
word">import org.springframework.stereotype.Service;
word">import org.springframework.web.multipart.MultipartFile;word">import java.io.IOException;
word">import java.net.URLEncoder;
word">import java.util.Base64;
word">import java.util.List;
word">import java.util.Arrays;/*** 百度OCR识别实现类*/
@Service("baiduOcrServiceImpl")
word">public word">class BaiduOcrServiceImpl {@Value("${baidu.ocr.apiKey}")word">private String API_KEY;  // 客户端id@Value("${baidu.ocr.secretKey}")word">private String SECRET_KEY; // 客户端秘钥// 支持的图片格式列表word">private word">static word">final List<String> SUPPORTED_FORMATS = Arrays.asList("png", "jpg", "jpeg", "bmp", "gif");// 构建 OkHttpClient 实例word">private word">static word">final OkHttpClient HTTP_CLIENT = word">new OkHttpClient().newBuilder().build();// 获取 Access Tokenword">private String getAccessToken() word">throws IOException {MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded");RequestBody body = RequestBody.create(mediaType, "grant_type=client_credentials&client_id=" + API_KEY+ "&client_secret=" + SECRET_KEY);Request request = word">new Request.Builder().url("https://aip.baidubce.com/oauth/2.0/token").method("POST", body).addHeader("Content-Type", "application/x-www-form-urlencoded").build();Response response = HTTP_CLIENT.newCall(request).execute();word">if (!response.isSuccessful()) {//throw new IOException("Unexpected code " + response);// 自定义提示信息String errorMessage = "OCR request failed. Status code: " + response.code() + ", Message: " + response.message();ServiceAssert.isTrue(false, errorMessage);}String responseBody = response.body().string();JSONObject jsonObject = JSON.parseObject(responseBody);word">return jsonObject.getString("access_token");}// 调用 OCR 接口,返回结果word">public String recognizeImage(String base64Image) word">throws IOException {MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded");RequestBody body = RequestBody.create(mediaType, "image=" + base64Image + "&detect_direction=false&paragraph=false&probability=false");Request request = word">new Request.Builder().url("https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic?access_token=" + getAccessToken()).method("POST", body).addHeader("Content-Type", "application/x-www-form-urlencoded").addHeader("Accept", "application/json").build();word">try (Response response = HTTP_CLIENT.newCall(request).execute()) {word">if (!response.isSuccessful()) {//throw new IOException("Unexpected code " + response);// 自定义提示信息String errorMessage = "Failed to obtain access token. Status code: " + response.code() + ", Message: " + response.message();ServiceAssert.isTrue(false, errorMessage);}word">return formatOcrResult(response.body().string());}}// 将 MultipartFile 转换为 Base64 编码word">public String convertToBase64(MultipartFile file, word">boolean urlEncode) word">throws IOException {// 检查图片格式String filename = file.getOriginalFilename();word">if (filename == word">null) {ServiceAssert.isTrue(false, "文件名为空");}String extension = filename.substring(filename.lastIndexOf('.') + 1).toLowerCase();word">if (!SUPPORTED_FORMATS.contains(extension)) {ServiceAssert.isTrue(false, "不支持的图片格式: " + extension);}// 从 MultipartFile 中获取字节数组word">byte[] bytes = file.getBytes();// 将字节数组编码为 Base64 字符串String base64 = Base64.getEncoder().encodeToString(bytes);// 如果需要 URL 编码word">if (urlEncode) {base64 = URLEncoder.encode(base64, "utf-8");}word">return base64;}//组装返回OCR识别的结果word">public word">static String formatOcrResult(String ocrResult) {StringBuilder resultText = word">new StringBuilder();word">try {// 解析 OCR 结果JSONObject jsonObject = JSON.parseObject(ocrResult);// 检查是否包含 words_result 数组word">if (jsonObject.containsKey("words_result")) {word">var wordsResult = jsonObject.getJSONArray("words_result");word">if (wordsResult != word">null && !wordsResult.isEmpty()) {word">for (word">int i = 0; i < wordsResult.size(); i++) {JSONObject wordObject = wordsResult.getJSONObject(i);String word = wordObject.getString("words");word">if (word != word">null && !word.isEmpty()) {resultText.append(word).append(" ");}}} word">else {// 如果没有识别到文字,直接返回空值word">return "";}} word">else {// OCR 结果中不包含 words_result,也返回空值word">return "";}} word">catch (Exception e) {ServiceAssert.isTrue(false,e.getMessage());//resultText.append("处理 OCR 结果时出错:").append(e.getMessage());}word">return resultText.toString().trim();}
}

http://www.ppmy.cn/embedded/94887.html

相关文章

鸿蒙(API 12 Beta3版)【音频编码】

开发者可以调用本模块的Native API接口&#xff0c;完成音频编码&#xff0c;即将音频PCM编码压缩成不同的格式。 接口不限制PCM数据的来源&#xff0c;开发者可以调用麦克风录制获取、也可以导入编辑后的PCM数据&#xff0c;通过音频编码&#xff0c;输出对应格式的码流&…

小邦教你5种方法能在面试中加分

​​秋招进行到现阶段&#xff0c;相信很多同学都已经进入了面试环节。 ​面试如何去准备&#xff0c;以及面试过程中如何去应对&#xff0c;大家肯定都搜过相关面经&#xff0c;都知道该怎么做。或者参加了好几场面试&#xff0c;也都有一定的经验了。 ​小邦来跟大家讲讲面试…

[Java]面向对象-抽象类/方法接口适配器设计模式

抽象类 一个类中如果存在抽象方法&#xff0c;那么该类就必须申明为抽象类 定义格式&#xff1a; 如果一个类包含抽象方法&#xff0c;那么该类必须是抽象类。注意&#xff1a;抽象类不一定有抽象方法&#xff0c;但是有抽象方法的类必须定义成抽象类。 abstract class 类名…

LeetCode - 54 - 螺旋矩阵

力扣54题 题目描述&#xff1a; 给你一个 m 行 n 列的矩阵 matrix &#xff0c;请按照 顺时针螺旋顺序 &#xff0c;返回矩阵中的所有元素。 题解思路&#xff1a; 54题和59题 螺旋矩阵Ⅱ 有些微区别&#xff0c;59是nn的方形矩阵&#xff0c;但是54需要考虑行和列不相等的情况…

springmail发送邮件如何实现邮件动态内容?

springmail发送邮件怎么样&#xff1f;springmail发信优化方法&#xff1f; SpringMail作为一个强大的邮件发送框架&#xff0c;提供了多种方式来实现邮件内容的动态生成。AokSend将探讨如何通过SpringMail发送邮件&#xff0c;并动态生成邮件内容以满足不同的需求。 springm…

Unity安卓IOS根据不同国家语言显示不同的APP名字

安卓篇 把res文件放在Plugins下&#xff0c;然后修改string.xm里的app名字即可; 如果需要别的国家&#xff0c;增加文件夹即可 IOS篇 info.list中增加Boolean类型的Application has localized display name&#xff0c;值为YES 然后把多语言放在ATT弹窗的多语言里面 CFBun…

haproxy整理

haproxy 1.1 haproxy简介 HAProxy是法国开发者 威利塔罗(Willy Tarreau) 在2000年使用C语言开发的一个开源软件 是一款具备高并发(万级以上)、高性能的TCP和HTTP负载均衡器&#xff0c;支持基于cookie的持久性&#xff0c;自动故障切换&#xff0c;支持正则表达式及web状态统…

MyBatis-Plus 提供的一个通用服务层实现类

一、代码示例 Service public class CarriageServiceImpl extends ServiceImpl<CarriageMapper, CarriageEntity> implements CarriageService{Overridepublic List<CarriageDTO> findAll() {return List.of();} } 在这段代码中&#xff0c;CarriageServiceImpl …