文章目录
- 参考文章
- 技术栈
- 需求
- 解析发票类型
- 1. 最终项目结构
- 1.1 说明
- 2. 相关代码
- 2.1 导入相应的maven依赖
- 2.2 实体类
- 2.3 工具类
- 2.4 三层架构
- controller
- service
- mapper
参考文章
参考文章
技术栈
SpringBoot+Vue
需求
本文主要是实现提取发票中的部分内容,并实现自动回填到页面中对应位置。
解析发票类型
1. 最终项目结构
新建一个Springboot项目
1.1 说明
- 实体 entity 包下
- 主要有两个实体,分别是发票实体NewInvoice,和发票备注实体Note
- 工具 utils 包下
- PDF类用于解析PDF文件
- InvoiceRegexEnum 类是发票信息的正则表达式枚举类,提取发票信息所用到的正则表达式都在这个枚举类中
2. 相关代码
2.1 导入相应的maven依赖
<!-- 添加依赖开始位置--><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox</artifactId><version>2.0.21</version></dependency><dependency><groupId>org.apache.pdfbox</groupId><artifactId>fontbox</artifactId><version>2.0.21</version></dependency><dependency><groupId>org.apache.pdfbox</groupId><artifactId>jempbox</artifactId><version>1.8.13</version></dependency><dependency><groupId>org.apache.pdfbox</groupId><artifactId>xmpbox</artifactId><version>2.0.0</version></dependency><dependency><groupId>org.apache.pdfbox</groupId><artifactId>preflight</artifactId><version>2.0.0</version></dependency><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox-tools</artifactId><version>2.0.0</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>5.0.0</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>5.0.0</version></dependency><dependency><groupId>com.google.zxing</groupId><artifactId>core</artifactId><version>3.1.0</version></dependency><dependency><groupId>com.google.zxing</groupId><artifactId>javase</artifactId><version>3.1.0</version></dependency><dependency><groupId>org.apache.commons</groupId><artifactId>commons-lang3</artifactId><version>3.12.0</version></dependency><dependency><groupId>commons-io</groupId><artifactId>commons-io</artifactId><version>2.11.0</version></dependency><dependency><groupId>commons-codec</groupId><artifactId>commons-codec</artifactId></dependency><!-- 添加依赖结束位置-->
2.2 实体类
电子发票实体类NewInvoice
package com.example.pdf.entity;import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;import java.math.BigDecimal;/*** 电子发票实体类*/
@Data
@AllArgsConstructor
@NoArgsConstructor
public class NewInvoice {
// private String fileAbsolutePath; // 文件绝对路径private String number; // 发票号码private String date; // 开票日期
// private String sellerName; // 销售方名称private BigDecimal amount; // 合计金额private BigDecimal taxAmount; // 合计税额private BigDecimal totalAmount; // 价税合计金额private Note Detail; // 发票备注信息private String note; // 备注}
备注实体类Note
package com.example.pdf.entity;import lombok.Data;/*** 发票备注信息*/
@Data
public class Note {private String buyer; // 购方private String buyerAccount; // 购方银行账号private String seller; // 销方private String sellerAccount; // 销方银行账号private String payee; // 收款人private String checker; // 复核人
// @Override
// public String toString() {
// return "购方开户银行:" + buyer + "; "+
// "银行账号:" + buyerAccount + ";" +"\n"+
// "销方开户银行:" + seller + "; "+
// "银行账号:" + sellerAccount +";" +"\n\n"+
// " 收款人:" + payee + ";" +" "+
// " 复核人:" + checker +";";
// }
}
2.3 工具类
正则表达式枚举类
package com.example.pdf.utils;/*** 正则表达式枚举类*/
public enum InvoiceRegexEnum {/*** 机器编码、发票代码、发票号码、开票日期和校验码的提取正则*/REGULAR_A("机器编号:(?<machineNumber>\\d{12})|发票代码:(?<code>\\d{12})|发票号码:(?<number>\\d{8})|:(?<date>\\d{4}年\\d{2}月\\d{2}日)|校验码:(?<checksum>\\d{20}|\\S{4,})"),/*** 新版发票的机器编码、发票代码、发票号码、开票日期和校验码的提取正则*/REGULAR_A_NEW("发票号码:(?<number>\\d{20})|:(?<date>\\d{4}年\\d{2}月\\d{2}日)|(售名称|销名称):(?<name>\\S*)"),/*** 发票号码备用提取正则*/REGULAR_A_1("(国制|制普通发票)(?<number>\\d{8})"),/*** 发票号码跨行提取正则*/REGULAR_A_1R("发票号码:(?<number>\\d{7})[\\s\\S]*?(\\d+)"),/*** 开票日期备用提取正则*/REGULAR_A_2("开票日期:(?<date>\\d{4}\\d{2}月\\d{2}日)"),/*** 发票代码备用提取正则*/REGULAR_A_3("发票代码(?<code>\\d{12})"),/*** 发票代码跨行提取正则*/REGULAR_A_3R("发票代码:(?<code>\\d{10})[\\s\\S]*?(\\d+)"),/*** 金额、税额提取正则,匹配形如 "合计¥?金额¥?税额" 的文本*/REGULAR_B("合计¥?(?<amount>[^ \\f\\n\\r\\t\\v*]*)(?:¥?(?<taxAmount>\\S*)|\\*+)\\s"),/*** 金额提取正则,用于匹配结果有误的修正*/REGULAR_BR("合计¥(?<amount>\\d+\\.\\d+)"),/*** 金额、税额备用提取正则*/REGULAR_B_1("合\\u0020*计\\u0020*¥?(?<amount>[^ ]*)\\u0020+¥?(?:(?<taxAmount>\\S*)|\\*+)\\s"),/*** 价税合计提取正则,匹配“价税合计(大写)XXX(小写)¥YYY”格式的文本*/REGULAR_C("价税合计\\u0028大写\\u0029(?<amountString>\\S*)\\u0028小写\\u0029¥?(?<amount>\\S*)\\s"),/*** 收款人、复核、开票人、销售方提取正则,匹配格式为“收款人:xxx复核:xxx开票人:xxx销售方”的字符串*/REGULAR_D("收款人:(?<payee>\\S*)复核:(?<reviewer>\\S*)开票人:(?<drawer>\\S*)销售方"),/*** 发票类型提取正则,匹配"xxx通发票"格式的发票类型*/REGULAR_E("(?<p>\\S*)通发票"),/*** 发票类型提取正则,匹配"xxx用发票"格式的发票类型*/REGULAR_E_1("(?<p>\\S*)用发票"),/*** 发票类型提取 - 辅助正则*/REGULAR_E_AUX("(?:国|统|一|发|票|监|制)"),/*** 购买方信息提取正则*/REGULAR_F("名称:(?<name>\\S*)|纳税人识别号:(?<code>\\S*)|地址、电话:(?<address>\\S*)|开户行及账号:(?<account>\\S*)|电子支付标识:(?<account2>\\S*)"),/*** 针对深圳发票的销售方名称提取正则*/REGULAR_FR("名称:(?<name>\\S*)"),/*** 处理除了金额和税额之外的其他文本元素正则*/REGULAR_G("^(-?\\d+)(\\.\\d+)?$"),/*** 备注信息提取正则*/REGULAR_A_NOTE_BUYER("购方开户银行:(?<buyer>[^;]+);"),REGULAR_A_NOTE_BUYERACCOUNT("银行账号:(?<buyerAccount>\\d+)(?=[,;])"),REGULAR_A_NOTE_SELLER("销方开户银行:(?<seller>.*?)(?=[,;]|\\Z)"),REGULAR_A_NOTE_SELLERACCOUNT("银行账号:(?<sellerAccount>\\d+)(?=[,;]|\\Z)"),REGULAR_A_NOTE_PAYEE("收款人:(?<payee>.*?)(?=[,;]|\\Z)"),REGULAR_A_NOTE_CHECKER("复核人:(?<checker>.*?)(?=[,;]|\\Z)"),/*** 检查当前详细项字符串是否符合特定条件正则*/REGULAR_H("\\S+\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*"),REGULAR_H_1("^ *\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*"),REGULAR_H_2("\\S+\\d+%[\\-\\d]+\\S*"),REGULAR_H_3("^ *\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*");private final String regex;InvoiceRegexEnum(String regex) {this.regex = regex;}public String getRegex() {return regex;}
}
解析PDF工具类
package com.example.pdf.utils;import com.example.pdf.entity.NewInvoice;
import com.example.pdf.entity.Note;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.web.multipart.MultipartFile;import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.math.BigDecimal;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;import static com.example.pdf.utils.InvoiceRegexEnum.*;@Slf4j
public class PDF {private static final String PDF_EXTENSION = ".pdf";/*** 调用该方法将前端接受到的文件暂存** @param file*/public static NewInvoice parsePdfFile(MultipartFile file) {NewInvoice newInvoice = new NewInvoice();try {// 创建一个临时文件Path tempFile = Files.createTempFile("tempPrefix", ".pdf");File tempFilePath = tempFile.toFile();// 将MultipartFile的内容写入到临时文件try (FileOutputStream fos = new FileOutputStream(tempFilePath)) {fos.write(file.getBytes());}// 现在你可以使用临时文件的路径来调用你的解析方法newInvoice = extractInvoiceInformation(tempFilePath.getAbsolutePath());// 删除临时文件,或者在某些情况下保留它tempFilePath.delete();} catch (IOException e) {// 处理异常e.printStackTrace();}// 返回值return newInvoice;}/*** 提取发票信息*//*** 提取发票信息** @param filePath 发票地址* @return*/public static NewInvoice extractInvoiceInformation(String filePath) {// 指定要处理的文件夹路径NewInvoice newInvoice1 = newPdfProcessInvoicesInFile(filePath);String note = "";if (newInvoice1.getDetail().getBuyer() != null) {note = "购方开户银行:" + newInvoice1.getDetail().getBuyer() + "; ";} else {note = "";}if (newInvoice1.getDetail().getBuyerAccount() != null) {note = note + "银行账号:" + newInvoice1.getDetail().getBuyerAccount() + "; ";} else {note = note + "";}if (newInvoice1.getDetail().getSeller() != null) {note = note + "销方开户银行:" + newInvoice1.getDetail().getSeller() + "; ";} else {note = note + "";}if (newInvoice1.getDetail().getSellerAccount() != null) {note = note + "银行账号:" + newInvoice1.getDetail().getSellerAccount() + "; ";} else {note = note + "";}if (newInvoice1.getDetail().getPayee() != null) {note = note + " 收款人:" + newInvoice1.getDetail().getPayee() + ";" + " ";} else {note = note + "";}if (newInvoice1.getDetail().getChecker() != null) {note = note + " 复核人:" + newInvoice1.getDetail().getChecker() + ";";} else {note = note + "";}newInvoice1.setNote(note);return newInvoice1;}/*** 处理指定的PDF发票文件** @param filePath 文件路径* @return 包含提取信息的 NewInvoice 列表*/public static NewInvoice newPdfProcessInvoicesInFile(String filePath) {File file = new File(filePath);NewInvoice returnResult = new NewInvoice();if (isPdfFile(file)) {NewInvoice result = extractInvoice(file.getAbsolutePath()); // 提取文件内容if (result != null) {returnResult = createProcessedInvoice(result);// 创建一个新的发票对象} else {handleExtractionError(file);}}return returnResult;}/*** 创建一个处理后的 NewInvoice 对象** @param result 原始的 NewInvoice 对象* @return 处理后的 NewInvoice 对象*/private static NewInvoice createProcessedInvoice(NewInvoice result) {NewInvoice returnResult = new NewInvoice();returnResult.setNumber(result.getNumber());returnResult.setDate(result.getDate());returnResult.setTotalAmount(result.getTotalAmount());returnResult.setAmount(result.getAmount());returnResult.setTaxAmount(result.getTaxAmount());returnResult.setDetail(result.getDetail());return returnResult;}/*** 处理提取失败的情况,输出错误信息** @param file 提取失败的文件*/private static void handleExtractionError(File file) {log.warn("文件: {}\t提取失败~~~\n", file.getName());}/*** 检查文件是否为PDF文件** @param file 要检查的文件* @return 如果是PDF文件,返回 true,否则返回 false*/private static boolean isPdfFile(File file) {return file.isFile() && file.getName().toLowerCase().endsWith(PDF_EXTENSION);}/*** 从本地文件或URL中提取发票信息。** @param filePath 本地文件路径或发票的URL。* @return 包含提取信息的 NewInvoice 对象。*/private static NewInvoice extractInvoice(String filePath) {File sourceFile = new File(filePath);if (!sourceFile.exists()) {log.error("指定的源文件不存在");}NewInvoice result = null;try {// 调用函数解析PDF ,返回 发票对象【基本信息】result = extract(sourceFile);} catch (Exception e) {e.printStackTrace();result = new NewInvoice();}return result;}/*** 解析PDF 文件,返回发票对象** @param file PDF文件* @return* @throws IOException*/public static NewInvoice extract(File file) throws IOException {NewInvoice invoice = new NewInvoice(); // 新建发票对象// 接收一个表示 PDF 文件路径的字符串作为参数,并返回一个 PDDocument 对象。// 这个对象代表了整个PDF 文档,可以通过这个对象来访问文档的各个部分PDDocument doc = PDDocument.load(file);// 从 PDDocument 对象 doc 中获取第一页,并将这个页面对象赋值给PDPage类型的变量// PDPage 对象代表了文档中的一个页面PDPage firstPage = doc.getPage(0);// 获取页面裁剪框宽度,并将宽度四舍五入为整数// 【页面裁剪宽度定义了页面上用于显示内容的区域】int pageWidth = Math.round(firstPage.getCropBox().getWidth());// PDFTextStripper 用于从PDF文档中提取文本的工具PDFTextStripper textStripper = new PDFTextStripper(); // 创建一个实例textStripper.setSortByPosition(true); // 提取文本时按照物理位置进行排序// 提取整个文档的所有文本内容,并将这些文本内容作为一个长字符串返回String fullText = textStripper.getText(doc);// 页面翻转? 不重要if (firstPage.getRotation() != 0) {pageWidth = Math.round(firstPage.getCropBox().getHeight());}// 处理文本中可能有错误的符号String allText = replace(fullText).replaceAll("(", "(").replaceAll(")", ")").replaceAll("¥", "¥");// 提取 新版发票的机器编码、发票代码、发票号码、开票日期和检验码{Pattern pattern = Pattern.compile(REGULAR_A_NEW.getRegex()); // 新版发票的机器编码、发票代码、发票号码、开票日期和检验码的提取正则Pattern patternNumber = Pattern.compile(REGULAR_A_1.getRegex());// 发票号码备用提取正则Pattern patternDate = Pattern.compile(REGULAR_A_2.getRegex()); // 开票日期备用提取正则// matcer 类对于输入字符串进行解释和匹配操作,这些操作是基于某个Pattern对象定义的规则(正则表达式)进行的。// 检查allText 字符串中是否匹配pattern中定义的正则表达式的文本Matcher matcher = pattern.matcher(allText);while (matcher.find()) {// 在输入字符串allText中查找与模式匹配的第一个子序列// 如果 提取到发票号码,则设置发票号码if (matcher.group("number") != null) {invoice.setNumber(matcher.group("number"));} else if (matcher.group("date") != null) {String rawDate = matcher.group("date"); // 发票日期,解析日期并设置日期try {SimpleDateFormat inputDateFormat = new SimpleDateFormat("yyyy年MM月dd日");SimpleDateFormat outputDateFormat = new SimpleDateFormat("yyyy-MM-dd");Date parsedDate = inputDateFormat.parse(rawDate);String formattedDate = outputDateFormat.format(parsedDate);invoice.setDate(formattedDate);} catch (ParseException e) {System.out.println("无法解析日期:" + rawDate);}}// 如果没有提取到的话使用备用在进行提取if (matcher.group("number") == null) {Matcher matcher2 = patternNumber.matcher(allText);if (matcher2.find()) {invoice.setNumber(matcher2.group("number"));}}if (matcher.group("date") == null) {Matcher matcher3 = patternDate.matcher(allText);if (matcher3.find()) {String rawDate = matcher3.group("date");try {SimpleDateFormat inputDateFormat = new SimpleDateFormat("yyyyMM月dd日");SimpleDateFormat outputDateFormat = new SimpleDateFormat("yyyy-MM-dd");Date parsedDate = inputDateFormat.parse(rawDate);String formattedDate = outputDateFormat.format(parsedDate);invoice.setDate(formattedDate);} catch (Exception e) {System.out.println("无法解析日期:" + rawDate);}}}}}// 提取 金额、税额等{Pattern pattern = Pattern.compile(REGULAR_B.getRegex()); // 金额、税额提取正则,匹配形如“合计¥?金额¥?税额”的文本Matcher matcher = pattern.matcher(allText);if (matcher.find()) {try {invoice.setAmount(new BigDecimal(matcher.group("amount")));} catch (Exception e) {// 不处理}try {invoice.setTaxAmount(new BigDecimal(matcher.group("taxAmount")));} catch (Exception e) {invoice.setTaxAmount(new BigDecimal(0));}}}// 如果没有提取到,则再使用备用的正则进行提取if (null == invoice.getAmount()) {Pattern pattern = Pattern.compile(REGULAR_B_1.getRegex());Matcher matcher = pattern.matcher(fullText);if (matcher.find()) {try {invoice.setAmount(new BigDecimal(matcher.group("amount")));} catch (Exception e) {invoice.setAmount(new BigDecimal(0));}try {invoice.setTaxAmount(new BigDecimal(matcher.group("taxAmount")));} catch (Exception e) {invoice.setTaxAmount(new BigDecimal(0));}}}invoice.setTotalAmount(invoice.getAmount().add(invoice.getTaxAmount()));// 先创建一个发票备注实例Note note = new Note();// 提取发票备注信息{// 提取购方开户银行Pattern patternBuyer = Pattern.compile(REGULAR_A_NOTE_BUYER.getRegex()); // 提取备注信息Pattern patternBuyerAccount = Pattern.compile(REGULAR_A_NOTE_BUYERACCOUNT.getRegex()); // 提取备注信息Pattern patternSeller = Pattern.compile(REGULAR_A_NOTE_SELLER.getRegex()); // 提取备注信息Pattern patternSellerAccount = Pattern.compile(REGULAR_A_NOTE_SELLERACCOUNT.getRegex()); // 提取备注信息Pattern patternPayee = Pattern.compile(REGULAR_A_NOTE_PAYEE.getRegex()); // 提取备注信息Pattern patternChecker = Pattern.compile(REGULAR_A_NOTE_CHECKER.getRegex()); // 提取备注信息Matcher matcher0 = patternBuyer.matcher(allText);if (matcher0.find()) {// 如果查询到的话就设置备注信息try {note.setBuyer(new String(matcher0.group("buyer")));} catch (Exception e) {// 不处理}}Matcher matcher1 = patternBuyerAccount.matcher(allText);if (matcher1.find()) {// 如果查询到的话就设置备注信息try {note.setBuyerAccount(new String(matcher1.group("buyerAccount")));} catch (Exception e) {// 不处理}}Matcher matcher2 = patternSeller.matcher(allText);if (matcher2.find()) {// 如果查询到的话就设置备注信息try {note.setSeller(new String(matcher2.group("seller")));} catch (Exception e) {// 不处理}}Matcher matcher3 = patternSellerAccount.matcher(allText);if (matcher3.find()) {// 如果查询到的话就设置备注信息try {note.setSellerAccount(new String(matcher3.group("sellerAccount")));} catch (Exception e) {// 不处理}}Matcher matcher4 = patternPayee.matcher(allText);if (matcher4.find()) {// 如果查询到的话就设置备注信息try {note.setPayee(new String(matcher4.group("payee")));} catch (Exception e) {// 不处理}}Matcher matcher5 = patternChecker.matcher(allText);if (matcher5.find()) {// 如果查询到的话就设置备注信息try {note.setChecker(new String(matcher5.group("checker")));} catch (Exception e) {// 不处理}}}invoice.setDetail(note);return invoice;}/*** 替换字符串中的空格、全角空格、冒号和特殊空白字符为标准字符。** @param str 要进行替换的字符串* @return 替换后的字符串*/private static String replace(String str) {return str.replaceAll(" ", "").replaceAll(" ", "").replaceAll(":", ":").replaceAll(" ", "");}}
2.4 三层架构
controller
package com.example.pdf.controller;import com.example.pdf.entity.NewInvoice;
import com.example.pdf.service.InvoiceService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;@RestController
@RequestMapping("/invoice")
public class InvoiceController {@AutowiredInvoiceService invoiceService;/*** @param*/
// @PostMapping
// public void insert() {
// invoiceService.save();
// }@CrossOrigin(origins = "http://localhost:8081", allowedHeaders = "*", allowCredentials = "true")@PostMapping("/upload")public ResponseEntity<Object> uploadFile(@RequestParam("file") MultipartFile file) {try {// 调用你的文件解析服务NewInvoice parsedData = invoiceService.parsePdfFile(file);// 返回解析后的数据return ResponseEntity.ok(parsedData);} catch (Exception e) {return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("Error parsing file");}}}
service
service
package com.example.pdf.service;import com.example.pdf.entity.NewInvoice;
import org.springframework.web.multipart.MultipartFile;public interface InvoiceService {
// void save();NewInvoice parsePdfFile(MultipartFile file);
}
serviceImpl
package com.example.pdf.service.impl;import com.example.pdf.entity.NewInvoice;
import com.example.pdf.mapper.InvoiceMapper;
import com.example.pdf.service.InvoiceService;
import com.example.pdf.utils.PDF;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;@Service
public class InvoiceServiceImpl implements InvoiceService {@AutowiredInvoiceMapper invoiceMapper;
//
// @Override
// public void save(){
// //获得发票对象
// InvoiceSubset invoiceSubset = extractInvoiceInformation("D:\\00-sqq\\idea\\test\\dzfp_24952000000116465179_深圳必维华法商品检定有限公司东莞分公司_20240726105216.pdf");
// //调用mapper将发票对象存入到数据库中
// invoiceMapper.save(invoiceSubset);
// }/*** 调用解析文件的方法,解析上传的文件* @param file* @return*/@Overridepublic NewInvoice parsePdfFile(MultipartFile file) {NewInvoice newInvoice = PDF.parsePdfFile(file);return newInvoice;}
}
mapper
因为没有操作数据库,所以没有使用到mapper层