引擎切换pdf识别简历分析

ops/2024/10/20 12:39:29/

文章目录

    • 1.EasyCode生成interview_history的crud
        • 1.在模板设置中手动指定逻辑删除的值
        • 2.生成代码,进行测试
    • 2.PDF识别关键字
        • 1.引入依赖
        • 2.代码概览
        • 3.PDFUtil.java
        • 4.keyword
    • 3.策略模式实现引擎切换&简历分析
        • 1.req和vo
        • 2.策略模式准备
          • 1.引擎策略枚举 EngineEnum.java
          • 2.引擎策略能力接口 InterviewEngine.java
          • 3.本地引擎具体策略 JiChiInterviewEngine.java
        • 3.业务
          • 1.InterviewController.java
          • 2.InterviewHistoryService.java
          • 3.InterviewHistoryServiceImpl.java
        • 4.测试

1.EasyCode生成interview_history的crud

1.在模板设置中手动指定逻辑删除的值

CleanShot 2024-07-21 at 16.11.30@2x

2.生成代码,进行测试

CleanShot 2024-07-21 at 16.36.26@2x

2.PDF识别关键字

1.引入依赖
        <!-- pdf解析器 --><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox</artifactId><version>2.0.24</version></dependency>
2.代码概览

CleanShot 2024-07-22 at 14.37.06@2x

java_29">3.PDFUtil.java
java">package com.sunxiansheng.interview.server.util;import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Pattern;@Slf4j
public class PDFUtil {private static Pattern pattern = Pattern.compile("\\s*|\t|\r|\n");/*** 获取pdf的text*/public static String getPdfText(String pdfUrl) {PDDocument document = null;String text = "";try {URL url = new URL(pdfUrl);HttpURLConnection htpcon = (HttpURLConnection) url.openConnection();htpcon.setRequestMethod("GET");htpcon.setDoOutput(true);htpcon.setDoInput(true);htpcon.setUseCaches(false);htpcon.setConnectTimeout(10000);htpcon.setReadTimeout(10000);InputStream in = htpcon.getInputStream();document = PDDocument.load(in);PDFTextStripper stripper = new PDFTextStripper();stripper.setSortByPosition(true);stripper.setStartPage(0);stripper.setEndPage(Integer.MAX_VALUE);text = stripper.getText(document);text = pattern.matcher(text).replaceAll("");if (log.isInfoEnabled()) {log.info("识别到的pdf为{}", text);}} catch (Exception e) {log.error("获取pdf转为文字错误:{}", e.getMessage(), e);} finally {if (document != null) {try {document.close();} catch (Exception e) {log.error("close error", e);}}}return text;}}
4.keyword
java_95">1.EndType.java
java">package com.sunxiansheng.interview.server.util.keyword;/*** 结束类型定义** @author minghu.zhang* @date 11:37 2020/11/11**/
public enum EndType {/*** 有下一个,结束*/HAS_NEXT, IS_END
}
java_116">2.FlagIndex.java
java">package com.sunxiansheng.interview.server.util.keyword;import java.util.List;/*** 敏感词标记** @author minghu.zhang*/
public class FlagIndex {/*** 标记结果*/private boolean flag;/*** 是否黑名单词汇*/private boolean isWhiteWord;/*** 标记索引*/private List<Integer> index;public boolean isFlag() {return flag;}public void setFlag(boolean flag) {this.flag = flag;}public List<Integer> getIndex() {return index;}public void setIndex(List<Integer> index) {this.index = index;}public boolean isWhiteWord() {return isWhiteWord;}public void setWhiteWord(boolean whiteWord) {isWhiteWord = whiteWord;}
}
java_170">3.WordType.java
java">package com.sunxiansheng.interview.server.util.keyword;/*** 词汇类型** @author minghu.zhang* @date 11:37 2020/11/11**/
public enum WordType {/*** 黑名单/白名单*/BLACK, WHITE
}
java_191">4.KeyWordUtil.java
java">package com.sunxiansheng.interview.server.util.keyword;import com.baomidou.mybatisplus.core.toolkit.CollectionUtils;import java.util.*;public class KeyWordUtil {/*** 敏感词字典*/private final static Map wordMap = new HashMap(1024);private static boolean init = false;public static boolean isInit() {return init;}/*** 获取敏感词列表** @param text 输入文本*/public static List<String> buildKeyWordsLists(final String text) {List<String> wordList = new ArrayList<>();char[] charset = text.toCharArray();for (int i = 0; i < charset.length; i++) {FlagIndex fi = getFlagIndex(charset, i, 0);if (fi.isFlag()) {if (fi.isWhiteWord()) {i += fi.getIndex().size() - 1;} else {StringBuilder builder = new StringBuilder();for (int j : fi.getIndex()) {char word = text.charAt(j);builder.append(word);}wordList.add(builder.toString());}}}return wordList;}/*** 获取标记索引** @param charset 输入文本* @param begin   检测起始* @param skip    文本距离*/private static FlagIndex getFlagIndex(final char[] charset, final int begin, final int skip) {FlagIndex fi = new FlagIndex();Map current = wordMap;boolean flag = false;int count = 0;List<Integer> index = new ArrayList<>();for (int i = begin; i < charset.length; i++) {char word = charset[i];Map mapTree = (Map) current.get(word);if (count > skip || (i == begin && Objects.isNull(mapTree))) {break;}if (Objects.nonNull(mapTree)) {current = mapTree;count = 0;index.add(i);} else {count++;if (flag && count > skip) {break;}}if ("1".equals(current.get("isEnd"))) {flag = true;}if ("1".equals(current.get("isWhiteWord"))) {fi.setWhiteWord(true);break;}}fi.setFlag(flag);fi.setIndex(index);return fi;}public static void addWord(Collection<String> wordList) {init = true;if (CollectionUtils.isEmpty(wordList)) {return;}WordType wordType = WordType.BLACK;Map nowMap;Map<String, String> newWorMap;// 迭代keyWordSetfor (String key : wordList) {nowMap = wordMap;for (int i = 0; i < key.length(); i++) {// 转换成char型char keyChar = key.charAt(i);// 获取Object wordMap = nowMap.get(keyChar);// 如果存在该key,直接赋值if (wordMap != null) {nowMap = (Map) wordMap;} else {// 不存在则构建一个map,同时将isEnd设置为0,因为他不是最后一个newWorMap = new HashMap<>(4);// 不是最后一个newWorMap.put("isEnd", String.valueOf(EndType.HAS_NEXT.ordinal()));nowMap.put(keyChar, newWorMap);nowMap = newWorMap;}if (i == key.length() - 1) {// 最后一个nowMap.put("isEnd", String.valueOf(EndType.IS_END.ordinal()));nowMap.put("isWhiteWord", String.valueOf(wordType.ordinal()));}}}}
}

3.策略模式实现引擎切换&简历分析

1.req和vo
java_329">1.InterviewReq.java
java">package com.sunxiansheng.interview.api.req;import com.sunxiansheng.interview.api.enums.EngineEnum;
import lombok.Getter;
import lombok.Setter;import java.io.Serializable;@Getter
@Setter
public class InterviewReq implements Serializable {/*** pdf的url*/private String url;/*** 分析引擎的名字(AI或者本地)*/private String engine = EngineEnum.JI_CHI.name();}
java_359">2.InterviewVO.java
java">package com.sunxiansheng.interview.api.vo;import lombok.Data;
import lombok.Getter;
import lombok.Setter;import java.io.Serializable;
import java.util.List;@Getter
@Setter
public class InterviewVO implements Serializable {/*** 问题列表*/private List<Interview> questionList;/*** 内部类(具体的问题信息)*/@Datapublic static class Interview {/*** pdf识别出来的关键词(分类名-标签名)*/private String keyWord;/*** 标签的分类id*/private Long categoryId;/*** 标签id*/private Long labelId;}}
2.策略模式准备
java_406">1.引擎策略枚举 EngineEnum.java
java">package com.sunxiansheng.interview.api.enums;import lombok.Getter;/*** 引擎*/
@Getter
public enum EngineEnum {JI_CHI,ALI_BL,}
java_426">2.引擎策略能力接口 InterviewEngine.java
java">package com.sunxiansheng.interview.server.service;import com.sunxiansheng.interview.api.enums.EngineEnum;
import com.sunxiansheng.interview.api.vo.InterviewVO;import java.util.List;/*** 引擎能力接口*/
public interface InterviewEngine {/*** 标识引擎类型的能力*/EngineEnum engineType();/*** 通过简历关键字获取面试关键字*/InterviewVO analyse(List<String> KeyWords);}
java_456">3.本地引擎具体策略 JiChiInterviewEngine.java
java">package com.sunxiansheng.interview.server.service.impl;import com.sunxiansheng.interview.api.enums.EngineEnum;
import com.sunxiansheng.interview.api.vo.InterviewVO;
import com.sunxiansheng.interview.server.entity.po.SubjectCategory;
import com.sunxiansheng.interview.server.entity.po.SubjectLabel;
import com.sunxiansheng.interview.server.mapper.SubjectMapper;
import com.sunxiansheng.interview.server.service.InterviewEngine;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;import javax.annotation.PostConstruct;
import javax.annotation.Resource;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Function;
import java.util.stream.Collectors;/*** Description: 本地的引擎* @Author sun* @Create 2024/7/22 14:23* @Version 1.0*/
@Component
public class JiChiInterviewEngine implements InterviewEngine {@Resourceprivate SubjectMapper subjectMapper;/*** 所有的标签和*/private List<SubjectLabel> labels;private Map<Long, SubjectCategory> categoryMap;// bean装载后初始化@PostConstructpublic void init() {labels = subjectMapper.listAllLabel();// 收集成map,key为分类id,value为分类对象categoryMap = subjectMapper.listAllCategory().stream().collect(Collectors.toMap(SubjectCategory::getId, Function.identity()));}/*** 枚举标识自己是本地引擎* @return*/@Overridepublic EngineEnum engineType() {return EngineEnum.JI_CHI;}/*** 根据关键词分析简历* @param KeyWords* @return*/@Overridepublic InterviewVO analyse(List<String> KeyWords) {// 判空if (CollectionUtils.isEmpty(KeyWords)) {return new InterviewVO();}// 首先过滤出所有是关键词的标签List<SubjectLabel> includedLabels = labels.stream().filter(item -> {return KeyWords.contains(item.getLabelName());}).collect(Collectors.toList());// map成InterviewVO.InterviewList<InterviewVO.Interview> collect = includedLabels.stream().map(// label为是关键词的标签label -> {InterviewVO.Interview interview = new InterviewVO.Interview();// 根据标签来获取这个标签所在的分类SubjectCategory subjectCategory = categoryMap.get(label.getCategoryId());// 如果分类不为空,则将分类名和标签名format成 "分类名-标签名" 的格式作为KeyWordif (Objects.nonNull(subjectCategory)) {interview.setKeyWord(String.format("%s-%s", subjectCategory.getCategoryName(), label.getLabelName()));} else {interview.setKeyWord(label.getLabelName());}interview.setCategoryId(label.getCategoryId());interview.setLabelId(label.getId());return interview;}).collect(Collectors.toList());InterviewVO interviewVO = new InterviewVO();interviewVO.setQuestionList(collect);return interviewVO;}
}
3.业务
java_558">1.InterviewController.java
java">package com.sunxiansheng.interview.server.controller;import com.alibaba.fastjson.JSON;
import com.google.common.base.Preconditions;
import com.sunxiansheng.interview.api.common.Result;
import com.sunxiansheng.interview.api.req.InterviewReq;
import com.sunxiansheng.interview.api.vo.InterviewVO;
import com.sunxiansheng.interview.server.convert.InterviewHistoryConvert;
import com.sunxiansheng.interview.server.entity.dto.InterviewHistoryDto;
import com.sunxiansheng.interview.server.entity.page.PageResult;
import com.sunxiansheng.interview.server.entity.req.InterviewHistoryReq;
import com.sunxiansheng.interview.server.entity.vo.InterviewHistoryVo;
import com.sunxiansheng.interview.server.service.InterviewHistoryService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.*;import javax.annotation.Resource;
import java.util.Objects;/*** 模拟面试信息 前端控制器** @author sun* @since 2024-07-21 16:03:42*/
@Slf4j
@RestController
@RequestMapping("/interview")
public class InterviewController {/*** 服务对象*/@Resourceprivate InterviewHistoryService interviewHistoryService;/*** 分页查询数据** @param req 筛选条件* @return 查询结果*/@GetMapping("/queryPage")public Result<PageResult<InterviewHistoryVo>> queryByPage(@RequestBody InterviewHistoryReq req) {try {// 打印日志if (log.isInfoEnabled()) {log.info("分页查询数据入参{}", JSON.toJSONString(req));}// ============================== Preconditions 参数校验 ==============================// ============================== Preconditions 参数校验 ==============================// 将req转换为dto(如果req的字段符合service层的规范,不转也可以)InterviewHistoryDto interviewHistoryDto = InterviewHistoryConvert.INSTANCE.convertReqToDto(req);// 调用service层PageResult<InterviewHistoryVo> interviewHistoryVoPageResult = this.interviewHistoryService.queryByPage(interviewHistoryDto);return Result.ok(interviewHistoryVoPageResult);} catch (Exception e) {// 打印error日志log.error("分页查询数据!错误原因{}", e.getMessage(), e);return Result.fail(e.getMessage());}}/*** 分析简历*/@PostMapping(value = "/analyse")public Result<InterviewVO> analyse(@RequestBody InterviewReq req) {try {if (log.isInfoEnabled()) {log.info("分析简历入参{}", JSON.toJSON(req));}Preconditions.checkArgument(!Objects.isNull(req), "参数不能为空!");Preconditions.checkArgument(!Objects.isNull(req.getEngine()), "引擎不能为空!");Preconditions.checkArgument(!Objects.isNull(req.getUrl()), "简历不能为空!");return Result.ok(interviewHistoryService.analyse(req));} catch (IllegalArgumentException e) {log.error("参数异常!错误原因{}", e.getMessage(), e);return Result.fail(e.getMessage());} catch (Exception e) {log.error("分析简历异常!错误原因{}", e.getMessage(), e);return Result.fail("分析简历异常!");}}}
java_652">2.InterviewHistoryService.java
java">package com.sunxiansheng.interview.server.service;import com.sunxiansheng.interview.api.req.InterviewReq;
import com.sunxiansheng.interview.api.vo.InterviewVO;
import com.sunxiansheng.interview.server.entity.dto.InterviewHistoryDto;
import com.sunxiansheng.interview.server.entity.page.PageResult;
import com.sunxiansheng.interview.server.entity.vo.InterviewHistoryVo;/*** 面试汇总记录表(InterviewHistory)service接口** @author sun* @since 2024-07-21 16:03:42*/
public interface InterviewHistoryService {/*** 分页查询** @param Dto 筛选条件* @return 查询结果*/PageResult<InterviewHistoryVo> queryByPage(InterviewHistoryDto Dto);/*** 使用引擎分析简历** @param req* @return*/InterviewVO analyse(InterviewReq req);
}
java_690">3.InterviewHistoryServiceImpl.java
java">package com.sunxiansheng.interview.server.service.impl;import com.google.common.base.Preconditions;
import com.sunxiansheng.interview.api.req.InterviewReq;
import com.sunxiansheng.interview.api.vo.InterviewVO;
import com.sunxiansheng.interview.server.convert.InterviewHistoryConvert;
import com.sunxiansheng.interview.server.entity.dto.InterviewHistoryDto;
import com.sunxiansheng.interview.server.entity.page.PageResult;
import com.sunxiansheng.interview.server.entity.page.SunPageHelper;
import com.sunxiansheng.interview.server.entity.po.InterviewHistoryPo;
import com.sunxiansheng.interview.server.entity.po.SubjectLabel;
import com.sunxiansheng.interview.server.entity.vo.InterviewHistoryVo;
import com.sunxiansheng.interview.server.mapper.InterviewHistoryMapper;
import com.sunxiansheng.interview.server.mapper.SubjectMapper;
import com.sunxiansheng.interview.server.service.InterviewEngine;
import com.sunxiansheng.interview.server.service.InterviewHistoryService;
import com.sunxiansheng.interview.server.util.PDFUtil;
import com.sunxiansheng.interview.server.util.keyword.KeyWordUtil;
import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.stereotype.Service;import javax.annotation.Resource;
import java.util.*;
import java.util.stream.Collectors;/*** 面试汇总记录表(InterviewHistory)service实现类** @author sun* @since 2024-07-21 16:27:12*/
@Service("interviewHistoryService")
public class InterviewHistoryServiceImpl implements InterviewHistoryService, ApplicationContextAware {@Resourceprivate InterviewHistoryMapper interviewHistoryMapper;@Resourceprivate SubjectMapper subjectMapper;/*** 存放所有的引擎策略的map*/private static final Map<String, InterviewEngine> engineMap = new HashMap<>();/*** 在bean初始化之后立即被调用,这里用来得到所有的引擎对象并封装到map中,方便获取** @param applicationContext* @throws BeansException*/@Overridepublic void setApplicationContext(ApplicationContext applicationContext) throws BeansException {// 从上下文中获取所有具体的引擎Collection<InterviewEngine> engines = applicationContext.getBeansOfType(InterviewEngine.class).values();// 将这些引擎放到map中for (InterviewEngine engine : engines) {engineMap.put(engine.engineType().name(), engine);}}/*** 分页查询** @param interviewHistoryDto 筛选条件,需要携带pageNo和pageSize以及查询条件* @return 分页结果*/@Overridepublic PageResult<InterviewHistoryVo> queryByPage(InterviewHistoryDto interviewHistoryDto) {// 将dto转换为poInterviewHistoryPo interviewHistoryPo = InterviewHistoryConvert.INSTANCE.convertDtoToPo(interviewHistoryDto);// 使用 SunPageHelper 执行分页操作PageResult<InterviewHistoryPo> paginate = SunPageHelper.paginate(interviewHistoryDto.getPageNo(), interviewHistoryDto.getPageSize(),() -> interviewHistoryMapper.count(interviewHistoryPo),(offset, size) -> interviewHistoryMapper.queryPage(interviewHistoryPo, offset, size));// 将po转换为voPageResult<InterviewHistoryVo> interviewHistoryVoPageResult = InterviewHistoryConvert.INSTANCE.convertPageResult(paginate);return interviewHistoryVoPageResult;}/*** 分析简历** @param req* @return*/@Overridepublic InterviewVO analyse(InterviewReq req) {// 从pdf中获取关键词List<String> keyWords = buildKeyWords(req.getUrl());// 从map中获取处理的引擎InterviewEngine engine = engineMap.get(req.getEngine());Preconditions.checkArgument(!Objects.isNull(engine), "引擎不能为空!");// 使用获取到的引擎来分析简历return engine.analyse(keyWords);}/*** 分析pdf来获取关键词** @param url* @return*/private List<String> buildKeyWords(String url) {String pdfText = PDFUtil.getPdfText(url);if (!KeyWordUtil.isInit()) {// 数据库中查询所有标签,作为敏感词放到KeyWordUtilList<String> list = subjectMapper.listAllLabel().stream().map(SubjectLabel::getLabelName).collect(Collectors.toList());KeyWordUtil.addWord(list);}// 与数据库中查询出来的敏感词来进行比对,得到关键字列表return KeyWordUtil.buildKeyWordsLists(pdfText);}}
4.测试

CleanShot 2024-07-22 at 15.44.39@2x


http://www.ppmy.cn/ops/101022.html

相关文章

Ruby遇上GUI:探索Ruby桌面应用开发的新天地

标题&#xff1a;Ruby遇上GUI&#xff1a;探索Ruby桌面应用开发的新天地 在编程语言的多彩世界中&#xff0c;Ruby以其简洁和优雅著称&#xff0c;常被用于Web开发。然而&#xff0c;Ruby的潜力远不止于此&#xff0c;它同样能够在桌面应用开发领域大放异彩。本文将带您深入了…

浅谈【数据结构】图-最短路径问题

目录 1、最短路径问题 2、迪杰斯特拉算法 3、算法的步骤 谢谢帅气美丽且优秀的你看完我的文章还要点赞、收藏加关注 没错&#xff0c;说的就是你&#xff0c;不用再怀疑&#xff01;&#xff01;&#xff01; 希望我的文章内容能对你有帮助&#xff0c;一起努力吧&#xff0…

基于SSM的垃圾分类管理系统的设计与实现 (含源码+sql+视频导入教程+论文)

&#x1f449;文末查看项目功能视频演示获取源码sql脚本视频导入教程视频 1 、功能描述 基于SSM的垃圾分类管理系统拥有两种角色&#xff1a;用户和管理员。具体功能如下&#xff1a; 1.1 背景描述 基于SSM的垃圾分类管理系统采用Spring、SpringMVC和MyBatis框架&#xff0c…

postman使用指北

粘贴 cURL 请求 环境设置 作用&#xff1a;方便切换不同环境&#xff0c;比如配置本地环境/测试环境/线上环境&#xff0c;通过切换环境就可以请求对应环境的接口 配置环境 切换环境请求 Pre-request Script 可以在发送请求之前执行一些脚本操作 1. 常用指令 // 获取请求方…

centos安装docker、docker-compose

centos安装docker、docker-compose 注意&#xff1a; 使用非root用户安装且用户有sudo权限 卸载旧版本 sudo yum remove docker \docker-client \docker-client-latest \docker-common \docker-latest \docker-latest-logrotate \docker-logrotate \docker-engine …

数据导出为Excel接口报错:java.io.IOException: UT010029: Stream is closed

在Spring框架中&#xff0c;开发过程中经常需要实现数据的导出功能&#xff0c;尤其是将数据导出为Excel文件。然而&#xff0c;在实现这样的功能时&#xff0c;可能会遇到一些意料之外的错误&#xff0c;比如java.io.IOException: UT010029: Stream is closed。本文将基于一个…

性能测试笔记

大多数用户期望网站加载时间3秒内 前端常见的性能优化点 1.静态资源增加缓存 2.静态资源使用CDN和独立域名 3.js压缩&#xff0c;并去掉存在的重复请求 4.js位置后移 5.css依赖减少&#xff0c;同时指定&#xff1b; 6.减少Banner图片个数&#xff0c;同时减小大小&…

整流电路和电源滤波方法

1. 简介 什么是整流电路&#xff1f; 整流电路是一种将交流电&#xff08;AC&#xff09;转换为直流电&#xff08;DC&#xff09;的电路&#xff0c;在电源设计、电子设备中广泛应用。本文将深入探讨整流电路的基本原理、常见类型及其在不同领域中的应用。 为什么整流电路重要…