java抽取word，ppt，excel，pdf文档中文字

/*** 解析pdf文档，输出每行的文字** @param filePath* @return* @throws IOException*/public static List<String> getPdfWords(String filePath) throws IOException {File file = new File(filePath);PDDocument document = null;document = PDDocument.load(file);int pageSize = document.getNumberOfPages();//System.out.println("pageSize : " + pageSize);// 一页一页读取List<String> contents = new ArrayList<>();for (int i = 0; i < pageSize; i++) {// 文本内容PDFTextStripper stripper = new PDFTextStripper();// 设置按顺序输出stripper.setSortByPosition(true);stripper.setStartPage(i + 1);stripper.setEndPage(i + 1);String text = stripper.getText(document);String[] split = text.split("\r\n");List<String> content = Arrays.asList(split);contents.addAll(content);contents.add("------------------------------我是分页行-------------------------------");}return contents;}/*** 抽取word中的文字，同时支持doc和docx格式** @param path* @return*/public static String getWord(String path) {String resullt = "";//首先判断文件中的是doc/docxtry {if (path.endsWith(".doc")) {InputStream is = new FileInputStream(new File(path));WordExtractor re = new WordExtractor(is);resullt = re.getText();re.close();} else if (path.endsWith(".docx")) {OPCPackage opcPackage = POIXMLDocument.openPackage(path);POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);resullt = extractor.getText();extractor.close();} else {System.out.println("此文件不是word文件");}} catch (Exception e) {e.printStackTrace();}return resullt;}/*** 抽取excel文件中的内容，支持xls和xlsx格式** @param filePath* @throws Exception*/public static String getExcelWords(String filePath) throws Exception {InputStream is = new FileInputStream(new File(filePath));Workbook hssfWorkbook = null;if (filePath.endsWith("xlsx")) {//Excel 2007hssfWorkbook = new XSSFWorkbook(is);} else if (filePath.endsWith("xls")) {//Excel 2003hssfWorkbook = new HSSFWorkbook(is);}int numberOfSheets = hssfWorkbook.getNumberOfSheets();List list = new ArrayList();for (int i = 0; i < numberOfSheets; i++) {Sheet sheet = hssfWorkbook.getSheetAt(i);Row row = null;for (int j = 0; j < sheet.getPhysicalNumberOfRows(); j++) { // 获取每行row = sheet.getRow(j);if (row != null) {for (int k = 0; k < sheet.getRow(0).getPhysicalNumberOfCells(); k++) { // 获取每个单元格Cell cell = row.getCell(k);if (cell == null) {list.add("");continue;}switch (cell.getCellType()) {case Cell.CELL_TYPE_STRING:list.add(cell.getRichStringCellValue().getString());break;case Cell.CELL_TYPE_NUMERIC:if (DateUtil.isCellDateFormatted(cell)) {list.add(cell.getDateCellValue());} else {list.add(cell.getNumericCellValue());}break;case Cell.CELL_TYPE_BOOLEAN:list.add(cell.getBooleanCellValue());break;case Cell.CELL_TYPE_FORMULA:list.add(cell.getCellFormula());break;default:list.add("");break;}}}}}return list.toString();}/*** 抽取ppt中文字* @param path* @return* @throws IOException*/public static String getPptWords(String path) throws IOException {List<String> textList = Lists.newArrayList();SlideShow ppt;InputStream is = new FileInputStream(new File(path));if (path.endsWith(".ppt")) {ppt = new HSLFSlideShow(is);} else if (path.endsWith(".pptx")) {ppt = new XMLSlideShow(is);} else {//LOGGER.debug("此文件{}不是word文件", path);return "此文件不是PPt文件" + path;}List<Slide> slides = ppt.getSlides();for (Slide slide : slides) {List<Shape> shapes = slide.getShapes();for (Shape sh : shapes) {//如果是一个文本框if (sh instanceof TextShape) {TextShape shape = (TextShape) sh;textList.add(shape.getText());}}}return textList.toString();}

java抽取word，ppt，excel，pdf文档中文字

相关文章

（C语言）三子棋（井字棋）娱乐教程

计算机械效率的三种方式,专题+三种四类简单机械的机械效率(共张ppt).ppt

计算机专业开题报告论证记录如何写,浙江大学软件学院研究生开题报告论证流程说明...

计算机应用基础第2单元,高教社山东版计算机应用基础第2单元.ppt

C语言模除循环,C语言程序设计模三循环程序设计训练.ppt

计算机组成原理在除法运算中商符,计算机组成原理第03章.ppt

技术人员近业务，会困死在一条船上吗？

唯库拼课课程大合集更新111门