/*** 解析pdf文档,输出每行的文字** @param filePath* @return* @throws IOException*/public static List<String> getPdfWords(String filePath) throws IOException {File file = new File(filePath);PDDocument document = null;document = PDDocument.load(file);int pageSize = document.getNumberOfPages();//System.out.println("pageSize : " + pageSize);// 一页一页读取List<String> contents = new ArrayList<>();for (int i = 0; i < pageSize; i++) {// 文本内容PDFTextStripper stripper = new PDFTextStripper();// 设置按顺序输出stripper.setSortByPosition(true);stripper.setStartPage(i + 1);stripper.setEndPage(i + 1);String text = stripper.getText(document);String[] split = text.split("\r\n");List<String> content = Arrays.asList(split);contents.addAll(content);contents.add("------------------------------我是分页行-------------------------------");}return contents;}/*** 抽取word中的文字,同时支持doc和docx格式** @param path* @return*/public static String getWord(String path) {String resullt = "";//首先判断文件中的是doc/docxtry {if (path.endsWith(".doc")) {InputStream is = new FileInputStream(new File(path));WordExtractor re = new WordExtractor(is);resullt = re.getText();re.close();} else if (path.endsWith(".docx")) {OPCPackage opcPackage = POIXMLDocument.openPackage(path);POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);resullt = extractor.getText();extractor.close();} else {System.out.println("此文件不是word文件");}} catch (Exception e) {e.printStackTrace();}return resullt;}/*** 抽取excel文件中的内容,支持xls和xlsx格式** @param filePath* @throws Exception*/public static String getExcelWords(String filePath) throws Exception {InputStream is = new FileInputStream(new File(filePath));Workbook hssfWorkbook = null;if (filePath.endsWith("xlsx")) {//Excel 2007hssfWorkbook = new XSSFWorkbook(is);} else if (filePath.endsWith("xls")) {//Excel 2003hssfWorkbook = new HSSFWorkbook(is);}int numberOfSheets = hssfWorkbook.getNumberOfSheets();List list = new ArrayList();for (int i = 0; i < numberOfSheets; i++) {Sheet sheet = hssfWorkbook.getSheetAt(i);Row row = null;for (int j = 0; j < sheet.getPhysicalNumberOfRows(); j++) { // 获取每行row = sheet.getRow(j);if (row != null) {for (int k = 0; k < sheet.getRow(0).getPhysicalNumberOfCells(); k++) { // 获取每个单元格Cell cell = row.getCell(k);if (cell == null) {list.add("");continue;}switch (cell.getCellType()) {case Cell.CELL_TYPE_STRING:list.add(cell.getRichStringCellValue().getString());break;case Cell.CELL_TYPE_NUMERIC:if (DateUtil.isCellDateFormatted(cell)) {list.add(cell.getDateCellValue());} else {list.add(cell.getNumericCellValue());}break;case Cell.CELL_TYPE_BOOLEAN:list.add(cell.getBooleanCellValue());break;case Cell.CELL_TYPE_FORMULA:list.add(cell.getCellFormula());break;default:list.add("");break;}}}}}return list.toString();}/*** 抽取ppt中文字* @param path* @return* @throws IOException*/public static String getPptWords(String path) throws IOException {List<String> textList = Lists.newArrayList();SlideShow ppt;InputStream is = new FileInputStream(new File(path));if (path.endsWith(".ppt")) {ppt = new HSLFSlideShow(is);} else if (path.endsWith(".pptx")) {ppt = new XMLSlideShow(is);} else {//LOGGER.debug("此文件{}不是word文件", path);return "此文件不是PPt文件" + path;}List<Slide> slides = ppt.getSlides();for (Slide slide : slides) {List<Shape> shapes = slide.getShapes();for (Shape sh : shapes) {//如果是一个文本框if (sh instanceof TextShape) {TextShape shape = (TextShape) sh;textList.add(shape.getText());}}}return textList.toString();}