一:python将pdf转换图片(进程)
# -*- coding:utf-8 -*-
# Author : yyzhang56
# 所有的图片与PDF转换的操作都在这里进行定义from multiprocessing import Pool
# 安装fitz需要安装PyMuPDF才能使用
import fitz
import ostmp = r'C:\Users\Downloads\' #pdf路径export_file=r"D:\new_dates\数据"save_path=r"D:\new_dates\ocr_result"
os.makedirs(export_file,exist_ok=True)
pdf_dir = [i for i in os.listdir(tmp) if os.path.splitext(i)[-1] == ".pdf"]
def pdf_to_jpg(name):# lock.acquire()#拼接pdf的文件路径pwd_name=os.path.join(tmp,name)doc=fitz.open(pwd_name)# 将文件名同我们的保存路径拼接起来(保存图片的文件夹)dir_name=os.path.splitext(name)[0]pdf_name = os.path.join(export_file, dir_name)# print(pdf_name)temp = 0#(保存图片的文件夹)不存咋则生成# exsitsdir.judge(pdf_name)os.makedirs(pdf_name,exist_ok=True)for pg in range(doc.pageCount):page = doc[pg]temp += 1rotate = int(0)# 每个尺寸的缩放系数为2,这将为我们生成分辨率提高四倍的图像。zoom_x = 2.0zoom_y = 2.0trans = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)pm = page.getPixmap(matrix=trans, alpha=False)pic_name = '{}.jpg'.format(temp)#拼接生成pdf的文件路径pic_pwd = os.path.join(pdf_name, pic_name)# print(pic_pwd)pm.writePNG(pic_pwd)def main():pool = Pool(10)for i in pdf_dir:res = pool.apply_async(pdf_to_jpg, (i,))pool.close()pool.join()if __name__ == '__main__':main() #需要pdf切图就开启
二:图片转pdf
#Author:yyzhang56
from PIL import Image
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
# 单张图片转pdf
# img = Image.open('优化.png')
# img.save('pypdf01.pdf', 'PDF') # 通过PIL库保存为pdf格式# 多张图片转pdf
# ilst = ['D:/docOfStu/pypdf2-mindmap-01.jpg', 'D:/docOfStu/pypdf2-mindmap-02.jpg'] # 图片列表
path=r"C:\Users\Desktop\pdf\tif_pic"
ilst=os.listdir(path)
out_pdf = PdfFileWriter()
sdfs=[os.path.join(path,i) for i in ilst ]
print(sdfs)
for f in sdfs:img = Image.open(f)fw = f.replace('.png', '.pdf')img.save(fw)out_pdf.appendPagesFromReader(PdfFileReader(open(fw, 'rb'))) # 也可拆这句为 sc_pdf=PdfFileReader(open(fw,'rb')); out_pdf.addPage(sc_pdf.getPage(0))
out_pdf.write(open('./pypdf2.pdf', 'wb'))
三:pdf转docx
import os
from pdf2docx import Converter'''
author--yyzhang
'''pdf_path = r"C:\Users\Desktop\pdf\files" # pdf路径
docx_path = r"C:\Users\Desktop\pdf\docx" # 保存docx路径
# 如果保存的路径不存在可以自动生成
if os.path.exists(docx_path):pass
else:os.makedirs(docx_path)pdf_files = os.listdir(pdf_path)
# 遍历pdf路径下的文件
for pdf_file in pdf_files:if pdf_file.endswith(".pdf") or pdf_file.endswith(".PDF"):head_name = os.path.splitext(pdf_file)[0] #获取文件名docx_name = head_name + ".docx" #创建docx名称file_path = os.path.join(pdf_path, pdf_file) #获取pdf绝对路径docx_file_path = os.path.join(docx_path, docx_name) #获取docx绝对路径cv = Converter(file_path)cv.convert(docx_file_path, start=0, end=None)cv.close()