深入学习Python解析并解密PDF文件内容的方法(2)

# coding:utf-8 import os from PyPDF2 import PdfFileReader from PyPDF2 import PdfFileWriter def get_reader(filename, password): try: old_file = open(filename, 'rb') print('run jiemi1') except Exception as err: print('文件打开失败!' + str(err)) return None # 创建读实例 pdf_reader = PdfFileReader(old_file, strict=False) # 解密操作 if pdf_reader.isEncrypted: if password is None: print('%s文件被加密,需要密码!' % filename) return None else: if pdf_reader.decrypt(password) != 1: print('%s密码不正确!' % filename) return None if old_file in locals(): old_file.close() return pdf_reader def decrypt_pdf(filename, password, decrypted_filename=None): """ 将加密的文件及逆行解密,并生成一个无需密码pdf文件 :param filename: 原先加密的pdf文件 :param password: 对应的密码 :param decrypted_filename: 解密之后的文件名 :return: """ # 生成一个Reader和Writer print('run jiemi') pdf_reader = get_reader(filename, password) if pdf_reader is None: return if not pdf_reader.isEncrypted: print('文件没有被加密,无需操作!') return pdf_writer = PdfFileWriter() pdf_writer.appendPagesFromReader(pdf_reader) if decrypted_filename is None: decrypted_filename = "".join(filename.split('.')[:-1]) + '_' + 'decrypted' + '.pdf' # 写入新文件 pdf_writer.write(open(decrypted_filename, 'wb')) decrypt_pdf(r'5b931164edc09a226b3a12c4.pdf', '')

  运行结果如下:

  新生成的文件如下:

  打开是这样的:

  所以 ,这样的话 就可以打开了,也可以解析了,下面继续使用PDF解析文件解析,代码是上面的,结果如下:

  解析成功,那么会保存为txt格式。

但是这里要注意,我给解密的代码,把密码设置为abc,如下:

  那么会触发异常,代码结果表示如下:

  代码如下:

# coding:utf-8 import os from PyPDF2 import PdfFileReader from PyPDF2 import PdfFileWriter def get_reader(filename, password): try: old_file = open(filename, 'rb') print('run jiemi1') except Exception as err: print('文件打开失败!' + str(err)) return None # 创建读实例 pdf_reader = PdfFileReader(old_file, strict=False) # 解密操作 if pdf_reader.isEncrypted: if password is None: print('%s文件被加密,需要密码!' % filename) return None else: if pdf_reader.decrypt(password) != 1: print('%s密码不正确!' % filename) return None if old_file in locals(): old_file.close() return pdf_reader def decrypt_pdf(filename, password, decrypted_filename=None): """ 将加密的文件及逆行解密,并生成一个无需密码pdf文件 :param filename: 原先加密的pdf文件 :param password: 对应的密码 :param decrypted_filename: 解密之后的文件名 :return: """ # 生成一个Reader和Writer print('run jiemi') pdf_reader = get_reader(filename, password) if pdf_reader is None: return if not pdf_reader.isEncrypted: print('文件没有被加密,无需操作!') return pdf_writer = PdfFileWriter() pdf_writer.appendPagesFromReader(pdf_reader) if decrypted_filename is None: decrypted_filename = "".join(filename.split('.')[:-1]) + '_' + 'decrypted' + '.pdf' # 写入新文件 pdf_writer.write(open(decrypted_filename, 'wb')) decrypt_pdf(r'5b931164edc09a226b3a12c4.pdf', 'abc')


四:PyPDF2的理论介绍

  PyPDF2 包含了 PdfFileReader PdfFileMerger PageObject PdfFileWriter 四个常用的主要 Class。

具体分析:

PyPDF2 将读与写分成两个类来操作:

from PyPDF2 import PdfFileWriter, PdfFileReader writer = PdfFileWriter() reader = PdfFileReader(open("document1.pdf", "rb"))

官方实例:

from PyPDF2 import PdfFileWriter, PdfFileReader output = PdfFileWriter() input1 = PdfFileReader(open("document1.pdf", "rb")) # print how many pages input1 has: print "document1.pdf has %d pages." % input1.getNumPages() # add page 1 from input1 to output document, unchanged output.addPage(input1.getPage(0)) # add page 2 from input1, but rotated clockwise 90 degrees output.addPage(input1.getPage(1).rotateClockwise(90)) # add page 3 from input1, rotated the other way: output.addPage(input1.getPage(2).rotateCounterClockwise(90)) # alt: output.addPage(input1.getPage(2).rotateClockwise(270)) # add page 4 from input1, but first add a watermark from another PDF: page4 = input1.getPage(3) watermark = PdfFileReader(open("watermark.pdf", "rb")) page4.mergePage(watermark.getPage(0)) output.addPage(page4) # add page 5 from input1, but crop it to half size: page5 = input1.getPage(4) page5.mediaBox.upperRight = ( page5.mediaBox.getUpperRight_x() / 2, page5.mediaBox.getUpperRight_y() / 2 ) output.addPage(page5) # add some Javascript to launch the print window on opening this PDF. # the password dialog may prevent the print dialog from being shown, # comment the the encription lines, if that's the case, to try this out output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") # encrypt your new PDF and add a password password = "secret" output.encrypt(password) # finally, write "output" to document-output.pdf outputStream = file("PyPDF2-output.pdf", "wb") output.write(outputStream)

五 :PdfFileReader类

class PyPDF2.PdfFileReader(stream,strict = True,warndest = None, overwriteWarnings = True )

初始化PdfFileReader对象。此操作可能需要一些时间,因为PDF流的交叉引用表被读入内存。

参数:

stream - File对象或支持类似于File对象的标准读取和搜索方法的对象。也可以是表示PDF文件路径的字符串。

strict(bool) - 确定是否应该警告用户所有问题并且还会导致一些可纠正的问题致命。默认为True

warndest - 记录警告的目的地(默认为 sys.stderr)。

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/326c69ad83386fe7634601de1c129360.html