- # 从 pdf 中读取文本
- # 写 pdf
- # 加密解密 pdf
- # 和平 pdf, 加水印
- # pip install PyPDF2
%cd D:\python 全站 \ office
import PyPDF2
D:\python 全站 \ office
- pdf_obj = open('coop.pdf', 'rb')
- pdf = PyPDF2.PdfFileReader(pdf_obj)
- pdf.numPages
- 3
- page = pdf.getPage(0)
- page.extractText() # 提取文件
'\n\n \n \n1\\\n1\nN\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \n\n\n \n \n\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \n \n\n\n \n \n\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \n \n'
- # 提取中文 pip install pdfminer3k #支持中文
- from pdfminer.pdfinterp import PDFResourceManager, process_pdf # 资源管理
- from pdfminer.converter import TextConverter # 文本转换
- from pdfminer.layout import LAParams #布局
- from io import StringIO # 生成临时文件
- def convert_pdf(path):
- rsrcmgr = PDFResourceManager()
- retstr = StringIO()
- laparams = LAParams()
- device = TextConverter(rsrcmgr, retstr, laparams = laparams)
- fp = open(path, 'rb')
- process_pdf(rsrcmgr, device, fp)
- fp.close()
- device.close()
- out = retstr.getvalue()
- retstr.close()
- return out
- s = convert_pdf('coop.pdf')
- # print(s)
- # convert_pdf('coop.pdf')
- s.split('\n\x0c')
- ['测试语句 \n\n 第 1 页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n 测试语句 \n\n 第一页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n 测试语句 \n\n 第一页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n \n \n \n \n',
- '测试语句 \n\n 第 2 页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n \n \n \n \n',
- 'de8ug word \n\n 测试语句 \n\n 第 3 页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n \n \n',
- '']
- # 写 pdf, 从上文打开的 pdf 找出第二页, 新鞋一个 pdf
- pdf_writer = PyPDF2.PdfFileWriter()
- page = pdf.getPage(1)
- pdf_writer.addPage(page)
- with open('coop-1.pdf', 'wb') as f:
- pdf_writer.write(f)
- pdf_obj.close()
- # 加密 pdf
- with open('coop.pdf', 'rb') as f_in:
- pdf = PyPDF2.PdfFileReader(f_in)
- pdf_writer = PyPDF2.PdfFileWriter()
- for page_num in range(pdf.numPages):
- pdf_writer.addPage(pdf.getPage(page_num))
- pdf_writer.encrypt('hicoop')
- with open('coop-s.pdf', 'wb') as f_out:
- pdf_writer.write(f_out)
- # 解密
- with open('coop-s.pdf', 'rb') as f_in:
- pdf = PyPDF2.PdfFileReader(f_in)
- print(pdf.isEncrypted)
- pdf.decrypt('hicoop')
- pdf.getPage(0) #取到解密后的数据才能正常操作
- True
- # 合并多个 pdf, 加水印
- with open('coop.pdf', 'rb') as f_in:
- with open('coop-watermarked.pdf', 'rb') as f_w:
- pdf = PyPDF2.PdfFileReader(f_in)
- pdf_w = PyPDF2.PdfFileReader(f_w)
- pdf_write = PyPDF2.PdfFileWriter()
- for page_num in range(pdf.numPages):
- page = pdf.getPage(page_num)
- page.mergePage(pdf_w.getPage(0))
- pdf_write.addPage(page)
- with open('coop-watermarked.pdf', 'wb') as f_out:
- pdf_write.write(f_out)
- ---------------------------------------------------------------------------
- OSError Traceback (most recent call last)
- <ipython-input-39-b87325251ec9> in <module>()
- 3 with open('coop-watermarked.pdf', 'rb') as f_w:
- 4 pdf = PyPDF2.PdfFileReader(f_in)
- ----> 5 pdf_w = PyPDF2.PdfFileReader(f_w)
- 6
- 7 pdf_write = PyPDF2.PdfFileWriter()
- c:\users\coop\miniconda3\envs\coop\lib\site-packages\PyPDF2\pdf.py in __init__(self, stream, strict, warndest, overwriteWarnings)
- 1082 stream = BytesIO(b_(fileobj.read()))
- 1083 fileobj.close()
- -> 1084 self.read(stream)
- 1085 self.stream = stream
- 1086
- c:\users\coop\miniconda3\envs\coop\lib\site-packages\PyPDF2\pdf.py in read(self, stream)
- 1687 if debug: print(">>read", stream)
- 1688 # start at the end:
- -> 1689 stream.seek(-1, 2)
- 1690 if not stream.tell():
- 1691 raise utils.PdfReadError('Cannot read an empty file')
- OSError: [Errno 22] Invalid argument
- python pdf
来源: http://www.bubuko.com/infodetail-2695160.html