python3 抽取PDF文字

2021-09-25 22:25:41 字數 3473 閱讀 3519

# -*- coding: utf-8 -*-

import datetime

import re

import sys

import time

from datetime import timedelta

from io import stringio

from pdfminer.converter import textconverter

from pdfminer.layout import laparams

from pdfminer.pdfinterp import pdfresourcemanager, pdfpageinterpreter

from pdfminer.pdfpage import pdfpage

import requests

import pymysql

import redis

import os

#日期校驗

def isvailddate(date):

try:

if ":" in date:

time.strptime(date, "%y-%m-%d %h:%m:%s")

else:

time.strptime(date, "%y-%m-%d")

return true

except:

return false

#根據給定的日期,獲取前n天或後n天的日期,n為正數則是以後的n天,n為負數則是以前的n天

def get_day_of_day(str2date,n=0):

if(n<0):

n = abs(n)

return (str2date-timedelta(days=n))

else:

return str2date+timedelta(days=n)

#開始日期與結束日期校驗

def valid_date(startdate,enddate):

#獲取當前時間日期

# nowtime_str = datetime.datetime.now().strftime('%y-%m-%d')

# print(nowtime_str)

#mktime引數為struc_time,將日期轉化為秒,

e_time = time.mktime(time.strptime(enddate,"%y-%m-%d"))

print(e_time)

try:

s_time = time.mktime(time.strptime(startdate, '%y-%m-%d'))

#print(s_time)

#日期轉化為int比較

diff = int(e_time)-int(s_time)

#print(diff)

if diff >= 0:

return true

else:

return false

except exception as e:

print(e)

return false

#讀取pdf檔案到字串

def getcontet(pdfpath,pages=none):

if(pdfpath is none):

return

print("pdfurl===="+pdfpath)

if not pages:

pagenums = set()

else:

pagenums = set(pages)

output = stringio()

# 建立乙個pdf資源管理器物件來儲存共賞資源

manager = pdfresourcemanager()

# 建立乙個pdf裝置物件

converter = textconverter(manager, output, laparams=laparams())

# 建立乙個pdf直譯器物件

interpreter = pdfpageinterpreter(manager, converter)

# 開啟源pdf檔案

with open(pdfpath, 'rb') as infile:

# 對pdf每一頁進行分析

try:

for page in pdfpage.get_pages(infile, pagenums):

interpreter.process_page(page)

except exception as e:

print(e)

print("pdf內容提取失敗")

converter.close()

# 得到每一頁的txt文件

text = output.getvalue()

output.close

#print(text)

pattern = r'\s+'

#logging.info("txt轉換成的字串==="+re.sub(pattern, '',text))

#print(re.sub(pattern, '',text))

content=re.sub(pattern, '',text)

return  content

with open(pdf_path, 'wb') as file:

for chunk in res.iter_content(100000):

file.write(chunk)

return pdf_path

else:

return

rdp = redis.connectionpool(host=redishost, port=redisport, password="")

#redis連線池

r = redis.strictredis(connection_pool=rdp)

if __name__ == "__main__":

if((len(sys.ar**)<=2) or (len(sys.ar**)>3)):

raise exception("請按格式執行指令碼,如:python pdf_handle_store.py 2017-08-01 2017-08-30")

#日期字串校驗

frist = isvailddate(sys.ar**[1])

if(not frist):

raise exception("請輸入有效的日期")

second = isvailddate(sys.ar**[2])

if (not second):

raise exception("請輸入有效的日期")

result = valid_date(sys.ar**[1], sys.ar**[2])

if(not result):

raise exception("開始時間不能大於結束時間")

startdate=sys.ar**[1]

enddate=sys.ar**[2]

python3 將pdf檔案轉為text

pdf檔案儘管可以用python提取文字,但存在加密的情況,那種pdf就是解析不了的。另外pdf更類似於,所以即使可以用python提取,結果也容易有問題。所以效果不敢保證。在python3中解析pdf一般用pdfminer3k,就是pdfminer的python3版本。直接pip安裝即可 pip ...

Python3檢驗pdf檔案是否有效

利用pypdf2的pdffilereader模組開啟pdf檔案,如果不拋異常,就認為此pdf檔案有效。有時開啟並不丟擲異常,但是有這種警告 userwarning startxref on same line as offset pdf.py 1680 這種情況pdf多半也是壞的,可進一步通過頁數判...

python3 切片 python3 切片

取乙個list或tuple的部分元素是非常常見的操作。比如,乙個list如下 l michael sarah tracy bob jack 取前3個元素,應該怎麼做?笨辦法 l 0 l 1 l 2 michael sarah tracy 之所以是笨辦法是因為擴充套件一下,取前n個元素就沒轍了。取前n...