Convert CreationTime PDF to readable format in Python

I am working on PDF in Python and I am PDFMiner meta file using PDFMiner . I am extracting information using this:

 from pdfminer.pdfparser import PDFParser, PDFDocument fp = open('diveintopython.pdf', 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() print doc.info[0]['CreationDate'] # And return this value "D:20130501200439+01'00'" 

How to convert D:20130501200439+01'00' to a readable format in Python?

+4
source share
2 answers

Is "+ 01'00" time zone information? Without this, you can create a datetime object as follows:

 >>>from time import mktime, strptime >>>from datetime import datetime ... >>>datestring = doc.info[0]['CreationDate'][2:-7] >>>ts = strptime(datestring, "%Y%m%d%H%M%S") >>>dt = datetime.fromtimestamp(mktime(ts)) datetime(2013, 5, 1, 20, 4, 30) 
+5
source

I found the format registered here . I also had to deal with time zones, because I have 160 kg documents from around the world. Here is my complete solution:

 import datetime import re from dateutil.tz import tzutc, tzoffset pdf_date_pattern = re.compile(''.join([ r"(D:)?", r"(?P<year>\d\d\d\d)", r"(?P<month>\d\d)", r"(?P<day>\d\d)", r"(?P<hour>\d\d)", r"(?P<minute>\d\d)", r"(?P<second>\d\d)", r"(?P<tz_offset>[+-zZ])?", r"(?P<tz_hour>\d\d)?", r"'?(?P<tz_minute>\d\d)?'?"])) def transform_date(date_str): """ Convert a pdf date such as "D:20120321183444+07'00'" into a usable datetime http://www.verypdf.com/pdfinfoeditor/pdf-date-format.htm (D:YYYYMMDDHHmmSSOHH'mm') :param date_str: pdf date string :return: datetime object """ global pdf_date_pattern match = re.match(pdf_date_pattern, date_str) if match: date_info = match.groupdict() for k, v in date_info.iteritems(): # transform values if v is None: pass elif k == 'tz_offset': date_info[k] = v.lower() # so we can treat Z as z else: date_info[k] = int(v) if date_info['tz_offset'] in ('z', None): # UTC date_info['tzinfo'] = tzutc() else: multiplier = 1 if date_info['tz_offset'] == '+' else -1 date_info['tzinfo'] = tzoffset(None, multiplier*(3600 * date_info['tz_hour'] + 60 * date_info['tz_minute'])) for k in ('tz_offset', 'tz_hour', 'tz_minute'): # no longer needed del date_info[k] return datetime.datetime(**date_info) 
+2
source

Source: https://habr.com/ru/post/1480300/


All Articles