I found the format registered here . I also had to deal with time zones, because I have 160 kg documents from around the world. Here is my complete solution:
import datetime import re from dateutil.tz import tzutc, tzoffset pdf_date_pattern = re.compile(''.join([ r"(D:)?", r"(?P<year>\d\d\d\d)", r"(?P<month>\d\d)", r"(?P<day>\d\d)", r"(?P<hour>\d\d)", r"(?P<minute>\d\d)", r"(?P<second>\d\d)", r"(?P<tz_offset>[+-zZ])?", r"(?P<tz_hour>\d\d)?", r"'?(?P<tz_minute>\d\d)?'?"])) def transform_date(date_str): """ Convert a pdf date such as "D:20120321183444+07'00'" into a usable datetime http://www.verypdf.com/pdfinfoeditor/pdf-date-format.htm (D:YYYYMMDDHHmmSSOHH'mm') :param date_str: pdf date string :return: datetime object """ global pdf_date_pattern match = re.match(pdf_date_pattern, date_str) if match: date_info = match.groupdict() for k, v in date_info.iteritems():
source share