I am trying to extract page and header data from a docx file. A file consists of several hundred pages, each of which has a table and a title. The header has relevant information that must be associated with each table. I can extract the header and table data, I just cannot reliably connect them together.
Using win32com is what I still have
# getting the table page number app = Dispatch("Word.Application") doc = app.Documents.Open(filename) table_1_page = doc.Tables(1).Range.Information(3)
The problem arises because the TextFrames and headers are duplicated on multiple pages, so when I call:
# getting the header page number doc.Sections(1).Headers(1).Shapes(1).TextFrame.TextRange.Information(3)
I get one of the pages on which the TextFrame occurs. The page seems to choose a little arbitrary, sometimes its first others are its last, but its not predictable.
I spent a little time reading the object model here . Ultimately, it would be nice to fix all the elements displayed on the page without having to reinvent the wheel.
EDIT 10/25/16 per request, here is the minimum working code **
# filename docx_parser.py import pythoncom class OpenDoc(object): def __init__(self, docx_path): import win32com.client as win32 self.path = docx_path self.word = win32.Dispatch("Word.Application") self.word.Visible = 0 self.word.Documents.Open(p) self.doc = self.word.ActiveDocument def get_table_count(self): return self.doc.Tables.Count def count_table_rows(self, table): return table.Rows.Count def count_table_columns(self, table): return table.Columns.Count def get_headers(self): headers = self.doc.Sections(1).Headers(1) shape_count = headers.Shapes.Count for shape_num in range(1, shape_count + 1): t_range = headers.Shapes(shape_num).TextFrame.TextRange text = t_range.Text page_num = t_range.Information(3) # 3 == wdActiveEndPageNumber yield text, page_num def get_table_text(self, table): col_count = self.count_table_columns(table) row_count = self.count_table_rows(table) for row in range(1, row_count + 1): row_data = [] for col in range(1, col_count + 1): try: row_data.append(table.Cell(Row=row, Column=col).Range.Text.strip(chr(7) + chr(13))) except pythoncom.com_error as error: row_data.append("") yield row_data def get_all_table_text(self): for table in self.get_tables(): table_data = [] for row_data in self.get_table_text(table): table_data.append(row_data) yield table_data def get_tables(self): for table in self.doc.Tables: yield table def __del__(self): self.word.Quit() if __name__ == "__main__": try: path = r"sample.docx" open_doc = OpenDoc(path) for table_num, table_text in enumerate(open_doc.get_all_table_text()): print("\n-------------- Table %s ----------------" % (table_num + 1)) for row_data in table_text: print(", ".join(row_data)) for header_text, page_num in open_doc.get_headers(): print("header page number: %s, text: %s" % (page_num, header_text)) except Exception as error: from traceback import format_exc print(format_exc()) raw_input("")