ankushshah89 · emrothenberg · Jan 7, 2026
diff --git a/README.md b/README.md
@@ -17,6 +17,8 @@ a. From command line:
 docx2txt file.docx
 # extract text and images
 docx2txt -i /tmp/img_dir file.docx
+# extract text and split on page breaks
+docx2txt -s file.docx
 ```
 b. From python:
 ```python
@@ -26,5 +28,7 @@ import docx2txt
 text = docx2txt.process("file.docx")
 
 # extract text and write images in /tmp/img_dir
-text = docx2txt.process("file.docx", "/tmp/img_dir") 
+text = docx2txt.process("file.docx", img_dir="/tmp/img_dir") 
+# extract text, split on page breaks, and write images in /tmp/img_dir
+text = docx2txt.process("file.docx", split_pages=True, img_dir="/tmp/img_dir") 
 ```
diff --git a/docx2txt/docx2txt.py b/docx2txt/docx2txt.py
@@ -2,6 +2,7 @@
 
 import argparse
 import re
+from typing import List, Literal, Union, cast, overload
 import xml.etree.ElementTree as ET
 import zipfile
 import os
@@ -16,6 +17,8 @@ def process_args():
                                                  'to extract text and images '
                                                  'from docx files.')
     parser.add_argument("docx", help="path of the docx file")
+    parser.add_argument('-s', '--split_pages',
+                        help='Split text on page breaks')
     parser.add_argument('-i', '--img_dir', help='path of directory '
                                                 'to extract images')
 
@@ -47,30 +50,61 @@ def qn(tag):
     return '{{{}}}{}'.format(uri, tagroot)
 
 
-def xml2text(xml):
+@overload
+def xml2text(xml, split_pages: Literal[True]) -> List[str]: ...
+
+
+@overload
+def xml2text(xml, split_pages: Literal[False]) -> str: ...
+
+
+def xml2text(xml, split_pages: bool):
     """
     A string representing the textual content of this run, with content
     child elements like ``<w:tab/>`` translated to their Python
     equivalent.
     Adapted from: https://github.com/python-openxml/python-docx/
     """
     text = u''
+    texts = []
+
     root = ET.fromstring(xml)
     for child in root.iter():
         if child.tag == qn('w:t'):
             t_text = child.text
             text += t_text if t_text is not None else ''
         elif child.tag == qn('w:tab'):
             text += '\t'
+        elif split_pages and child.tag == qn('w:br') and list(child.attrib.values())[0] == "page":
+            texts.append(text)
+            text = u''
         elif child.tag in (qn('w:br'), qn('w:cr')):
             text += '\n'
         elif child.tag == qn("w:p"):
             text += '\n\n'
-    return text
+    return texts if split_pages else text
 
 
-def process(docx, img_dir=None):
-    text = u''
+def strip_list(lst: list):
+    while lst and not lst[-1]:
+        lst.pop()
+
+    while lst and not lst[0]:
+        lst.pop(0)
+
+    return lst
+
+
+@overload
+def process(docx, split_pages: Literal[True], img_dir=None) -> List[str]: ...
+
+
+@overload
+def process(docx, split_pages: Literal[False], img_dir=None) -> str: ...
+
+
+def process(docx, split_pages=False, img_dir=None):
+    text: Union[list[str], str] = [] if split_pages else ""
 
     # unzip the docx in memory
     zipf = zipfile.ZipFile(docx)
@@ -81,18 +115,33 @@ def process(docx, img_dir=None):
     header_xmls = 'word/header[0-9]*.xml'
     for fname in filelist:
         if re.match(header_xmls, fname):
-            text += xml2text(zipf.read(fname))
+            if split_pages:
+                text = cast(list[str], text)
+                text.extend(xml2text(zipf.read(fname), split_pages))
+            else:
+                text = cast(str, text)
+                text += xml2text(zipf.read(fname), split_pages)
 
     # get main text
     doc_xml = 'word/document.xml'
-    text += xml2text(zipf.read(doc_xml))
+    if split_pages:
+        text = cast(list[str], text)
+        text.extend(xml2text(zipf.read(doc_xml), split_pages))
+    else:
+        text = cast(str, text)
+        text += xml2text(zipf.read(doc_xml), split_pages)
 
     # get footer text
     # there can be 3 footer files in the zip
     footer_xmls = 'word/footer[0-9]*.xml'
     for fname in filelist:
         if re.match(footer_xmls, fname):
-            text += xml2text(zipf.read(fname))
+            if split_pages:
+                text = cast(list[str], text)
+                text.extend(xml2text(zipf.read(fname), split_pages))
+            else:
+                text = cast(str, text)
+                text += xml2text(zipf.read(fname), split_pages)
 
     if img_dir is not None:
         # extract images
@@ -104,10 +153,10 @@ def process(docx, img_dir=None):
                     dst_f.write(zipf.read(fname))
 
     zipf.close()
-    return text.strip()
+    return [t.strip() for t in strip_list(cast(list[str], text))] if split_pages else cast(str, text).strip()
 
 
 if __name__ == '__main__':
     args = process_args()
-    text = process(args.docx, args.img_dir)
+    text = process(args.docx, args.split_pages, args.img_dir)
     sys.stdout.write(text.encode('utf-8'))