add module to export xml collation as docx

d-flood · Nov 1, 2021 · f08b9c0 · f08b9c0
1 parent 103eb43
commit f08b9c0
Show file tree

Hide file tree

Showing 9 changed files with 378 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ The standard tool for transcribing ancient New Testament manuscripts ([ITSEE's O
 Moving from transcription to collation to analysis requires several steps of intermediate conversion of the data along the way. Tendon is a collection of tools to help 'connect' these three basic tasks.
 
 ## What Tendon Does
-Tendon is a desktop app with nine distinct tools:
+Tendon is a desktop app with ten distinct tools:
 1. Convert a plain text transcription of a chapter or other unit into single-verse JSON files properly formatted for use in the Collation Editor. This is the simplest way to get data into the Collation Editor.
 2. Get a consolidated plain text file from an entire folder of JSON files.
 3. Convert a repurposed superset of Markdown to TEI XML. Included is a graphical user interface (GUI) to my CLI [MarkdownTEI](https://github.com/d-flood/MarkdownTEI). This is presented as simple and offline alternative to the [Online Transcription Editor (OTE)](https://itsee-wce.birmingham.ac.uk/ote/transcriptiontool). MarkdownTEI converted files can even be uploaded to the OTE.
@@ -18,6 +18,7 @@ Tendon is a desktop app with nine distinct tools:
 7. Provide a simple way to view TEI XML transcriptions offline using the same styling as the [IGNTP online transcriptions](http://www.itseeweb.bham.ac.uk/epistulae/XML/igntp.xml).
 8. Provide an interface to conveniently edit the project configuration. This is how one chooses which witnesses to collate, and which witness should be the basetext.
 9. A simple graphical interface for the [open-cbgm](https://github.com/jjmccollum/open-cbgm-standalone) CLI application. This tool is windows only right now.
+10. Export the TEI XML output of the ITSEE Collation Editor to a Microsoft Word DOCX file suitable for print publication.
 
 ## Installation
 ### Windows standalone version
@@ -180,6 +181,9 @@ This tab is for getting and viewing pre-genealogical coherence. It can be export
   1. "Save as CSV" will save the data as a CSV file that can then be opened in Excel or any spreadsheet software.
   2. "View Plain Text" opens a popup with the full text output of the open-cbgm as one would see when operating the CLI. ![Screenshot of plain text output of the "Compare Witnesses" tab](images/open-cbgm/compare_witnesses_plain_text_screenshot.png)
 
+#### Export XML Collation File to DOCX (Microsoft Word)
+This feature is working and ready, detailed instructions to come...
+
 #### More features of the open-cbgm will be added over time.
 
 ###### I structured and generated the standalone desktop apps with [Beeware's Briefcase](https://github.com/beeware/briefcase).
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.briefcase]
 project_name = "Tendon"
 bundle = "com.davidaflood.tendon"
-version = "0.17.1"
+version = "0.18"
 min_needed_to_update = "0.15"
 url = "https://github.com/d-flood/Tendon"
 license = "MIT license"
@@ -21,7 +21,8 @@ requires = [
     "Markdown==3.2.2",
     "markdown-del-ins",
     "PyGithub",
-    "toml"
+    "toml",
+    "python-docx==0.8.11"
 ]
 
 

diff --git a/src/tendon/app.py b/src/tendon/app.py
@@ -16,11 +16,12 @@
 from tendon.mac_layout import mac_layout
 from tendon.pc_layout import pc_layout
 import tendon.py.update_tendon as ut
+from tendon.py.export_to_docx.xml_to_docx_ui import export_to_docx
 
 # if platform.system() == 'Windows':
 from tendon.py.cbgm_interface.open_cbgm_ui import open_cbgm_ui
 
-__version = '0.17.1'
+__version = '0.18'
 #pylint: disable=no-member
 
 def open_new_window(function: FunctionType, window: sg.Window, main_dir, font, icon, include_main_dir=False):
@@ -88,7 +89,9 @@ def main():
             open_new_window(open_cbgm_ui, window, main_dir, font, icon)
 
         elif event == 'Check for Updates':
-            # ut.check_for_updates(__version)
             ut.check_for_updates(__version, window)
 
+        elif event == 'export_to_docx':
+            open_new_window(export_to_docx, window, main_dir, font, icon)
+
     window.close()
diff --git a/src/tendon/mac_layout.py b/src/tendon/mac_layout.py
@@ -16,6 +16,7 @@ def mac_layout():
         [sg.Button('View TEI Transcriptions', key='tei_server', size=bs)],
         [sg.Button('Configure Collation Editor', key='ce_config', size=bs)],
         [sg.Button('open-cbgm Interface', key='open-cbgm', size=bs)],
+        [sg.Button('Export Collation to DOCX', key='export_to_docx', size=bs)],
         [sg.Button('Close', size=(30, 1.5), pad=(10, 15))]
         ]
     return layout
diff --git a/src/tendon/pc_layout.py b/src/tendon/pc_layout.py
@@ -18,7 +18,8 @@ def pc_layout():
         [sg.Button('Reformat Collation File', key='reformat_xml', size=bs)],
         [sg.Button('View TEI Transcriptions', key='tei_server', size=bs)],
         [sg.Button('Configure Collation Editor', key='ce_config', size=bs)],
-        [sg.B('open-cbgm Interface', key='open-cbgm', size=bs)],
+        [sg.Button('open-cbgm Interface', key='open-cbgm', size=bs)],
+        [sg.Button('Export Collation to DOCX', key='export_to_docx', size=bs)],
         [sg.Stretch(), sg.Button('Close', pad=(20, 20), size=(20, 2)), sg.Stretch()]
     ]
     return layout
diff --git a/src/tendon/py/edit_settings.py b/src/tendon/py/edit_settings.py
@@ -39,7 +39,12 @@ def get_settings():
             'plain_text_dir': '',
             'cbgm_main_dir': '',
             'cbgm_cx_dir': '',
-            'pre_parse_regex': []
+            'pre_parse_regex': [],
+            'export_docx_folder': '',
+            'text_wits_separator': ' // ',
+            'rdg_n_text_separator': '\t',
+            'words_per_line': 10,
+            'text_bold': False,
         }
         save_settings(settings)
     return settings

diff --git a/src/tendon/py/export_to_docx/template.docx b/src/tendon/py/export_to_docx/template.docx
diff --git a/src/tendon/py/export_to_docx/xml_to_docx.py b/src/tendon/py/export_to_docx/xml_to_docx.py
@@ -0,0 +1,274 @@
+import os
+from pathlib import Path
+import re
+from typing import List
+
+from lxml import etree as et
+from docx import Document
+from natsort import natsorted
+import PySimpleGUIQt as sg
+
+from tendon.py.reformat_collation.itsee_to_open_cbgm import reformat_xml
+import tendon.py.edit_settings as es
+
+TEI_NS = '{http://www.tei-c.org/ns/1.0}'
+XML_NS = '{http://www.w3.org/XML/1998/namespace}'
+ABBR_TO_FULL = {
+    'Matt': 'Matthew',
+    'B01': 'Matthew',
+    'Mark': 'Mark',
+    'B02': 'Mark',
+    'Luke': 'Luke',
+    'B03': 'Luke',
+    'John': 'John',
+    'B04': 'John',
+    'Acts': 'Acts',
+    'B05': 'Acts',
+    'Rom': 'Romans',
+    'B06': 'Romans',
+    'Romans': 'Romans',
+    'R': 'Romans',
+    '1 Cor': '1 Corinthians',
+    '1Cor': '1 Corinthians',
+    'ICor': '1 Corinthians',
+    'B07': '1 Corinthians',
+    'B07': '1 Corinthians',
+    '1 Corinthians': '1 Corinthians',
+    '2 Cor': '2 Corinthians',
+    '2Cor': '2 Corinthians',
+    'IICor': '2 Corinthians',
+    '2 Corinthians': '2 Corinthians',
+    'B08': '2 Corinthians',
+    'Gal': 'Galatians',
+    'Galatians': 'Galatians',
+    'B09': 'Galatians',
+    'Eph': 'Ephesians',
+    'Ephesians': 'Ephesians',
+    'B10': 'Ephesians',
+    'Phil': 'Philippians',
+    'Philippians': 'Philippians',
+    'B11': 'Philippians',
+    'Col': 'Colossians',
+    'Colossians': 'Colossians',
+    'B12': 'Colossians',
+    '1 Thess': '1 Thessalonians',
+    '1Thess': '1 Thessalonians',
+    '1 Thessalonians': '1 Thessalonians',
+    'B13': '1 Thessalonians',
+    '2 Thess': '2 Thessalonians',
+    '2Thess': '2 Thessalonians',
+    '2 Thessalonians': '2 Thessalonians',
+    'B14': '2 Thessalonians',
+    '1 Tim': '1 Timothy',
+    '1Tim': '1 Timothy',
+    '1 Timothy': '1 Timothy',
+    'B15': '1 Timothy',
+    '2 Tim': '2 Timothy',
+    '2Tim': '2 Timothy',
+    '2 Timothy': '2 Timothy',
+    'B16': '2 Timothy',
+    'Titus': 'Titus',
+    'B17': 'Titus',
+    'Phlm': 'B18',
+    'Philemon': 'Philemon',
+    'B18': 'Philemon',
+    'Heb': 'B19',
+    'Hebrews': 'Hebrews',
+    'B19': 'Hebrews',
+    'Jas': 'James',
+    'James': 'James',
+    'B20': 'James',
+    '1 Pet': '1 Peter',
+    '1Pet': '1 Peter',
+    '1 Peter': '1 Peter',
+    'B21': '1 Peter',
+    '2 Pet': '2 Peter',
+    '2Pet': '2 Peter',
+    '2 Peter': '2 Peter',
+    'B22': '2 Peter',
+    '1 John': '1 John',
+    'B23': '1 John',
+    '1John': '1 John',
+    '2 John': '2 John',
+    'B24': '2 John',
+    '2John': '2 John',
+    '3 John': '3John',
+    '3John': '3John',
+    'B25': '3John',
+    'Jude': 'Jude',
+    'B26': 'Jude',
+    'Rev': 'Revelation',
+    'Revelation': 'Revelation',
+    'B27': 'Revelation',
+}
+
+def get_xml_file(xml: str) -> et._Element:
+    temp_cx_file = 'temp_xml_collation_file'
+    xml = xml.replace('xml:id="1', 'xml:id="I')
+    xml = xml.replace('xml:id="2', 'xml:id="II')
+    xml = xml.replace('xml:id="3', 'xml:id="III')
+    xml = xml.replace('subreading', 'subr')
+    with open(temp_cx_file, 'w', encoding='utf-8') as file:
+        file.write(xml)
+    if re.search('<teiHeader>', xml) is None:
+        try:
+            temp_cx_file = reformat_xml(temp_cx_file)
+        except:
+            return None
+    parser = et.XMLParser(remove_blank_text=True, encoding='UTF-8')
+    tree = et.parse(temp_cx_file, parser) #type: et._ElementTree
+    root = tree.getroot()
+    os.remove(temp_cx_file)
+    return root
+
+def get_document():
+    this_dir = Path(__file__).parent
+    template = this_dir.joinpath('template.docx').as_posix()
+    print(template)
+    return Document(template)
+
+def load_xml_file(xml_file: str):
+    with open(xml_file, 'r', encoding='utf-8') as file:
+        xml = file.read()
+    return get_xml_file(xml)
+
+def construct_full_ref(ab: et. _Element):
+    ref = ab.get(f'{XML_NS}id').replace('-APP', '') #type: str
+    if ref.startswith('B'): # then it is an INTF/IGNTP style reference... probably
+        book = re.search(r'B\d+', ref).group(0)
+        book = ABBR_TO_FULL[book]
+        chapter = re.search(r'K\d+', ref).group(0)
+        verse = re.search(r'V\d+', ref).group(0)
+        ref = f'{book} {chapter}:{verse}'
+    else:
+        book = re.search(r'.[a-zA-Z]+', ref)
+        if not book:
+            return ref
+        book = book.group(0)
+        full_book = ABBR_TO_FULL.get(book)
+        if not full_book:
+            return ref
+        reference = ref.replace(book, '').replace('.', ':')
+        ref = f'{full_book} {reference}'
+    return ref
+
+def print_reference(document: Document, ab: et._Element):
+    ref = construct_full_ref(ab)
+    reference = document.add_paragraph(ref)
+    reference.style = document.styles['reference']
+
+def group_basetext_words(basetext: str, words_per_line: int) -> List[list]:
+    words_per_line = words_per_line - 1
+    grouped_basetext = []
+    current_group = []
+    chunk = 0
+    for word in basetext.split():
+        if chunk == words_per_line:
+            current_group.append(word)
+            grouped_basetext.append(current_group)
+            chunk = 0
+            current_group = []
+            continue
+        current_group.append(word)
+        chunk += 1
+    if current_group != []:
+        grouped_basetext.append(current_group)
+    return grouped_basetext
+
+def construct_basetext(ab: et._Element) -> str:
+    basetext = []
+    for elem in ab:
+        if elem.tag == f'{TEI_NS}seg':
+            basetext.append(elem.text)
+        elif elem.tag == f'{TEI_NS}app' and elem.find(f'{TEI_NS}lem').get('type') != 'om':
+            basetext.append(elem.find(f'{TEI_NS}lem').text)
+    return ' '.join(basetext)
+
+def print_basetext(document: Document, ab: et._Element, words_per_line: int):
+    basetext = construct_basetext(ab)
+    basetext = group_basetext_words(basetext, words_per_line)
+    table = document.add_table(rows=0, cols=10)
+    index = 2
+    for line in basetext:
+        row_cells = table.add_row().cells
+        for cell, word in enumerate(line):
+            row_cells[cell].text = f"{word}\n{index}"
+            row_cells[cell].paragraphs[0].style = document.styles['table cell']
+            index += 2
+
+def print_app(document: Document, app: et._Element):
+    app_from = app.get('from')
+    app_to = app.get('to')
+    if app_from == app_to:
+        index = app_from
+    else:
+        index = f'{app_from}–{app_to}'
+    p = document.add_paragraph(index)
+    p.style = document.styles['index']
+
+def sort_by_ga(wits: List[str]):
+    papyri = []
+    majuscules = []
+    minuscules = []
+    lectionaries = []
+    editions = []
+    for wit in wits:
+        if wit.lower().startswith('p'):
+            papyri.append(wit)
+        elif wit.startswith('0'):
+            majuscules.append(wit)
+        elif wit[0].isdigit():
+            minuscules.append(wit)
+        elif wit.lower().startswith('l'):
+            lectionaries.append(wit)
+        else:
+            editions.append(wit)
+    return natsorted(papyri) + natsorted(majuscules) + natsorted(minuscules) + natsorted(lectionaries) + natsorted(editions)
+
+def print_rdg(
+    document, rdg: et._Element, 
+    text_wits_separator: str, 
+    rdg_n_text_separator: str, 
+    text_bold: bool
+    ):
+    if rdg.text:
+        greek_text = rdg.text
+    else:
+        greek_text = rdg.get('type')
+    p = document.add_paragraph()
+    p.style = document.styles['reading']
+    rdg_name = re.sub(r'\d', '', rdg.get('n'))
+    p.add_run(rdg_name).italic = True
+    p.add_run(rdg_n_text_separator)
+    p.add_run(greek_text).bold = text_bold
+    wits = rdg.get('wit').split(' ')
+    wits = sort_by_ga(wits)
+    wits = ' '.join(wits)
+    p.add_run(f"{text_wits_separator}{wits}")
+
+def save_docx(document: Document, settings: dict):
+    docx_filename = sg.popup_get_file('', no_window=True, save_as=True, initial_folder=settings.get('export_docx_folder'), file_types=(('DOCX Files', '*.docx'),))
+    if not docx_filename:
+        return
+    if not docx_filename.endswith('.docx'):
+        docx_filename = f'{docx_filename}.docx'
+    docx_dir = Path(docx_filename).parent.as_posix()
+    es.edit_settings('export_docx_folder', docx_dir)
+    document.save(docx_filename)
+    return docx_filename
+
+def export_xml_to_docx(xml_filename: str):
+    settings = es.get_settings()
+    document = get_document()
+    root = load_xml_file(xml_filename)
+    for ab in root.findall(f'{TEI_NS}ab'):
+        print_reference(document, ab)
+        print_basetext(document, ab, settings['words_per_line'])
+        for app in ab.findall(f'{TEI_NS}app'):
+            print_app(document, app)
+            for rdg in app.findall(f'{TEI_NS}rdg'): #type: List[et._Element]
+                print_rdg(
+                    document, rdg, settings['text_wits_separator'], 
+                    settings['rdg_n_text_separator'], settings['text_bold']
+                    )
+    return save_docx(document, settings)