Skip to content

Commit

Permalink
add module to export xml collation as docx
Browse files Browse the repository at this point in the history
  • Loading branch information
d-flood committed Nov 1, 2021
1 parent 103eb43 commit f08b9c0
Show file tree
Hide file tree
Showing 9 changed files with 378 additions and 7 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ The standard tool for transcribing ancient New Testament manuscripts ([ITSEE's O
Moving from transcription to collation to analysis requires several steps of intermediate conversion of the data along the way. Tendon is a collection of tools to help 'connect' these three basic tasks.

## What Tendon Does
Tendon is a desktop app with nine distinct tools:
Tendon is a desktop app with ten distinct tools:
1. Convert a plain text transcription of a chapter or other unit into single-verse JSON files properly formatted for use in the Collation Editor. This is the simplest way to get data into the Collation Editor.
2. Get a consolidated plain text file from an entire folder of JSON files.
3. Convert a repurposed superset of Markdown to TEI XML. Included is a graphical user interface (GUI) to my CLI [MarkdownTEI](https://github.com/d-flood/MarkdownTEI). This is presented as simple and offline alternative to the [Online Transcription Editor (OTE)](https://itsee-wce.birmingham.ac.uk/ote/transcriptiontool). MarkdownTEI converted files can even be uploaded to the OTE.
Expand All @@ -18,6 +18,7 @@ Tendon is a desktop app with nine distinct tools:
7. Provide a simple way to view TEI XML transcriptions offline using the same styling as the [IGNTP online transcriptions](http://www.itseeweb.bham.ac.uk/epistulae/XML/igntp.xml).
8. Provide an interface to conveniently edit the project configuration. This is how one chooses which witnesses to collate, and which witness should be the basetext.
9. A simple graphical interface for the [open-cbgm](https://github.com/jjmccollum/open-cbgm-standalone) CLI application. This tool is windows only right now.
10. Export the TEI XML output of the ITSEE Collation Editor to a Microsoft Word DOCX file suitable for print publication.

## Installation
### Windows standalone version
Expand Down Expand Up @@ -180,6 +181,9 @@ This tab is for getting and viewing pre-genealogical coherence. It can be export
1. "Save as CSV" will save the data as a CSV file that can then be opened in Excel or any spreadsheet software.
2. "View Plain Text" opens a popup with the full text output of the open-cbgm as one would see when operating the CLI. ![Screenshot of plain text output of the "Compare Witnesses" tab](images/open-cbgm/compare_witnesses_plain_text_screenshot.png)

#### Export XML Collation File to DOCX (Microsoft Word)
This feature is working and ready, detailed instructions to come...

#### More features of the open-cbgm will be added over time.

###### I structured and generated the standalone desktop apps with [Beeware's Briefcase](https://github.com/beeware/briefcase).
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.briefcase]
project_name = "Tendon"
bundle = "com.davidaflood.tendon"
version = "0.17.1"
version = "0.18"
min_needed_to_update = "0.15"
url = "https://github.com/d-flood/Tendon"
license = "MIT license"
Expand All @@ -21,7 +21,8 @@ requires = [
"Markdown==3.2.2",
"markdown-del-ins",
"PyGithub",
"toml"
"toml",
"python-docx==0.8.11"
]


Expand Down
7 changes: 5 additions & 2 deletions src/tendon/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@
from tendon.mac_layout import mac_layout
from tendon.pc_layout import pc_layout
import tendon.py.update_tendon as ut
from tendon.py.export_to_docx.xml_to_docx_ui import export_to_docx

# if platform.system() == 'Windows':
from tendon.py.cbgm_interface.open_cbgm_ui import open_cbgm_ui

__version = '0.17.1'
__version = '0.18'
#pylint: disable=no-member

def open_new_window(function: FunctionType, window: sg.Window, main_dir, font, icon, include_main_dir=False):
Expand Down Expand Up @@ -88,7 +89,9 @@ def main():
open_new_window(open_cbgm_ui, window, main_dir, font, icon)

elif event == 'Check for Updates':
# ut.check_for_updates(__version)
ut.check_for_updates(__version, window)

elif event == 'export_to_docx':
open_new_window(export_to_docx, window, main_dir, font, icon)

window.close()
1 change: 1 addition & 0 deletions src/tendon/mac_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def mac_layout():
[sg.Button('View TEI Transcriptions', key='tei_server', size=bs)],
[sg.Button('Configure Collation Editor', key='ce_config', size=bs)],
[sg.Button('open-cbgm Interface', key='open-cbgm', size=bs)],
[sg.Button('Export Collation to DOCX', key='export_to_docx', size=bs)],
[sg.Button('Close', size=(30, 1.5), pad=(10, 15))]
]
return layout
3 changes: 2 additions & 1 deletion src/tendon/pc_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def pc_layout():
[sg.Button('Reformat Collation File', key='reformat_xml', size=bs)],
[sg.Button('View TEI Transcriptions', key='tei_server', size=bs)],
[sg.Button('Configure Collation Editor', key='ce_config', size=bs)],
[sg.B('open-cbgm Interface', key='open-cbgm', size=bs)],
[sg.Button('open-cbgm Interface', key='open-cbgm', size=bs)],
[sg.Button('Export Collation to DOCX', key='export_to_docx', size=bs)],
[sg.Stretch(), sg.Button('Close', pad=(20, 20), size=(20, 2)), sg.Stretch()]
]
return layout
7 changes: 6 additions & 1 deletion src/tendon/py/edit_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,12 @@ def get_settings():
'plain_text_dir': '',
'cbgm_main_dir': '',
'cbgm_cx_dir': '',
'pre_parse_regex': []
'pre_parse_regex': [],
'export_docx_folder': '',
'text_wits_separator': ' // ',
'rdg_n_text_separator': '\t',
'words_per_line': 10,
'text_bold': False,
}
save_settings(settings)
return settings
Expand Down
Binary file added src/tendon/py/export_to_docx/template.docx
Binary file not shown.
274 changes: 274 additions & 0 deletions src/tendon/py/export_to_docx/xml_to_docx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
import os
from pathlib import Path
import re
from typing import List

from lxml import etree as et
from docx import Document
from natsort import natsorted
import PySimpleGUIQt as sg

from tendon.py.reformat_collation.itsee_to_open_cbgm import reformat_xml
import tendon.py.edit_settings as es

TEI_NS = '{http://www.tei-c.org/ns/1.0}'
XML_NS = '{http://www.w3.org/XML/1998/namespace}'
ABBR_TO_FULL = {
'Matt': 'Matthew',
'B01': 'Matthew',
'Mark': 'Mark',
'B02': 'Mark',
'Luke': 'Luke',
'B03': 'Luke',
'John': 'John',
'B04': 'John',
'Acts': 'Acts',
'B05': 'Acts',
'Rom': 'Romans',
'B06': 'Romans',
'Romans': 'Romans',
'R': 'Romans',
'1 Cor': '1 Corinthians',
'1Cor': '1 Corinthians',
'ICor': '1 Corinthians',
'B07': '1 Corinthians',
'B07': '1 Corinthians',
'1 Corinthians': '1 Corinthians',
'2 Cor': '2 Corinthians',
'2Cor': '2 Corinthians',
'IICor': '2 Corinthians',
'2 Corinthians': '2 Corinthians',
'B08': '2 Corinthians',
'Gal': 'Galatians',
'Galatians': 'Galatians',
'B09': 'Galatians',
'Eph': 'Ephesians',
'Ephesians': 'Ephesians',
'B10': 'Ephesians',
'Phil': 'Philippians',
'Philippians': 'Philippians',
'B11': 'Philippians',
'Col': 'Colossians',
'Colossians': 'Colossians',
'B12': 'Colossians',
'1 Thess': '1 Thessalonians',
'1Thess': '1 Thessalonians',
'1 Thessalonians': '1 Thessalonians',
'B13': '1 Thessalonians',
'2 Thess': '2 Thessalonians',
'2Thess': '2 Thessalonians',
'2 Thessalonians': '2 Thessalonians',
'B14': '2 Thessalonians',
'1 Tim': '1 Timothy',
'1Tim': '1 Timothy',
'1 Timothy': '1 Timothy',
'B15': '1 Timothy',
'2 Tim': '2 Timothy',
'2Tim': '2 Timothy',
'2 Timothy': '2 Timothy',
'B16': '2 Timothy',
'Titus': 'Titus',
'B17': 'Titus',
'Phlm': 'B18',
'Philemon': 'Philemon',
'B18': 'Philemon',
'Heb': 'B19',
'Hebrews': 'Hebrews',
'B19': 'Hebrews',
'Jas': 'James',
'James': 'James',
'B20': 'James',
'1 Pet': '1 Peter',
'1Pet': '1 Peter',
'1 Peter': '1 Peter',
'B21': '1 Peter',
'2 Pet': '2 Peter',
'2Pet': '2 Peter',
'2 Peter': '2 Peter',
'B22': '2 Peter',
'1 John': '1 John',
'B23': '1 John',
'1John': '1 John',
'2 John': '2 John',
'B24': '2 John',
'2John': '2 John',
'3 John': '3John',
'3John': '3John',
'B25': '3John',
'Jude': 'Jude',
'B26': 'Jude',
'Rev': 'Revelation',
'Revelation': 'Revelation',
'B27': 'Revelation',
}

def get_xml_file(xml: str) -> et._Element:
temp_cx_file = 'temp_xml_collation_file'
xml = xml.replace('xml:id="1', 'xml:id="I')
xml = xml.replace('xml:id="2', 'xml:id="II')
xml = xml.replace('xml:id="3', 'xml:id="III')
xml = xml.replace('subreading', 'subr')
with open(temp_cx_file, 'w', encoding='utf-8') as file:
file.write(xml)
if re.search('<teiHeader>', xml) is None:
try:
temp_cx_file = reformat_xml(temp_cx_file)
except:
return None
parser = et.XMLParser(remove_blank_text=True, encoding='UTF-8')
tree = et.parse(temp_cx_file, parser) #type: et._ElementTree
root = tree.getroot()
os.remove(temp_cx_file)
return root

def get_document():
this_dir = Path(__file__).parent
template = this_dir.joinpath('template.docx').as_posix()
print(template)
return Document(template)

def load_xml_file(xml_file: str):
with open(xml_file, 'r', encoding='utf-8') as file:
xml = file.read()
return get_xml_file(xml)

def construct_full_ref(ab: et. _Element):
ref = ab.get(f'{XML_NS}id').replace('-APP', '') #type: str
if ref.startswith('B'): # then it is an INTF/IGNTP style reference... probably
book = re.search(r'B\d+', ref).group(0)
book = ABBR_TO_FULL[book]
chapter = re.search(r'K\d+', ref).group(0)
verse = re.search(r'V\d+', ref).group(0)
ref = f'{book} {chapter}:{verse}'
else:
book = re.search(r'.[a-zA-Z]+', ref)
if not book:
return ref
book = book.group(0)
full_book = ABBR_TO_FULL.get(book)
if not full_book:
return ref
reference = ref.replace(book, '').replace('.', ':')
ref = f'{full_book} {reference}'
return ref

def print_reference(document: Document, ab: et._Element):
ref = construct_full_ref(ab)
reference = document.add_paragraph(ref)
reference.style = document.styles['reference']

def group_basetext_words(basetext: str, words_per_line: int) -> List[list]:
words_per_line = words_per_line - 1
grouped_basetext = []
current_group = []
chunk = 0
for word in basetext.split():
if chunk == words_per_line:
current_group.append(word)
grouped_basetext.append(current_group)
chunk = 0
current_group = []
continue
current_group.append(word)
chunk += 1
if current_group != []:
grouped_basetext.append(current_group)
return grouped_basetext

def construct_basetext(ab: et._Element) -> str:
basetext = []
for elem in ab:
if elem.tag == f'{TEI_NS}seg':
basetext.append(elem.text)
elif elem.tag == f'{TEI_NS}app' and elem.find(f'{TEI_NS}lem').get('type') != 'om':
basetext.append(elem.find(f'{TEI_NS}lem').text)
return ' '.join(basetext)

def print_basetext(document: Document, ab: et._Element, words_per_line: int):
basetext = construct_basetext(ab)
basetext = group_basetext_words(basetext, words_per_line)
table = document.add_table(rows=0, cols=10)
index = 2
for line in basetext:
row_cells = table.add_row().cells
for cell, word in enumerate(line):
row_cells[cell].text = f"{word}\n{index}"
row_cells[cell].paragraphs[0].style = document.styles['table cell']
index += 2

def print_app(document: Document, app: et._Element):
app_from = app.get('from')
app_to = app.get('to')
if app_from == app_to:
index = app_from
else:
index = f'{app_from}{app_to}'
p = document.add_paragraph(index)
p.style = document.styles['index']

def sort_by_ga(wits: List[str]):
papyri = []
majuscules = []
minuscules = []
lectionaries = []
editions = []
for wit in wits:
if wit.lower().startswith('p'):
papyri.append(wit)
elif wit.startswith('0'):
majuscules.append(wit)
elif wit[0].isdigit():
minuscules.append(wit)
elif wit.lower().startswith('l'):
lectionaries.append(wit)
else:
editions.append(wit)
return natsorted(papyri) + natsorted(majuscules) + natsorted(minuscules) + natsorted(lectionaries) + natsorted(editions)

def print_rdg(
document, rdg: et._Element,
text_wits_separator: str,
rdg_n_text_separator: str,
text_bold: bool
):
if rdg.text:
greek_text = rdg.text
else:
greek_text = rdg.get('type')
p = document.add_paragraph()
p.style = document.styles['reading']
rdg_name = re.sub(r'\d', '', rdg.get('n'))
p.add_run(rdg_name).italic = True
p.add_run(rdg_n_text_separator)
p.add_run(greek_text).bold = text_bold
wits = rdg.get('wit').split(' ')
wits = sort_by_ga(wits)
wits = ' '.join(wits)
p.add_run(f"{text_wits_separator}{wits}")

def save_docx(document: Document, settings: dict):
docx_filename = sg.popup_get_file('', no_window=True, save_as=True, initial_folder=settings.get('export_docx_folder'), file_types=(('DOCX Files', '*.docx'),))
if not docx_filename:
return
if not docx_filename.endswith('.docx'):
docx_filename = f'{docx_filename}.docx'
docx_dir = Path(docx_filename).parent.as_posix()
es.edit_settings('export_docx_folder', docx_dir)
document.save(docx_filename)
return docx_filename

def export_xml_to_docx(xml_filename: str):
settings = es.get_settings()
document = get_document()
root = load_xml_file(xml_filename)
for ab in root.findall(f'{TEI_NS}ab'):
print_reference(document, ab)
print_basetext(document, ab, settings['words_per_line'])
for app in ab.findall(f'{TEI_NS}app'):
print_app(document, app)
for rdg in app.findall(f'{TEI_NS}rdg'): #type: List[et._Element]
print_rdg(
document, rdg, settings['text_wits_separator'],
settings['rdg_n_text_separator'], settings['text_bold']
)
return save_docx(document, settings)
Loading

0 comments on commit f08b9c0

Please sign in to comment.