Add support for UBXF format

Also support conversion of USFM `srcloc` attribute to internal format and back.
schierlm · Sep 25, 2024 · 93eeed0 · 93eeed0
1 parent 5bc4b4a
commit 93eeed0
Show file tree

Hide file tree

Showing 4 changed files with 598 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -94,6 +94,7 @@ In addition, the following other formats are supported, with varying accuracy:
   - **[USFM 2](https://markups.paratext.org/usfm/)**: import and export
   - **[USX 2/USX 3](https://markups.paratext.org/usx/)**: import and export
 - **[USFX](https://ebible.org/usfx/)**: import and export
+- **[UBXF](https://resource-container.readthedocs.io/en/latest/ubxf.html)**: import (and conversion to "normal" USFM)
 - **[SwordSearcher](https://www.swordsearcher.com/) ([Forge](https://www.swordsearcher.com/forge/))**: export only
 - **[MySword](https://www.mysword.info/)**: import and export
 - **[Obsidian](https://obsidian.md/)**: export only
@@ -310,6 +311,30 @@ when these occur):
     - 1-4.7 becomes 1-4
     - 1.4-7 becomes 1
 
+UBXF Support
+------------
+
+UBXF bibles are based on USFM 3 but contain extra alignment milestones that can be used to
+align the text to another UBXF bible. In general, the `\w` tags in UBXF bibles only contain
+lemma and morphology information if the bible is in a source language - other bibles contain
+alignment information to another Bible instead. BibleMulticonverter can perform the following
+operations on UBXF files:
+
+- Add `srcloc` attributes to `\w` tags of source bibles, simply by counting them within a verse
+- Create a database file that contains word information from a (source language) bible and maps
+  it to Lemma/Strongs/Morphology
+- Use this database file to augment alignment milestones in a translated bible with those data
+- Create new `\w` tags for parts between aligment milestones, if your bible does not have them
+  and/or you want them grouped by source word not by translated word.
+- Fill `\w` tag attributes based on the augmentations in the alignment milestones (to convert to
+  a format that does not support aligment milestones, i.e. most formats)
+- Convert grammar tags (strongs,morphology) from the format used by
+  [unfoldingWord][https://unfoldingword.us/] to normal Strongs/RMAC/WIVU tagging used by other formats
+
+Depending on which UBXF translations you have and what format you ultimately want to convert them
+to, you will have to find out which of the steps above you want to perform and which ones not.
+
+
 E-Sword export
 --------------
 

diff --git a/biblemulticonverter/src/main/java/biblemulticonverter/MainModuleRegistry.java b/biblemulticonverter/src/main/java/biblemulticonverter/MainModuleRegistry.java
@@ -34,6 +34,7 @@ public Collection<Module<ExportFormat>> getExportFormats() {
 		result.add(new Module<ExportFormat>("ScrambledParatextDump", "Like ParatextDump, but with scrambled text; for tests with non-free bibles.", ScrambledParatextDump.HELP_TEXT, ScrambledParatextDump.class));
 		result.add(new Module<ExportFormat>("ParatextStripped", "Export parts of a Paratext bible", ParatextStripped.HELP_TEXT, ParatextStripped.class));
 		result.add(new Module<ExportFormat>("ParatextValidate", "Validate a Paratext bible", ParatextValidate.HELP_TEXT, ParatextValidate.class));
+		result.add(new Module<ExportFormat>("UBXFConverter", "Convert some tags in an UBXF bible", UBXFConverter.HELP_TEXT, UBXFConverter.class));
 		result.add(new Module<ExportFormat>("Volksbibel2000", "Export format for reimporting into Volksbibel 2000", Volksbibel2000.HELP_TEXT, Volksbibel2000.class));
 		result.add(new Module<ExportFormat>("OnLineBible", "Export format for importing into OnLine Bible", OnLineBible.HELP_TEXT, OnLineBible.class));
 		result.add(new Module<ExportFormat>("BrowserBible", "Export format for The Browser Bible 3 by Digital Bible Society", BrowserBible.HELP_TEXT, BrowserBible.class));

diff --git a/...ticonverter/src/main/java/biblemulticonverter/format/paratext/AbstractParatextFormat.java b/...ticonverter/src/main/java/biblemulticonverter/format/paratext/AbstractParatextFormat.java
@@ -70,6 +70,8 @@ public abstract class AbstractParatextFormat implements RoundtripFormat {
 			"toc1", "toc2", "toc3", "toca1", "toca2", "toca3")
 	);
 
+	private static final String[] srclocPrefixes = System.getProperty("biblemulticonverter.paratext.srclocprefixes", "src").split(",");
+
 	protected static String getBibleName(List<ParatextBook> books) {
 		String bibleName = null;
 		for (ParatextBook book : books) {
@@ -804,11 +806,38 @@ public ParatextCharacterContentVisitor<RuntimeException> visitAutoClosingFormatt
 						}
 					}
 				}
+				List<Integer> srclocs = new ArrayList<>();
+				String srclocAttribute = attributes.get("srcloc");
+				if (srclocAttribute != null) {
+					Pattern srclocPattern = null;
+					if (ctx.currentVerse != null && ctx.bk != null && srclocPrefixes.length > 0) {
+						StringBuilder prefixes = new StringBuilder("(");
+						for(String pfx : srclocPrefixes) {
+							if (prefixes.length() > 1)
+								prefixes.append('|');
+							prefixes.append(Pattern.quote(pfx));
+						}
+						prefixes.append(")");
+						srclocPattern = Pattern.compile(prefixes.toString() + Pattern.quote(
+							 ":" + ParatextID.fromBookID(ctx.bk.getId()).getNumber() + "." +
+									ctx.cnum + "." + ctx.currentVerse.getNumber() + ".") + "[0-9]+");
+					}
+					for (String srcloc : srclocAttribute.split("[, ]")) {
+						if (srcloc.matches("[0-9]+")) {
+							srclocs.add(Integer.parseInt(srcloc));
+						} else if (srclocPattern != null && srclocPattern.matcher(srcloc).matches()) {
+							srclocs.add(Integer.parseInt(srcloc.replaceFirst(".*\\.", "")));
+						} else {
+							System.out.println("Skipping unsupported srcloc: " + srcloc);
+						}
+					}
+				}
 				int[] strongsArray = strongs.isEmpty() ? null : strongs.stream().mapToInt(s -> s).toArray();
+				int[] srclocArray = srclocs.isEmpty() ? null : srclocs.stream().mapToInt(s -> s).toArray();
 				if (rmacs.isEmpty() && strongsArray == null) {
 					newVisitor = getCurrentVisitor().visitCSSFormatting(kind.getCss());
 				} else {
-					newVisitor = getCurrentVisitor().visitGrammarInformation(strongsPrefixes.toString().toCharArray(), strongsArray, rmacs.isEmpty() ? null : rmacs.toArray(new String[rmacs.size()]), null);
+					newVisitor = getCurrentVisitor().visitGrammarInformation(strongsPrefixes.toString().isEmpty() ? null : strongsPrefixes.toString().toCharArray(), strongsArray, rmacs.isEmpty() ? null : rmacs.toArray(new String[rmacs.size()]), srclocArray);
 					if (exportAllTags) {
 						newVisitor = newVisitor.visitCSSFormatting("-bmc-usfm-tag: " + kind.getTag());
 					}
@@ -897,6 +926,7 @@ private static class ParatextExportContext {
 		private ParatextBook book;
 		private ParagraphKind currentParagraph;
 		private ParatextCharacterContent charContent;
+		private VerseIdentifier currentVerse;
 
 		public ParatextExportContext(ParatextBook book) {
 			this.book = book;
@@ -914,6 +944,7 @@ public void endChapter(int cnum) {
 
 		public void startVerse(VerseIdentifier verse) {
 			book.getContent().add(new ParatextBook.VerseStart(verse, verse.verse()));
+			currentVerse = verse;
 			charContent = null;
 		}
 
@@ -1193,6 +1224,18 @@ else if (rmac[0].matches(Utils.WIVU_REGEX))
 					throw new IllegalStateException("Invalid morph format: "+rmac[0]);
 				formatting.getAttributes().put("x-morph", prefix + String.join(",", Arrays.asList(rmac)));
 			}
+			if (sourceIndices != null) {
+				String srclocLongPrefix = ctx.currentVerse == null ? null :
+					srclocPrefixes[0] + ":" + ctx.book.getId().getNumber() + "." +
+						ctx.currentVerse.chapter + "." + ctx.currentVerse.verse() + ".";
+				StringBuilder sb = new StringBuilder();
+				for (int i = 0; i < sourceIndices.length; i++) {
+					if (sb.length() != 0)
+						sb.append(",");
+					sb.append(srclocLongPrefix + sourceIndices[i]);
+				}
+				formatting.getAttributes().put("srcloc", sb.toString());
+			}
 			getCharContent().getContent().add(formatting);
 			return new ParatextExportVisitor("in formatting", nt, null, formatting, null);
 		}