Skip to content

Commit

Permalink
Add support for UBXF format
Browse files Browse the repository at this point in the history
Also support conversion of USFM `srcloc` attribute to internal format
and back.
  • Loading branch information
schierlm committed Sep 25, 2024
1 parent 5bc4b4a commit 93eeed0
Show file tree
Hide file tree
Showing 4 changed files with 598 additions and 1 deletion.
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ In addition, the following other formats are supported, with varying accuracy:
- **[USFM 2](https://markups.paratext.org/usfm/)**: import and export
- **[USX 2/USX 3](https://markups.paratext.org/usx/)**: import and export
- **[USFX](https://ebible.org/usfx/)**: import and export
- **[UBXF](https://resource-container.readthedocs.io/en/latest/ubxf.html)**: import (and conversion to "normal" USFM)
- **[SwordSearcher](https://www.swordsearcher.com/) ([Forge](https://www.swordsearcher.com/forge/))**: export only
- **[MySword](https://www.mysword.info/)**: import and export
- **[Obsidian](https://obsidian.md/)**: export only
Expand Down Expand Up @@ -310,6 +311,30 @@ when these occur):
- 1-4.7 becomes 1-4
- 1.4-7 becomes 1

UBXF Support
------------

UBXF bibles are based on USFM 3 but contain extra alignment milestones that can be used to
align the text to another UBXF bible. In general, the `\w` tags in UBXF bibles only contain
lemma and morphology information if the bible is in a source language - other bibles contain
alignment information to another Bible instead. BibleMulticonverter can perform the following
operations on UBXF files:

- Add `srcloc` attributes to `\w` tags of source bibles, simply by counting them within a verse
- Create a database file that contains word information from a (source language) bible and maps
it to Lemma/Strongs/Morphology
- Use this database file to augment alignment milestones in a translated bible with those data
- Create new `\w` tags for parts between aligment milestones, if your bible does not have them
and/or you want them grouped by source word not by translated word.
- Fill `\w` tag attributes based on the augmentations in the alignment milestones (to convert to
a format that does not support aligment milestones, i.e. most formats)
- Convert grammar tags (strongs,morphology) from the format used by
[unfoldingWord][https://unfoldingword.us/] to normal Strongs/RMAC/WIVU tagging used by other formats

Depending on which UBXF translations you have and what format you ultimately want to convert them
to, you will have to find out which of the steps above you want to perform and which ones not.


E-Sword export
--------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ public Collection<Module<ExportFormat>> getExportFormats() {
result.add(new Module<ExportFormat>("ScrambledParatextDump", "Like ParatextDump, but with scrambled text; for tests with non-free bibles.", ScrambledParatextDump.HELP_TEXT, ScrambledParatextDump.class));
result.add(new Module<ExportFormat>("ParatextStripped", "Export parts of a Paratext bible", ParatextStripped.HELP_TEXT, ParatextStripped.class));
result.add(new Module<ExportFormat>("ParatextValidate", "Validate a Paratext bible", ParatextValidate.HELP_TEXT, ParatextValidate.class));
result.add(new Module<ExportFormat>("UBXFConverter", "Convert some tags in an UBXF bible", UBXFConverter.HELP_TEXT, UBXFConverter.class));
result.add(new Module<ExportFormat>("Volksbibel2000", "Export format for reimporting into Volksbibel 2000", Volksbibel2000.HELP_TEXT, Volksbibel2000.class));
result.add(new Module<ExportFormat>("OnLineBible", "Export format for importing into OnLine Bible", OnLineBible.HELP_TEXT, OnLineBible.class));
result.add(new Module<ExportFormat>("BrowserBible", "Export format for The Browser Bible 3 by Digital Bible Society", BrowserBible.HELP_TEXT, BrowserBible.class));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ public abstract class AbstractParatextFormat implements RoundtripFormat {
"toc1", "toc2", "toc3", "toca1", "toca2", "toca3")
);

private static final String[] srclocPrefixes = System.getProperty("biblemulticonverter.paratext.srclocprefixes", "src").split(",");

protected static String getBibleName(List<ParatextBook> books) {
String bibleName = null;
for (ParatextBook book : books) {
Expand Down Expand Up @@ -804,11 +806,38 @@ public ParatextCharacterContentVisitor<RuntimeException> visitAutoClosingFormatt
}
}
}
List<Integer> srclocs = new ArrayList<>();
String srclocAttribute = attributes.get("srcloc");
if (srclocAttribute != null) {
Pattern srclocPattern = null;
if (ctx.currentVerse != null && ctx.bk != null && srclocPrefixes.length > 0) {
StringBuilder prefixes = new StringBuilder("(");
for(String pfx : srclocPrefixes) {
if (prefixes.length() > 1)
prefixes.append('|');
prefixes.append(Pattern.quote(pfx));
}
prefixes.append(")");
srclocPattern = Pattern.compile(prefixes.toString() + Pattern.quote(
":" + ParatextID.fromBookID(ctx.bk.getId()).getNumber() + "." +
ctx.cnum + "." + ctx.currentVerse.getNumber() + ".") + "[0-9]+");
}
for (String srcloc : srclocAttribute.split("[, ]")) {
if (srcloc.matches("[0-9]+")) {
srclocs.add(Integer.parseInt(srcloc));
} else if (srclocPattern != null && srclocPattern.matcher(srcloc).matches()) {
srclocs.add(Integer.parseInt(srcloc.replaceFirst(".*\\.", "")));
} else {
System.out.println("Skipping unsupported srcloc: " + srcloc);
}
}
}
int[] strongsArray = strongs.isEmpty() ? null : strongs.stream().mapToInt(s -> s).toArray();
int[] srclocArray = srclocs.isEmpty() ? null : srclocs.stream().mapToInt(s -> s).toArray();
if (rmacs.isEmpty() && strongsArray == null) {
newVisitor = getCurrentVisitor().visitCSSFormatting(kind.getCss());
} else {
newVisitor = getCurrentVisitor().visitGrammarInformation(strongsPrefixes.toString().toCharArray(), strongsArray, rmacs.isEmpty() ? null : rmacs.toArray(new String[rmacs.size()]), null);
newVisitor = getCurrentVisitor().visitGrammarInformation(strongsPrefixes.toString().isEmpty() ? null : strongsPrefixes.toString().toCharArray(), strongsArray, rmacs.isEmpty() ? null : rmacs.toArray(new String[rmacs.size()]), srclocArray);
if (exportAllTags) {
newVisitor = newVisitor.visitCSSFormatting("-bmc-usfm-tag: " + kind.getTag());
}
Expand Down Expand Up @@ -897,6 +926,7 @@ private static class ParatextExportContext {
private ParatextBook book;
private ParagraphKind currentParagraph;
private ParatextCharacterContent charContent;
private VerseIdentifier currentVerse;

public ParatextExportContext(ParatextBook book) {
this.book = book;
Expand All @@ -914,6 +944,7 @@ public void endChapter(int cnum) {

public void startVerse(VerseIdentifier verse) {
book.getContent().add(new ParatextBook.VerseStart(verse, verse.verse()));
currentVerse = verse;
charContent = null;
}

Expand Down Expand Up @@ -1193,6 +1224,18 @@ else if (rmac[0].matches(Utils.WIVU_REGEX))
throw new IllegalStateException("Invalid morph format: "+rmac[0]);
formatting.getAttributes().put("x-morph", prefix + String.join(",", Arrays.asList(rmac)));
}
if (sourceIndices != null) {
String srclocLongPrefix = ctx.currentVerse == null ? null :
srclocPrefixes[0] + ":" + ctx.book.getId().getNumber() + "." +
ctx.currentVerse.chapter + "." + ctx.currentVerse.verse() + ".";
StringBuilder sb = new StringBuilder();
for (int i = 0; i < sourceIndices.length; i++) {
if (sb.length() != 0)
sb.append(",");
sb.append(srclocLongPrefix + sourceIndices[i]);
}
formatting.getAttributes().put("srcloc", sb.toString());
}
getCharContent().getContent().add(formatting);
return new ParatextExportVisitor("in formatting", nt, null, formatting, null);
}
Expand Down
Loading

0 comments on commit 93eeed0

Please sign in to comment.