Add OSHB import format

schierlm · Mar 12, 2024 · 634e4f2 · 634e4f2
1 parent 7622542
commit 634e4f2
Show file tree

Hide file tree

Showing 3 changed files with 100 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -74,7 +74,9 @@ In addition, the following other formats are supported, with varying accuracy:
 - **[BrowserBible](https://github.com/digitalbiblesociety/browserbible-3/)**: export only
 - **[Quick Bible](http://www.bibleforandroid.com/)**: export only
 - **[SWORD](https://www.crosswire.org/sword) modules**: import only (see below for details)
-- **[MorphGNT](https://github.com/morphgnt/sblgnt)**: import only
+- **Original Languages with tagging**: import only
+  - [MorphGNT](https://github.com/morphgnt/sblgnt)
+  - [OpenScriptures Hebrew Bible (OSHB) MorphBB](https://github.com/openscriptures/morphhb)
 - **[MyBible.Zone](https://mybible.zone/index-eng.php)** ([more bibles](http://www.ph4.org/b4_index.php)): import and export (in a special SQLite edition)
 - **[Bible Analyzer](http://www.bibleanalyzer.com/)**: export only (text export for
   bibles and dictionaries, SQLite export for bibles)

diff --git a/biblemulticonverter/src/main/java/biblemulticonverter/MainModuleRegistry.java b/biblemulticonverter/src/main/java/biblemulticonverter/MainModuleRegistry.java
@@ -14,6 +14,7 @@ public Collection<Module<ImportFormat>> getImportFormats() {
 		List<Module<ImportFormat>> result = new ArrayList<ModuleRegistry.Module<ImportFormat>>();
 		result.add(new Module<ImportFormat>("StrongDictionary", "Importer for creating a Strong's dictionary from public domain resources.", StrongDictionary.HELP_TEXT, StrongDictionary.class));
 		result.add(new Module<ImportFormat>("MorphGNT", "Importer for MorphGNT", MorphGNT.HELP_TEXT, MorphGNT.class));
+		result.add(new Module<ImportFormat>("OSHB", "Importer for OpenScriptures Hebrew Bible MorphBB", OSHB.HELP_TEXT, OSHB.class));
 		return result;
 	}
 

diff --git a/biblemulticonverter/src/main/java/biblemulticonverter/format/OSHB.java b/biblemulticonverter/src/main/java/biblemulticonverter/format/OSHB.java
@@ -0,0 +1,96 @@
+package biblemulticonverter.format;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+
+import org.w3c.dom.*;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
+
+import biblemulticonverter.data.Bible;
+import biblemulticonverter.data.Book;
+import biblemulticonverter.data.BookID;
+import biblemulticonverter.data.Chapter;
+import biblemulticonverter.data.FormattedText.ExtraAttributePriority;
+import biblemulticonverter.data.FormattedText.Visitor;
+import biblemulticonverter.data.Verse;
+
+public class OSHB implements ImportFormat {
+
+	public static final String[] HELP_TEXT = {
+			"Importer for OpenScriptures Hebrew Bible MorphBB",
+			"",
+			"Usage: OSHB <directory>",
+			"",
+			"Download OSHB from <https://github.com/openscriptures/morphhb>."
+	};
+
+	@Override
+	public Bible doImport(File directory) throws Exception {
+		Bible bible = new Bible("OSHB");
+		DocumentBuilder docBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
+		XPath xpath = javax.xml.xpath.XPathFactory.newInstance().newXPath();
+		for (int zefID = 1; zefID < 40; zefID++) {
+			BookID bid = BookID.fromZefId(zefID);
+			Book book = new Book(bid.getOsisID(), bid, bid.getEnglishName(), bid.getEnglishName());
+			bible.getBooks().add(book);
+			Document doc = docBuilder.parse(new File(directory, bid.getOsisID() + ".xml"));
+			NodeList verses = (NodeList) xpath.evaluate("//verse", doc, XPathConstants.NODESET);
+			for (int i = 0; i < verses.getLength(); i++) {
+				Element verse = (Element) verses.item(i);
+				String[] parts = verse.getAttribute("osisID").split("\\.");
+				if (parts.length != 3 || !parts[0].equals(book.getAbbr()))
+					throw new RuntimeException();
+				int cnum = Integer.parseInt(parts[1]);
+				int vnum = Integer.parseInt(parts[2]);
+				while (book.getChapters().size() < cnum)
+					book.getChapters().add(new Chapter());
+				Verse v = new Verse("" + vnum);
+				book.getChapters().get(cnum - 1).getVerses().add(v);
+				Visitor<RuntimeException> vv = v.getAppendVisitor();
+				boolean spaceAllowed = false;
+				for (Node ww = verse.getFirstChild(); ww != null; ww = ww.getNextSibling()) {
+					if (ww instanceof Text && ww.getTextContent().trim().isEmpty()) {
+						continue;
+					}
+					Element w = (Element) ww;
+					if (spaceAllowed)
+						vv.visitText(" ");
+					spaceAllowed = true;
+					if (w.getNodeName().equals("seg")) {
+						vv.visitText(w.getTextContent().trim());
+						continue;
+					} else if (w.getNodeName().equals("note")) {
+						vv.visitFootnote().visitText(w.getTextContent().replaceAll("[\r\n\t ]+", " ").trim());
+						continue;
+					}
+					if (!w.getNodeName().equals("w"))
+						throw new RuntimeException(w.getNodeName());
+					List<String> snums = new ArrayList<>(Arrays.asList(w.getAttribute("lemma").split("[^0-9]+")));
+					snums.removeIf(s -> s.isEmpty());
+					int[] strong = new int[snums.size()];
+					for (int j = 0; j < strong.length; j++) {
+						strong[j] = Integer.parseInt(snums.get(j));
+					}
+					vv.visitGrammarInformation(null, strong.length == 0 ? null : strong, new String[] { w.getAttribute("morph") }, null).visitText(w.getTextContent());
+				}
+				v.finished();
+			}
+		}
+		return bible;
+	}
+}