diff --git a/README.md b/README.md index 250db86..8d605fd 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,9 @@ modules without loss of data: - **RoundtripHTML**: HTML format that can be read back if desired (originally intended for publishing on free website hosters, but with the advent of free file hosters this feature is pretty much obsolete). +- **RoundtripStructuredHTML**: Similar to above, but converts the structure (i. e. + paragraphs, headlines, tables) as toplevel elements, and not the verses. Often results + in better HTML when converting from Paratext formats. - **RoundtripODT**: Export as an editable .odt (OpenOffice/LibreOffice Document Text), which can be edited in LibreOffice (tested with LibreOffice 6.0) and later imported again. Large bibles can take a minute or so to open in LibreOffice 6, which @@ -187,7 +190,8 @@ limited to the old intermediate format. | `RoundtripTaggedText` | complete | unit tests | | `RoundtripXML` | complete | unit tests | | `AbstractParatextFormat` | partial *(maybe complete?)* | unit tests | -| `RoundtripHTML` | partial *(Show `ga-` attributes)* | unit tests | +| `RoundtripHTML` | partial *(Show `ga-` attributes)* | partial | +| `RoundtripStructuredHTML` | partial *(Show `ga-` attributes)* | partial | | `RoundtripODT` | partial *(Handling of hyperlinks)* | unit tests | | `AbstractVersificationDetector` | complete | none | | `AugmentGrammar` | complete | none | diff --git a/biblemulticonverter/src/main/java/biblemulticonverter/MainModuleRegistry.java b/biblemulticonverter/src/main/java/biblemulticonverter/MainModuleRegistry.java index 62cd2e8..3df1549 100644 --- a/biblemulticonverter/src/main/java/biblemulticonverter/MainModuleRegistry.java +++ b/biblemulticonverter/src/main/java/biblemulticonverter/MainModuleRegistry.java @@ -82,6 +82,7 @@ public Collection> getRoundtripFormats() { result.add(new Module("RoundtripTaggedText", "A text-format that consistently uses numbered tags to make automated editing easy.", RoundtripTaggedText.HELP_TEXT, RoundtripTaggedText.class)); result.add(new Module("SoftProjector", "Bible format used by SoftProjector", SoftProjector.HELP_TEXT, SoftProjector.class)); result.add(new Module("BebliaXML", "Beblia XML format.", BebliaXML.HELP_TEXT, BebliaXML.class)); + result.add(new Module("RoundtripStructuredHTML", "Roundtrip HTML Export using structured paragraphs", RoundtripStructuredHTML.HELP_TEXT, RoundtripStructuredHTML.class)); return result; } diff --git a/biblemulticonverter/src/main/java/biblemulticonverter/format/AbstractHTMLVisitor.java b/biblemulticonverter/src/main/java/biblemulticonverter/format/AbstractHTMLVisitor.java index f87bdff..198f207 100644 --- a/biblemulticonverter/src/main/java/biblemulticonverter/format/AbstractHTMLVisitor.java +++ b/biblemulticonverter/src/main/java/biblemulticonverter/format/AbstractHTMLVisitor.java @@ -37,8 +37,12 @@ protected String getNextFootnoteTarget() { return null; } + protected void prepareForInlineOutput(boolean endTag) throws IOException { + } + @Override public void visitVerseSeparator() throws IOException { + prepareForInlineOutput(false); writer.write("/"); } @@ -53,6 +57,7 @@ public void visitStart() throws IOException { @Override public void visitText(String text) throws IOException { + prepareForInlineOutput(false); writer.write(text.replace("&", "&").replace("<", "<").replace(">", ">")); } @@ -74,6 +79,7 @@ public FormattedText.Visitor visitFormattingInstruction(FormattingI endTag = ""; } } + prepareForInlineOutput(false); writer.write(startTag); pushSuffix(endTag); return this; @@ -85,6 +91,7 @@ protected String createFormattingInstructionStartTag(FormattingInstructionKind k @Override public Visitor visitCSSFormatting(String css) throws IOException { + prepareForInlineOutput(false); writer.write(""); pushSuffix(""); return this; @@ -93,6 +100,7 @@ public Visitor visitCSSFormatting(String css) throws IOException { @Override public void visitRawHTML(RawHTMLMode mode, String raw) throws IOException { if (!mode.equals(Boolean.getBoolean("rawhtml.online") ? RawHTMLMode.OFFLINE : RawHTMLMode.ONLINE)) { + prepareForInlineOutput(false); writer.write(raw); } } @@ -104,6 +112,7 @@ public Visitor visitVariationText(String[] variations) throws IOExc @Override public Visitor visitHyperlink(HyperlinkType type, String target) throws IOException { + prepareForInlineOutput(false); if (type == HyperlinkType.ANCHOR) { writer.write(""); } else { @@ -123,6 +132,7 @@ public Visitor visitExtraAttribute(ExtraAttributePriority prio, Str @Override public boolean visitEnd() throws IOException { + prepareForInlineOutput(true); writer.write(suffixStack.remove(suffixStack.size() - 1)); return false; } diff --git a/biblemulticonverter/src/main/java/biblemulticonverter/format/AbstractStructuredHTMLVisitor.java b/biblemulticonverter/src/main/java/biblemulticonverter/format/AbstractStructuredHTMLVisitor.java new file mode 100644 index 0000000..c26909c --- /dev/null +++ b/biblemulticonverter/src/main/java/biblemulticonverter/format/AbstractStructuredHTMLVisitor.java @@ -0,0 +1,161 @@ +package biblemulticonverter.format; + +import java.io.IOException; +import java.io.Writer; + +import biblemulticonverter.data.FormattedText; +import biblemulticonverter.data.FormattedText.ExtendedLineBreakKind; +import biblemulticonverter.data.FormattedText.LineBreakKind; +import biblemulticonverter.data.FormattedText.Visitor; +import biblemulticonverter.format.AbstractStructuredHTMLVisitor.StructuredHTMLState; + +/** + * Helper class for converting {@link FormattedText} to structured HTML. Unlike + * {@link AbstractHTMLVisitor}, this also implements headlines and line breaks + * (including tables and paragraph styles using inline CSS). State is tracked in + * a StructuredHTMLState object that may be passed to subsequent visitors or + * "closed". + */ +public abstract class AbstractStructuredHTMLVisitor extends AbstractHTMLVisitor { + + protected final StructuredHTMLState state; + + protected AbstractStructuredHTMLVisitor(StructuredHTMLState state) { + super(state.getWriter(), ""); + this.state = state; + } + + @Override + protected void prepareForInlineOutput(boolean endTag) throws IOException { + if (!endTag) { + state.ensureOpen(); + } else if (suffixStack.size() == 2) { + state.closeHeadline(); + } + } + + public static void startHeadline(StructuredHTMLState state) throws IOException { + state.closeAll(); + state.openState = StructuredHTMLOpenState.Headline; + } + + @Override + public Visitor visitHeadline(int depth) throws IOException { + if (suffixStack.size() == 1) { + startHeadline(state); + } + writer.write(""); + pushSuffix(""); + return this; + } + + public static void visitLineBreak(AbstractHTMLVisitor v, StructuredHTMLState state, ExtendedLineBreakKind kind, int indent) throws IOException { + if (kind == ExtendedLineBreakKind.TABLE_ROW_FIRST_CELL || kind == ExtendedLineBreakKind.TABLE_ROW_NEXT_CELL) { + if (kind == ExtendedLineBreakKind.TABLE_ROW_FIRST_CELL && state.openState == StructuredHTMLOpenState.TableCell) { + v.writer.write(""); + state.colNumber = 1; + } else if (state.openState == StructuredHTMLOpenState.TableCell) { + v.writer.write(""); + } else { + state.closeAll(); + v.writer.write(""); + state.openState = StructuredHTMLOpenState.TableCell; + state.colNumber = 1; + } + if (state.colNumber == 1 && kind == ExtendedLineBreakKind.TABLE_ROW_NEXT_CELL) { + System.out.println("WARNING: Table cell without table row start"); + state.colNumber++; // for roundtrip formats + } + v.writer.write("
0) { + v.writer.write(" colspan=\"" + indent + "\""); + state.colNumber += indent - 1; + } else if (indent == ExtendedLineBreakKind.INDENT_CENTER) { + v.writer.write(" style=\"text-align: center;\""); + } else if (indent == ExtendedLineBreakKind.INDENT_RIGHT_JUSTIFIED) { + v.writer.write(" style=\"text-align: right;\""); + } + v.writer.write(">"); + } else { + state.closeAll(); + state.openState = StructuredHTMLOpenState.Para; + v.writer.write("

0) { + v.writer.write(" style=\"text-indent: " + indent + "em;\""); + } else if (indent == ExtendedLineBreakKind.INDENT_CENTER) { + v.writer.write(" style=\"text-align: center;\""); + } else if (indent == ExtendedLineBreakKind.INDENT_RIGHT_JUSTIFIED) { + v.writer.write(" style=\"text-align: right;\""); + } + v.writer.write(">"); + } + } + + @Override + public void visitLineBreak(ExtendedLineBreakKind kind, int indent) throws IOException { + if (suffixStack.size() == 1 && kind != ExtendedLineBreakKind.NEWLINE) { + visitLineBreak(this, state, kind, indent); + } else { + prepareForInlineOutput(false); + if (!kind.isSameParagraph()) { + writer.write("

"); + } else { + writer.write("
"); + } + if (indent > 0) { + writer.write(""); + for (int i = 0; i < indent; i++) { + writer.write("   "); + } + writer.write(""); + } + } + } + + public static class StructuredHTMLState { + private final Writer writer; + private StructuredHTMLOpenState openState = StructuredHTMLOpenState.None; + private int colNumber = 0; + + public StructuredHTMLState(Writer writer) { + this.writer = writer; + } + + public Writer getWriter() { + return writer; + } + + public void ensureOpen() throws IOException { + if (openState == StructuredHTMLOpenState.None) { + writer.write("

"); + openState = StructuredHTMLOpenState.Para; + } + } + + public void closeHeadline() { + if (openState == StructuredHTMLOpenState.Headline) { + openState = StructuredHTMLOpenState.None; + } + } + + public void closeAll() throws IOException { + switch (openState) { + case Para: + writer.write("

"); + break; + case TableCell: + writer.write("
"); + break; + case None: + case Headline: + break; + } + openState = StructuredHTMLOpenState.None; + } + } + + private static enum StructuredHTMLOpenState { + None, TableCell, Para, Headline + } +} diff --git a/biblemulticonverter/src/main/java/biblemulticonverter/format/RoundtripHTML.java b/biblemulticonverter/src/main/java/biblemulticonverter/format/RoundtripHTML.java index 07b62e5..5115a37 100644 --- a/biblemulticonverter/src/main/java/biblemulticonverter/format/RoundtripHTML.java +++ b/biblemulticonverter/src/main/java/biblemulticonverter/format/RoundtripHTML.java @@ -148,29 +148,7 @@ public void doExport(Bible bible, String... exportArgs) throws Exception { bw.write("
\n"); bw.write("

" + bk.getAbbr() + (bk.getChapters().size() == 1 ? "" : " " + cnumber) + "

\n"); bw.write("\n"); - List footnotes = new ArrayList(); - if (ch.getProlog() != null) { - bw.write("
\n"); - ch.getProlog().accept(new RoundtripHTMLVisitor(bw, footnotes, "", "", xrefMap)); - bw.write("\n"); - bw.write("
\n"); - } - if (ch.getVerses().size() > 0) { - bw.write("
\n"); - for (Verse v : ch.getVerses()) { - bw.write("
"); - v.accept(new RoundtripHTMLVisitor(bw, footnotes, "" + v.getNumber() + " ", "", xrefMap)); - bw.write("
\n"); - } - bw.write("
\n"); - } - if (footnotes.size() > 0) { - bw.write("
\n"); - for (StringWriter footnote : footnotes) { - bw.write(footnote.toString() + "\n"); - } - bw.write("
\n"); - } + exportChapter(ch, bw, xrefMap); bw.write("\n"); bw.write(""); } @@ -229,6 +207,32 @@ public void doExport(Bible bible, String... exportArgs) throws Exception { } + protected void exportChapter(Chapter ch, BufferedWriter bw, Properties xrefMap) throws IOException { + List footnotes = new ArrayList(); + if (ch.getProlog() != null) { + bw.write("
\n"); + ch.getProlog().accept(new RoundtripHTMLVisitor(bw, footnotes, "", "", xrefMap)); + bw.write("\n"); + bw.write("
\n"); + } + if (ch.getVerses().size() > 0) { + bw.write("
\n"); + for (Verse v : ch.getVerses()) { + bw.write("
"); + v.accept(new RoundtripHTMLVisitor(bw, footnotes, "" + v.getNumber() + " ", "", xrefMap)); + bw.write("
\n"); + } + bw.write("
\n"); + } + if (footnotes.size() > 0) { + bw.write("
\n"); + for (StringWriter footnote : footnotes) { + bw.write(footnote.toString() + "\n"); + } + bw.write("
\n"); + } + } + private static String getTypeDir(BookID id) { if (id == BookID.DICTIONARY_ENTRY) return "dict"; @@ -286,51 +290,8 @@ public Bible doImport(File inputDir) throws Exception { for (Chapter ch : bk.getChapters()) { cnumber++; try (BufferedReader br = createReader(inputDir, getTypeDir(bk.getId()) + "/" + bk.getAbbr() + "_" + cnumber + ".html")) { - String line; List> footnotes = new ArrayList<>(); - while ((line = br.readLine()) != null) { - if (line.equals("
")) { - line = br.readLine(); - FormattedText prolog = new FormattedText(); - int end = parseLine(prolog.getAppendVisitor(), line, 0, footnotes); - ch.setProlog(prolog); - if (end != line.length()) - throw new IOException(line.substring(end)); - line = br.readLine(); - if (!line.equals("
")) - throw new IOException(line); - } else if (line.equals("
")) { - while ((line = br.readLine()) != null) { - if (line.equals("
")) - break; - if (!line.startsWith("
")) - throw new IOException(line); - line = line.substring(20, line.length() - 6); - int pos = line.indexOf("\">"); - Verse v = new Verse(line.substring(0, pos)); - int end = parseLine(v.getAppendVisitor(), line, pos + 2, footnotes); - if (end != line.length()) - throw new IOException(line.substring(end)); - ch.getVerses().add(v); - } - if (!line.equals("
")) - throw new IOException(line); - } else if (line.equals("
")) { - for (int i = 0; i < footnotes.size(); i++) { - line = br.readLine(); - String prefix = "
" + (i + 1) + " "; - if (!line.startsWith(prefix) || !line.endsWith("
")) - throw new IOException(line); - line = line.substring(prefix.length(), line.length() - 6); - int end = parseLine(footnotes.get(i), line, 0, null); - if (end != line.length()) - throw new IOException(line.substring(end)); - } - line = br.readLine(); - if (!line.equals("
")) - throw new IOException(line); - } - } + parseChapter(ch, br, footnotes); if (ch.getProlog() != null) ch.getProlog().finished(); for (Verse v : ch.getVerses()) @@ -341,11 +302,58 @@ public Bible doImport(File inputDir) throws Exception { return bible; } + protected void parseChapter(Chapter ch, BufferedReader br, List> footnotes) throws IOException { + String line; + while ((line = br.readLine()) != null) { + if (line.equals("
")) { + line = br.readLine(); + FormattedText prolog = new FormattedText(); + int end = parseLine(prolog.getAppendVisitor(), line, 0, footnotes); + ch.setProlog(prolog); + if (end != line.length()) + throw new IOException(line.substring(end)); + line = br.readLine(); + if (!line.equals("
")) + throw new IOException(line); + } else if (line.equals("
")) { + while ((line = br.readLine()) != null) { + if (line.equals("
")) + break; + if (!line.startsWith("
")) + throw new IOException(line); + line = line.substring(20, line.length() - 6); + int pos = line.indexOf("\">"); + Verse v = new Verse(line.substring(0, pos)); + int end = parseLine(v.getAppendVisitor(), line, pos + 2, footnotes); + if (end != line.length()) + throw new IOException(line.substring(end)); + ch.getVerses().add(v); + } + if (!line.equals("
")) + throw new IOException(line); + } else if (line.equals("
")) { + for (int i = 0; i < footnotes.size(); i++) { + line = br.readLine(); + String prefix = "
" + (i + 1) + " "; + if (!line.startsWith(prefix) || !line.endsWith("
")) + throw new IOException(line); + line = line.substring(prefix.length(), line.length() - 6); + int end = parseLine(footnotes.get(i), line, 0, null); + if (end != line.length()) + throw new IOException(line.substring(end)); + } + line = br.readLine(); + if (!line.equals("
")) + throw new IOException(line); + } + } + } + private static BufferedReader createReader(File directory, String name) throws IOException { return new BufferedReader(new InputStreamReader(new FileInputStream(new File(directory, name)), StandardCharsets.UTF_8)); } - private int parseLine(Visitor visitor, String line, int pos, List> footnotes) throws IOException { + protected int parseLine(Visitor visitor, String line, int pos, List> footnotes) throws IOException { while (pos < line.length()) { if (line.charAt(pos) != '<') { int endPos = line.indexOf('<', pos); @@ -641,34 +649,21 @@ public boolean isImportExportRoundtrip() { return true; } - private static class RoundtripHTMLVisitor extends AbstractHTMLVisitor { + protected static abstract class AbstractRoundtripHTMLVisitor extends AbstractHTMLVisitor { - private final String prefix; - private final List footnotes; - private final Properties xrefMap; + protected final Properties xrefMap; - private RoundtripHTMLVisitor(Writer writer, List footnotes, String prefix, String suffix, Properties xrefMap) { + protected AbstractRoundtripHTMLVisitor(Writer writer, String suffix, Properties xrefMap) { super(writer, suffix); - this.footnotes = footnotes; - this.prefix = prefix; this.xrefMap = xrefMap; } - protected String getNextFootnoteTarget() { - return "#fn" + (footnotes.size() + 1); - } - @Override public void visitVerseSeparator() throws IOException { + prepareForInlineOutput(false); writer.write("/"); } - @Override - public void visitStart() throws IOException { - if (suffixStack.size() == 1) - writer.write(prefix); - } - protected String createFormattingInstructionStartTag(FormattingInstructionKind kind) { return ""; } @@ -679,6 +674,7 @@ public void visitRawHTML(RawHTMLMode mode, String raw) throws IOException { while (raw.contains("endraw " + marker + "-->")) { marker = (int) (Math.random() * 1000000); } + prepareForInlineOutput(false); writer.write("\n"); + ch.getProlog().accept(new RoundtripStructuredHTMLVisitor(state, footnotes, footnoteStates, "", xrefMap)); + bw.write("\n"); + bw.write("\n"); + } + if (ch.getVerses().size() > 0) { + for (Verse v : ch.getVerses()) { + bw.write(""); + v.accept(new RoundtripStructuredHTMLVisitor(state, footnotes, footnoteStates, "" + v.getNumber() + " ", xrefMap)); + bw.write("\n"); + } + } + bw.write(""); + state.closeAll(); + bw.write("\n"); + bw.write("\n"); + for (StructuredHTMLState footnoteState : footnoteStates) { + footnoteState.closeAll(); + } + if (footnotes.size() > 0) { + bw.write("
\n"); + for (StringWriter footnote : footnotes) { + bw.write(footnote.toString() + "
\n"); + } + bw.write("\n"); + } + } + + protected void parseChapter(Chapter ch, BufferedReader br, List> footnotes) throws IOException { + String line; + ParseState state = new ParseState(ParseState.State.NONE); + while ((line = br.readLine()) != null) { + if (line.equals("")) { + line = br.readLine(); + FormattedText prolog = new FormattedText(); + parseStructuredLine(state, prolog.getAppendVisitor(), line, footnotes); + ch.setProlog(prolog); + line = br.readLine(); + if (!line.equals("")) + throw new IOException(line); + } else if (line.startsWith("")) + throw new IOException(line); + line = line.substring(26, line.length() - 27); + int pos = line.indexOf(" -->"); + String vnum = line.substring(0, pos); + Verse v = new Verse(vnum); + line = line.substring(pos + 4).replace("", ""); + parseStructuredLine(state, v.getAppendVisitor(), line, footnotes); + ch.getVerses().add(v); + } else if (line.equals("
")) { + for (int i = 0; i < footnotes.size(); i++) { + line = br.readLine(); + String prefix = "

" + (i + 1) + " "; + if (!line.startsWith(prefix) || !line.endsWith("

")) + throw new IOException(line); + line = line.substring(prefix.length(), line.length() - 6); + ParseState fnState = new ParseState(ParseState.State.PARAGRAPH); + parseStructuredLine(fnState, footnotes.get(i), line, null); + if (fnState.state != ParseState.State.NONE) + throw new IOException(line); + } + line = br.readLine(); + if (!line.equals("
")) + throw new IOException(line); + } else if (line.startsWith("") && line.endsWith("")) { + String suffix = line.substring(30, line.length() - 28); + if (suffix.equals("

") && state.state == ParseState.State.PARAGRAPH) { + state.state = ParseState.State.NONE; + } else if (suffix.equals("") && state.state == ParseState.State.TABLECELL) { + state.state = ParseState.State.NONE; + } else if (suffix.isEmpty() && state.state == ParseState.State.NONE) { + // all fine + } else { + throw new IOException(suffix); + } + } else if (line.startsWith("