Skip to content

Commit

Permalink
Add RoundtripStructuredHTML format
Browse files Browse the repository at this point in the history
  • Loading branch information
schierlm committed Oct 25, 2024
1 parent 5d30477 commit 9f3633b
Show file tree
Hide file tree
Showing 7 changed files with 610 additions and 98 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ modules without loss of data:
- **RoundtripHTML**: HTML format that can be read back if desired (originally
intended for publishing on free website hosters, but with the advent of free
file hosters this feature is pretty much obsolete).
- **RoundtripStructuredHTML**: Similar to above, but converts the structure (i. e.
paragraphs, headlines, tables) as toplevel elements, and not the verses. Often results
in better HTML when converting from Paratext formats.
- **RoundtripODT**: Export as an editable .odt (OpenOffice/LibreOffice Document
Text), which can be edited in LibreOffice (tested with LibreOffice 6.0) and later
imported again. Large bibles can take a minute or so to open in LibreOffice 6, which
Expand Down Expand Up @@ -187,7 +190,8 @@ limited to the old intermediate format.
| `RoundtripTaggedText` | complete | unit tests |
| `RoundtripXML` | complete | unit tests |
| `AbstractParatextFormat` | partial *(maybe complete?)* | unit tests |
| `RoundtripHTML` | partial *(Show `ga-` attributes)* | unit tests |
| `RoundtripHTML` | partial *(Show `ga-` attributes)* | partial |
| `RoundtripStructuredHTML` | partial *(Show `ga-` attributes)* | partial |
| `RoundtripODT` | partial *(Handling of hyperlinks)* | unit tests |
| `AbstractVersificationDetector` | complete | none |
| `AugmentGrammar` | complete | none |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ public Collection<Module<RoundtripFormat>> getRoundtripFormats() {
result.add(new Module<RoundtripFormat>("RoundtripTaggedText", "A text-format that consistently uses numbered tags to make automated editing easy.", RoundtripTaggedText.HELP_TEXT, RoundtripTaggedText.class));
result.add(new Module<RoundtripFormat>("SoftProjector", "Bible format used by SoftProjector", SoftProjector.HELP_TEXT, SoftProjector.class));
result.add(new Module<RoundtripFormat>("BebliaXML", "Beblia XML format.", BebliaXML.HELP_TEXT, BebliaXML.class));
result.add(new Module<RoundtripFormat>("RoundtripStructuredHTML", "Roundtrip HTML Export using structured paragraphs", RoundtripStructuredHTML.HELP_TEXT, RoundtripStructuredHTML.class));
return result;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,12 @@ protected String getNextFootnoteTarget() {
return null;
}

protected void prepareForInlineOutput(boolean endTag) throws IOException {
}

@Override
public void visitVerseSeparator() throws IOException {
prepareForInlineOutput(false);
writer.write("<font color=\"#808080\">/</font>");
}

Expand All @@ -53,6 +57,7 @@ public void visitStart() throws IOException {

@Override
public void visitText(String text) throws IOException {
prepareForInlineOutput(false);
writer.write(text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;"));
}

Expand All @@ -74,6 +79,7 @@ public FormattedText.Visitor<IOException> visitFormattingInstruction(FormattingI
endTag = "</a>";
}
}
prepareForInlineOutput(false);
writer.write(startTag);
pushSuffix(endTag);
return this;
Expand All @@ -85,6 +91,7 @@ protected String createFormattingInstructionStartTag(FormattingInstructionKind k

@Override
public Visitor<IOException> visitCSSFormatting(String css) throws IOException {
prepareForInlineOutput(false);
writer.write("<span class=\"css\" style=\"" + css + "\">");
pushSuffix("</span>");
return this;
Expand All @@ -93,6 +100,7 @@ public Visitor<IOException> visitCSSFormatting(String css) throws IOException {
@Override
public void visitRawHTML(RawHTMLMode mode, String raw) throws IOException {
if (!mode.equals(Boolean.getBoolean("rawhtml.online") ? RawHTMLMode.OFFLINE : RawHTMLMode.ONLINE)) {
prepareForInlineOutput(false);
writer.write(raw);
}
}
Expand All @@ -104,6 +112,7 @@ public Visitor<IOException> visitVariationText(String[] variations) throws IOExc

@Override
public Visitor<IOException> visitHyperlink(HyperlinkType type, String target) throws IOException {
prepareForInlineOutput(false);
if (type == HyperlinkType.ANCHOR) {
writer.write("<a name=\"" + target + "\">");
} else {
Expand All @@ -123,6 +132,7 @@ public Visitor<IOException> visitExtraAttribute(ExtraAttributePriority prio, Str

@Override
public boolean visitEnd() throws IOException {
prepareForInlineOutput(true);
writer.write(suffixStack.remove(suffixStack.size() - 1));
return false;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
package biblemulticonverter.format;

import java.io.IOException;
import java.io.Writer;

import biblemulticonverter.data.FormattedText;
import biblemulticonverter.data.FormattedText.ExtendedLineBreakKind;
import biblemulticonverter.data.FormattedText.LineBreakKind;
import biblemulticonverter.data.FormattedText.Visitor;
import biblemulticonverter.format.AbstractStructuredHTMLVisitor.StructuredHTMLState;

/**
* Helper class for converting {@link FormattedText} to structured HTML. Unlike
* {@link AbstractHTMLVisitor}, this also implements headlines and line breaks
* (including tables and paragraph styles using inline CSS). State is tracked in
* a StructuredHTMLState object that may be passed to subsequent visitors or
* "closed".
*/
public abstract class AbstractStructuredHTMLVisitor extends AbstractHTMLVisitor {

protected final StructuredHTMLState state;

protected AbstractStructuredHTMLVisitor(StructuredHTMLState state) {
super(state.getWriter(), "");
this.state = state;
}

@Override
protected void prepareForInlineOutput(boolean endTag) throws IOException {
if (!endTag) {
state.ensureOpen();
} else if (suffixStack.size() == 2) {
state.closeHeadline();
}
}

public static void startHeadline(StructuredHTMLState state) throws IOException {
state.closeAll();
state.openState = StructuredHTMLOpenState.Headline;
}

@Override
public Visitor<IOException> visitHeadline(int depth) throws IOException {
if (suffixStack.size() == 1) {
startHeadline(state);
}
writer.write("<h" + (depth < 6 ? depth : 6) + ">");
pushSuffix("</h" + (depth < 6 ? depth : 6) + ">");
return this;
}

public static void visitLineBreak(AbstractHTMLVisitor v, StructuredHTMLState state, ExtendedLineBreakKind kind, int indent) throws IOException {
if (kind == ExtendedLineBreakKind.TABLE_ROW_FIRST_CELL || kind == ExtendedLineBreakKind.TABLE_ROW_NEXT_CELL) {
if (kind == ExtendedLineBreakKind.TABLE_ROW_FIRST_CELL && state.openState == StructuredHTMLOpenState.TableCell) {
v.writer.write("</td></tr><tr>");
state.colNumber = 1;
} else if (state.openState == StructuredHTMLOpenState.TableCell) {
v.writer.write("</td>");
} else {
state.closeAll();
v.writer.write("<table><tr>");
state.openState = StructuredHTMLOpenState.TableCell;
state.colNumber = 1;
}
if (state.colNumber == 1 && kind == ExtendedLineBreakKind.TABLE_ROW_NEXT_CELL) {
System.out.println("WARNING: Table cell without table row start");
state.colNumber++; // for roundtrip formats
}
v.writer.write("<td class=\"col" + state.colNumber + "\"");
state.colNumber++;
if (indent > 0) {
v.writer.write(" colspan=\"" + indent + "\"");
state.colNumber += indent - 1;
} else if (indent == ExtendedLineBreakKind.INDENT_CENTER) {
v.writer.write(" style=\"text-align: center;\"");
} else if (indent == ExtendedLineBreakKind.INDENT_RIGHT_JUSTIFIED) {
v.writer.write(" style=\"text-align: right;\"");
}
v.writer.write(">");
} else {
state.closeAll();
state.openState = StructuredHTMLOpenState.Para;
v.writer.write("<p class=\"para-" + Character.toLowerCase(kind.getCode()) + "\"");
if (indent > 0) {
v.writer.write(" style=\"text-indent: " + indent + "em;\"");
} else if (indent == ExtendedLineBreakKind.INDENT_CENTER) {
v.writer.write(" style=\"text-align: center;\"");
} else if (indent == ExtendedLineBreakKind.INDENT_RIGHT_JUSTIFIED) {
v.writer.write(" style=\"text-align: right;\"");
}
v.writer.write(">");
}
}

@Override
public void visitLineBreak(ExtendedLineBreakKind kind, int indent) throws IOException {
if (suffixStack.size() == 1 && kind != ExtendedLineBreakKind.NEWLINE) {
visitLineBreak(this, state, kind, indent);
} else {
prepareForInlineOutput(false);
if (!kind.isSameParagraph()) {
writer.write("<br><br>");
} else {
writer.write("<br>");
}
if (indent > 0) {
writer.write("<span class=\"indent\">");
for (int i = 0; i < indent; i++) {
writer.write("&nbsp;&nbsp;&nbsp;");
}
writer.write("</span>");
}
}
}

public static class StructuredHTMLState {
private final Writer writer;
private StructuredHTMLOpenState openState = StructuredHTMLOpenState.None;
private int colNumber = 0;

public StructuredHTMLState(Writer writer) {
this.writer = writer;
}

public Writer getWriter() {
return writer;
}

public void ensureOpen() throws IOException {
if (openState == StructuredHTMLOpenState.None) {
writer.write("<p>");
openState = StructuredHTMLOpenState.Para;
}
}

public void closeHeadline() {
if (openState == StructuredHTMLOpenState.Headline) {
openState = StructuredHTMLOpenState.None;
}
}

public void closeAll() throws IOException {
switch (openState) {
case Para:
writer.write("</p>");
break;
case TableCell:
writer.write("</td></tr></table>");
break;
case None:
case Headline:
break;
}
openState = StructuredHTMLOpenState.None;
}
}

private static enum StructuredHTMLOpenState {
None, TableCell, Para, Headline
}
}
Loading

0 comments on commit 9f3633b

Please sign in to comment.