Skip to content

Commit

Permalink
Fix with unwrap_html + update tests
Browse files Browse the repository at this point in the history
  • Loading branch information
thomasst committed Jun 28, 2024
1 parent 89176b6 commit 273addf
Show file tree
Hide file tree
Showing 9 changed files with 194 additions and 48 deletions.
12 changes: 7 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,16 @@ max-branches = 16

[tool.ruff.lint.per-file-ignores]
"tests/test_quotequail.py" = ["E501", "PT009"]
"tests/test_unwrap_html.py" = ["E501"]

[tool.mypy]
python_version = "3.10"
enable_error_code = "possibly-undefined"
files = "quotequail"
follow_imports = "normal"
ignore_missing_imports = true
no_implicit_optional = true
pretty = true
python_version = "3.10"
show_error_context = true
strict_equality = true
follow_imports = "normal"
warn_unreachable = true
show_error_context = true
pretty = true
files = "quotequail"
80 changes: 41 additions & 39 deletions quotequail/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,46 +141,48 @@ def unwrap_html(html: str) -> dict[str, str] | None:

unwrap_result = _internal.unwrap(lines, 1, _patterns.MIN_HEADER_LINES, 1)

if unwrap_result:
typ, top_range, hdrs, main_range, bottom_range, needs_unindent = (
unwrap_result
if not unwrap_result:
return None

typ, top_range, hdrs, main_range, bottom_range, needs_unindent = (
unwrap_result
)

result = {
"type": typ,
}

top_range_slice = _html.trim_slice(lines, top_range)
main_range_slice = _html.trim_slice(lines, main_range)
bottom_range_slice = _html.trim_slice(lines, bottom_range)

if top_range_slice:
top_tree = _html.slice_tree(
tree, start_refs, end_refs, top_range_slice, html_copy=html
)
html_top = _html.render_html_tree(top_tree)
if html_top:
result["html_top"] = html_top

if bottom_range_slice:
bottom_tree = _html.slice_tree(
tree, start_refs, end_refs, bottom_range_slice, html_copy=html
)
html_bottom = _html.render_html_tree(bottom_tree)
if html_bottom:
result["html_bottom"] = html_bottom

result = {
"type": typ,
}

top_range_slice = _html.trim_slice(lines, top_range)
main_range_slice = _html.trim_slice(lines, main_range)
bottom_range_slice = _html.trim_slice(lines, bottom_range)

if top_range_slice:
top_tree = _html.slice_tree(
tree, start_refs, end_refs, top_range_slice, html_copy=html
)
html_top = _html.render_html_tree(top_tree)
if html_top:
result["html_top"] = html_top

if bottom_range_slice:
bottom_tree = _html.slice_tree(
tree, start_refs, end_refs, bottom_range_slice, html_copy=html
)
html_bottom = _html.render_html_tree(bottom_tree)
if html_bottom:
result["html_bottom"] = html_bottom

if main_range_slice:
main_tree = _html.slice_tree(
tree, start_refs, end_refs, main_range_slice
)
if needs_unindent:
_html.unindent_tree(main_tree)
html = _html.render_html_tree(main_tree)
if html:
result["html"] = html

if hdrs:
result.update(hdrs)
if main_range_slice:
main_tree = _html.slice_tree(
tree, start_refs, end_refs, main_range_slice
)
if needs_unindent:
_html.unindent_tree(main_tree)
html = _html.render_html_tree(main_tree)
if html:
result["html"] = html

if hdrs:
result.update(hdrs)

return result
6 changes: 2 additions & 4 deletions quotequail/_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,10 +187,8 @@ def slice_tree(
else:
new_tree = tree

if start_ref:
include_start = start_ref[1] is Position.Begin
if end_ref:
include_end = end_ref[1] is Position.End
include_start = start_ref[1] is Position.Begin if start_ref else False
include_end = end_ref[1] is Position.End if end_ref else False

# If start_ref is the same as end_ref, and we don't include the element,
# we are removing the entire tree. We need to handle this separately,
Expand Down
14 changes: 14 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import os

import pytest


@pytest.fixture()
def read_file():
def _read_file(name: str) -> str:
with open(
os.path.join(os.path.dirname(__file__), "files", name), "rb"
) as f:
return f.read().decode("utf8")

return _read_file
1 change: 1 addition & 0 deletions tests/files/apple_forward.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<html><head><meta http-equiv="Content-Type" content="text/html charset=utf-8"></head><body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;" class="">test<div class=""><br class=""></div><div class="">blah<br class=""><div><br class=""><div><br class=""><blockquote type="cite" class=""><div class="">Begin forwarded message:</div><br class="Apple-interchange-newline"><div style="margin-top: 0px; margin-right: 0px; margin-bottom: 0px; margin-left: 0px;" class=""><span style="font-family: -webkit-system-font, Helvetica Neue, Helvetica, sans-serif; color:rgba(0, 0, 0, 1.0);" class=""><b class="">From: </b></span><span style="font-family: -webkit-system-font, Helvetica Neue, Helvetica, sans-serif;" class="">Foo Bar &lt;<a href="mailto:foo@bar.example" class="">foo@bar.example</a>&gt;<br class=""></span></div><div style="margin-top: 0px; margin-right: 0px; margin-bottom: 0px; margin-left: 0px;" class=""><span style="font-family: -webkit-system-font, Helvetica Neue, Helvetica, sans-serif; color:rgba(0, 0, 0, 1.0);" class=""><b class="">Subject: </b></span><span style="font-family: -webkit-system-font, Helvetica Neue, Helvetica, sans-serif;" class=""><b class="">The Subject</b><br class=""></span></div><div style="margin-top: 0px; margin-right: 0px; margin-bottom: 0px; margin-left: 0px;" class=""><span style="font-family: -webkit-system-font, Helvetica Neue, Helvetica, sans-serif; color:rgba(0, 0, 0, 1.0);" class=""><b class="">Date: </b></span><span style="font-family: -webkit-system-font, Helvetica Neue, Helvetica, sans-serif;" class="">March 24, 2016 at 20:16:25 GMT+1<br class=""></span></div><div style="margin-top: 0px; margin-right: 0px; margin-bottom: 0px; margin-left: 0px;" class=""><span style="font-family: -webkit-system-font, Helvetica Neue, Helvetica, sans-serif; color:rgba(0, 0, 0, 1.0);" class=""><b class="">To: </b></span><span style="font-family: -webkit-system-font, Helvetica Neue, Helvetica, sans-serif;" class="">John Doe &lt;<a href="mailto:john@doe.example" class="">john@doe.example</a>&gt;<br class=""></span></div><br class=""><div class=""><div dir="ltr" class="">Text of the original email</div>
1 change: 1 addition & 0 deletions tests/files/apple_reply.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<html><head><meta http-equiv="Content-Type" content="text/html charset=us-ascii"></head><body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;" class="">Foo<div class=""><br class=""></div><div class="">Bar</div><div class=""><br class=""></div><div class=""><div><blockquote type="cite" class=""><div class="">On 2016-03-25, at 23:01, John Doe &lt;<a href="mailto:john@doe.example" class="">john@doe.example</a>&gt; wrote:</div><br class="Apple-interchange-newline"><div class=""><meta http-equiv="Content-Type" content="text/html charset=us-ascii" class=""><div style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;" class="">Some <b class="">important</b> email<br class=""></div></div></blockquote></div><br class=""></div></body></html>
1 change: 1 addition & 0 deletions tests/files/gmail_forward.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<html><head></head><body><div dir="ltr">test<div><br></div><div>blah</div><div><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Foo Bar</b> <span dir="ltr">&lt;<a href="mailto:foo@bar.example">foo@bar.example</a>&gt;</span><br>Date: Thu, Mar 24, 2016 at 5:17 PM<br>Subject: The Subject<br>To: John Doe &lt;<a href="mailto:john@doe.example">john@doe.example</a>&gt;<br><br><br><div dir="ltr">Some text<div><br></div><div><br></div></div></div><br></div></div></body></html>
9 changes: 9 additions & 0 deletions tests/files/gmail_reply.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<html><head></head><body><div dir="ltr">foo<div><br></div><div>bar</div></div><div class="gmail_extra"><br><div class="gmail_quote">On Wed, Mar 16, 2016 at 12:49 AM, Foo Bar <span dir="ltr">&lt;<a href="mailto:foo@bar.example" target="_blank">foo@bar.example</a>&gt;</span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Hi,<br>
<br>This is the reply<br>
<br>
Thanks a lot!<br>
<span class="HOEnZb"><font color="#888888">Foo<br>
<br>
</font></span></blockquote></div><br><br clear="all"><div><br></div>-- <br><div class="gmail_signature"><div dir="ltr"><div><div dir="ltr"><b>John Doe</b></div><div dir="ltr"><b>Senior Director</b><div>Some Company</div></div></div></div></div>
</div>
</body></html>
118 changes: 118 additions & 0 deletions tests/test_unwrap_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import pytest

from quotequail import unwrap_html


@pytest.mark.parametrize(
("html", "expected"),
[
("<p>html text</p>", None),
(
"Begin forwarded message:<br>\n<br>\nFrom: someone@example.com<br>\nTo: anyone@example.com<br>\nSubject: You won<br>\n",
{
"type": "forward",
"from": "someone@example.com",
"to": "anyone@example.com",
"subject": "You won",
},
),
],
)
def test_unwrap_html_simple(html, expected):
assert unwrap_html(html) == expected


@pytest.mark.parametrize(
("file", "expected"),
[
(
"apple_forward.html",
{
"type": "forward",
"subject": "The Subject",
"date": "March 24, 2016 at 20:16:25 GMT+1",
"from": "Foo Bar <foo@bar.example>",
"to": "John Doe <john@doe.example>",
"html_top": '<html><head></head><body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;" class="">test<div class=""><br class=""></div><div class="">blah</div></body></html>',
"html": '<html><head></head><body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;" class=""><div class=""><div><div><div><div class=""><div dir="ltr" class="">Text of the original email</div></div></div></div></div></div></body></html>',
},
),
(
"gmail_forward.html",
{
"type": "forward",
"subject": "The Subject",
"date": "Thu, Mar 24, 2016 at 5:17 PM",
"from": "Foo Bar <foo@bar.example>",
"to": "John Doe <john@doe.example>",
"html_top": '<html><head></head><body><div dir="ltr">test<div><br></div><div>blah</div></div></body></html>',
"html": '<html><head></head><body><div dir="ltr"><div><div class="gmail_quote"><div dir="ltr">Some text</div></div></div></div></body></html>',
},
),
(
"apple_reply.html",
{
"type": "reply",
"from": "John Doe <john@doe.example>",
"date": "2016-03-25, at 23:01",
"html": '<html><head></head><body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;" class=""><div class=""><div><div><div class=""><div style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;" class="">Some <b class="">important</b> email</div></div></div></div></div></body></html>',
"html_top": '<html><head></head><body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;" class="">Foo<div class=""><br class=""></div><div class="">Bar</div></body></html>',
},
),
(
"gmail_reply.html",
{
"type": "reply",
"from": "Foo Bar <foo@bar.example>",
"date": "Wed, Mar 16, 2016 at 12:49 AM",
"html_top": '<html><head></head><body><div dir="ltr">foo<div><br></div><div>bar</div></div></body></html>',
"html": '<html><head></head><body><div class="gmail_extra"><div class="gmail_quote"><div>Hi,<br>\n<br>This is the reply<br>\n<br>\nThanks a lot!<br>\n<span class="HOEnZb"><font color="#888888">Foo</font></span></div></div></div></body></html>',
"html_bottom": '<html><head></head><body><div class="gmail_extra">-- <br><div class="gmail_signature"><div dir="ltr"><div><div dir="ltr"><b>John Doe</b></div><div dir="ltr"><b>Senior Director</b><div>Some Company</div></div></div></div></div>\n</div>\n</body></html>',
},
),
],
)
def test_unwrap_html_file(read_file, file, expected):
html = read_file(file)
assert unwrap_html(html) == expected


def test_outlook_forward(read_file):
data = read_file("outlook_forward.html")
result = unwrap_html(data)
assert result["type"] == "forward"
assert result["from"] == "John Doe"
assert result["to"] == "Foo Bar (foo@bar.example)"
assert result["date"] == "Wednesday, July 09, 2014 10:27 AM"
assert result["subject"] == "The subject!"
assert result["html"] == read_file("outlook_forward_unwrapped.html")
assert result["html_top"] == read_file(
"outlook_forward_unwrapped_top.html"
)
assert "html_bottom" not in result


def test_thunderbird_forward(read_file):
data = read_file("thunderbird_forward.html")
result = unwrap_html(data)
assert result["type"] == "forward"
assert result["from"] == "John Doe <johndoe@example.com>"
assert result["to"] == "Foo Bar <foobar@example.com>"
assert result["date"] == "Tue, 3 May 2016 14:54:27 +0200 (CEST)"
assert result["subject"] == "Re: Example subject"
assert "html_top" not in result
assert result["html"] == read_file("thunderbird_forward_unwrapped.html")
assert "html_bottom" not in result


def test_mailru_forward(read_file):
data = read_file("mailru_forward.html")
result = unwrap_html(data)
assert result["type"] == "forward"
assert result["from"] == "Иван Иванов <ivanivanov@example.com>"
assert result["to"] == "Петр Петров <petrpetrov@example.com>"
assert result["date"] == "Среда, 14 июня 2017, 15:19 +03:00"
assert result["subject"] == "Тестовая тема"
assert "html_top" not in result
assert result["html"] == read_file("mailru_forward_unwrapped.html")
assert "html_bottom" not in result

0 comments on commit 273addf

Please sign in to comment.