Skip to content

Commit

Permalink
Escape quoted URLs
Browse files Browse the repository at this point in the history
- Unquote URL before check existance
- Add tests
  • Loading branch information
BMPixel committed Sep 9, 2024
1 parent 9afd437 commit 4de9e32
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 31 deletions.
57 changes: 39 additions & 18 deletions moffee/utils/file_helper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import re
import shutil
from urllib.parse import urlparse
from urllib.parse import unquote, urlparse, quote
from pathlib import Path
import uuid

Expand Down Expand Up @@ -31,7 +31,7 @@ def redirect_paths(document: str, document_path: str, resource_dir: str = ".") -
- The resource dir (if it exists as an absolute path)
- The resource dir relative to the document (Otherwise)
:param document: Markdown document string
:param document: HTML document string
:param document_path: Path to the document
:param resource_dir: Optional resource path
:return: Document string with all urls redirected.
Expand All @@ -45,10 +45,9 @@ def is_absolute_url(url):
def make_absolute(base, relative):
return os.path.abspath(os.path.normpath(os.path.join(base, relative)))

def replace_url(match):
url = match.group(1)
def redirect_url(url):
if is_absolute_url(url):
return match.group(0)
return url

# Try different base paths to make the URL absolute
base_paths = [
Expand All @@ -60,23 +59,35 @@ def replace_url(match):
for base in base_paths:
absolute_url = make_absolute(base, url)
if os.path.exists(absolute_url) or is_absolute_url(absolute_url):
return match.group(0).replace(url, absolute_url)
return absolute_url

return match.group(0)
return url

# Regular expression to find markdown links
# import ipdb; ipdb.set_trace(context=15)
url_pattern = re.compile(r'"(.+?)"')
soup = BeautifulSoup(document, "html.parser")

# Substitute all URLs in the document using the replace_url function
redirected_document = url_pattern.sub(replace_url, document)
# Tags and attributes to check for URLs
tag_attr_pairs = [
("img", "src"),
("link", "href"),
("script", "src"),
("a", "href"),
]

return redirected_document
for tag, attr in tag_attr_pairs:
for element in soup.find_all(tag):
if element.has_attr(attr):
original_url = element[attr]
decoded_url = unquote(original_url)
redirected_url = redirect_url(decoded_url)
element[attr] = redirected_url

return str(soup)


def copy_assets(document: str, target_dir: str) -> str:
"""
Copy all asset resources in an HTML document to target_dir, then update URLs to target_dir/uuid_originalname.ext
Handles encoded URLs.
:param document: HTML document to process
:param target_dir: Target directory
Expand All @@ -103,24 +114,34 @@ def copy_assets(document: str, target_dir: str) -> str:
if element.has_attr(attr):
original_path = element[attr]

# Skip if it's an external URL or a non-file path
if urlparse(original_path).scheme or not os.path.isfile(original_path):
# Decode the URL
decoded_path = unquote(original_path)

# Skip if it's an external URL
if urlparse(decoded_path).scheme:
continue

# Convert to absolute path if it's relative
absolute_path = os.path.abspath(decoded_path)

# Skip if it's not a file
if not os.path.isfile(absolute_path):
continue

if original_path not in path_mapping:
# Generate a new filename
original_filename = os.path.basename(original_path)
original_filename = os.path.basename(absolute_path)
name, ext = os.path.splitext(original_filename)
new_filename = f"{str(uuid.uuid4())[:8]}_{name}{ext}"
new_path = os.path.join(target_dir, new_filename)

# Copy the file
shutil.copy2(original_path, new_path)
shutil.copy2(absolute_path, new_path)

# Store the mapping
path_mapping[original_path] = new_path

# Update the attribute with the new path
element[attr] = path_mapping[original_path]
element[attr] = quote(path_mapping[original_path])

return str(soup)
58 changes: 58 additions & 0 deletions tests/test_copy_assets.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from email.utils import unquote
import os
import pytest
import shutil
import tempfile
from urllib.parse import quote

from moffee.utils.file_helper import (
copy_assets,
Expand All @@ -26,6 +28,21 @@ def setup_test_environment():
shutil.rmtree(temp_dir)


@pytest.fixture
def setup_file_with_spaces():
temp_dir = tempfile.mkdtemp()

# Create a sample file with a space in its name
sample_file_name = "sample file with spaces.txt"
sample_file_path = os.path.join(temp_dir, sample_file_name)
with open(sample_file_path, "w") as f:
f.write("This is a sample file with spaces in its name")

yield temp_dir, sample_file_path, sample_file_name

shutil.rmtree(temp_dir)


def test_copy_assets_updates_links(setup_test_environment):
temp_dir, sample_image_path, sample_pdf_path = setup_test_environment
target_dir = os.path.join(temp_dir, "asset_resources")
Expand Down Expand Up @@ -163,3 +180,44 @@ def test_copy_assets_ignores_urls_in_text_content():
assert updated_doc.count(sample_file_path) == 2

shutil.rmtree(temp_dir)


def test_copy_assets_handles_encoded_urls(setup_file_with_spaces):
temp_dir, sample_file_path, sample_file_name = setup_file_with_spaces
target_dir = os.path.join(temp_dir, "asset_resources")

# Encode the file path
encoded_file_path = quote(sample_file_path)

html_doc = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Test Document</title>
</head>
<body>
<h1>Test HTML Document</h1>
<img src="{encoded_file_path}" alt="Sample Image">
<a href="{unquote(encoded_file_path)}">Download Sample</a>
</body>
</html>
"""

updated_doc = copy_assets(html_doc, target_dir)

# Check that the target directory is created
assert os.path.exists(target_dir)

# Verify that the file has been moved
moved_files = os.listdir(target_dir)
assert len(moved_files) == 1

# Check if document URLs are updated with new file names
assert quote(os.path.join(target_dir, moved_files[0])) in updated_doc

# Verify that the encoded URL is no longer in the updated document
assert encoded_file_path not in updated_doc

# Verify that the original file name (with spaces) is not in the updated document
assert sample_file_name not in updated_doc
53 changes: 40 additions & 13 deletions tests/test_redirect_paths.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from urllib.parse import quote
import pytest
import tempfile
import os
Expand Down Expand Up @@ -31,39 +32,65 @@ def test_redirect_paths(setup_test_env):
temp_dir, doc_path, res_dir = setup_test_env

document = """
Image Path: "image.png"
Image in resource: "image2.png"
URL: "http://example.com"
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Sample Document</title>
<link rel="stylesheet" href="styles.css">
</head>
<body>
<h1>Sample Images and URL</h1>
<img src="image.png" alt="Sample Image">
<img src="resources/image2.png" alt="Image in resource">
<a href="http://example.com">Example URL</a>
<script src="script.js"></script>
</body>
</html>"
"""

redirected_document = redirect_paths(document, doc_path, res_dir)

expected_path_image1 = os.path.abspath(os.path.join(temp_dir, "image.png"))
expected_path_image2 = os.path.abspath(os.path.join(res_dir, "image2.png"))

assert f'"{expected_path_image1}"' in redirected_document
assert '"http://example.com"' in redirected_document
assert f'"{expected_path_image2}"' in redirected_document
assert quote(f"{expected_path_image1}") in redirected_document
assert "http://example.com" in redirected_document
assert quote(f"{expected_path_image2}") in redirected_document


def test_redirect_paths_no_res_dir(setup_test_env):
temp_dir, doc_path, res_dir = setup_test_env

document = """
Image Path: "image.png"
Image in resource: "image2.png"
URL: "http://example.com"
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Sample Document</title>
<link rel="stylesheet" href="styles.css">
</head>
<body>
<h1>Sample Images and URL</h1>
<img src="image.png" alt="Sample Image">
<img src="resources/image2.png" alt="Image in resource">
<a href="http://example.com">Example URL</a>
<script src="script.js"></script>
</body>
</html>"
"""

redirected_document = redirect_paths(document, doc_path)

expected_path_image1 = os.path.abspath(os.path.join(temp_dir, "image.png"))
expected_path_image2 = os.path.abspath(os.path.join(res_dir, "image2.png"))

assert f'"{expected_path_image1}"' in redirected_document
assert '"http://example.com"' in redirected_document
assert not f'"{expected_path_image2}"' in redirected_document
assert f'"image2.png"' in redirected_document
assert quote(f"{expected_path_image1}") in redirected_document
assert "http://example.com" in redirected_document
assert quote(f"{expected_path_image2}") in redirected_document
assert quote(f"image2.png") in redirected_document


def test_redirect_paths_trivial(setup_test_env):
Expand Down

0 comments on commit 4de9e32

Please sign in to comment.