Escape quoted URLs

- Unquote URL before check existance - Add tests
BMPixel · Sep 9, 2024 · 4de9e32 · 4de9e32
1 parent 9afd437
commit 4de9e32
Show file tree

Hide file tree

Showing 3 changed files with 137 additions and 31 deletions.
diff --git a/moffee/utils/file_helper.py b/moffee/utils/file_helper.py
@@ -1,7 +1,7 @@
 import os
 import re
 import shutil
-from urllib.parse import urlparse
+from urllib.parse import unquote, urlparse, quote
 from pathlib import Path
 import uuid
 
@@ -31,7 +31,7 @@ def redirect_paths(document: str, document_path: str, resource_dir: str = ".") -
     - The resource dir (if it exists as an absolute path)
     - The resource dir relative to the document (Otherwise)
 
-    :param document: Markdown document string
+    :param document: HTML document string
     :param document_path: Path to the document
     :param resource_dir: Optional resource path
     :return: Document string with all urls redirected.
@@ -45,10 +45,9 @@ def is_absolute_url(url):
     def make_absolute(base, relative):
         return os.path.abspath(os.path.normpath(os.path.join(base, relative)))
 
-    def replace_url(match):
-        url = match.group(1)
+    def redirect_url(url):
         if is_absolute_url(url):
-            return match.group(0)
+            return url
 
         # Try different base paths to make the URL absolute
         base_paths = [
@@ -60,23 +59,35 @@ def replace_url(match):
         for base in base_paths:
             absolute_url = make_absolute(base, url)
             if os.path.exists(absolute_url) or is_absolute_url(absolute_url):
-                return match.group(0).replace(url, absolute_url)
+                return absolute_url
 
-        return match.group(0)
+        return url
 
-    # Regular expression to find markdown links
-    # import ipdb; ipdb.set_trace(context=15)
-    url_pattern = re.compile(r'"(.+?)"')
+    soup = BeautifulSoup(document, "html.parser")
 
-    # Substitute all URLs in the document using the replace_url function
-    redirected_document = url_pattern.sub(replace_url, document)
+    # Tags and attributes to check for URLs
+    tag_attr_pairs = [
+        ("img", "src"),
+        ("link", "href"),
+        ("script", "src"),
+        ("a", "href"),
+    ]
 
-    return redirected_document
+    for tag, attr in tag_attr_pairs:
+        for element in soup.find_all(tag):
+            if element.has_attr(attr):
+                original_url = element[attr]
+                decoded_url = unquote(original_url)
+                redirected_url = redirect_url(decoded_url)
+                element[attr] = redirected_url
+
+    return str(soup)
 
 
 def copy_assets(document: str, target_dir: str) -> str:
     """
     Copy all asset resources in an HTML document to target_dir, then update URLs to target_dir/uuid_originalname.ext
+    Handles encoded URLs.
 
     :param document: HTML document to process
     :param target_dir: Target directory
@@ -103,24 +114,34 @@ def copy_assets(document: str, target_dir: str) -> str:
             if element.has_attr(attr):
                 original_path = element[attr]
 
-                # Skip if it's an external URL or a non-file path
-                if urlparse(original_path).scheme or not os.path.isfile(original_path):
+                # Decode the URL
+                decoded_path = unquote(original_path)
+
+                # Skip if it's an external URL
+                if urlparse(decoded_path).scheme:
+                    continue
+
+                # Convert to absolute path if it's relative
+                absolute_path = os.path.abspath(decoded_path)
+
+                # Skip if it's not a file
+                if not os.path.isfile(absolute_path):
                     continue
 
                 if original_path not in path_mapping:
                     # Generate a new filename
-                    original_filename = os.path.basename(original_path)
+                    original_filename = os.path.basename(absolute_path)
                     name, ext = os.path.splitext(original_filename)
                     new_filename = f"{str(uuid.uuid4())[:8]}_{name}{ext}"
                     new_path = os.path.join(target_dir, new_filename)
 
                     # Copy the file
-                    shutil.copy2(original_path, new_path)
+                    shutil.copy2(absolute_path, new_path)
 
                     # Store the mapping
                     path_mapping[original_path] = new_path
 
                 # Update the attribute with the new path
-                element[attr] = path_mapping[original_path]
+                element[attr] = quote(path_mapping[original_path])
 
     return str(soup)
diff --git a/tests/test_copy_assets.py b/tests/test_copy_assets.py
@@ -1,7 +1,9 @@
+from email.utils import unquote
 import os
 import pytest
 import shutil
 import tempfile
+from urllib.parse import quote
 
 from moffee.utils.file_helper import (
     copy_assets,
@@ -26,6 +28,21 @@ def setup_test_environment():
     shutil.rmtree(temp_dir)
 
 
+@pytest.fixture
+def setup_file_with_spaces():
+    temp_dir = tempfile.mkdtemp()
+
+    # Create a sample file with a space in its name
+    sample_file_name = "sample file with spaces.txt"
+    sample_file_path = os.path.join(temp_dir, sample_file_name)
+    with open(sample_file_path, "w") as f:
+        f.write("This is a sample file with spaces in its name")
+
+    yield temp_dir, sample_file_path, sample_file_name
+
+    shutil.rmtree(temp_dir)
+
+
 def test_copy_assets_updates_links(setup_test_environment):
     temp_dir, sample_image_path, sample_pdf_path = setup_test_environment
     target_dir = os.path.join(temp_dir, "asset_resources")
@@ -163,3 +180,44 @@ def test_copy_assets_ignores_urls_in_text_content():
     assert updated_doc.count(sample_file_path) == 2
 
     shutil.rmtree(temp_dir)
+
+
+def test_copy_assets_handles_encoded_urls(setup_file_with_spaces):
+    temp_dir, sample_file_path, sample_file_name = setup_file_with_spaces
+    target_dir = os.path.join(temp_dir, "asset_resources")
+
+    # Encode the file path
+    encoded_file_path = quote(sample_file_path)
+
+    html_doc = f"""
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <title>Test Document</title>
+    </head>
+    <body>
+        <h1>Test HTML Document</h1>
+        <img src="{encoded_file_path}" alt="Sample Image">
+        <a href="{unquote(encoded_file_path)}">Download Sample</a>
+    </body>
+    </html>
+    """
+
+    updated_doc = copy_assets(html_doc, target_dir)
+
+    # Check that the target directory is created
+    assert os.path.exists(target_dir)
+
+    # Verify that the file has been moved
+    moved_files = os.listdir(target_dir)
+    assert len(moved_files) == 1
+
+    # Check if document URLs are updated with new file names
+    assert quote(os.path.join(target_dir, moved_files[0])) in updated_doc
+
+    # Verify that the encoded URL is no longer in the updated document
+    assert encoded_file_path not in updated_doc
+
+    # Verify that the original file name (with spaces) is not in the updated document
+    assert sample_file_name not in updated_doc
diff --git a/tests/test_redirect_paths.py b/tests/test_redirect_paths.py
@@ -1,3 +1,4 @@
+from urllib.parse import quote
 import pytest
 import tempfile
 import os
@@ -31,39 +32,65 @@ def test_redirect_paths(setup_test_env):
     temp_dir, doc_path, res_dir = setup_test_env
 
     document = """
-Image Path: "image.png"
-Image in resource: "image2.png"
-URL: "http://example.com"
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Sample Document</title>
+    <link rel="stylesheet" href="styles.css">
+</head>
+<body>
+    <h1>Sample Images and URL</h1>
+    <img src="image.png" alt="Sample Image">
+    <img src="resources/image2.png" alt="Image in resource">
+    <a href="http://example.com">Example URL</a>
+    <script src="script.js"></script>
+</body>
+</html>"
 """
 
     redirected_document = redirect_paths(document, doc_path, res_dir)
 
     expected_path_image1 = os.path.abspath(os.path.join(temp_dir, "image.png"))
     expected_path_image2 = os.path.abspath(os.path.join(res_dir, "image2.png"))
 
-    assert f'"{expected_path_image1}"' in redirected_document
-    assert '"http://example.com"' in redirected_document
-    assert f'"{expected_path_image2}"' in redirected_document
+    assert quote(f"{expected_path_image1}") in redirected_document
+    assert "http://example.com" in redirected_document
+    assert quote(f"{expected_path_image2}") in redirected_document
 
 
 def test_redirect_paths_no_res_dir(setup_test_env):
     temp_dir, doc_path, res_dir = setup_test_env
 
     document = """
-Image Path: "image.png"
-Image in resource: "image2.png"
-URL: "http://example.com"
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Sample Document</title>
+    <link rel="stylesheet" href="styles.css">
+</head>
+<body>
+    <h1>Sample Images and URL</h1>
+    <img src="image.png" alt="Sample Image">
+    <img src="resources/image2.png" alt="Image in resource">
+    <a href="http://example.com">Example URL</a>
+    <script src="script.js"></script>
+</body>
+</html>"
 """
 
     redirected_document = redirect_paths(document, doc_path)
 
     expected_path_image1 = os.path.abspath(os.path.join(temp_dir, "image.png"))
     expected_path_image2 = os.path.abspath(os.path.join(res_dir, "image2.png"))
 
-    assert f'"{expected_path_image1}"' in redirected_document
-    assert '"http://example.com"' in redirected_document
-    assert not f'"{expected_path_image2}"' in redirected_document
-    assert f'"image2.png"' in redirected_document
+    assert quote(f"{expected_path_image1}") in redirected_document
+    assert "http://example.com" in redirected_document
+    assert quote(f"{expected_path_image2}") in redirected_document
+    assert quote(f"image2.png") in redirected_document
 
 
 def test_redirect_paths_trivial(setup_test_env):