diff --git a/moffee/utils/file_helper.py b/moffee/utils/file_helper.py index 4d749d6..b0c6675 100644 --- a/moffee/utils/file_helper.py +++ b/moffee/utils/file_helper.py @@ -1,7 +1,7 @@ import os import re import shutil -from urllib.parse import urlparse +from urllib.parse import unquote, urlparse, quote from pathlib import Path import uuid @@ -31,7 +31,7 @@ def redirect_paths(document: str, document_path: str, resource_dir: str = ".") - - The resource dir (if it exists as an absolute path) - The resource dir relative to the document (Otherwise) - :param document: Markdown document string + :param document: HTML document string :param document_path: Path to the document :param resource_dir: Optional resource path :return: Document string with all urls redirected. @@ -45,10 +45,9 @@ def is_absolute_url(url): def make_absolute(base, relative): return os.path.abspath(os.path.normpath(os.path.join(base, relative))) - def replace_url(match): - url = match.group(1) + def redirect_url(url): if is_absolute_url(url): - return match.group(0) + return url # Try different base paths to make the URL absolute base_paths = [ @@ -60,23 +59,35 @@ def replace_url(match): for base in base_paths: absolute_url = make_absolute(base, url) if os.path.exists(absolute_url) or is_absolute_url(absolute_url): - return match.group(0).replace(url, absolute_url) + return absolute_url - return match.group(0) + return url - # Regular expression to find markdown links - # import ipdb; ipdb.set_trace(context=15) - url_pattern = re.compile(r'"(.+?)"') + soup = BeautifulSoup(document, "html.parser") - # Substitute all URLs in the document using the replace_url function - redirected_document = url_pattern.sub(replace_url, document) + # Tags and attributes to check for URLs + tag_attr_pairs = [ + ("img", "src"), + ("link", "href"), + ("script", "src"), + ("a", "href"), + ] - return redirected_document + for tag, attr in tag_attr_pairs: + for element in soup.find_all(tag): + if element.has_attr(attr): + original_url = element[attr] + decoded_url = unquote(original_url) + redirected_url = redirect_url(decoded_url) + element[attr] = redirected_url + + return str(soup) def copy_assets(document: str, target_dir: str) -> str: """ Copy all asset resources in an HTML document to target_dir, then update URLs to target_dir/uuid_originalname.ext + Handles encoded URLs. :param document: HTML document to process :param target_dir: Target directory @@ -103,24 +114,34 @@ def copy_assets(document: str, target_dir: str) -> str: if element.has_attr(attr): original_path = element[attr] - # Skip if it's an external URL or a non-file path - if urlparse(original_path).scheme or not os.path.isfile(original_path): + # Decode the URL + decoded_path = unquote(original_path) + + # Skip if it's an external URL + if urlparse(decoded_path).scheme: + continue + + # Convert to absolute path if it's relative + absolute_path = os.path.abspath(decoded_path) + + # Skip if it's not a file + if not os.path.isfile(absolute_path): continue if original_path not in path_mapping: # Generate a new filename - original_filename = os.path.basename(original_path) + original_filename = os.path.basename(absolute_path) name, ext = os.path.splitext(original_filename) new_filename = f"{str(uuid.uuid4())[:8]}_{name}{ext}" new_path = os.path.join(target_dir, new_filename) # Copy the file - shutil.copy2(original_path, new_path) + shutil.copy2(absolute_path, new_path) # Store the mapping path_mapping[original_path] = new_path # Update the attribute with the new path - element[attr] = path_mapping[original_path] + element[attr] = quote(path_mapping[original_path]) return str(soup) diff --git a/tests/test_copy_assets.py b/tests/test_copy_assets.py index 0b0b0f3..4e3e11d 100644 --- a/tests/test_copy_assets.py +++ b/tests/test_copy_assets.py @@ -1,7 +1,9 @@ +from email.utils import unquote import os import pytest import shutil import tempfile +from urllib.parse import quote from moffee.utils.file_helper import ( copy_assets, @@ -26,6 +28,21 @@ def setup_test_environment(): shutil.rmtree(temp_dir) +@pytest.fixture +def setup_file_with_spaces(): + temp_dir = tempfile.mkdtemp() + + # Create a sample file with a space in its name + sample_file_name = "sample file with spaces.txt" + sample_file_path = os.path.join(temp_dir, sample_file_name) + with open(sample_file_path, "w") as f: + f.write("This is a sample file with spaces in its name") + + yield temp_dir, sample_file_path, sample_file_name + + shutil.rmtree(temp_dir) + + def test_copy_assets_updates_links(setup_test_environment): temp_dir, sample_image_path, sample_pdf_path = setup_test_environment target_dir = os.path.join(temp_dir, "asset_resources") @@ -163,3 +180,44 @@ def test_copy_assets_ignores_urls_in_text_content(): assert updated_doc.count(sample_file_path) == 2 shutil.rmtree(temp_dir) + + +def test_copy_assets_handles_encoded_urls(setup_file_with_spaces): + temp_dir, sample_file_path, sample_file_name = setup_file_with_spaces + target_dir = os.path.join(temp_dir, "asset_resources") + + # Encode the file path + encoded_file_path = quote(sample_file_path) + + html_doc = f""" + + + + + Test Document + + +

Test HTML Document

+ Sample Image + Download Sample + + + """ + + updated_doc = copy_assets(html_doc, target_dir) + + # Check that the target directory is created + assert os.path.exists(target_dir) + + # Verify that the file has been moved + moved_files = os.listdir(target_dir) + assert len(moved_files) == 1 + + # Check if document URLs are updated with new file names + assert quote(os.path.join(target_dir, moved_files[0])) in updated_doc + + # Verify that the encoded URL is no longer in the updated document + assert encoded_file_path not in updated_doc + + # Verify that the original file name (with spaces) is not in the updated document + assert sample_file_name not in updated_doc diff --git a/tests/test_redirect_paths.py b/tests/test_redirect_paths.py index 4618c8c..db45f7a 100644 --- a/tests/test_redirect_paths.py +++ b/tests/test_redirect_paths.py @@ -1,3 +1,4 @@ +from urllib.parse import quote import pytest import tempfile import os @@ -31,9 +32,22 @@ def test_redirect_paths(setup_test_env): temp_dir, doc_path, res_dir = setup_test_env document = """ -Image Path: "image.png" -Image in resource: "image2.png" -URL: "http://example.com" + + + + + + Sample Document + + + +

Sample Images and URL

+ Sample Image + Image in resource + Example URL + + +" """ redirected_document = redirect_paths(document, doc_path, res_dir) @@ -41,18 +55,31 @@ def test_redirect_paths(setup_test_env): expected_path_image1 = os.path.abspath(os.path.join(temp_dir, "image.png")) expected_path_image2 = os.path.abspath(os.path.join(res_dir, "image2.png")) - assert f'"{expected_path_image1}"' in redirected_document - assert '"http://example.com"' in redirected_document - assert f'"{expected_path_image2}"' in redirected_document + assert quote(f"{expected_path_image1}") in redirected_document + assert "http://example.com" in redirected_document + assert quote(f"{expected_path_image2}") in redirected_document def test_redirect_paths_no_res_dir(setup_test_env): temp_dir, doc_path, res_dir = setup_test_env document = """ -Image Path: "image.png" -Image in resource: "image2.png" -URL: "http://example.com" + + + + + + Sample Document + + + +

Sample Images and URL

+ Sample Image + Image in resource + Example URL + + +" """ redirected_document = redirect_paths(document, doc_path) @@ -60,10 +87,10 @@ def test_redirect_paths_no_res_dir(setup_test_env): expected_path_image1 = os.path.abspath(os.path.join(temp_dir, "image.png")) expected_path_image2 = os.path.abspath(os.path.join(res_dir, "image2.png")) - assert f'"{expected_path_image1}"' in redirected_document - assert '"http://example.com"' in redirected_document - assert not f'"{expected_path_image2}"' in redirected_document - assert f'"image2.png"' in redirected_document + assert quote(f"{expected_path_image1}") in redirected_document + assert "http://example.com" in redirected_document + assert quote(f"{expected_path_image2}") in redirected_document + assert quote(f"image2.png") in redirected_document def test_redirect_paths_trivial(setup_test_env):