diff --git a/data_managers/data_manager_gtdbtk_database_installer/data_manager/gtdbtk_database_installer.py b/data_managers/data_manager_gtdbtk_database_installer/data_manager/gtdbtk_database_installer.py index 526bf14a505..75a3c8e712d 100644 --- a/data_managers/data_manager_gtdbtk_database_installer/data_manager/gtdbtk_database_installer.py +++ b/data_managers/data_manager_gtdbtk_database_installer/data_manager/gtdbtk_database_installer.py @@ -45,6 +45,60 @@ def is_urlfile(url): return False +def extract_tar_iteratively(tarball, target_directory): + """ + Extracts a .tar, .tar.gz, or .tar.bz2 archive iteratively in a memory-efficient manner. + + This function processes the contents of the archive member-by-member, ensuring only + one file or directory is loaded into memory at any given time. It handles the creation + of directories and symbolic links, and streams large files to disk in chunks to avoid + memory overload. + + Args: + tarball (str): Path to the tar archive (e.g., .tar, .tar.gz, .tar.bz2) to be extracted. + target_directory (str): The destination directory where the archive content + will be extracted. + + Raises: + OSError: If there is an issue with file or directory creation, or writing to disk. + tarfile.TarError: If there is an issue opening or reading the tar archive. + + Example Usage: + extract_tar_iteratively("archive.tar.gz", "/path/to/extract") + + Notes: + - The function supports symbolic and hard links present in the tar archive. + - It ensures that directories are created before files are extracted. + - Large files are streamed to disk in 1 MB chunks to minimize memory usage. + - This function does not return anything but will populate the target directory with + the extracted content. + """ + + with tarfile.open(tarball, "r:*") as fh: + for member in fh: + # Full path to where the member should be extracted + member_path = os.path.join(target_directory, member.name) + + if member.isdir(): + # If it's a directory, ensure it exists + os.makedirs(member_path, exist_ok=True) + elif member.isfile(): + # If it's a file, extract it in chunks to avoid memory spikes + with fh.extractfile(member) as source, open( + member_path, "wb" + ) as target: + shutil.copyfileobj( + source, target, length=1024 * 1024 + ) # 1 MB chunks + elif member.issym() or member.islnk(): + # Handle symlinks or hard links if necessary + target_link = os.path.join(target_directory, member.name) + if member.issym(): + os.symlink(member.linkname, target_link) + elif member.islnk(): + os.link(member.linkname, target_link) + + def url_download(url, target_directory, meta): # download the url @@ -59,7 +113,7 @@ def url_download(url, target_directory, meta): src = urlopen(req) with open(tarball, "wb") as dst: while True: - chunk = src.read(2**10) + chunk = src.read(2**16) # Read in 64 KB chunks instead of 1 KB if chunk: dst.write(chunk) else: @@ -74,9 +128,7 @@ def url_download(url, target_directory, meta): if meta: # extract the content of *.tar.gz into the target dir if tarfile.is_tarfile(tarball): - fh = tarfile.open(tarball, "r:*") - fh.extractall(target_directory) - fh.close() + extract_tar_iteratively(tarball, target_directory) os.remove(tarball) return target_directory # return path to output folder # extract the content of *.gz into the target dir @@ -96,9 +148,7 @@ def url_download(url, target_directory, meta): # handle the DB # extract the content of the folder in the tar.gz into the target dir if tarfile.is_tarfile(tarball): - fh = tarfile.open(tarball, "r:*") - fh.extractall(target_directory) - fh.close() + extract_tar_iteratively(tarball, target_directory) os.remove(tarball) else: # handle the test case for the DB diff --git a/data_managers/data_manager_gtdbtk_database_installer/data_manager/gtdbtk_database_installer.xml b/data_managers/data_manager_gtdbtk_database_installer/data_manager/gtdbtk_database_installer.xml index 3de58286982..52b73776ebe 100644 --- a/data_managers/data_manager_gtdbtk_database_installer/data_manager/gtdbtk_database_installer.xml +++ b/data_managers/data_manager_gtdbtk_database_installer/data_manager/gtdbtk_database_installer.xml @@ -2,7 +2,7 @@ 202 - 3 + 4 20.09