Skip to content

Commit

Permalink
remove aria2 in place of wget
Browse files Browse the repository at this point in the history
closes #21
  • Loading branch information
scottstanie committed Aug 25, 2023
1 parent ba09d26 commit c19fb6d
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 11 deletions.
2 changes: 1 addition & 1 deletion conda-env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ dependencies:
- python>=3.8
- pip>=21.3 # https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/#editable-installation
- git # for pip install, due to setuptools_scm
- aria2 # ASF Downloading
- wget # ASF Downloading
# - isce3>=0.14.0
# - compass>=0.4.1
# - dask>=2022.6.0
Expand Down
13 changes: 5 additions & 8 deletions src/sweets/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def _download_water_mask(self) -> Future:
create_water_mask, self._water_mask_filename, self._dem_bbox
)

def _download_rslcs(self) -> Future:
def _download_rslcs(self) -> list[Path]:
"""Download Sentinel zip files from ASF."""
self.log_dir.mkdir(parents=True, exist_ok=True)
# The final name will depend on if we're unzipping or not
Expand All @@ -264,18 +264,15 @@ def _download_rslcs(self) -> Future:
f"Found {len(existing_files)} existing files in"
f" {self.asf_query.out_dir}. Skipping download."
)
return self._client.submit(lambda: existing_files)
return existing_files

# If we didn't have any, we need to download them
# TODO: how should we handle partial/failed downloads... do we really
# want to re-search for them each time?
# Maybe there can be a "force" flag to re-download everything?
# or perhaps an API search, then if the number matches, we can skip
# rather than let aria2c start and do the checksums
rslc_futures = self._client.submit(
self.asf_query.download, log_dir=self.log_dir
)
return rslc_futures
return self.asf_query.download(log_dir=self.log_dir)

@log_runtime
def _geocode_slcs(self, slc_files, dem_file, burst_db_file):
Expand Down Expand Up @@ -543,12 +540,12 @@ def run(self, starting_step: int = 1):
dem_fut = self._download_dem()
burst_db_fut = self._download_burst_db()
water_mask_future = self._download_water_mask()
rslc_futures = self._download_rslcs()
# Gather the futures once everything is downloaded
burst_db_file = burst_db_fut.result()
dem_fut.result()
wait([water_mask_future])
rslc_files = rslc_futures.result()

rslc_files = self._download_rslcs()

# Second step:
if starting_step <= 2:
Expand Down
19 changes: 17 additions & 2 deletions src/sweets/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import subprocess
import sys
import zipfile
from concurrent.futures import ThreadPoolExecutor
from datetime import date, datetime
from functools import lru_cache
from pathlib import Path
Expand Down Expand Up @@ -192,6 +193,20 @@ def _download_with_aria(self, urls, log_dir: Filename = Path(".")):
with open(log_filename, "w") as f:
subprocess.run(aria_cmd, shell=True, stdout=f, stderr=f, text=True)

def _download_with_wget(self, urls, log_dir: Filename = Path(".")):
def download_url(idx_url_pair):
idx, u = idx_url_pair
log_filename = Path(log_dir) / f"wget_{idx:02d}.log"
with open(log_filename, "w") as f:
wget_cmd = f'wget -nc -c "{u}" -P "{self.out_dir}"'
logger.info(f"({idx} / {len(urls)}): Downloading {u} with wget")
logger.info(wget_cmd)
subprocess.run(wget_cmd, shell=True, stdout=f, stderr=f, text=True)

# Parallelize the download using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=3) as executor:
list(executor.map(download_url, enumerate(urls)))

@log_runtime
def download(self, log_dir: Filename = Path(".")) -> list[Path]:
# Start by saving data available as geojson
Expand All @@ -206,8 +221,8 @@ def download(self, log_dir: Filename = Path(".")) -> list[Path]:
self.out_dir.mkdir(parents=True, exist_ok=True)
file_names = [self.out_dir / f for f in self._file_names(results)]

# NOTE: aria should skip already-downloaded files
self._download_with_aria(urls, log_dir=log_dir)
# TODO: use aria if available? or just make wget parallel...
self._download_with_wget(urls, log_dir=log_dir)

if self.unzip:
# Change to .SAFE extension
Expand Down

0 comments on commit c19fb6d

Please sign in to comment.