Skip to content

Commit

Permalink
Implement skip if file exists (#85)
Browse files Browse the repository at this point in the history
* Fix skip if exists

* Fix link

* Update README.md
  • Loading branch information
g4brielvs authored Jul 18, 2024
1 parent 7f14143 commit ef39047
Show file tree
Hide file tree
Showing 5 changed files with 7,731 additions and 6,251 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
[![docs](https://github.com/worldbank/blackmarblepy/actions/workflows/gh-pages.yml/badge.svg)](https://github.com/worldbank/blackmarblepy/actions/workflows/gh-pages.yml)
[![tests](https://github.com/worldbank/blackmarblepy/actions/workflows/tests.yml/badge.svg)](https://github.com/worldbank/blackmarblepy/actions/workflows/tests.yml)
[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/worldbank/blackmarblepy/main.svg)](https://results.pre-commit.ci/latest/github/worldbank/blackmarblepy/main)
[![downloads](https://static.pepy.tech/badge/blackmarblepy/month)](https://pepy.tech/project/blackmarblepy)
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10667907.svg)](https://zenodo.org/doi/10.5281/zenodo.10667907)
[![Downloads](https://static.pepy.tech/badge/blackmarblepy)](https://pepy.tech/project/blackmarblepy)
[![GitHub Repo stars](https://img.shields.io/github/stars/worldbank/blackmarblepy)](https://github.com/worldbank/blackmarblepy)

**BlackMarblePy** is a Python package that provides a simple way to use nighttime lights data from NASA's Black Marble project. [Black Marble](https://blackmarble.gsfc.nasa.gov) is a [NASA Earth Science Data Systems (ESDS)](https://www.earthdata.nasa.gov) project that provides a product suite of daily, monthly and yearly global [nighttime lights](https://www.earthdata.nasa.gov/learn/backgrounders/nighttime-lights). This package automates the process of downloading all relevant tiles from the [NASA LAADS DAAC](https://www.earthdata.nasa.gov/eosdis/daacs/laads) to cover a region of interest, converting the raw files (in HDF5 format), to georeferenced rasters, and mosaicing rasters together when needed.
**BlackMarblePy** is a Python package that provides a simple way to use nighttime lights data from NASA's Black Marble project. [Black Marble](https://blackmarble.gsfc.nasa.gov) is a [NASA Earth Science Data Systems (ESDS)](https://www.earthdata.nasa.gov) project that provides a product suite of daily, monthly and yearly global [nighttime lights](https://www.earthdata.nasa.gov/learn/backgrounders/nighttime-lights). This package automates the process of downloading all relevant tiles from the [NASA LAADS DAAC](https://www.earthdata.nasa.gov/eosdis/daacs/laads) to cover a region of interest, converting the raw files (in HDF5 format), to georeferenced rasters, and mosaicking rasters together when needed.

## Features

Expand Down Expand Up @@ -140,7 +140,7 @@ Robert Marty

## Citation

When using **BlackMarblePy**, your support is much appreciated! Please consider using the following citation or download [bibliography.bib](bibliography.bib):
When using **BlackMarblePy**, your support is much appreciated! Please consider using the following citation or download [bibliography.bib](https://raw.githubusercontent.com/worldbank/blackmarblepy/main/docs/bibliography.bib):

```bibtex
@misc{blackmarblepy,
Expand Down
13,872 changes: 7,676 additions & 6,196 deletions notebooks/blackmarblepy.ipynb

Large diffs are not rendered by default.

60 changes: 36 additions & 24 deletions src/blackmarble/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ async def get_manifest(
def _download_file(
self,
name: str,
skip_if_exists: bool = True,
):
"""Download NASA Black Marble file
Expand All @@ -150,25 +151,25 @@ def _download_file(
url = f"{self.URL}{name}"
name = name.split("/")[-1]

with open(filename := Path(self.directory, name), "wb+") as f:
with httpx.stream(
"GET",
url,
headers={"Authorization": f"Bearer {self.bearer}"},
) as response:
total = int(response.headers["Content-Length"])
with tqdm(
total=total,
unit="B",
unit_scale=True,
leave=None,
) as pbar:
pbar.set_description(f"Retrieving {name}...")
for chunk in response.iter_raw():
f.write(chunk)
pbar.update(len(chunk))

return filename
if not (filename := Path(self.directory, name)).exists() or not skip_if_exists:
with open(filename, "wb+") as f:
with httpx.stream(
"GET",
url,
headers={"Authorization": f"Bearer {self.bearer}"},
) as response:
total = int(response.headers["Content-Length"])
with tqdm(
total=total,
unit="B",
unit_scale=True,
leave=None,
) as pbar:
pbar.set_description(f"Downloading {name}...")
for chunk in response.iter_raw():
f.write(chunk)
pbar.update(len(chunk))
return filename

def download(
self,
Expand All @@ -177,12 +178,13 @@ def download(
date_range: List[datetime.date],
skip_if_exists: bool = True,
):
"""Download (in parallel) from NASA Black Marble archive
"""
Downloads files asynchronously from NASA Black Marble archive.
Parameters
----------
gdf: geopandas.GeoDataFrame
Region of Interest
Region of Interest. Converted to EPSG:4326 and intersected with Black Mable tiles
product: Product
Nasa Black Marble Product Id (e.g, VNP46A1)
Expand All @@ -192,22 +194,32 @@ def download(
skip_if_exists: bool, default=True
Whether to skip downloading data if file already exists
Returns
-------
list: List[pathlib.Path]
List of downloaded H5 filenames.
"""
# Convert to EPSG:4326 and intersect with self.TILES
gdf = geopandas.overlay(
gdf.to_crs("EPSG:4326").dissolve(), self.TILES, how="intersection"
)

# Fetch manifest data asynchronously
bm_files_df = asyncio.run(self.get_manifest(gdf, product_id, date_range))

# Filter files to those intersecting with Black Marble tiles
bm_files_df = bm_files_df[
bm_files_df["name"].str.contains("|".join(gdf["TileID"]))
]
names = bm_files_df["fileURL"].tolist()

args = [(name,) for name in names]
# Prepare arguments for parallel download
names = bm_files_df["fileURL"].tolist()
args = [(name, skip_if_exists) for name in names]
return pqdm(
args,
self._download_file,
n_jobs=16,
n_jobs=4, # os.cpu_count(),
argument_type="args",
desc="Downloading...",
)
19 changes: 7 additions & 12 deletions src/blackmarble/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@ def bm_extract(
variable: Optional[str] = None,
drop_values_by_quality_flag: List[int] = [],
check_all_tiles_exist: bool = True,
file_directory: Optional[Path] = None,
file_prefix: Optional[str] = None,
file_skip_if_exists: bool = True,
output_directory: Optional[Path] = None,
output_skip_if_exists: bool = True,
):
"""Extract and aggregate nighttime lights zonal statistics from `NASA Black Marble <https://blackmarble.gsfc.nasa.gov>`_.
Expand Down Expand Up @@ -76,13 +75,10 @@ def bm_extract(
check_all_tiles_exist: bool, default=True
Check whether all Black Marble nighttime light tiles exist for the region of interest. Sometimes not all tiles are available, so the full region of interest may not be covered. By default (True), it skips cases where not all tiles are available.
file_directory: pathlib.Path, optional
Where to produce output. By default, the output will be produced onto a temporary directory.
output_directory: pathlib.Path, optional
Directory to produce output. By default, the output will be produced onto a temporary directory.
file_directory_prefix: str, optional
Prefix
file_skip_if_exists: bool, default=True
outout_skip_if_exists: bool, default=True
Whether to skip downloading or extracting data if the data file for that date already exists.
bearer
Expand All @@ -102,9 +98,8 @@ def bm_extract(
variable,
drop_values_by_quality_flag,
check_all_tiles_exist,
file_directory,
file_prefix,
file_skip_if_exists,
output_directory,
output_skip_if_exists,
)

results = []
Expand Down
25 changes: 9 additions & 16 deletions src/blackmarble/raster.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ def h5_to_geotiff(
variable: str = None,
drop_values_by_quality_flag: List[int] = [255],
output_directory: Path = None,
output_prefix: str = None,
):
"""
Convert HDF5 file to GeoTIFF for a selected (or default) variable from NASA Black Marble data
Expand All @@ -164,9 +163,6 @@ def h5_to_geotiff(
output_directory : Path, optional
Directory to save the output GeoTIFF file. If None, uses the same directory as input file.
output_prefix : str, optional
Prefix for the output file name. If None, uses the input file name.
Returns
------
output_path: Path
Expand Down Expand Up @@ -265,9 +261,8 @@ def bm_raster(
variable: Optional[str] = None,
drop_values_by_quality_flag: List[int] = [],
check_all_tiles_exist: bool = True,
file_directory: Optional[Path] = None,
file_prefix: Optional[str] = None,
file_skip_if_exists: bool = True,
output_directory: Optional[Path] = None,
output_skip_if_exists: bool = True,
):
"""Create a stack of nighttime lights rasters by retrieiving from `NASA Black Marble <https://blackmarble.gsfc.nasa.gov>`_ data.
Expand Down Expand Up @@ -318,13 +313,10 @@ def bm_raster(
check_all_tiles_exist: bool, default=True
Check whether all Black Marble nighttime light tiles exist for the region of interest. Sometimes not all tiles are available, so the full region of interest may not be covered. By default (True), it skips cases where not all tiles are available.
file_directory: pathlib.Path, optional
Where to produce output. By default, the output will be produced onto a temporary directory.
file_prefix: str, optional
Prefix
output_directory: pathlib.Path, optional
Directory to produce output. By default, the output will be produced onto a temporary directory.
file_skip_if_exists: bool, default=True
output_skip_if_exists: bool, default=True
Whether to skip downloading or extracting data if the data file for that date already exists.
Returns
Expand All @@ -348,9 +340,11 @@ def bm_raster(
date_range = sorted(set([d.replace(day=1, month=1) for d in date_range]))

# Download and construct Dataset
with file_directory if file_directory else tempfile.TemporaryDirectory() as d:
with output_directory if output_directory else tempfile.TemporaryDirectory() as d:
downloader = BlackMarbleDownloader(bearer, d)
pathnames = downloader.download(gdf, product_id, date_range)
pathnames = downloader.download(
gdf, product_id, date_range, output_skip_if_exists
)

datasets = []
for date in tqdm(date_range, desc="COLLATING RESULTS | Processing..."):
Expand All @@ -364,7 +358,6 @@ def bm_raster(
f,
variable=variable,
drop_values_by_quality_flag=drop_values_by_quality_flag,
output_prefix=file_prefix,
output_directory=d,
),
)
Expand Down

0 comments on commit ef39047

Please sign in to comment.