Skip to content

Commit

Permalink
Merge pull request #31 from microsoft/download_cmip6
Browse files Browse the repository at this point in the history
Download and preprocess CMIP6 data
  • Loading branch information
rejuvyesh authored Sep 30, 2023
2 parents 5533b8c + 3f97ae7 commit 3407b91
Show file tree
Hide file tree
Showing 44 changed files with 1,049 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,6 @@ dmypy.json

# experiments
exps

# snakemake logs
.snakemake
16 changes: 15 additions & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,21 @@

### Data Preparation

The code for downloading and preprocessing CMIP6 data is coming soon
First install `snakemake` following [these instructions](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html)

To download and regrid a CMIP6 dataset to a common resolution (e.g., 1.406525 degree), go to the corresponding directory inside `snakemake_configs` and run
```bash
snakemake all --configfile config_2m_temperature.yml --cores 8
```
This script will download and regrid the `2m_temperature` data in parallel using 8 CPU cores. Modify `configfile` for other variables. After downloading and regrdding, run the following script to preprocess the `.nc` files into `.npz` format for pretraining ClimaX
```bash
python src/data_preprocessing/nc2np_equally_cmip6.py \
--dataset mpi
--path /data/CMIP6/MPI-ESM/1.40625deg/
--num_shards 10
--save_dir /data/CMIP6/MPI-ESM/1.40625deg_np_10shards
```
in which `num_shards` denotes the number of chunks to break each `.nc` file into.

### Training

Expand Down
49 changes: 49 additions & 0 deletions snakemake_configs/AWI-ESM/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@

year_strings = [f'{y}01010600-{y+1}01010000' for y in range(1850, 2015, 1)]

print(config)

rule download:
output:
"{dataset}/raw/{name}/{name}_{year_str}_raw.nc",
shell:
"wget https://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/AWI/AWI-ESM-1-1-LR/historical/{config["
"run]}/6hrPlevPt/"
"{config[cmip_name]}/gn/v20200212/"
"{config[cmip_name]}_6hrPlevPt_AWI-ESM-1-1-LR_historical_{config[run]}_gn_{wildcards.year_str}.nc "
"-O {wildcards.dataset}/raw/{config[name]}/{config[name]}_{wildcards.year_str}_raw.nc"

rule regrid:
input:
"{dataset}/raw/{name}/{name}_{year_str}_raw.nc"
output:
"{dataset}/{res}deg/{name}/{name}_{year_str}_{res}deg.nc.tmp"
shell:
"python ../../src/data_preprocessing/regrid.py \
--input_fns {input} \
--output_dir {wildcards.dataset}/{wildcards.res}deg/{wildcards.name} \
--ddeg_out {wildcards.res} \
--cmip 1 \
--rename {config[cmip_name]} {config[era_name]} \
--file_ending nc.tmp"

rule delete:
input:
expand("{{dataset}}/{res}deg/{{name}}/{{name}}_{{year_str}}_{res}deg.nc.tmp",
res=config['res']),
output:
expand("{{dataset}}/{res}deg/{{name}}/{{name}}_{{year_str}}_{res}deg.nc",
res=config['res'])
priority: 100
run:
for i, o in zip(input, output):
shell("mv {i} {o}")
# shell("rm {wildcards.dataset}/raw/{wildcards.name}/{wildcards.name}_{wildcards.year_str}_raw.nc"),


rule all:
input:
expand("{datadir}/{res}deg/{name}/{name}_{year_str}_{res}deg.nc",
datadir=config['datadir'], res=config['res'], name=config['name'], year_str=year_strings)


8 changes: 8 additions & 0 deletions snakemake_configs/AWI-ESM/config_10m_u_component_of_wind.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
datadir: /data/CMIP6/AWI-ESM
name: 10m_u_component_of_wind
cmip_name: uas
era_name: u10
run: r1i1p1f1
res:
- 1.40625
# - 5.625
8 changes: 8 additions & 0 deletions snakemake_configs/AWI-ESM/config_10m_v_component_of_wind.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
datadir: /data/CMIP6/AWI-ESM
name: 10m_v_component_of_wind
cmip_name: vas
era_name: v10
run: r1i1p1f1
res:
- 1.40625
# - 5.625
8 changes: 8 additions & 0 deletions snakemake_configs/AWI-ESM/config_2m_temperature.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
datadir: /data/CMIP6/AWI-ESM
name: 2m_temperature
cmip_name: tas
era_name: t2m
run: r1i1p1f1
res:
- 1.40625
# - 5.625
8 changes: 8 additions & 0 deletions snakemake_configs/AWI-ESM/config_geopotential.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
datadir: /data/CMIP6/AWI-ESM
name: geopotential
cmip_name: zg
era_name: z
run: r1i1p1f1
res:
- 1.40625
# - 5.625
8 changes: 8 additions & 0 deletions snakemake_configs/AWI-ESM/config_specific_humidity.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
datadir: /data/CMIP6/AWI-ESM
name: specific_humidity
cmip_name: hus
era_name: q
run: r1i1p1f1
res:
- 1.40625
# - 5.625
8 changes: 8 additions & 0 deletions snakemake_configs/AWI-ESM/config_temperature.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
datadir: /data/CMIP6/AWI-ESM
name: temperature
cmip_name: ta
era_name: t
run: r1i1p1f1
res:
- 1.40625
# - 5.625
8 changes: 8 additions & 0 deletions snakemake_configs/AWI-ESM/config_u_component_of_wind.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
datadir: /data/CMIP6/AWI-ESM
name: u_component_of_wind
cmip_name: ua
era_name: u
run: r1i1p1f1
res:
- 1.40625
# - 5.625
8 changes: 8 additions & 0 deletions snakemake_configs/AWI-ESM/config_v_component_of_wind.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
datadir: /data/CMIP6/AWI-ESM
name: v_component_of_wind
cmip_name: va
era_name: v
run: r1i1p1f1
res:
- 1.40625
# - 5.625
50 changes: 50 additions & 0 deletions snakemake_configs/CMCC/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
year_strings = [f'{y}01010600-{y+1}01010000' for y in range(1850, 2015, 1)]

print(config)

rule download:
output:
"{dataset}/raw/{name}/{name}_{year_str}_raw.nc",
shell:
"wget https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/CMIP/CMCC/CMCC-CM2-HR4/historical/{config["
"run]}/6hrPlevPt/"
"{config[cmip_name]}/gn/v20200904/"
"{config[cmip_name]}_6hrPlevPt_CMCC-CM2-HR4_historical_{config[run]}_gn_{wildcards.year_str}.nc "
"-O {wildcards.dataset}/raw/{config[name]}/{config[name]}_{wildcards.year_str}_raw.nc"

# https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/CMIP/CMCC/CMCC-CM2-HR4/historical/r1i1p1f1/6hrPlevPt/ta/gn/v20200904/ta_6hrPlevPt_CMCC-CM2-HR4_historical_r1i1p1f1_gn_185001010600-185101010000.nc

rule regrid:
input:
"{dataset}/raw/{name}/{name}_{year_str}_raw.nc"
output:
"{dataset}/{res}deg/{name}/{name}_{year_str}_{res}deg.nc.tmp"
shell:
"python ../../src/data_preprocessing/regrid.py \
--input_fns {input} \
--output_dir {wildcards.dataset}/{wildcards.res}deg/{wildcards.name} \
--ddeg_out {wildcards.res} \
--cmip 1 \
--rename {config[cmip_name]} {config[era_name]} \
--file_ending nc.tmp"

rule delete:
input:
expand("{{dataset}}/{res}deg/{{name}}/{{name}}_{{year_str}}_{res}deg.nc.tmp",
res=config['res']),
output:
expand("{{dataset}}/{res}deg/{{name}}/{{name}}_{{year_str}}_{res}deg.nc",
res=config['res'])
priority: 100
run:
for i, o in zip(input, output):
shell("mv {i} {o}")
# shell("rm {wildcards.dataset}/raw/{wildcards.name}/{wildcards.name}_{wildcards.year_str}_raw.nc"),


rule all:
input:
expand("{datadir}/{res}deg/{name}/{name}_{year_str}_{res}deg.nc",
datadir=config['datadir'], res=config['res'], name=config['name'], year_str=year_strings)


8 changes: 8 additions & 0 deletions snakemake_configs/CMCC/config_geopotential.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
datadir: /data/CMIP6/CMCC
name: geopotential
cmip_name: zg
era_name: z
run: r1i1p1f1
res:
- 1.40625
# - 5.625
8 changes: 8 additions & 0 deletions snakemake_configs/CMCC/config_temperature.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
datadir: /data/CMIP6/CMCC
name: temperature
cmip_name: ta
era_name: t
run: r1i1p1f1
res:
- 1.40625
# - 5.625
8 changes: 8 additions & 0 deletions snakemake_configs/CMCC/config_u_component_of_wind.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
datadir: /data/CMIP6/CMCC
name: u_component_of_wind
cmip_name: ua
era_name: u
run: r1i1p1f1
res:
- 1.40625
# - 5.625
8 changes: 8 additions & 0 deletions snakemake_configs/CMCC/config_v_component_of_wind.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
datadir: /data/CMIP6/CMCC
name: v_component_of_wind
cmip_name: va
era_name: v
run: r1i1p1f1
res:
- 1.40625
# - 5.625
59 changes: 59 additions & 0 deletions snakemake_configs/HAMMOZ/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@

year_strings = [
'185001010600-187001010000',
'187001010600-189001010000',
'189001010600-191001010000',
'191001010600-193001010000',
'193001010600-195001010000',
'195001010600-197001010000',
'197001010600-199001010000',
'199001010600-201001010000',
'201001010600-201501010000',
]

print(config)

rule download:
output:
"{dataset}/raw/{name}/{name}_{year_str}_raw.nc",
shell:
"wget https://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/HAMMOZ-Consortium/MPI-ESM-1-2-HAM/historical/{config["
"run]}/6hrPlevPt/"
"{config[cmip_name]}/gn/{config[version]}/"
"{config[cmip_name]}_6hrPlevPt_MPI-ESM-1-2-HAM_historical_{config[run]}_gn_{wildcards.year_str}.nc "
"-O {wildcards.dataset}/raw/{config[name]}/{config[name]}_{wildcards.year_str}_raw.nc"

rule regrid:
input:
"{dataset}/raw/{name}/{name}_{year_str}_raw.nc"
output:
"{dataset}/{res}deg/{name}/{name}_{year_str}_{res}deg.nc.tmp"
shell:
"python ../../src/data_preprocessing/regrid.py \
--input_fns {input} \
--output_dir {wildcards.dataset}/{wildcards.res}deg/{wildcards.name} \
--ddeg_out {wildcards.res} \
--cmip 1 \
--rename {config[cmip_name]} {config[era_name]} \
--file_ending nc.tmp"

rule delete:
input:
expand("{{dataset}}/{res}deg/{{name}}/{{name}}_{{year_str}}_{res}deg.nc.tmp",
res=config['res']),
output:
expand("{{dataset}}/{res}deg/{{name}}/{{name}}_{{year_str}}_{res}deg.nc",
res=config['res'])
priority: 100
run:
for i, o in zip(input, output):
shell("mv {i} {o}")
# shell("rm {wildcards.dataset}/raw/{wildcards.name}/{wildcards.name}_{wildcards.year_str}_raw.nc"),


rule all:
input:
expand("{datadir}/{res}deg/{name}/{name}_{year_str}_{res}deg.nc",
datadir=config['datadir'], res=config['res'], name=config['name'], year_str=year_strings)


9 changes: 9 additions & 0 deletions snakemake_configs/HAMMOZ/config_10m_u_component_of_wind.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
datadir: /data/CMIP6/HAMMOZ
name: 10m_u_component_of_wind
cmip_name: uas
era_name: u10
run: r1i1p1f1
version: v20190627
res:
- 1.40625
# - 5.625
9 changes: 9 additions & 0 deletions snakemake_configs/HAMMOZ/config_10m_v_component_of_wind.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
datadir: /data/CMIP6/HAMMOZ
name: 10m_v_component_of_wind
cmip_name: vas
era_name: v10
run: r1i1p1f1
version: v20190627
res:
- 1.40625
# - 5.625
9 changes: 9 additions & 0 deletions snakemake_configs/HAMMOZ/config_2m_temperature.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
datadir: /data/CMIP6/HAMMOZ
name: 2m_temperature
cmip_name: tas
era_name: t2m
run: r1i1p1f1
version: v20190628
res:
- 1.40625
# - 5.625
9 changes: 9 additions & 0 deletions snakemake_configs/HAMMOZ/config_geopotential.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
datadir: /data/CMIP6/HAMMOZ
name: geopotential
cmip_name: zg
era_name: z
run: r1i1p1f1
version: v20190628
res:
- 1.40625
# - 5.625
9 changes: 9 additions & 0 deletions snakemake_configs/HAMMOZ/config_specific_humidity.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
datadir: /data/CMIP6/HAMMOZ
name: specific_humidity
cmip_name: hus
era_name: q
run: r1i1p1f1
version: v20190628
res:
- 1.40625
# - 5.625
9 changes: 9 additions & 0 deletions snakemake_configs/HAMMOZ/config_temperature.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
datadir: /data/CMIP6/HAMMOZ
name: temperature
cmip_name: ta
era_name: t
run: r1i1p1f1
version: v20190628
res:
- 1.40625
# - 5.625
9 changes: 9 additions & 0 deletions snakemake_configs/HAMMOZ/config_u_component_of_wind.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
datadir: /data/CMIP6/HAMMOZ
name: u_component_of_wind
cmip_name: ua
era_name: u
run: r1i1p1f1
version: v20190628
res:
- 1.40625
# - 5.625
9 changes: 9 additions & 0 deletions snakemake_configs/HAMMOZ/config_v_component_of_wind.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
datadir: /data/CMIP6/HAMMOZ
name: v_component_of_wind
cmip_name: va
era_name: v
run: r1i1p1f1
version: v20190628
res:
- 1.40625
# - 5.625
Loading

0 comments on commit 3407b91

Please sign in to comment.