Skip to content

Commit

Permalink
chore(scripts): update md5 and size script, use for v4 md5s
Browse files Browse the repository at this point in the history
  • Loading branch information
rileyhgrant committed Apr 26, 2024
1 parent b1ca536 commit f455fa6
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 19 deletions.
24 changes: 12 additions & 12 deletions browser/src/DownloadsPage/GnomadV4Downloads.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -15,43 +15,43 @@ import {
import Link from '../Link'

const exomeChromosomeVcfs = [
{ chrom: '1', size: '17.50 GiB', md5: '848be4d85c953bc73a8e4f0c97026a72' },
{ chrom: '2', size: '13.30 GiB', md5: 'e2f5d891a3374e88d1c3136f94bed0ea' },
{ chrom: '1', size: '17.5 GiB', md5: '848be4d85c953bc73a8e4f0c97026a72' },
{ chrom: '2', size: '13.3 GiB', md5: 'e2f5d891a3374e88d1c3136f94bed0ea' },
{ chrom: '3', size: '10.79 GiB', md5: 'a35b949b32453b4b5abd7b1de42e298a' },
{ chrom: '4', size: '7.18 GiB', md5: 'c7c7008a73acbb8fea68b82951842832' },
{ chrom: '5', size: '7.91 GiB', md5: 'c1016f56be62deb2e947fed4d31302dd' },
{ chrom: '6', size: '8.50 GiB', md5: 'f96bc8711ed085d63b8bd5396664adbc' },
{ chrom: '6', size: '8.5 GiB', md5: 'f96bc8711ed085d63b8bd5396664adbc' },
{ chrom: '7', size: '9.01 GiB', md5: 'c41cd52571b001cf7d3e388a668e4dff' },
{ chrom: '8', size: '6.78 GiB', md5: 'a47777fe141c01876cb170f2c2f2e6b6' },
{ chrom: '9', size: '7.49 GiB', md5: 'c5abd4d8aff12f2bf8b4ec844769782f' },
{ chrom: '10', size: '7.32 GiB', md5: '4befc8dc50ead888e8af24f556c9fdd6' },
{ chrom: '11', size: '10.84 GiB', md5: '3833ce3ab046afe92c9b55df93a61ec' },
{ chrom: '12', size: '9.90 GiB', md5: 'e530a9ed203cdcc914621ab7430774bb' },
{ chrom: '13', size: '3.30 GiB', md5: 'af1eab40c8be47c8c7c04dc73e0333e4' },
{ chrom: '11', size: '10.84 GiB', md5: '3833ce3ab046afe92c9b55df93a61ec8' },
{ chrom: '12', size: '9.9 GiB', md5: 'e530a9ed203cdcc914621ab7430774bb' },
{ chrom: '13', size: '3.3 GiB', md5: 'af1eab40c8be47c8c7c04dc73e0333e4' },
{ chrom: '14', size: '6.21 GiB', md5: 'd0e0fa71d94bf016a061ba4dc0bd869f' },
{ chrom: '15', size: '6.87 GiB', md5: '01b116e34b3815cfd1d3afa53a29a41b' },
{ chrom: '16', size: '9.10 GiB', md5: '44137843b2df39c8a654427181bda919' },
{ chrom: '16', size: '9.1 GiB', md5: '44137843b2df39c8a654427181bda919' },
{ chrom: '17', size: '11.19 GiB', md5: 'c61978218ab3eaa07b571eb9959f39d2' },
{ chrom: '18', size: '3.16 GiB', md5: '1ec708d5cae9657ccee0626763ed9946' },
{ chrom: '19', size: '11.75 GiB', md5: '50a37cfa9a9a3e030388bcf15bdabb79' },
{ chrom: '20', size: '4.43 GiB', md5: '605680cd99e469bdf5f0045bd22359c9' },
{ chrom: '21', size: '2.10 GiB', md5: '7ca6d51a42425b857eddb46d7bc5832d' },
{ chrom: '21', size: '2.1 GiB', md5: '7ca6d51a42425b857eddb46d7bc5832d' },
{ chrom: '22', size: '4.71 GiB', md5: 'dcf191563e69054a71bd4dc77862799a' },
{ chrom: 'X', size: '5.35 GiB', md5: '5b7b17d3d4cff22c20480a908c861a28' },
{ chrom: 'Y', size: '108.30 MiB', md5: ' d500cf5a73c53f02d1b95f1e092f2e49' },
{ chrom: 'Y', size: '108.3 MiB', md5: 'd500cf5a73c53f02d1b95f1e092f2e49' },
]

const genomeChromosomeVcfs = [
{ chrom: '1', size: '41.05 GiB', md5: '328b4578212afec2cde394a1b02d544f' },
{ chrom: '2', size: '43.43 GiB', md5: '518ca01e6757a68bc0abe76f85af644d' },
{ chrom: '2', size: '43.36 GiB', md5: '518ca01e6757a68bc0abe76f85af644d' },
{ chrom: '3', size: '36.56 GiB', md5: '716c181431a3c11a2eb18c5f50a3542d' },
{ chrom: '4', size: '33.56 GiB', md5: 'd9b913f3e30c8f410f9ce7dee5dee6d4' },
{ chrom: '5', size: '30.46 GiB', md5: 'a2a3b9014af5c8f9bbaaf743968e48f8' },
{ chrom: '6', size: '29.61 GiB', md5: 'e65c2aa321c5e272548fb3d901bae382' },
{ chrom: '7', size: '29.10 GiB', md5: '58ee22cf3dcc8cb8b493d218e19432cc' },
{ chrom: '7', size: '29.1 GiB', md5: '58ee22cf3dcc8cb8b493d218e19432cc' },
{ chrom: '8', size: '27.28 GiB', md5: '9854d9df22977cf5bac0f9fc05f4e8f5' },
{ chrom: '9', size: '23.06 GiB', md5: '6adfc9c47000cf66d1305392051b391d' },
{ chrom: '10', size: '25.00 GiB', md5: 'c2cd760130d2339f7135fc70700db1e1' },
{ chrom: '10', size: '25.0 GiB', md5: 'c2cd760130d2339f7135fc70700db1e1' },
{ chrom: '11', size: '24.58 GiB', md5: 'd1e7a4dcf3ff62eeffca57afefc5a33a' },
{ chrom: '12', size: '24.17 GiB', md5: '644bdbc5c53d9112edbacad401a28d1a' },
{ chrom: '13', size: '15.93 GiB', md5: '84b12f299210d2a7e390c56a234b7b68' },
Expand Down
64 changes: 57 additions & 7 deletions development/scripts/get_object_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
Usage:
./get_object_metadata.py gs://bucket/file_1 gs://bucket/file_2 ...
python ./get_object_metadata.py --urls gs://bucket/file_1 gs://bucket/file_2 ...
python ./get_object_metadata.py --bucket gs://gcp-public-data--gnomad/release/4.1/vcf/exomes
"""

import argparse
Expand All @@ -14,8 +16,8 @@
import subprocess


ONE_GIBIBYTE = 2 ** 30
ONE_MEBIBYTE = 2 ** 20
ONE_GIBIBYTE = 2**30
ONE_MEBIBYTE = 2**20


def fetch_object_metadata(url):
Expand Down Expand Up @@ -43,14 +45,62 @@ def fetch_object_metadata(url):
return info


def fetch_object_metadata_from_bucket(bucket_prefix):
output = subprocess.check_output(["gsutil", "ls", "-L", bucket_prefix]).decode("utf8")

aggregated_info = {}

lines = output.split("\n")
current_filename = None
for line in lines:
if line.startswith("gs://"):
current_filename = line.split("/")[-1]
aggregated_info[current_filename] = {}
elif line.strip().startswith("Content-Length:"):
size = int(line.split(":")[1].strip())
if size >= ONE_GIBIBYTE:
aggregated_info[current_filename]["size"] = f"{round(size / ONE_GIBIBYTE, 2)} GiB"
else:
aggregated_info[current_filename]["size"] = f"{round(size / ONE_MEBIBYTE, 2)} MiB"
elif line.strip().startswith("Hash (md5):"):
md5_hash = line.split(":")[1].strip()
aggregated_info[current_filename]["md5"] = base64.b64decode(md5_hash).hex()

return aggregated_info


def main():
parser = argparse.ArgumentParser()
parser.add_argument("urls", metavar="url", nargs="+")

group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--urls", metavar="url", nargs="+")
group.add_argument("--bucket")
args = parser.parse_args()

for url in args.urls:
info = fetch_object_metadata(url)
print(json.dumps(info))
if args.urls:
for url in args.urls:
info = fetch_object_metadata(url)
print(json.dumps(info))
elif args.bucket:
aggregated_info = fetch_object_metadata_from_bucket(args.bucket)
del aggregated_info[":"]
aggregated_info = {k: v for k, v in aggregated_info.items() if not k.endswith("tbi:")}

sorted_filenames = sorted(
aggregated_info.keys(),
key=lambda x: (
(int(x.split(".")[5][3:]) if x.split(".")[5][3:].isdigit() else float("inf")),
x.split(".")[3],
),
)

print(f"Results for {args.bucket}\n")
print("[")
for i, filename in enumerate(sorted_filenames):
if not filename.endswith(".tbi:"):
line = f" {{ chrom: '{filename.split('.')[5][3:]}', size: {json.dumps(aggregated_info[filename]['size'])}, md5: {json.dumps(aggregated_info[filename]['md5'])} }}"
print(line + "," if i < len(sorted_filenames) - 1 else line)
print("]")


if __name__ == "__main__":
Expand Down

0 comments on commit f455fa6

Please sign in to comment.