populationgenomics · vladsavelyev · Aug 6, 2022 · Sep 1, 2022 · Sep 1, 2022
diff --git a/scripts/hgdp_1kg/README.md b/scripts/hgdp_1kg/README.md
@@ -0,0 +1,25 @@
+# Ingesting population labels for HGDP and 1KG (thousand-genomes) datasets
+
+1. Download [HGDP population list](https://www.internationalgenome.org/data-portal/data-collection/hgdp) (54 populations -> Download the list) and save as `sources/igsr-hgdp-pops.tsv`
+
+1. Download [HGDP sample list](https://www.internationalgenome.org/data-portal/data-collection/hgdp) (828 samples -> Download the list) and save as `sources/igsr-hgdp-samples.tsv`
+
+1. Do the same for [SGDP](https://www.internationalgenome.org/data-portal/data-collection/sgdp) and for [30x GRCh38 1KG](https://www.internationalgenome.org/data-portal/data-collection/30x-grch38)
+
+1. SGDP external IDs used in NAGIM are illumina sequencing IDs, and extra metadata is needed to map them to sample IDs. Download [this metadata](https://github.com/kaspermunch/PopulationGenomicsCourse/blob/5a6a2a3850af755cff63661e95644e9381f8031e/Cluster/metadata/Simons_meta_ENArun.txt) and save it in the same folder.
+
+1. Run the Rmd notebook `tidy.Rmd` in RStudio to process 7 files above, harmonise the population labels and descriptions, and merge into 1 file `sources/samples-pops.tsv`.
+
+1. Download [1kg pedigree data](https://www.internationalgenome.org/faq/can-i-get-phenotype-gender-and-family-relationship-information-for-the-individuals) and cut 6 columns (otherwise metamist will error):
+
+```sh
+cut -f1-6 sources/20130606_g1k.ped > sources/20130606_g1k-cut16.ped
+```
+
+1. Run script to populate and update sample, participant, and family entries in metamist:
+
+```sh
+python ingest_metadata.py
+```
+
+The script will ask for your confirmation on each step.
diff --git a/scripts/hgdp_1kg/ingest_metadata.py b/scripts/hgdp_1kg/ingest_metadata.py
@@ -0,0 +1,130 @@
+"""
+Update metadata for hgdp and thousand-genomes datasets:
+population labels, sex, pedigree.
+"""
+
+import csv
+from cpg_utils import to_path
+
+from sample_metadata.model.participant_upsert import ParticipantUpsert
+from sample_metadata.model.participant_update_model import ParticipantUpdateModel
+from sample_metadata.model.participant_upsert_body import ParticipantUpsertBody
+from sample_metadata.apis import SampleApi, ParticipantApi, FamilyApi
+from sample_metadata.model.sample_batch_upsert import SampleBatchUpsert
+from sample_metadata.model.sample_update_model import SampleUpdateModel
+
+sapi = SampleApi()
+papi = ParticipantApi()
+fapi = FamilyApi()
+
+for project in ['thousand-genomes', 'hgdp']:
+    metamist_samples = sapi.get_samples(
+        body_get_samples={
+            'project_ids': [project],
+        }
+    )
+
+    pops_tsv_path = 'samples-pops.tsv'
+    meta_by_collaborator_id = {}
+    meta_by_illumina_id = {}
+    with to_path(pops_tsv_path).open() as fh:
+        tsv_file = csv.DictReader(fh, delimiter='\t')
+        for entry in tsv_file:
+            sid = entry['Sample ID (Collaborator)']
+            meta_by_collaborator_id[sid] = entry
+            illumina_id = entry['Sample ID (Illumina)']
+            if illumina_id != 'NA':
+                meta_by_illumina_id[illumina_id] = entry
+
+    meta_by_cpg_id = {}
+    for s in metamist_samples:
+        ext_id = s['external_id']
+        meta = meta_by_illumina_id.get(ext_id) or meta_by_collaborator_id.get(ext_id)
+        if not meta:
+            print(f'Error: metadata not found for sample {s}')
+            continue
+        meta_by_cpg_id[s['id']] = meta
+        meta_by_cpg_id[s['id']]['existing_ext_id'] = ext_id
+
+    # Pedigree
+    if project == 'thousand-genomes' and input('Load pedigree? (y/n): ').lower() == 'y':
+        ped_path = '20130606_g1k-cut16.ped'
+        with open(ped_path) as f:
+            fapi.import_pedigree(
+                project, f, has_header=True, create_missing_participants=True
+            )
+
+    # Participants
+    try:
+        pid_map = papi.get_participant_id_map_by_external_ids(
+            project=project, request_body=[s['external_id'] for s in metamist_samples]
+        )
+    except BaseException:
+        if input('Participant entries do not exist. Create? (y/n): ').lower() == 'y':
+            # Create new
+            body = ParticipantUpsertBody(
+                participants=[
+                    ParticipantUpsert(
+                        external_id=meta['Sample ID (Collaborator)'],
+                        reported_sex={'male': 1, 'female': 2}.get(
+                            meta['Sex'].lower(), 0
+                        ),
+                        meta={
+                            'Superpopulation name': meta['Superpopulation name'],
+                            'Population name': meta['Population name'],
+                            'Population description': meta['Population description'],
+                        },
+                        samples=[
+                            SampleBatchUpsert(
+                                id=cpg_id,
+                                sequences=[],
+                            ),
+                        ],
+                    )
+                    for cpg_id, meta in meta_by_cpg_id.items()
+                ]
+            )
+            print(papi.batch_upsert_participants(project, body))
+    else:
+        if input('Participant entries exist. Update? (y/n): ').lower() == 'y':
+            # Update existing
+            participant_update_models = {
+                meta['existing_ext_id']: ParticipantUpdateModel(
+                    external_id=meta['Sample ID (Collaborator)'],
+                    reported_sex={'male': 1, 'female': 2}.get(meta['Sex'].lower(), 0),
+                    meta={
+                        'Superpopulation name': meta['Superpopulation name'],
+                        'Population name': meta['Population name'],
+                        'Population description': meta['Population description'],
+                    },
+                )
+                for cpg_id, meta in meta_by_cpg_id.items()
+            }
+            print(f'Number of participants to update: {len(participant_update_models)}')
+            for i, (ext_id, model) in enumerate(participant_update_models.items()):
+                print(f'Participant #{i}: {ext_id}')
+                api_response = papi.update_participant(pid_map[ext_id], model)
+                print(api_response)
+
+    # Samples
+    if input('Update samples? (y/n): ').lower() == 'y':
+        sample_update_models = {
+            cpg_id: SampleUpdateModel(
+                meta={
+                    'Population name': None,
+                    'Superpopulation name': None,
+                    'Population description': None,
+                    'Sex': None,
+                    'subpop': None,
+                    'subcontinental_pop': None,
+                    'sex': None,
+                    'continental_pop': None,
+                },
+            )
+            for cpg_id, meta in meta_by_cpg_id.items()
+        }
+        print(f'Number of samples to update: {len(sample_update_models)}')
+        for i, (cpg_id, model) in enumerate(sample_update_models.items()):
+            print(f'Sample #{i}: {cpg_id}')
+            api_response = sapi.update_sample(cpg_id, model)
+            print(api_response)