Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reference dataset metadata: thousand-genomes and hgdp #265

Open
wants to merge 3 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions scripts/hgdp_1kg/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Ingesting population labels for HGDP and 1KG (thousand-genomes) datasets

1. Download [HGDP population list](https://www.internationalgenome.org/data-portal/data-collection/hgdp) (54 populations -> Download the list) and save as `sources/igsr-hgdp-pops.tsv`

1. Download [HGDP sample list](https://www.internationalgenome.org/data-portal/data-collection/hgdp) (828 samples -> Download the list) and save as `sources/igsr-hgdp-samples.tsv`

1. Do the same for [SGDP](https://www.internationalgenome.org/data-portal/data-collection/sgdp) and for [30x GRCh38 1KG](https://www.internationalgenome.org/data-portal/data-collection/30x-grch38)

1. SGDP external IDs used in NAGIM are illumina sequencing IDs, and extra metadata is needed to map them to sample IDs. Download [this metadata](https://github.com/kaspermunch/PopulationGenomicsCourse/blob/5a6a2a3850af755cff63661e95644e9381f8031e/Cluster/metadata/Simons_meta_ENArun.txt) and save it in the same folder.

1. Run the Rmd notebook `tidy.Rmd` in RStudio to process 7 files above, harmonise the population labels and descriptions, and merge into 1 file `sources/samples-pops.tsv`.

1. Download [1kg pedigree data](https://www.internationalgenome.org/faq/can-i-get-phenotype-gender-and-family-relationship-information-for-the-individuals) and cut 6 columns (otherwise metamist will error):

```sh
cut -f1-6 sources/20130606_g1k.ped > sources/20130606_g1k-cut16.ped
```

1. Run script to populate and update sample, participant, and family entries in metamist:

```sh
python ingest_metadata.py
```

The script will ask for your confirmation on each step.
130 changes: 130 additions & 0 deletions scripts/hgdp_1kg/ingest_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""
Update metadata for hgdp and thousand-genomes datasets:
population labels, sex, pedigree.
"""

import csv
from cpg_utils import to_path

from sample_metadata.model.participant_upsert import ParticipantUpsert
from sample_metadata.model.participant_update_model import ParticipantUpdateModel
from sample_metadata.model.participant_upsert_body import ParticipantUpsertBody
from sample_metadata.apis import SampleApi, ParticipantApi, FamilyApi
from sample_metadata.model.sample_batch_upsert import SampleBatchUpsert
from sample_metadata.model.sample_update_model import SampleUpdateModel

sapi = SampleApi()
papi = ParticipantApi()
fapi = FamilyApi()

for project in ['thousand-genomes', 'hgdp']:
metamist_samples = sapi.get_samples(
body_get_samples={
'project_ids': [project],
}
)

pops_tsv_path = 'samples-pops.tsv'
meta_by_collaborator_id = {}
meta_by_illumina_id = {}
with to_path(pops_tsv_path).open() as fh:
tsv_file = csv.DictReader(fh, delimiter='\t')
for entry in tsv_file:
sid = entry['Sample ID (Collaborator)']
meta_by_collaborator_id[sid] = entry
illumina_id = entry['Sample ID (Illumina)']
if illumina_id != 'NA':
meta_by_illumina_id[illumina_id] = entry

meta_by_cpg_id = {}
for s in metamist_samples:
ext_id = s['external_id']
meta = meta_by_illumina_id.get(ext_id) or meta_by_collaborator_id.get(ext_id)
if not meta:
print(f'Error: metadata not found for sample {s}')
continue
meta_by_cpg_id[s['id']] = meta
meta_by_cpg_id[s['id']]['existing_ext_id'] = ext_id

# Pedigree
if project == 'thousand-genomes' and input('Load pedigree? (y/n): ').lower() == 'y':
ped_path = '20130606_g1k-cut16.ped'
with open(ped_path) as f:
fapi.import_pedigree(
project, f, has_header=True, create_missing_participants=True
)

# Participants
try:
pid_map = papi.get_participant_id_map_by_external_ids(
project=project, request_body=[s['external_id'] for s in metamist_samples]
)
except BaseException:
if input('Participant entries do not exist. Create? (y/n): ').lower() == 'y':
# Create new
body = ParticipantUpsertBody(
participants=[
ParticipantUpsert(
external_id=meta['Sample ID (Collaborator)'],
reported_sex={'male': 1, 'female': 2}.get(
meta['Sex'].lower(), 0
),
meta={
'Superpopulation name': meta['Superpopulation name'],
'Population name': meta['Population name'],
'Population description': meta['Population description'],
},
samples=[
SampleBatchUpsert(
id=cpg_id,
sequences=[],
),
],
)
for cpg_id, meta in meta_by_cpg_id.items()
]
)
print(papi.batch_upsert_participants(project, body))
else:
if input('Participant entries exist. Update? (y/n): ').lower() == 'y':
# Update existing
participant_update_models = {
meta['existing_ext_id']: ParticipantUpdateModel(
external_id=meta['Sample ID (Collaborator)'],
reported_sex={'male': 1, 'female': 2}.get(meta['Sex'].lower(), 0),
meta={
'Superpopulation name': meta['Superpopulation name'],
'Population name': meta['Population name'],
'Population description': meta['Population description'],
},
)
for cpg_id, meta in meta_by_cpg_id.items()
}
print(f'Number of participants to update: {len(participant_update_models)}')
for i, (ext_id, model) in enumerate(participant_update_models.items()):
print(f'Participant #{i}: {ext_id}')
api_response = papi.update_participant(pid_map[ext_id], model)
print(api_response)

# Samples
if input('Update samples? (y/n): ').lower() == 'y':
sample_update_models = {
cpg_id: SampleUpdateModel(
meta={
'Population name': None,
'Superpopulation name': None,
'Population description': None,
'Sex': None,
'subpop': None,
'subcontinental_pop': None,
'sex': None,
'continental_pop': None,
},
)
for cpg_id, meta in meta_by_cpg_id.items()
}
print(f'Number of samples to update: {len(sample_update_models)}')
for i, (cpg_id, model) in enumerate(sample_update_models.items()):
print(f'Sample #{i}: {cpg_id}')
api_response = sapi.update_sample(cpg_id, model)
print(api_response)
Loading