Skip to content

Commit

Permalink
Merge pull request #329 from TranslatorSRI/cell_wikidata
Browse files Browse the repository at this point in the history
This hits wikidata to find CL/UMLS concordances via wikidata IDs. It writes a concord table between CL and UMLS.

It could also bring in wikidata id's but not sure if that's terribly useful.

Does a pretty good job; there are still lots of nodes that are either Cl or UMLS but cleans up many unmerged cliques.
  • Loading branch information
gaurav authored Aug 7, 2024
2 parents 7995b55 + f24e97d commit d0502c6
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 4 deletions.
2 changes: 1 addition & 1 deletion config.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

"anatomy_prefixes": ["UBERON","GO","CL","UMLS","MESH","NCIT","SNOMEDCT"],
"anatomy_ids": ["UBERON","GO","CL","UMLS","MESH","NCIT"],
"anatomy_concords": ["UBERON","GO","CL","UMLS"],
"anatomy_concords": ["UBERON","GO","CL","UMLS", "WIKIDATA"],
"anatomy_outputs": ["AnatomicalEntity.txt", "Cell.txt", "CellularComponent.txt","GrossAnatomicalStructure.txt"],

"gene_labels": ["HGNC","NCBIGene","UMLS"],
Expand Down
36 changes: 35 additions & 1 deletion src/createcompendia/anatomy.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from collections import defaultdict
import requests

import src.datahandlers.obo as obo
from src.util import Text

from src.prefixes import MESH, NCIT, CL, GO, UBERON, SNOMEDCT
from src.prefixes import MESH, NCIT, CL, GO, UBERON, SNOMEDCT, WIKIDATA, UMLS
from src.categories import ANATOMICAL_ENTITY, GROSS_ANATOMICAL_STRUCTURE, CELL, CELLULAR_COMPONENT
from src.ubergraph import build_sets
from src.babel_utils import write_compendium, glom, get_prefixes, read_identifier_file, remove_overused_xrefs
Expand Down Expand Up @@ -96,6 +98,38 @@ def build_anatomy_obo_relationships(outdir):
build_sets(f'{UBERON}:0001062', {UBERON:uberon, GO:go, CL:cl},'xref', ignore_list=ignore_list)
build_sets(f'{GO}:0005575', {UBERON:uberon, GO:go, CL:cl},'xref', ignore_list=ignore_list)

def build_wikidata_cell_relationships(outdir):
#This sparql returns all the wikidata items that have a UMLS identifier and a CL identifier
sparql = """PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wdtn: <http://www.wikidata.org/prop/direct-normalized/>
SELECT * WHERE {
?wd wdtn:P7963 ?cl .
?wd wdt:P2892 ?umls .
}"""
frink_wikidata_url = "https://frink.apps.renci.org/federation/sparql"
response = requests.post(frink_wikidata_url, params={'query': sparql})
results = response.json()
rows = results["results"]["bindings"]
# If one wikidata entry has either more than one CL or more than one UMLS, then we end up with problems
# (It could also be possible that the same CL is on more than one wikidata entry, but haven't seen that yet)
# Loop over the rows, transform each row into curies, and filter out any wikidata entry that occurs more than once.
# Double check that the UMLS and CL are unique. Then write out the now-unique UMLS/CL mappings
counts = defaultdict(int)
pairs = []
for row in rows:
umls_curie = f'{UMLS}:{row["umls"]["value"]}'
wd_curie = f'{WIKIDATA}:{row["wd"]["value"]}'
cl_curie = Text.obo_to_curie(row["cl"]["value"])
pairs.append( (umls_curie, cl_curie) )
counts[umls_curie] += 1
counts[cl_curie] += 1
with open(f'{outdir}/{WIKIDATA}', 'w') as wd:
for pair in pairs:
if (counts[pair[0]] == 1) and (counts[pair[1]] == 1):
wd.write(f'{pair[0]}\teq\t{pair[1]}\n')
else:
print(f'Pair {pair} is not unique {counts[pair[0]]} {counts[pair[1]]}')

def build_anatomy_umls_relationships(mrconso, idfile,outfile):
umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT})

Expand Down
4 changes: 2 additions & 2 deletions src/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ def get_biolink_prefix_map():
raise RuntimeError(f"Biolink version {biolink_version} is not supported.")
elif biolink_version.startswith('3.'):
# biolink-model v3.* releases keeps the prefix map in a different place.
return curies.load_prefix_map(
return curies.Converter.from_prefix_map(
'https://raw.githubusercontent.com/biolink/biolink-model/v' + biolink_version +
'/prefix-map/biolink-model-prefix-map.json'
)
else:
# biolink-model v4.0.0 and beyond is in the /project directory.
return curies.load_prefix_map(
return curies.Converter.from_prefix_map(
f'https://raw.githubusercontent.com/biolink/biolink-model/v' + biolink_version +
'/project/prefixmap/biolink_model_prefix_map.json'
)
Expand Down
1 change: 1 addition & 0 deletions src/prefixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
SMPDB = 'SMPDB'
REACT = 'REACT'
WIKIPATHWAYS = 'WIKIPATHWAYS'
WIKIDATA = 'WIKIDATA'
TCDB = 'TCDB'
PUBCHEMCOMPOUND='PUBCHEM.COMPOUND'
CHEMBLCOMPOUND='CHEMBL.COMPOUND'
Expand Down
6 changes: 6 additions & 0 deletions src/snakefiles/anatomy.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ rule get_anatomy_obo_relationships:
run:
anatomy.build_anatomy_obo_relationships(config['intermediate_directory']+'/anatomy/concords')

rule get_wikidata_cell_relationships:
output:
config['intermediate_directory']+'/anatomy/concords/WIKIDATA',
run:
anatomy.build_wikidata_cell_relationships(config['intermediate_directory']+'/anatomy/concords')

rule get_anatomy_umls_relationships:
input:
mrconso=config['download_directory']+"/UMLS/MRCONSO.RRF",
Expand Down

0 comments on commit d0502c6

Please sign in to comment.