Merge pull request #329 from TranslatorSRI/cell_wikidata

This hits wikidata to find CL/UMLS concordances via wikidata IDs. It writes a concord table between CL and UMLS. It could also bring in wikidata id's but not sure if that's terribly useful. Does a pretty good job; there are still lots of nodes that are either Cl or UMLS but cleans up many unmerged cliques.
TranslatorSRI · Aug 7, 2024 · d0502c6 · d0502c6
2 parents 7995b55 + f24e97d
commit d0502c6
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 4 deletions.
diff --git a/config.json b/config.json
@@ -14,7 +14,7 @@
 
   "anatomy_prefixes": ["UBERON","GO","CL","UMLS","MESH","NCIT","SNOMEDCT"],
   "anatomy_ids": ["UBERON","GO","CL","UMLS","MESH","NCIT"],
-  "anatomy_concords": ["UBERON","GO","CL","UMLS"],
+  "anatomy_concords": ["UBERON","GO","CL","UMLS", "WIKIDATA"],
   "anatomy_outputs": ["AnatomicalEntity.txt", "Cell.txt", "CellularComponent.txt","GrossAnatomicalStructure.txt"],
 
   "gene_labels": ["HGNC","NCBIGene","UMLS"],

diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py
@@ -1,8 +1,10 @@
 from collections import defaultdict
+import requests
 
 import src.datahandlers.obo as obo
+from src.util import Text
 
-from src.prefixes import MESH, NCIT, CL, GO, UBERON, SNOMEDCT
+from src.prefixes import MESH, NCIT, CL, GO, UBERON, SNOMEDCT, WIKIDATA, UMLS
 from src.categories import ANATOMICAL_ENTITY, GROSS_ANATOMICAL_STRUCTURE, CELL, CELLULAR_COMPONENT
 from src.ubergraph import build_sets
 from src.babel_utils import write_compendium, glom, get_prefixes, read_identifier_file, remove_overused_xrefs
@@ -96,6 +98,38 @@ def build_anatomy_obo_relationships(outdir):
         build_sets(f'{UBERON}:0001062', {UBERON:uberon, GO:go, CL:cl},'xref', ignore_list=ignore_list)
         build_sets(f'{GO}:0005575', {UBERON:uberon, GO:go, CL:cl},'xref', ignore_list=ignore_list)
 
+def build_wikidata_cell_relationships(outdir):
+    #This sparql returns all the wikidata items that have a UMLS identifier and a CL identifier
+    sparql = """PREFIX wdt: <http://www.wikidata.org/prop/direct/>
+        PREFIX wdtn: <http://www.wikidata.org/prop/direct-normalized/>
+        SELECT * WHERE {
+          ?wd wdtn:P7963 ?cl .
+          ?wd wdt:P2892 ?umls .
+        }"""
+    frink_wikidata_url = "https://frink.apps.renci.org/federation/sparql"
+    response = requests.post(frink_wikidata_url, params={'query': sparql})
+    results = response.json()
+    rows = results["results"]["bindings"]
+    # If one wikidata entry has either more than one CL or more than one UMLS, then we end up with problems
+    # (It could also be possible that the same CL is on more than one wikidata entry, but haven't seen that yet)
+    # Loop over the rows, transform each row into curies, and filter out any wikidata entry that occurs more than once.
+    # Double check that the UMLS and CL are unique.  Then write out the now-unique UMLS/CL mappings
+    counts = defaultdict(int)
+    pairs = []
+    for row in rows:
+        umls_curie = f'{UMLS}:{row["umls"]["value"]}'
+        wd_curie = f'{WIKIDATA}:{row["wd"]["value"]}'
+        cl_curie = Text.obo_to_curie(row["cl"]["value"])
+        pairs.append( (umls_curie, cl_curie) )
+        counts[umls_curie] += 1
+        counts[cl_curie] += 1
+    with open(f'{outdir}/{WIKIDATA}', 'w') as wd:
+        for pair in pairs:
+            if (counts[pair[0]] == 1) and (counts[pair[1]] == 1):
+                wd.write(f'{pair[0]}\teq\t{pair[1]}\n')
+            else:
+                print(f'Pair {pair} is not unique {counts[pair[0]]} {counts[pair[1]]}')
+
 def build_anatomy_umls_relationships(mrconso, idfile,outfile):
     umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT})
 

diff --git a/src/node.py b/src/node.py
@@ -37,13 +37,13 @@ def get_biolink_prefix_map():
         raise RuntimeError(f"Biolink version {biolink_version} is not supported.")
     elif biolink_version.startswith('3.'):
         # biolink-model v3.* releases keeps the prefix map in a different place.
-        return curies.load_prefix_map(
+        return curies.Converter.from_prefix_map(
             'https://raw.githubusercontent.com/biolink/biolink-model/v' + biolink_version +
             '/prefix-map/biolink-model-prefix-map.json'
         )
     else:
         # biolink-model v4.0.0 and beyond is in the /project directory.
-        return curies.load_prefix_map(
+        return curies.Converter.from_prefix_map(
             f'https://raw.githubusercontent.com/biolink/biolink-model/v' + biolink_version +
             '/project/prefixmap/biolink_model_prefix_map.json'
         )

diff --git a/src/prefixes.py b/src/prefixes.py
@@ -47,6 +47,7 @@
 SMPDB = 'SMPDB'
 REACT = 'REACT'
 WIKIPATHWAYS = 'WIKIPATHWAYS'
+WIKIDATA = 'WIKIDATA'
 TCDB = 'TCDB'
 PUBCHEMCOMPOUND='PUBCHEM.COMPOUND'
 CHEMBLCOMPOUND='CHEMBL.COMPOUND'

diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile
@@ -51,6 +51,12 @@ rule get_anatomy_obo_relationships:
     run:
         anatomy.build_anatomy_obo_relationships(config['intermediate_directory']+'/anatomy/concords')
 
+rule get_wikidata_cell_relationships:
+    output:
+        config['intermediate_directory']+'/anatomy/concords/WIKIDATA',
+    run:
+        anatomy.build_wikidata_cell_relationships(config['intermediate_directory']+'/anatomy/concords')
+
 rule get_anatomy_umls_relationships:
     input:
         mrconso=config['download_directory']+"/UMLS/MRCONSO.RRF",