Skip to content

Commit

Permalink
Merge pull request #1074 from microbiomedata/issue-1070-content-from-…
Browse files Browse the repository at this point in the history
…mongo

doi remodeling and migration, mixed with CURIes coercion, RDF generation, and class-specific SPARQL generation
  • Loading branch information
turbomam authored Aug 30, 2023
2 parents 8b5d1d1 + 70c84f6 commit 19ba665
Show file tree
Hide file tree
Showing 236 changed files with 4,595 additions and 184,918 deletions.
15 changes: 15 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# routine: make squeaky-clean all only-test
# for regenrating src/schema/mixs.yaml: make squeaky-clean mixs-yaml-clean all only-test

# for releases: git add -f examples nmdc_schema project src
examples/output/
nmdc_schema/*json
nmdc_schema/*yaml
nmdc_schema/nmdc.py
nmdc_schema/nmdc_materialized_patterns*
nmdc_schema/nmdc_schema_accepting_legacy_ids*
project/
src/schema/mixs.yaml
nmdc_schema/gold-to-mixs.sssom.tsv


notebooks/neon_cache.sqlite
neon_cache.sqlite

Expand Down
73 changes: 30 additions & 43 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ PYMODEL = $(SCHEMA_NAME)
EXAMPLEDIR = examples
TEMPLATEDIR = doc-templates

.PHONY: all clean site test examples-clean site-copy test-python site-clean test-with-examples
.PHONY: all clean examples-clean install site site-clean site-copy squeaky-clean test test-python test-with-examples

# note: "help" MUST be the first target in the file,
# when the user types "make" they should get help info
Expand All @@ -41,12 +41,12 @@ status: check-config
@echo "Source: $(SOURCE_SCHEMA_PATH)"

# generate products and add everything to github
setup: install gen-project gen-examples gendoc git-init-add
setup: install gen-project gendoc git-init-add

# install any dependencies required for building
install:
poetry install
.PHONY: install


# ---
# Project Synchronization
Expand All @@ -73,56 +73,56 @@ create-data-harmonizer:
npm init data-harmonizer $(SOURCE_SCHEMA_PATH)

all: site
site: clean site-clean src/schema/mixs.yaml gen-project gendoc nmdc_schema/gold-to-mixs.sssom.tsv
site: clean site-clean gen-project gendoc nmdc_schema/gold-to-mixs.sssom.tsv
# may change files in nmdc_schema/ or project/. uncommitted changes are not tolerated by mkd-gh-deploy

%.yaml: gen-project

# was deploy: all mkd-gh-deploy
deploy: gendoc mkd-gh-deploy

## In future this will be done by conversion
#gen-examples:
# cp src/data/examples/* $(EXAMPLEDIR)

gen-project: $(PYMODEL)
# added inclusion/exclusion parameters here, in test rule, and in project directories constant
gen-project: $(PYMODEL) src/schema/mixs.yaml
# keep these in sync between PROJECT_FOLDERS and the includes/excludes for gen-project and test-schema
$(RUN) gen-project \
--exclude excel \
--exclude graphql \
--exclude jsonld \
--exclude jsonldcontext \
--exclude markdown \
--exclude proto \
--exclude shacl \
--exclude shex \
--exclude sqlddl \
--include jsonldcontext \
--include jsonschema \
--include owl \
--include prefixmap \
--include python \
--include rdf \
-d $(DEST) $(SOURCE_SCHEMA_PATH) && mv $(DEST)/*.py $(PYMODEL)
cp project/jsonschema/nmdc.schema.json $(PYMODEL)
mv project/prefixmap/nmdc.yaml project/prefixmap/nmdc.json # todo this is too hardcoded and makes assumptions bout the file's existence

test: examples-clean site test-python jsonschema-check-all-valid-databases examples/output
only_test: examples-clean test-python jsonschema-check-all-valid-databases examples/output
# jsonschema-check-all-valid-databases
test: examples-clean site accepting-legacy-ids-all test-python examples/output
only-test: examples-clean accepting-legacy-ids-all test-python examples/output

test-schema:
# keep these in sync between PROJECT_FOLDERS and the includes/excludes for gen-project and test-schema
$(RUN) gen-project \
--exclude excel \
--exclude graphql \
--exclude jsonld \
--exclude jsonldcontext \
--exclude markdown \
--exclude prefixmap \
--exclude proto \
--exclude shacl \
--exclude shex \
--exclude sqlddl \
--include jsonldcontext \
--include jsonschema \
--include owl \
--include prefixmap \
--include python \
--include rdf \
-d tmp $(SOURCE_SCHEMA_PATH)

test-python:
Expand All @@ -134,24 +134,13 @@ lint:
check-config:
@(grep my-datamodel about.yaml > /dev/null && printf "\n**Project not configured**:\n\n - Remember to edit 'about.yaml'\n\n" || exit 0)

#convert-examples-to-%:
# $(patsubst %, $(RUN) linkml-convert % -s $(SOURCE_SCHEMA_PATH) -C Person, $(shell find src/data/examples -name "*.yaml"))
#
#examples/%.yaml: src/data/examples/%.yaml
# $(RUN) linkml-convert -s $(SOURCE_SCHEMA_PATH) -C Person $< -o $@
#examples/%.json: src/data/examples/%.yaml
# $(RUN) linkml-convert -s $(SOURCE_SCHEMA_PATH) -C Person $< -o $@
#examples/%.ttl: src/data/examples/%.yaml
# $(RUN) linkml-convert -P EXAMPLE=http://example.org/ -s $(SOURCE_SCHEMA_PATH) -C Person $< -o $@

# Test documentation locally
serve: mkd-serve

# Python datamodel
$(PYMODEL):
mkdir -p $@


$(DOCDIR):
mkdir -p $@

Expand All @@ -160,16 +149,15 @@ gendoc: $(DOCDIR)
cp $(SRC)/docs/*md $(DOCDIR) ; \
cp -r $(SRC)/docs/images $(DOCDIR) ; \
$(RUN) gen-doc -d $(DOCDIR) --template-directory $(SRC)/$(TEMPLATEDIR) $(SOURCE_SCHEMA_PATH)
#mv $(DOCDIR)/TEMP.md $(DOCDIR)/temp.md

testdoc: gendoc serve

MKDOCS = $(RUN) mkdocs
mkd-%:
$(MKDOCS) $*

#PROJECT_FOLDERS = sqlschema shex shacl protobuf prefixmap owl jsonschema jsonld graphql excel
PROJECT_FOLDERS = owl jsonschema
# keep these in sync between PROJECT_FOLDERS and the includes/excludes for gen-project and test-schema
PROJECT_FOLDERS = jsonldcontext jsonschema owl prefixmap python rdf
git-init-add: git-init git-add git-commit git-status
git-init:
git init
Expand All @@ -185,9 +173,9 @@ git-add: .cruft.json
MAINTAINERS.md \
Makefile \
README.md \
RELEASE_NOTES_v7.7.2_to_v7.7.7.md \
about.yaml \
assets \
examples \
images \
mkdocs.yml \
nmdc_schema \
Expand All @@ -196,16 +184,14 @@ git-add: .cruft.json
project.Makefile \
project/ \
pyproject.toml \
reports \
src/ \
sssom \
tests \
util \
utils

git add $(patsubst %, project/%, $(PROJECT_FOLDERS))

git-commit:
git commit -m 'Initial commit' -a

git-status:
git status

Expand All @@ -222,6 +208,14 @@ clean:

include project.Makefile

# custom
site-clean: clean
rm -rf nmdc_schema/*.json
rm -rf nmdc_schema/*.tsv
rm -rf nmdc_schema/*.yaml

squeaky-clean: clean OmicsProcessing-clean accepting-legacy-ids-clean examples-clean mongodb-clean rdf-clean shuttle-clean site-clean # does not include mixs-yaml-clean

project/nmdc_schema_merged.yaml:
$(RUN) gen-linkml \
--format yaml \
Expand All @@ -241,14 +235,8 @@ project/nmdc_materialized_patterns.schema.json: project/nmdc_materialized_patter
--closed \
--top-class Database $< > $@

site-clean:
rm -rf nmdc_schema/*.json
rm -rf nmdc_schema/*.tsv
rm -rf nmdc_schema/*.yaml
rm -rf project/nmdc_*.json
rm -rf project/nmdc_*.yaml

nmdc_schema/gold-to-mixs.sssom.tsv: sssom/gold-to-mixs.sssom.tsv nmdc_schema/nmdc_materialized_patterns.schema.json nmdc_schema/nmdc_materialized_patterns.yaml nmdc_schema/nmdc_schema_merged.yaml
nmdc_schema/gold-to-mixs.sssom.tsv: sssom/gold-to-mixs.sssom.tsv nmdc_schema/nmdc_materialized_patterns.schema.json \
nmdc_schema/nmdc_materialized_patterns.yaml nmdc_schema/nmdc_schema_merged.yaml
# just can't seem to tell pyproject.toml to bundle artifacts like these
# so reverting to copying into the module
cp $< $@
Expand All @@ -261,4 +249,3 @@ nmdc_schema/nmdc_materialized_patterns.yaml: project/nmdc_materialized_patterns.

nmdc_schema/nmdc_schema_merged.yaml: project/nmdc_schema_merged.yaml
cp $< $@

58 changes: 58 additions & 0 deletions RELEASE_NOTES_v7.7.2_to_v7.8.0.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
- More aggressive .gitigore for cleaner merges
- for releases: git add -f examples nmdc_schema project src
- Refactored Makefile and project.Makefile
- doesn't regenerate src/schema/mixs.yaml by default on every `make all`
- routine: `make squeaky-clean all only-test`
- for regenerating `src/schema/mixs.yaml: make squeaky-clean mixs-yaml-clean all only-test`
- New `OmicsProcessing-all` meta target for illustrating the automatic generation and execution of SPARQL queries for a
specified class
- New `assets/sparql` folder for SPARQL that were generated with the `class-sparql` CLI and which could be edited and
resubmitted to `class-sparql` for in `--query-file` mode
- New `make-rdf` meta target dumps MongoDB in the shape of an NMDC Database instance, makes migrations (including `doi`
migration and CURIe coercion), linkml-validation, conversion to RDF/TTL and casting anyURI-typed strings to real
CURIes
- skips functional_annotation_agg and metaproteomics_analysis_activity_set by default
- `dois` migrations can be performed in one `Study`
with `nmdc_schema.migration_recursion:migrate_studies_7_7_2_to_7_8_0`
- New `accepting-legacy-ids-all` meta target for validating MongoDB data that includes legacy `id`s
- Commented out old exploratory NEON targets in project.Makefile
- Commented out old `assets/MIxS_6_term_updates_MIxS6*` targets in project.Makefile
- `project/prefixmap/nmdc.json` is generated as a YAML file. We're renaming it in Makefile's `gen-project` target
- see https://github.com/linkml/linkml/issues/1598
- Created some new redundancy in terms of supplements to the prefixes defined in the merged NMDC schema
- `assets/misc/extra_prefix_expansions.yaml`
- `project/jsonld/nmdc.context.jsonld`
- `project/prefixmap/nmdc.json`
- hardcoded prefix expansions in `nmdc_schema/class_sparql.py`
- makefiles:
- added squeaky-clean target
- harmonization of PROJECT_FOLDERS and the includes/excludes for gen-project and test-schema
- standardize naming of cleanup targets
- use hyphens in target names, not underscores
- Potential sources of breakage for downstream users of the nmdc-schema package:
- Moved several legacy Python scripts from nmdc_schema/ to assets/old_python
- removed several `pyproject.toml` dependencies, or relocated to `tool.poetry.group.dev.dependencies`
- removed several legacy `tool.poetry.scripts` CLI definitions
- Structural schema changes:
- addition of class `FunctionalAnnotationAggMember`, `Database` slot `functional_annotation_agg`
and `FunctionalAnnotationAggMember`
slots `metagenome_annotation_id`, `WorkflowExecutionActivity`, `gene_function_id` and `count`.
- `count` sure is a vague slot name
- example data:
- `src/data/valid/FunctionalAnnotationAggMember-minimal.yaml`
- `src/data/valid/Database-functional_annotation_agg.yaml`
- removal of `doi` slot, along with its assignment to class `Study`. Requires migration. Also switching from scalar
to multivalued and from `TextValue.has_raw_value` to `uriorcurie` ranges
- valid example data like `src/data/valid/Study-exhaustive.yaml` changed to match the schema changes
- addition of `award_dois`, `dataset_dois` `publication_dois`
- example data: `src/data/valid/Database-study-set-with-dois.yaml`
- removed transient 200-character limit on `funding_sources` slot
- moved `emsl_project_dois` from `external_identifiers.yaml` to `nmdc.yaml` and re-rooted in `dois` including new
pattern
- `abstract` slot has been moved from class `Study`
- Schema annotation changes
- updated `examples`, `comments` and `description` for `jgi_portal_study_identifiers` (
in `src/schema/external_identifiers.yaml`)
- the `description` for class `Study` clarifies that the `description` of `Study` instances should not include
hyperlinks
- the comments on slot `elev` in class `Biosample` have been updated
Binary file not shown.

This file was deleted.

21 changes: 21 additions & 0 deletions assets/misc/extra_prefix_expansions.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# use a more standardized format like JSON-LD contexts,
# or just extract this from the schema?
# project/prefixmap/nmdc.yaml
# although some of the prefixes may not be defined in the schema yet?!

# todo: project/prefixmap/nmdc.yaml is mislabeled JSON
# todo: see also nmdc_schema/class_sparql.py

"KEGG.ORTHOLOGY": "http://identifiers.org/kegg.orthology/" # yes
"cas": "http://identifiers.org/cas/" # yes
"gnps.task": "https://bioregistry.io/gnps.task:" # yes
"jgi.proposal": "https://bioregistry.io/jgi.proposal:" # yes

"MASSIVE": "http://example.com/" # yes
"doi": "https://example.org/doi/" # "doi": "http://example.com/"

"kegg": "https://example.org/kegg/" # no, only uppercase KEGG.*

"biosample": "https://example.org/biosample/" # no
"gold": "https://example.org/gold/" # no
"img.taxon": "https://example.org/img.taxon/" # no
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 19ba665

Please sign in to comment.